P4 to Git Change 1364923 by gandryey@gera-lnx-rcf-lc on 2017/01/23 11:48:45

SWDEV-110996 - OCL to use the blit manager instead ROCr implementing copyRect API

	- Implement the blit manager functionality in ROCm backened. This checki-in also fixes SWDEV-95079, SWDEV-95068, SWDEV-95069, SWDEV-95071

Affected files ...

... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocblit.cpp#6 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocblit.hpp#4 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocdefs.hpp#9 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocdevice.cpp#35 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocdevice.hpp#13 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocmemory.cpp#6 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocmemory.hpp#5 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocsettings.cpp#13 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocsettings.hpp#6 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocvirtual.cpp#27 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocvirtual.hpp#6 edit


[ROCm/clr commit: 454621b7f1]
This commit is contained in:
foreman
2017-01-23 11:59:51 -05:00
orang tua f483250946
melakukan 8b58936282
11 mengubah file dengan 2820 tambahan dan 1527 penghapusan
File diff ditekan karena terlalu besar Load Diff
@@ -1,5 +1,5 @@
//
// Copyright (c) 2013 Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
//
#pragma once
@@ -8,12 +8,13 @@
#include "platform/commandqueue.hpp"
#include "device/device.hpp"
#include "device/blit.hpp"
#include "device/rocm/rocdefs.hpp"
/*! \addtogroup HSA Blit Implementation
/*! \addtogroup ROC Blit Implementation
* @{
*/
//! HSA Blit Manager Implementation
//! ROC Blit Manager Implementation
namespace roc {
class Device;
@@ -22,221 +23,29 @@ class Memory;
class VirtualGPU;
//! DMA Blit Manager
class HsaBlitManager : public device::HostBlitManager
class DmaBlitManager : public device::HostBlitManager
{
public:
//! Constructor
HsaBlitManager(
device::VirtualDevice& vdev, //!< Virtual GPU to be used for blits
Setup setup = Setup() //!< Specifies HW accelerated blits
);
//! Destructor
virtual ~HsaBlitManager() {
if (completion_signal_.handle != 0) {
hsa_signal_destroy(completion_signal_);
}
}
//! Creates HostBlitManager object
virtual bool create(amd::Device& device) {
if (HSA_STATUS_SUCCESS != hsa_signal_create(0, 0, NULL, &completion_signal_)) {
return false;
}
return true;
}
//! Copies a buffer object to system memory
virtual bool readBuffer(
device::Memory& srcMemory, //!< Source memory object
void* dstHost, //!< Destination host memory
const amd::Coord3D& origin, //!< Source origin
const amd::Coord3D& size, //!< Size of the copy region
bool entire = false //!< Entire buffer will be updated
) const;
//! Copies a buffer object to system memory
virtual bool readBufferRect(
device::Memory& srcMemory, //!< Source memory object
void* dstHost, //!< Destinaiton host memory
const amd::BufferRect& bufRect, //!< Source rectangle
const amd::BufferRect& hostRect, //!< Destination rectangle
const amd::Coord3D& size, //!< Size of the copy region
bool entire = false //!< Entire buffer will be updated
) const;
//! Copies an image object to system memory
virtual bool readImage(
device::Memory& srcMemory, //!< Source memory object
void* dstHost, //!< Destination host memory
const amd::Coord3D& origin, //!< Source origin
const amd::Coord3D& size, //!< Size of the copy region
size_t rowPitch, //!< Row pitch for host memory
size_t slicePitch, //!< Slice pitch for host memory
bool entire = false //!< Entire buffer will be updated
) const;
//! Copies system memory to a buffer object
virtual bool writeBuffer(
const void* srcHost, //!< Source host memory
device::Memory& dstMemory, //!< Destination memory object
const amd::Coord3D& origin, //!< Destination origin
const amd::Coord3D& size, //!< Size of the copy region
bool entire = false //!< Entire buffer will be updated
) const;
//! Copies system memory to a buffer object
virtual bool writeBufferRect(
const void* srcHost, //!< Source host memory
device::Memory& dstMemory, //!< Destination memory object
const amd::BufferRect& hostRect, //!< Destination rectangle
const amd::BufferRect& bufRect, //!< Source rectangle
const amd::Coord3D& size, //!< Size of the copy region
bool entire = false //!< Entire buffer will be updated
) const;
//! Copies system memory to an image object
virtual bool writeImage(
const void* srcHost, //!< Source host memory
device::Memory& dstMemory, //!< Destination memory object
const amd::Coord3D& origin, //!< Destination origin
const amd::Coord3D& size, //!< Size of the copy region
size_t rowPitch, //!< Row pitch for host memory
size_t slicePitch, //!< Slice pitch for host memory
bool entire = false //!< Entire buffer will be updated
) const;
//! Copies a buffer object to another buffer object
virtual bool copyBuffer(
device::Memory& srcMemory, //!< Source memory object
device::Memory& dstMemory, //!< Destination memory object
const amd::Coord3D& srcOrigin, //!< Source origin
const amd::Coord3D& dstOrigin, //!< Destination origin
const amd::Coord3D& size, //!< Size of the copy region
bool entire = false //!< Entire buffer will be updated
) const;
//! Copies a buffer object to another buffer object
virtual bool copyBufferRect(
device::Memory& srcMemory, //!< Source memory object
device::Memory& dstMemory, //!< Destination memory object
const amd::BufferRect& srcRect, //!< Source rectangle
const amd::BufferRect& dstRect, //!< Destination rectangle
const amd::Coord3D& size, //!< Size of the copy region
bool entire = false //!< Entire buffer will be updated
) const;
//! Copies an image object to a buffer object
virtual bool copyImageToBuffer(
device::Memory& srcMemory, //!< Source memory object
device::Memory& dstMemory, //!< Destination memory object
const amd::Coord3D& srcOrigin, //!< Source origin
const amd::Coord3D& dstOrigin, //!< Destination origin
const amd::Coord3D& size, //!< Size of the copy region
bool entire = false, //!< Entire buffer will be updated
size_t rowPitch = 0, //!< Pitch for buffer
size_t slicePitch = 0 //!< Slice for buffer
) const;
//! Copies a buffer object to an image object
virtual bool copyBufferToImage(
device::Memory& srcMemory, //!< Source memory object
device::Memory& dstMemory, //!< Destination memory object
const amd::Coord3D& srcOrigin, //!< Source origin
const amd::Coord3D& dstOrigin, //!< Destination origin
const amd::Coord3D& size, //!< Size of the copy region
bool entire = false, //!< Entire buffer will be updated
size_t rowPitch = 0, //!< Pitch for buffer
size_t slicePitch = 0 //!< Slice for buffer
) const;
//! Copies an image object to another image object
virtual bool copyImage(
device::Memory& srcMemory, //!< Source memory object
device::Memory& dstMemory, //!< Destination memory object
const amd::Coord3D& srcOrigin, //!< Source origin
const amd::Coord3D& dstOrigin, //!< Destination origin
const amd::Coord3D& size, //!< Size of the copy region
bool entire = false //!< Entire buffer will be updated
) const;
//! Fills a buffer memory with a pattern data
virtual bool fillBuffer(
device::Memory& memory, //!< Memory object to fill with pattern
const void* pattern, //!< Pattern data
size_t patternSize, //!< Pattern size
const amd::Coord3D& origin, //!< Destination origin
const amd::Coord3D& size, //!< Size of the copy region
bool entire = false //!< Entire buffer will be updated
) const;
//! Fills an image memory with a pattern data
virtual bool fillImage(
device::Memory& dstMemory, //!< Memory object to fill with pattern
const void* pattern, //!< Pattern data
const amd::Coord3D& origin, //!< Destination origin
const amd::Coord3D& size, //!< Size of the copy region
bool entire = false //!< Entire buffer will be updated
) const;
protected:
//! Returns the virtual GPU object
VirtualGPU& gpu() const { return static_cast<VirtualGPU&>(vDev_); }
private:
//! Handle of Hsa Device object
const roc::Device& roc_device_;
hsa_signal_t completion_signal_;
//! Assits in transferring data from Host to Local or vice versa
//! taking into account the Hsail profile supported by Hsa Agent
bool hsaCopy(
const void *hostSrc, //!< Contains source data to be copied
void *hostDst, //!< Destination buffer address for copying
uint32_t size, //!< Size of data to copy in bytes
bool hostToDev //!< True if data is copied from Host To Device
) const;
//! Disable copy constructor
HsaBlitManager(const HsaBlitManager&);
//! Disable operator=
HsaBlitManager& operator=(const HsaBlitManager&);
};
//! Kernel Blit Manager
//class KernelBlitManager : public HsaBlitManager
class KernelBlitManager : public HsaBlitManager
{
private:
VirtualGPU& gpu() const { return static_cast<VirtualGPU&>(vDev_); }
public:
enum {
BlitCopyImage = 0,
BlitCopyImage1DA,
BlitCopyImageToBuffer,
BlitCopyBufferToImage,
BlitCopyBufferRect,
BlitCopyBufferRectAligned,
BlitCopyBuffer,
BlitCopyBufferAligned,
FillBuffer,
FillImage,
BlitTotal
};
//! Constructor
KernelBlitManager(
device::VirtualDevice& vdev, //!< Virtual GPU to be used for blits
Setup setup = Setup() //!< Specifies HW accelerated blits
DmaBlitManager(
VirtualGPU& gpu, //!< Virtual GPU to be used for blits
Setup setup = Setup() //!< Specifies HW accelerated blits
);
//! Destructor
virtual ~KernelBlitManager();
virtual ~DmaBlitManager() {
if (completion_signal_.handle != 0) {
hsa_signal_destroy(completion_signal_);
}
}
//! Creates HostBlitManager object
virtual bool create(amd::Device& device);
//! Creates DmaBlitManager object
virtual bool create(amd::Device& device) {
if (HSA_STATUS_SUCCESS != hsa_signal_create(0, 0, NULL, &completion_signal_)) {
false;
}
return true;
}
//! Copies a buffer object to system memory
virtual bool readBuffer(
@@ -352,6 +161,230 @@ public:
bool entire = false //!< Entire buffer will be updated
) const;
protected:
const static uint MaxPinnedBuffers = 4;
//! Synchronizes the blit operations if necessary
inline void synchronize() const;
//! Returns the virtual GPU object
VirtualGPU& gpu() const { return static_cast<VirtualGPU&>(vDev_); }
//! Returns the ROC device object
const Device& dev() const { return static_cast<const Device&>(dev_); };
inline Memory& gpuMem(device::Memory& mem) const;
//! Pins host memory for GPU access
amd::Memory* pinHostMemory(
const void* hostMem, //!< Host memory pointer
size_t pinSize, //!< Host memory size
size_t& partial //!< Extra offset for memory alignment
) const;
//! Assits in transferring data from Host to Local or vice versa
//! taking into account the Hsail profile supported by Hsa Agent
bool hsaCopy(
const Memory& srcMemory,
const Memory& dstMemory,
const amd::Coord3D& srcOrigin,
const amd::Coord3D& dstOrigin,
const amd::Coord3D& size,
bool enableCopyRect = false,
bool flushDMA = true) const;
const size_t MinSizeForPinnedTransfer;
bool completeOperation_; //!< DMA blit manager must complete operation
amd::Context* context_; //!< A dummy context
private:
//! Disable copy constructor
DmaBlitManager(const DmaBlitManager&);
//! Disable operator=
DmaBlitManager& operator=(const DmaBlitManager&);
//! Reads video memory, using a staged buffer
bool readMemoryStaged(
Memory& srcMemory, //!< Source memory object
void* dstHost, //!< Destination host memory
Memory& xferBuf, //!< Staged buffer for read
size_t origin, //!< Original offset in the source memory
size_t& offset, //!< Offset for the current copy pointer
size_t& totalSize, //!< Total size for copy region
size_t xferSize //!< Transfer size
) const;
//! Write into video memory, using a staged buffer
bool writeMemoryStaged(
const void* srcHost, //!< Source host memory
Memory& dstMemory, //!< Destination memory object
Memory& xferBuf, //!< Staged buffer for write
size_t origin, //!< Original offset in the destination memory
size_t& offset, //!< Offset for the current copy pointer
size_t& totalSize, //!< Total size for the copy region
size_t xferSize //!< Transfer size
) const;
//! Handle of ROC Device object
hsa_signal_t completion_signal_;
//! Assits in transferring data from Host to Local or vice versa
//! taking into account the Hsail profile supported by Hsa Agent
bool hsaCopyStaged(
const_address hostSrc, //!< Contains source data to be copied
address hostDst, //!< Destination buffer address for copying
size_t size, //!< Size of data to copy in bytes
address staging, //!< Staging resource
bool hostToDev //!< True if data is copied from Host To Device
) const;
};
//! Kernel Blit Manager
class KernelBlitManager : public DmaBlitManager
{
public:
enum {
BlitCopyImage = 0,
BlitCopyImage1DA,
BlitCopyImageToBuffer,
BlitCopyBufferToImage,
BlitCopyBufferRect,
BlitCopyBufferRectAligned,
BlitCopyBuffer,
BlitCopyBufferAligned,
FillBuffer,
FillImage,
BlitTotal
};
//! Constructor
KernelBlitManager(
VirtualGPU& gpu, //!< Virtual GPU to be used for blits
Setup setup = Setup() //!< Specifies HW accelerated blits
);
//! Destructor
virtual ~KernelBlitManager();
//! Creates DmaBlitManager object
virtual bool create(amd::Device& device);
//! Copies a buffer object to another buffer object
virtual bool copyBufferRect(
device::Memory& srcMemory, //!< Source memory object
device::Memory& dstMemory, //!< Destination memory object
const amd::BufferRect& srcRectIn, //!< Source rectangle
const amd::BufferRect& dstRectIn, //!< Destination rectangle
const amd::Coord3D& sizeIn, //!< Size of the copy region
bool entire = false //!< Entire buffer will be updated
) const;
//! Copies a buffer object to system memory
virtual bool readBuffer(
device::Memory& srcMemory, //!< Source memory object
void* dstHost, //!< Destination host memory
const amd::Coord3D& origin, //!< Source origin
const amd::Coord3D& size, //!< Size of the copy region
bool entire = false //!< Entire buffer will be updated
) const;
//! Copies a buffer object to system memory
virtual bool readBufferRect(
device::Memory& srcMemory, //!< Source memory object
void* dstHost, //!< Destinaiton host memory
const amd::BufferRect& bufRect, //!< Source rectangle
const amd::BufferRect& hostRect, //!< Destination rectangle
const amd::Coord3D& size, //!< Size of the copy region
bool entire = false //!< Entire buffer will be updated
) const;
//! Copies system memory to a buffer object
virtual bool writeBuffer(
const void* srcHost, //!< Source host memory
device::Memory& dstMemory, //!< Destination memory object
const amd::Coord3D& origin, //!< Destination origin
const amd::Coord3D& size, //!< Size of the copy region
bool entire = false //!< Entire buffer will be updated
) const;
//! Copies system memory to a buffer object
virtual bool writeBufferRect(
const void* srcHost, //!< Source host memory
device::Memory& dstMemory, //!< Destination memory object
const amd::BufferRect& hostRect, //!< Destination rectangle
const amd::BufferRect& bufRect, //!< Source rectangle
const amd::Coord3D& size, //!< Size of the copy region
bool entire = false //!< Entire buffer will be updated
) const;
//! Copies a buffer object to an image object
virtual bool copyBuffer(
device::Memory& srcMemory, //!< Source memory object
device::Memory& dstMemory, //!< Destination memory object
const amd::Coord3D& srcOrigin, //!< Source origin
const amd::Coord3D& dstOrigin, //!< Destination origin
const amd::Coord3D& size, //!< Size of the copy region
bool entire = false //!< Entire buffer will be updated
) const;
//! Copies a buffer object to an image object
virtual bool copyBufferToImage(
device::Memory& srcMemory, //!< Source memory object
device::Memory& dstMemory, //!< Destination memory object
const amd::Coord3D& srcOrigin, //!< Source origin
const amd::Coord3D& dstOrigin, //!< Destination origin
const amd::Coord3D& size, //!< Size of the copy region
bool entire = false, //!< Entire buffer will be updated
size_t rowPitch = 0, //!< Pitch for buffer
size_t slicePitch = 0 //!< Slice for buffer
) const;
//! Copies an image object to a buffer object
virtual bool copyImageToBuffer(
device::Memory& srcMemory, //!< Source memory object
device::Memory& dstMemory, //!< Destination memory object
const amd::Coord3D& srcOrigin, //!< Source origin
const amd::Coord3D& dstOrigin, //!< Destination origin
const amd::Coord3D& size, //!< Size of the copy region
bool entire = false, //!< Entire buffer will be updated
size_t rowPitch = 0, //!< Pitch for buffer
size_t slicePitch = 0 //!< Slice for buffer
) const;
//! Copies an image object to another image object
virtual bool copyImage(
device::Memory& srcMemory, //!< Source memory object
device::Memory& dstMemory, //!< Destination memory object
const amd::Coord3D& srcOrigin, //!< Source origin
const amd::Coord3D& dstOrigin, //!< Destination origin
const amd::Coord3D& size, //!< Size of the copy region
bool entire = false //!< Entire buffer will be updated
) const;
//! Copies an image object to system memory
virtual bool readImage(
device::Memory& srcMemory, //!< Source memory object
void* dstHost, //!< Destination host memory
const amd::Coord3D& origin, //!< Source origin
const amd::Coord3D& size, //!< Size of the copy region
size_t rowPitch, //!< Row pitch for host memory
size_t slicePitch, //!< Slice pitch for host memory
bool entire = false //!< Entire buffer will be updated
) const;
//! Copies system memory to an image object
virtual bool writeImage(
const void* srcHost, //!< Source host memory
device::Memory& dstMemory, //!< Destination memory object
const amd::Coord3D& origin, //!< Destination origin
const amd::Coord3D& size, //!< Size of the copy region
size_t rowPitch, //!< Row pitch for host memory
size_t slicePitch, //!< Slice pitch for host memory
bool entire = false //!< Entire buffer will be updated
) const;
//! Fills a buffer memory with a pattern data
virtual bool fillBuffer(
device::Memory& memory, //!< Memory object to fill with pattern
@@ -372,26 +405,56 @@ public:
) const;
private:
//! Disable copy constructor
KernelBlitManager(const KernelBlitManager&);
static const size_t MaxXferBuffers = 2;
static const uint TransferSplitSize = 1;
//! Disable operator=
KernelBlitManager& operator=(const KernelBlitManager&);
//! Copies a buffer object to an image object
bool copyBufferToImageKernel(
device::Memory& srcMemory, //!< Source memory object
device::Memory& dstMemory, //!< Destination memory object
const amd::Coord3D& srcOrigin, //!< Source origin
const amd::Coord3D& dstOrigin, //!< Destination origin
const amd::Coord3D& size, //!< Size of the copy region
bool entire = false, //!< Entire buffer will be updated
size_t rowPitch = 0, //!< Pitch for buffer
size_t slicePitch = 0 //!< Slice for buffer
) const;
//! Copies an image object to a buffer object
bool copyImageToBufferKernel(
device::Memory& srcMemory, //!< Source memory object
device::Memory& dstMemory, //!< Destination memory object
const amd::Coord3D& srcOrigin, //!< Source origin
const amd::Coord3D& dstOrigin, //!< Destination origin
const amd::Coord3D& size, //!< Size of the copy region
bool entire = false, //!< Entire buffer will be updated
size_t rowPitch = 0, //!< Pitch for buffer
size_t slicePitch = 0 //!< Slice for buffer
) const;
//! Creates a program for all blit operations
bool createProgram(
Device& device //!< Device object
);
amd::Image::Format filterFormat(amd::Image::Format oldFormat) const;
//! Creates a view memory object
Memory* createView(
const Memory& parent, //!< Parent memory object
const cl_image_format format //!< The new format for a view
) const;
device::Memory *createImageView(
device::Memory &parent,
amd::Image::Format newFormat) const;
//! Disable copy constructor
KernelBlitManager(const KernelBlitManager&);
amd::Context *context_; //!< A dummy context
amd::Program *program_; //!< GPU program obejct
amd::Kernel *kernels_[BlitTotal]; //!< GPU kernels for blit
//! Disable operator=
KernelBlitManager& operator=(const KernelBlitManager&);
amd::Program* program_; //!< GPU program obejct
amd::Kernel* kernels_[BlitTotal]; //!< GPU kernels for blit
amd::Memory* constantBuffer_; //!< An internal CB for blits
amd::Memory* xferBuffers_[MaxXferBuffers]; //!< Transfer buffers for images
size_t xferBufferSize_; //!< Transfer buffer size
amd::Monitor* lockXferOps_; //!< Lock transfer operation
};
static const char* BlitName[KernelBlitManager::BlitTotal] = {
@@ -404,9 +467,8 @@ static const char* BlitName[KernelBlitManager::BlitTotal] = {
"copyBuffer",
"copyBufferAligned",
"fillBuffer",
"fillImage"
"fillImage",
};
/*@}*/
} // namespace roc
/*@}*/} // namespace roc
@@ -4,6 +4,9 @@
namespace roc {
//! Alignment restriciton for the pinned memory
const static size_t PinnedMemoryAlignment = 4 * Ki;
typedef uint HsaDeviceId;
struct AMDDeviceInfo {
@@ -164,9 +164,7 @@ bool NullDevice::create(const AMDDeviceInfo& deviceInfo) {
settings_ = new Settings();
roc::Settings* hsaSettings = static_cast<roc::Settings*>(settings_);
if ((hsaSettings == NULL) ||
// @Todo sramalin Use double precision from constsant
!hsaSettings->create((true) & 0x1)) {
if ((hsaSettings == NULL) || !hsaSettings->create(false)) {
LogError("Error creating settings for NULL HSA device");
return false;
}
@@ -189,6 +187,8 @@ Device::Device(hsa_agent_t bkendDevice)
, alloc_granularity_(0)
, context_(nullptr)
, xferQueue_(nullptr)
, xferRead_(nullptr)
, xferWrite_(nullptr)
, numOfVgpus_(0)
{
group_segment_.handle = 0;
@@ -208,6 +208,10 @@ Device::~Device()
delete mapCache_;
delete mapCacheOps_;
// Destroy temporary buffers for read/write
delete xferRead_;
delete xferWrite_;
// Destroy transfer queue
if (xferQueue_ && xferQueue_->terminate()) {
delete xferQueue_;
@@ -363,6 +367,85 @@ Device::loaderQueryHostAddress(const void* device, const void** host)
: HSA_STATUS_ERROR;
}
Device::XferBuffers::~XferBuffers()
{
// Destroy temporary buffer for reads
for (const auto& buf : freeBuffers_) {
delete buf;
}
freeBuffers_.clear();
}
bool
Device::XferBuffers::create()
{
Memory* xferBuf = nullptr;
bool result = false;
// Create a buffer object
xferBuf = new Buffer(dev(), bufSize_);
// Try to allocate memory for the transfer buffer
if ((nullptr == xferBuf) || !xferBuf->create()) {
delete xferBuf;
xferBuf = nullptr;
LogError("Couldn't allocate a transfer buffer!");
}
else {
result = true;
freeBuffers_.push_back(xferBuf);
}
return result;
}
Memory&
Device::XferBuffers::acquire()
{
Memory* xferBuf = nullptr;
size_t listSize;
// Lock the operations with the staged buffer list
amd::ScopedLock l(lock_);
listSize = freeBuffers_.size();
// If the list is empty, then attempt to allocate a staged buffer
if (listSize == 0) {
// Allocate memory
xferBuf = new Buffer(dev(), bufSize_);
// Allocate memory for the transfer buffer
if ((nullptr == xferBuf) || !xferBuf->create()) {
delete xferBuf;
xferBuf = nullptr;
LogError("Couldn't allocate a transfer buffer!");
}
else {
++acquiredCnt_;
}
}
if (xferBuf == nullptr) {
xferBuf = *(freeBuffers_.begin());
freeBuffers_.erase(freeBuffers_.begin());
++acquiredCnt_;
}
return *xferBuf;
}
void
Device::XferBuffers::release(VirtualGPU& gpu, Memory& buffer)
{
// Make sure buffer isn't busy on the current VirtualGPU, because
// the next aquire can come from different queue
// buffer.wait(gpu);
// Lock the operations with the staged buffer list
amd::ScopedLock l(lock_);
freeBuffers_.push_back(&buffer);
--acquiredCnt_;
}
bool Device::init()
{
#if defined(__linux__)
@@ -550,6 +633,28 @@ Device::create()
// Use just 1 entry by default for the map cache
mapCache_->push_back(NULL);
if (settings().stagedXferSize_ != 0) {
// Initialize staged write buffers
if (settings().stagedXferWrite_) {
xferWrite_ = new XferBuffers(*this,
amd::alignUp(settings().stagedXferSize_, 4 * Ki));
if ((xferWrite_ == nullptr) || !xferWrite_->create()) {
LogError("Couldn't allocate transfer buffer objects for read");
return false;
}
}
// Initialize staged read buffers
if (settings().stagedXferRead_) {
xferRead_ = new XferBuffers(*this,
amd::alignUp(settings().stagedXferSize_, 4 * Ki));
if ((xferRead_ == nullptr) || !xferRead_->create()) {
LogError("Couldn't allocate transfer buffer objects for write");
return false;
}
}
}
xferQueue();
return true;
@@ -568,11 +673,17 @@ Device::createProgram(amd::option::Options* options) {
bool
Device::mapHSADeviceToOpenCLDevice(hsa_agent_t dev)
{
if (HSA_STATUS_SUCCESS != hsa_agent_get_info(_bkendDevice,
HSA_AGENT_INFO_PROFILE,
&agent_profile_)) {
return false;
}
// Create HSA settings
settings_ = new Settings();
roc::Settings* hsaSettings = static_cast<roc::Settings*>(settings_);
if ((hsaSettings == NULL) ||
!hsaSettings->create((true) & 0x1)) {
!hsaSettings->create((agent_profile_ == HSA_PROFILE_FULL))) {
return false;
}
@@ -712,12 +823,6 @@ Device::populateOCLDeviceConstants()
::strcpy(info_.boardName_, device_name);
}
if (HSA_STATUS_SUCCESS != hsa_agent_get_info(_bkendDevice,
HSA_AGENT_INFO_PROFILE,
&agent_profile_)) {
return false;
}
if (HSA_STATUS_SUCCESS !=
hsa_agent_get_info(
_bkendDevice, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT,
@@ -883,7 +988,7 @@ Device::populateOCLDeviceConstants()
if (agent_profile_ == HSA_PROFILE_FULL) { // full-profile = participating in coherent memory,
// base-profile = NUMA based non-coherent memory
info_.hostUnifiedMemory_ = CL_TRUE;
info_.hostUnifiedMemory_ = CL_TRUE;
}
info_.memBaseAddrAlign_ = 8 * (flagIsDefault(MEMOBJ_BASE_ADDR_ALIGN) ?
sizeof(cl_long16) : MEMOBJ_BASE_ADDR_ALIGN);
@@ -1244,6 +1349,13 @@ Device::addMapTarget(amd::Memory* memory) const
return true;
}
Memory*
Device::getRocMemory(amd::Memory* mem) const
{
return static_cast<roc::Memory*>(mem->getDeviceMemory(*this));
}
device::Memory*
Device::createMemory(amd::Memory &owner) const
{
@@ -1302,9 +1414,9 @@ Device::createMemory(amd::Memory &owner) const
imageView->replaceDeviceMemory(this, devImageView);
result = xferMgr().writeImage(owner.getHostMem(), *devImageView,
amd::Coord3D(0), imageView->getRegion(),
imageView->getRowPitch(),
imageView->getSlicePitch(), true);
amd::Coord3D(0, 0, 0), imageView->getRegion(),
0,
0, true);
imageView->release();
}
@@ -219,6 +219,54 @@ private:
//! A HSA device ordinal (physical HSA device)
class Device : public NullDevice {
public:
//! Transfer buffers
class XferBuffers : public amd::HeapObject
{
public:
static const size_t MaxXferBufListSize = 8;
//! Default constructor
XferBuffers(const Device& device, size_t bufSize)
: bufSize_(bufSize)
, acquiredCnt_(0)
, gpuDevice_(device)
{}
//! Default destructor
~XferBuffers();
//! Creates the xfer buffers object
bool create();
//! Acquires an instance of the transfer buffers
Memory& acquire();
//! Releases transfer buffer
void release(
VirtualGPU& gpu, //!< Virual GPU object used with the buffer
Memory& buffer //!< Transfer buffer for release
);
//! Returns the buffer's size for transfer
size_t bufSize() const { return bufSize_; }
private:
//! Disable copy constructor
XferBuffers(const XferBuffers&);
//! Disable assignment operator
XferBuffers& operator=(const XferBuffers&);
//! Get device object
const Device& dev() const { return gpuDevice_; }
size_t bufSize_; //!< Staged buffer size
std::list<Memory*> freeBuffers_; //!< The list of free buffers
amd::Atomic<uint> acquiredCnt_; //!< The total number of acquired buffers
amd::Monitor lock_; //!< Stgaed buffer acquire/release lock
const Device& gpuDevice_; //!< GPU device object
};
//! Initialise the whole HSA device subsystem (CAL init, device enumeration, etc).
static bool init();
static void tearDown();
@@ -354,6 +402,17 @@ public:
//! Adds a map target to the cache
bool addMapTarget(amd::Memory* memory) const;
//! Returns transfer buffer object
XferBuffers& xferWrite() const { return *xferWrite_; }
//! Returns transfer buffer object
XferBuffers& xferRead() const { return *xferRead_; }
//! Returns a ROC memory object from AMD memory object
roc::Memory* getRocMemory(
amd::Memory* mem //!< Pointer to AMD memory object
) const;
private:
static hsa_ven_amd_loader_1_00_pfn_t amd_loader_ext_table;
@@ -379,6 +438,9 @@ private:
VirtualGPU* xferQueue() const;
XferBuffers* xferRead_; //!< Transfer buffers read
XferBuffers* xferWrite_; //!< Transfer buffers write
public:
amd::Atomic<uint> numOfVgpus_; //!< Virtual gpu unique index
}; // class roc::Device
@@ -25,10 +25,18 @@ namespace roc {
/////////////////////////////////roc::Memory//////////////////////////////
Memory::Memory(const roc::Device &dev, amd::Memory &owner)
: device::Memory(owner),
dev_(dev),
deviceMemory_(NULL),
kind_(MEMORY_KIND_NORMAL)
: device::Memory(owner)
, dev_(dev)
, deviceMemory_(NULL)
, kind_(MEMORY_KIND_NORMAL)
{
}
Memory::Memory(const roc::Device &dev, size_t size)
: device::Memory(size)
, dev_(dev)
, deviceMemory_(NULL)
, kind_(MEMORY_KIND_NORMAL)
{
}
@@ -64,8 +72,8 @@ Memory::allocateMapMemory(size_t allocationSize)
roc::Memory* hsaMapMemory = reinterpret_cast<roc::Memory *>(
mapMemory->getDeviceMemory(dev_));
if (hsaMapMemory == nullptr) {
mapMemory->release();
return false;
mapMemory->release();
return false;
}
}
@@ -191,7 +199,7 @@ bool Memory::createInteropBuffer(GLenum targetType, int miplevel, size_t* metada
return false;
#else
assert(owner()->isInterop() && "Object is not an interop object.");
mesa_glinterop_export_in in;
mesa_glinterop_export_out out;
@@ -213,7 +221,7 @@ bool Memory::createInteropBuffer(GLenum targetType, int miplevel, size_t* metada
if(!dev_.mesa().Export(in, out))
return false;
size_t size;
hsa_agent_t agent=dev_.getBackendDevice();
hsa_status_t status=hsa_amd_interop_map_buffer(1, &agent, out.dmabuf_fd, 0, &size, &deviceMemory_, metadata_size, (const void**)metadata);
@@ -242,9 +250,18 @@ Buffer::Buffer(const roc::Device &dev, amd::Memory &owner)
: roc::Memory(dev, owner)
{}
Buffer::Buffer(const roc::Device &dev, size_t size)
: roc::Memory(dev, size)
{}
Buffer::~Buffer()
{
destroy();
if (owner() == nullptr) {
dev_.hostFree(deviceMemory_, size());
}
else {
destroy();
}
}
void
@@ -288,6 +305,15 @@ Buffer::destroy()
bool
Buffer::create()
{
if (owner() == nullptr) {
deviceMemory_ = dev_.hostAlloc(size(), 1, false);
if (deviceMemory_ != nullptr) {
flags_ |= HostMemoryDirectAccess;
return true;
}
return false;
}
//Interop buffer
if(owner()->isInterop())
return createInteropBuffer(GL_ARRAY_BUFFER, 0, NULL, NULL);
@@ -303,8 +329,7 @@ Buffer::create()
}
const size_t offset = owner()->getOrigin();
deviceMemory_ =
static_cast<char *>(parentBuffer->getDeviceMemory()) + offset;
deviceMemory_ = parentBuffer->getDeviceMemory() + offset;
flags_ |= SubMemoryObject;
flags_ |=
@@ -562,10 +587,10 @@ Image::createInteropImage()
{
auto obj=owner()->getInteropObj()->asGLObject();
assert(obj->getCLGLObjectType()!=CL_GL_OBJECT_BUFFER && "Non-image OpenGL object used with interop image API.");
const hsa_amd_image_descriptor_t* meta;
size_t size=0;
GLenum glTarget = obj->getGLTarget();
if (glTarget == GL_TEXTURE_CUBE_MAP) {
glTarget = obj->getCubemapFace();
@@ -593,13 +618,13 @@ Image::createInteropImage()
if (obj->getGLTarget()==GL_TEXTURE_CUBE_MAP)
desc.setFace(obj->getCubemapFace());
originalDeviceMemory_=deviceMemory_;
hsa_status_t err=hsa_amd_image_create(dev_.getBackendDevice(), &imageDescriptor_, amdImageDesc_, originalDeviceMemory_, permission_, &hsaImageObject_);
if(err!=HSA_STATUS_SUCCESS)
return false;
BufferGuard.Dismiss();
DescGuard.Dismiss();
return true;
@@ -672,13 +697,13 @@ Image::create()
}
bool
Image::createView(Memory &parent)
Image::createView(const Memory &parent)
{
deviceMemory_ = parent.getDeviceMemory();
originalDeviceMemory_ = (parent.owner()->asBuffer() != NULL)
? deviceMemory_
: static_cast<Image &>(parent).originalDeviceMemory_;
: static_cast<const Image&>(parent).originalDeviceMemory_;
kind_=parent.getKind();
@@ -18,10 +18,12 @@ class Memory : public device::Memory {
Memory(const roc::Device &dev, amd::Memory &owner);
Memory(const roc::Device &dev, size_t size);
virtual ~Memory();
// Getter for deviceMemory_.
void *getDeviceMemory() const { return deviceMemory_; }
// Getter for deviceMemory_
address getDeviceMemory() const { return reinterpret_cast<address>(deviceMemory_); }
// Gets a pointer to a region of host-visible memory for use as the target
// of an indirect map for a given memory object
@@ -41,7 +43,7 @@ class Memory : public device::Memory {
Unimplemented();
return true;
}
// Immediate blocking write from device cache to owners's backing store.
// Marks owner as "current" by resetting the last writer to NULL.
virtual void syncHostFromCache(SyncFlags syncFlags = SyncFlags())
@@ -112,6 +114,7 @@ class Memory : public device::Memory {
class Buffer : public roc::Memory {
public:
Buffer(const roc::Device &dev, amd::Memory &owner);
Buffer(const roc::Device &dev, size_t size);
virtual ~Buffer();
@@ -143,7 +146,7 @@ public:
virtual bool create();
//! Create an image view
bool createView(Memory &parent);
bool createView(const Memory &parent);
//! Gets a pointer to a region of host-visible memory for use as the target
//! of an indirect map for a given memory object
@@ -53,14 +53,38 @@ Settings::Settings()
enablePartialDispatch_ = (partialDispatch) ? false : true;
partialDispatch_ = (partialDispatch) ? false : true;
commandQueues_ = 100; //!< Field value set to maximum number
//!< concurrent Virtual GPUs for ROCm backend
//!< concurrent Virtual GPUs for ROCm backend
// Disable image DMA by default (ROCM runtime doesn't support it)
imageDMA_ = false;
stagedXferRead_ = true;
stagedXferWrite_ = true;
stagedXferSize_ = GPU_STAGING_BUFFER_SIZE * Ki;
// Initialize transfer buffer size to 1MB by default
xferBufSize_ = 1024 * Ki;
const static size_t MaxPinnedXferSize = 32;
pinnedXferSize_ = std::min(GPU_PINNED_XFER_SIZE, MaxPinnedXferSize) * Mi;
pinnedMinXferSize_ = std::min(GPU_PINNED_MIN_XFER_SIZE * Ki, pinnedXferSize_);
}
bool
Settings::create(bool doublePrecision)
Settings::create(bool fullProfile)
{
customHostAllocator_ = true;
if (fullProfile) {
pinnedXferSize_ = 0;
stagedXferSize_ = 0;
xferBufSize_ = 0;
}
else {
pinnedXferSize_ = std::max(pinnedXferSize_, pinnedMinXferSize_);
stagedXferSize_ = std::max(stagedXferSize_, pinnedMinXferSize_ + 4 * Ki);
}
// Enable extensions
enableExtension(ClKhrByteAddressableStore);
enableExtension(ClKhrGlobalInt32BaseAtomics);
@@ -72,21 +96,16 @@ Settings::create(bool doublePrecision)
enableExtension(ClKhr3DImageWrites);
enableExtension(ClAmdMediaOps);
enableExtension(ClAmdMediaOps2);
if(MesaInterop::Supported())
enableExtension(ClKhrGlSharing);
// Make sure device supports doubles
doublePrecision_ &= doublePrecision;
if (doublePrecision_) {
// Enable KHR double precision extension
enableExtension(ClKhrFp64);
#if !defined(WITH_LIGHTNING_COMPILER)
// Also enable AMD double precision extension?
enableExtension(ClAmdFp64);
#endif // !defined(WITH_LIGHTNING_COMPILER)
if(MesaInterop::Supported()) {
enableExtension(ClKhrGlSharing);
}
// Enable KHR double precision extension
enableExtension(ClKhrFp64);
#if !defined(WITH_LIGHTNING_COMPILER)
// Also enable AMD double precision extension?
enableExtension(ClAmdFp64);
#endif // !defined(WITH_LIGHTNING_COMPILER)
enableExtension(ClKhrSubGroups);
enableExtension(ClKhrDepthImages);
@@ -109,6 +128,18 @@ Settings::override()
if (!flagIsDefault(GPU_MAX_COMMAND_QUEUES)) {
commandQueues_ = GPU_MAX_COMMAND_QUEUES;
}
if (!flagIsDefault(GPU_IMAGE_DMA)) {
commandQueues_ = GPU_IMAGE_DMA;
}
if (!flagIsDefault(GPU_XFER_BUFFER_SIZE)) {
xferBufSize_ = GPU_XFER_BUFFER_SIZE * Ki;
}
if (!flagIsDefault(GPU_PINNED_MIN_XFER_SIZE)) {
pinnedMinXferSize_ = std::min(GPU_PINNED_MIN_XFER_SIZE * Ki, pinnedXferSize_);
}
}
} // namespace roc
@@ -26,7 +26,10 @@ public:
uint enableImageHandle_: 1; //!< Use HSAIL image/sampler pointer
uint enableNCMode_: 1; //!< Enable Non Coherent mode for system memory
uint enablePartialDispatch_: 1; //!< Enable support for Partial Dispatch
uint reserved_: 26;
uint imageDMA_: 1; //!< Enable direct image DMA transfers
uint stagedXferRead_: 1; //!< Uses a staged buffer read
uint stagedXferWrite_: 1; //!< Uses a staged buffer write
uint reserved_: 22;
};
uint value_;
};
@@ -46,11 +49,16 @@ public:
uint kernargPoolSize_;
uint signalPoolSize_;
size_t xferBufSize_; //!< Transfer buffer size for image copy optimization
size_t stagedXferSize_; //!< Staged buffer size
size_t pinnedXferSize_; //!< Pinned buffer size for transfer
size_t pinnedMinXferSize_; //!< Minimal buffer size for pinned transfer
//! Default constructor
Settings();
//! Creates settings
bool create(bool doublePrecision);
bool create(bool fullProfile);
private:
//! Disable copy constructor
@@ -446,6 +446,9 @@ bool VirtualGPU::releaseGpuMemoryFence() {
hasPendingDispatch_ = false;
// Release all transfer buffers on this command queue
releaseXferWrite();
// Release all memory dependencies
memoryDependency().clear();
@@ -1774,8 +1777,66 @@ void VirtualGPU::submitReleaseExtObjects(amd::ReleaseExtObjectsCommand& vcmd)
profilingEnd(vcmd);
}
void VirtualGPU::flush(amd::Command *list, bool wait) {
void VirtualGPU::flush(amd::Command *list, bool wait)
{
releaseGpuMemoryFence();
updateCommandsState(list);
// Rlease all pinned memory
releasePinnedMem();
}
void
VirtualGPU::addXferWrite(Memory& memory)
{
if (xferWriteBuffers_.size() > 7) {
dev().xferWrite().release(*this, *xferWriteBuffers_.front());
xferWriteBuffers_.erase(xferWriteBuffers_.begin());
}
// Delay destruction
xferWriteBuffers_.push_back(&memory);
}
void
VirtualGPU::releaseXferWrite()
{
for (auto& memory : xferWriteBuffers_) {
dev().xferWrite().release(*this, *memory);
}
xferWriteBuffers_.resize(0);
}
void
VirtualGPU::addPinnedMem(amd::Memory* mem)
{
if (nullptr == findPinnedMem(mem->getHostMem(), mem->getSize())) {
if (pinnedMems_.size() > 7) {
pinnedMems_.front()->release();
pinnedMems_.erase(pinnedMems_.begin());
}
// Delay destruction
pinnedMems_.push_back(mem);
}
}
void
VirtualGPU::releasePinnedMem()
{
for (auto& amdMemory : pinnedMems_) {
amdMemory->release();
}
pinnedMems_.resize(0);
}
amd::Memory*
VirtualGPU::findPinnedMem(void* addr, size_t size)
{
for (auto& amdMemory : pinnedMems_) {
if ((amdMemory->getHostMem() == addr) && (size <= amdMemory->getSize())) {
return amdMemory;
}
}
return nullptr;
}
} // End of roc namespace
@@ -149,7 +149,7 @@ public:
void submitAcquireExtObjects(amd::AcquireExtObjectsCommand& cmd);
void submitReleaseExtObjects(amd::ReleaseExtObjectsCommand& cmd);
void submitPerfCounter(amd::PerfCounterCommand& cmd){};
void flush(amd::Command* list = NULL, bool wait = false);
void submitFillMemory(amd::FillMemoryCommand& cmd);
void submitMigrateMemObjects(amd::MigrateMemObjectsCommand& cmd);
@@ -193,10 +193,24 @@ public:
bool processMemObjects(
const amd::Kernel& kernel, //!< AMD kernel object for execution
const_address params //!< Pointer to the param's store
);
);
//Retun the virtual gpu unique index
uint index() const { return index_; }
//! Adds a stage write buffer into a list
void addXferWrite(Memory& memory);
//! Releases stage write buffers
void releaseXferWrite();
//! Adds a pinned memory object into a map
void addPinnedMem(amd::Memory* mem);
//! Release pinned memory objects
void releasePinnedMem();
//! Finds if pinned memory is cached
amd::Memory* findPinnedMem(void* addr, size_t size);
// } roc OpenCL integration
private:
@@ -219,6 +233,9 @@ private:
//! Updates AQL header for the upcomming dispatch
void setAqlHeader(uint16_t header) { aqlHeader_ = header; }
std::vector<Memory*> xferWriteBuffers_; //!< Stage write buffers
std::vector<amd::Memory*> pinnedMems_; //!< Pinned memory list
/**
* @brief Maintains the list of sampler allocated for one or more kernel
* submissions.
@@ -231,16 +248,16 @@ private:
*/
bool hasPendingDispatch_;
Timestamp* timestamp_;
hsa_agent_t gpu_device_; //!< Physical device
hsa_queue_t* gpu_queue_; //!< Queue associated with a gpu
hsa_agent_t gpu_device_; //!< Physical device
hsa_queue_t* gpu_queue_; //!< Queue associated with a gpu
hsa_barrier_and_packet_t barrier_packet_;
hsa_signal_t barrier_signal_;
uint32_t dispatch_id_; //!< This variable must be updated atomically.
Device& roc_device_; //!< roc device object
uint32_t dispatch_id_; //!< This variable must be updated atomically.
Device& roc_device_; //!< roc device object
void * tools_lib_;
PrintfDbg* printfdbg_;
MemoryDependency memoryDependency_; //!< Memory dependency class
uint16_t aqlHeader_; //!< AQL header for dispatch
uint16_t aqlHeader_; //!< AQL header for dispatch
char* kernarg_pool_base_;
size_t kernarg_pool_size_;