P4 to Git Change 1364923 by gandryey@gera-lnx-rcf-lc on 2017/01/23 11:48:45
SWDEV-110996 - OCL to use the blit manager instead ROCr implementing copyRect API
- Implement the blit manager functionality in ROCm backened. This checki-in also fixes SWDEV-95079, SWDEV-95068, SWDEV-95069, SWDEV-95071
Affected files ...
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocblit.cpp#6 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocblit.hpp#4 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocdefs.hpp#9 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocdevice.cpp#35 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocdevice.hpp#13 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocmemory.cpp#6 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocmemory.hpp#5 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocsettings.cpp#13 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocsettings.hpp#6 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocvirtual.cpp#27 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocvirtual.hpp#6 edit
[ROCm/clr commit: 454621b7f1]
This commit is contained in:
File diff ditekan karena terlalu besar
Load Diff
@@ -1,5 +1,5 @@
|
||||
//
|
||||
// Copyright (c) 2013 Advanced Micro Devices, Inc. All rights reserved.
|
||||
// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
|
||||
//
|
||||
#pragma once
|
||||
|
||||
@@ -8,12 +8,13 @@
|
||||
#include "platform/commandqueue.hpp"
|
||||
#include "device/device.hpp"
|
||||
#include "device/blit.hpp"
|
||||
#include "device/rocm/rocdefs.hpp"
|
||||
|
||||
/*! \addtogroup HSA Blit Implementation
|
||||
/*! \addtogroup ROC Blit Implementation
|
||||
* @{
|
||||
*/
|
||||
|
||||
//! HSA Blit Manager Implementation
|
||||
//! ROC Blit Manager Implementation
|
||||
namespace roc {
|
||||
|
||||
class Device;
|
||||
@@ -22,221 +23,29 @@ class Memory;
|
||||
class VirtualGPU;
|
||||
|
||||
//! DMA Blit Manager
|
||||
class HsaBlitManager : public device::HostBlitManager
|
||||
class DmaBlitManager : public device::HostBlitManager
|
||||
{
|
||||
public:
|
||||
//! Constructor
|
||||
HsaBlitManager(
|
||||
device::VirtualDevice& vdev, //!< Virtual GPU to be used for blits
|
||||
Setup setup = Setup() //!< Specifies HW accelerated blits
|
||||
);
|
||||
|
||||
//! Destructor
|
||||
virtual ~HsaBlitManager() {
|
||||
if (completion_signal_.handle != 0) {
|
||||
hsa_signal_destroy(completion_signal_);
|
||||
}
|
||||
}
|
||||
|
||||
//! Creates HostBlitManager object
|
||||
virtual bool create(amd::Device& device) {
|
||||
if (HSA_STATUS_SUCCESS != hsa_signal_create(0, 0, NULL, &completion_signal_)) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
//! Copies a buffer object to system memory
|
||||
virtual bool readBuffer(
|
||||
device::Memory& srcMemory, //!< Source memory object
|
||||
void* dstHost, //!< Destination host memory
|
||||
const amd::Coord3D& origin, //!< Source origin
|
||||
const amd::Coord3D& size, //!< Size of the copy region
|
||||
bool entire = false //!< Entire buffer will be updated
|
||||
) const;
|
||||
|
||||
//! Copies a buffer object to system memory
|
||||
virtual bool readBufferRect(
|
||||
device::Memory& srcMemory, //!< Source memory object
|
||||
void* dstHost, //!< Destinaiton host memory
|
||||
const amd::BufferRect& bufRect, //!< Source rectangle
|
||||
const amd::BufferRect& hostRect, //!< Destination rectangle
|
||||
const amd::Coord3D& size, //!< Size of the copy region
|
||||
bool entire = false //!< Entire buffer will be updated
|
||||
) const;
|
||||
|
||||
//! Copies an image object to system memory
|
||||
virtual bool readImage(
|
||||
device::Memory& srcMemory, //!< Source memory object
|
||||
void* dstHost, //!< Destination host memory
|
||||
const amd::Coord3D& origin, //!< Source origin
|
||||
const amd::Coord3D& size, //!< Size of the copy region
|
||||
size_t rowPitch, //!< Row pitch for host memory
|
||||
size_t slicePitch, //!< Slice pitch for host memory
|
||||
bool entire = false //!< Entire buffer will be updated
|
||||
) const;
|
||||
|
||||
//! Copies system memory to a buffer object
|
||||
virtual bool writeBuffer(
|
||||
const void* srcHost, //!< Source host memory
|
||||
device::Memory& dstMemory, //!< Destination memory object
|
||||
const amd::Coord3D& origin, //!< Destination origin
|
||||
const amd::Coord3D& size, //!< Size of the copy region
|
||||
bool entire = false //!< Entire buffer will be updated
|
||||
) const;
|
||||
|
||||
//! Copies system memory to a buffer object
|
||||
virtual bool writeBufferRect(
|
||||
const void* srcHost, //!< Source host memory
|
||||
device::Memory& dstMemory, //!< Destination memory object
|
||||
const amd::BufferRect& hostRect, //!< Destination rectangle
|
||||
const amd::BufferRect& bufRect, //!< Source rectangle
|
||||
const amd::Coord3D& size, //!< Size of the copy region
|
||||
bool entire = false //!< Entire buffer will be updated
|
||||
) const;
|
||||
|
||||
//! Copies system memory to an image object
|
||||
virtual bool writeImage(
|
||||
const void* srcHost, //!< Source host memory
|
||||
device::Memory& dstMemory, //!< Destination memory object
|
||||
const amd::Coord3D& origin, //!< Destination origin
|
||||
const amd::Coord3D& size, //!< Size of the copy region
|
||||
size_t rowPitch, //!< Row pitch for host memory
|
||||
size_t slicePitch, //!< Slice pitch for host memory
|
||||
bool entire = false //!< Entire buffer will be updated
|
||||
) const;
|
||||
|
||||
//! Copies a buffer object to another buffer object
|
||||
virtual bool copyBuffer(
|
||||
device::Memory& srcMemory, //!< Source memory object
|
||||
device::Memory& dstMemory, //!< Destination memory object
|
||||
const amd::Coord3D& srcOrigin, //!< Source origin
|
||||
const amd::Coord3D& dstOrigin, //!< Destination origin
|
||||
const amd::Coord3D& size, //!< Size of the copy region
|
||||
bool entire = false //!< Entire buffer will be updated
|
||||
) const;
|
||||
|
||||
//! Copies a buffer object to another buffer object
|
||||
virtual bool copyBufferRect(
|
||||
device::Memory& srcMemory, //!< Source memory object
|
||||
device::Memory& dstMemory, //!< Destination memory object
|
||||
const amd::BufferRect& srcRect, //!< Source rectangle
|
||||
const amd::BufferRect& dstRect, //!< Destination rectangle
|
||||
const amd::Coord3D& size, //!< Size of the copy region
|
||||
bool entire = false //!< Entire buffer will be updated
|
||||
) const;
|
||||
|
||||
//! Copies an image object to a buffer object
|
||||
virtual bool copyImageToBuffer(
|
||||
device::Memory& srcMemory, //!< Source memory object
|
||||
device::Memory& dstMemory, //!< Destination memory object
|
||||
const amd::Coord3D& srcOrigin, //!< Source origin
|
||||
const amd::Coord3D& dstOrigin, //!< Destination origin
|
||||
const amd::Coord3D& size, //!< Size of the copy region
|
||||
bool entire = false, //!< Entire buffer will be updated
|
||||
size_t rowPitch = 0, //!< Pitch for buffer
|
||||
size_t slicePitch = 0 //!< Slice for buffer
|
||||
) const;
|
||||
|
||||
//! Copies a buffer object to an image object
|
||||
virtual bool copyBufferToImage(
|
||||
device::Memory& srcMemory, //!< Source memory object
|
||||
device::Memory& dstMemory, //!< Destination memory object
|
||||
const amd::Coord3D& srcOrigin, //!< Source origin
|
||||
const amd::Coord3D& dstOrigin, //!< Destination origin
|
||||
const amd::Coord3D& size, //!< Size of the copy region
|
||||
bool entire = false, //!< Entire buffer will be updated
|
||||
size_t rowPitch = 0, //!< Pitch for buffer
|
||||
size_t slicePitch = 0 //!< Slice for buffer
|
||||
) const;
|
||||
|
||||
//! Copies an image object to another image object
|
||||
virtual bool copyImage(
|
||||
device::Memory& srcMemory, //!< Source memory object
|
||||
device::Memory& dstMemory, //!< Destination memory object
|
||||
const amd::Coord3D& srcOrigin, //!< Source origin
|
||||
const amd::Coord3D& dstOrigin, //!< Destination origin
|
||||
const amd::Coord3D& size, //!< Size of the copy region
|
||||
bool entire = false //!< Entire buffer will be updated
|
||||
) const;
|
||||
|
||||
//! Fills a buffer memory with a pattern data
|
||||
virtual bool fillBuffer(
|
||||
device::Memory& memory, //!< Memory object to fill with pattern
|
||||
const void* pattern, //!< Pattern data
|
||||
size_t patternSize, //!< Pattern size
|
||||
const amd::Coord3D& origin, //!< Destination origin
|
||||
const amd::Coord3D& size, //!< Size of the copy region
|
||||
bool entire = false //!< Entire buffer will be updated
|
||||
) const;
|
||||
|
||||
//! Fills an image memory with a pattern data
|
||||
virtual bool fillImage(
|
||||
device::Memory& dstMemory, //!< Memory object to fill with pattern
|
||||
const void* pattern, //!< Pattern data
|
||||
const amd::Coord3D& origin, //!< Destination origin
|
||||
const amd::Coord3D& size, //!< Size of the copy region
|
||||
bool entire = false //!< Entire buffer will be updated
|
||||
) const;
|
||||
|
||||
protected:
|
||||
//! Returns the virtual GPU object
|
||||
VirtualGPU& gpu() const { return static_cast<VirtualGPU&>(vDev_); }
|
||||
|
||||
private:
|
||||
//! Handle of Hsa Device object
|
||||
const roc::Device& roc_device_;
|
||||
|
||||
hsa_signal_t completion_signal_;
|
||||
|
||||
//! Assits in transferring data from Host to Local or vice versa
|
||||
//! taking into account the Hsail profile supported by Hsa Agent
|
||||
bool hsaCopy(
|
||||
const void *hostSrc, //!< Contains source data to be copied
|
||||
void *hostDst, //!< Destination buffer address for copying
|
||||
uint32_t size, //!< Size of data to copy in bytes
|
||||
bool hostToDev //!< True if data is copied from Host To Device
|
||||
) const;
|
||||
|
||||
//! Disable copy constructor
|
||||
HsaBlitManager(const HsaBlitManager&);
|
||||
|
||||
//! Disable operator=
|
||||
HsaBlitManager& operator=(const HsaBlitManager&);
|
||||
};
|
||||
|
||||
//! Kernel Blit Manager
|
||||
//class KernelBlitManager : public HsaBlitManager
|
||||
class KernelBlitManager : public HsaBlitManager
|
||||
{
|
||||
private:
|
||||
VirtualGPU& gpu() const { return static_cast<VirtualGPU&>(vDev_); }
|
||||
public:
|
||||
enum {
|
||||
BlitCopyImage = 0,
|
||||
BlitCopyImage1DA,
|
||||
BlitCopyImageToBuffer,
|
||||
BlitCopyBufferToImage,
|
||||
BlitCopyBufferRect,
|
||||
BlitCopyBufferRectAligned,
|
||||
BlitCopyBuffer,
|
||||
BlitCopyBufferAligned,
|
||||
FillBuffer,
|
||||
FillImage,
|
||||
BlitTotal
|
||||
};
|
||||
|
||||
//! Constructor
|
||||
KernelBlitManager(
|
||||
device::VirtualDevice& vdev, //!< Virtual GPU to be used for blits
|
||||
Setup setup = Setup() //!< Specifies HW accelerated blits
|
||||
DmaBlitManager(
|
||||
VirtualGPU& gpu, //!< Virtual GPU to be used for blits
|
||||
Setup setup = Setup() //!< Specifies HW accelerated blits
|
||||
);
|
||||
|
||||
//! Destructor
|
||||
virtual ~KernelBlitManager();
|
||||
virtual ~DmaBlitManager() {
|
||||
if (completion_signal_.handle != 0) {
|
||||
hsa_signal_destroy(completion_signal_);
|
||||
}
|
||||
}
|
||||
|
||||
//! Creates HostBlitManager object
|
||||
virtual bool create(amd::Device& device);
|
||||
//! Creates DmaBlitManager object
|
||||
virtual bool create(amd::Device& device) {
|
||||
if (HSA_STATUS_SUCCESS != hsa_signal_create(0, 0, NULL, &completion_signal_)) {
|
||||
false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
//! Copies a buffer object to system memory
|
||||
virtual bool readBuffer(
|
||||
@@ -352,6 +161,230 @@ public:
|
||||
bool entire = false //!< Entire buffer will be updated
|
||||
) const;
|
||||
|
||||
protected:
|
||||
const static uint MaxPinnedBuffers = 4;
|
||||
|
||||
//! Synchronizes the blit operations if necessary
|
||||
inline void synchronize() const;
|
||||
|
||||
//! Returns the virtual GPU object
|
||||
VirtualGPU& gpu() const { return static_cast<VirtualGPU&>(vDev_); }
|
||||
|
||||
//! Returns the ROC device object
|
||||
const Device& dev() const { return static_cast<const Device&>(dev_); };
|
||||
|
||||
inline Memory& gpuMem(device::Memory& mem) const;
|
||||
|
||||
//! Pins host memory for GPU access
|
||||
amd::Memory* pinHostMemory(
|
||||
const void* hostMem, //!< Host memory pointer
|
||||
size_t pinSize, //!< Host memory size
|
||||
size_t& partial //!< Extra offset for memory alignment
|
||||
) const;
|
||||
|
||||
//! Assits in transferring data from Host to Local or vice versa
|
||||
//! taking into account the Hsail profile supported by Hsa Agent
|
||||
bool hsaCopy(
|
||||
const Memory& srcMemory,
|
||||
const Memory& dstMemory,
|
||||
const amd::Coord3D& srcOrigin,
|
||||
const amd::Coord3D& dstOrigin,
|
||||
const amd::Coord3D& size,
|
||||
bool enableCopyRect = false,
|
||||
bool flushDMA = true) const;
|
||||
|
||||
const size_t MinSizeForPinnedTransfer;
|
||||
bool completeOperation_; //!< DMA blit manager must complete operation
|
||||
amd::Context* context_; //!< A dummy context
|
||||
|
||||
private:
|
||||
|
||||
//! Disable copy constructor
|
||||
DmaBlitManager(const DmaBlitManager&);
|
||||
|
||||
//! Disable operator=
|
||||
DmaBlitManager& operator=(const DmaBlitManager&);
|
||||
|
||||
//! Reads video memory, using a staged buffer
|
||||
bool readMemoryStaged(
|
||||
Memory& srcMemory, //!< Source memory object
|
||||
void* dstHost, //!< Destination host memory
|
||||
Memory& xferBuf, //!< Staged buffer for read
|
||||
size_t origin, //!< Original offset in the source memory
|
||||
size_t& offset, //!< Offset for the current copy pointer
|
||||
size_t& totalSize, //!< Total size for copy region
|
||||
size_t xferSize //!< Transfer size
|
||||
) const;
|
||||
|
||||
//! Write into video memory, using a staged buffer
|
||||
bool writeMemoryStaged(
|
||||
const void* srcHost, //!< Source host memory
|
||||
Memory& dstMemory, //!< Destination memory object
|
||||
Memory& xferBuf, //!< Staged buffer for write
|
||||
size_t origin, //!< Original offset in the destination memory
|
||||
size_t& offset, //!< Offset for the current copy pointer
|
||||
size_t& totalSize, //!< Total size for the copy region
|
||||
size_t xferSize //!< Transfer size
|
||||
) const;
|
||||
|
||||
//! Handle of ROC Device object
|
||||
hsa_signal_t completion_signal_;
|
||||
|
||||
//! Assits in transferring data from Host to Local or vice versa
|
||||
//! taking into account the Hsail profile supported by Hsa Agent
|
||||
bool hsaCopyStaged(
|
||||
const_address hostSrc, //!< Contains source data to be copied
|
||||
address hostDst, //!< Destination buffer address for copying
|
||||
size_t size, //!< Size of data to copy in bytes
|
||||
address staging, //!< Staging resource
|
||||
bool hostToDev //!< True if data is copied from Host To Device
|
||||
) const;
|
||||
};
|
||||
|
||||
//! Kernel Blit Manager
|
||||
class KernelBlitManager : public DmaBlitManager
|
||||
{
|
||||
public:
|
||||
enum {
|
||||
BlitCopyImage = 0,
|
||||
BlitCopyImage1DA,
|
||||
BlitCopyImageToBuffer,
|
||||
BlitCopyBufferToImage,
|
||||
BlitCopyBufferRect,
|
||||
BlitCopyBufferRectAligned,
|
||||
BlitCopyBuffer,
|
||||
BlitCopyBufferAligned,
|
||||
FillBuffer,
|
||||
FillImage,
|
||||
BlitTotal
|
||||
};
|
||||
|
||||
//! Constructor
|
||||
KernelBlitManager(
|
||||
VirtualGPU& gpu, //!< Virtual GPU to be used for blits
|
||||
Setup setup = Setup() //!< Specifies HW accelerated blits
|
||||
);
|
||||
|
||||
//! Destructor
|
||||
virtual ~KernelBlitManager();
|
||||
|
||||
//! Creates DmaBlitManager object
|
||||
virtual bool create(amd::Device& device);
|
||||
|
||||
//! Copies a buffer object to another buffer object
|
||||
virtual bool copyBufferRect(
|
||||
device::Memory& srcMemory, //!< Source memory object
|
||||
device::Memory& dstMemory, //!< Destination memory object
|
||||
const amd::BufferRect& srcRectIn, //!< Source rectangle
|
||||
const amd::BufferRect& dstRectIn, //!< Destination rectangle
|
||||
const amd::Coord3D& sizeIn, //!< Size of the copy region
|
||||
bool entire = false //!< Entire buffer will be updated
|
||||
) const;
|
||||
|
||||
//! Copies a buffer object to system memory
|
||||
virtual bool readBuffer(
|
||||
device::Memory& srcMemory, //!< Source memory object
|
||||
void* dstHost, //!< Destination host memory
|
||||
const amd::Coord3D& origin, //!< Source origin
|
||||
const amd::Coord3D& size, //!< Size of the copy region
|
||||
bool entire = false //!< Entire buffer will be updated
|
||||
) const;
|
||||
|
||||
//! Copies a buffer object to system memory
|
||||
virtual bool readBufferRect(
|
||||
device::Memory& srcMemory, //!< Source memory object
|
||||
void* dstHost, //!< Destinaiton host memory
|
||||
const amd::BufferRect& bufRect, //!< Source rectangle
|
||||
const amd::BufferRect& hostRect, //!< Destination rectangle
|
||||
const amd::Coord3D& size, //!< Size of the copy region
|
||||
bool entire = false //!< Entire buffer will be updated
|
||||
) const;
|
||||
|
||||
//! Copies system memory to a buffer object
|
||||
virtual bool writeBuffer(
|
||||
const void* srcHost, //!< Source host memory
|
||||
device::Memory& dstMemory, //!< Destination memory object
|
||||
const amd::Coord3D& origin, //!< Destination origin
|
||||
const amd::Coord3D& size, //!< Size of the copy region
|
||||
bool entire = false //!< Entire buffer will be updated
|
||||
) const;
|
||||
|
||||
//! Copies system memory to a buffer object
|
||||
virtual bool writeBufferRect(
|
||||
const void* srcHost, //!< Source host memory
|
||||
device::Memory& dstMemory, //!< Destination memory object
|
||||
const amd::BufferRect& hostRect, //!< Destination rectangle
|
||||
const amd::BufferRect& bufRect, //!< Source rectangle
|
||||
const amd::Coord3D& size, //!< Size of the copy region
|
||||
bool entire = false //!< Entire buffer will be updated
|
||||
) const;
|
||||
|
||||
//! Copies a buffer object to an image object
|
||||
virtual bool copyBuffer(
|
||||
device::Memory& srcMemory, //!< Source memory object
|
||||
device::Memory& dstMemory, //!< Destination memory object
|
||||
const amd::Coord3D& srcOrigin, //!< Source origin
|
||||
const amd::Coord3D& dstOrigin, //!< Destination origin
|
||||
const amd::Coord3D& size, //!< Size of the copy region
|
||||
bool entire = false //!< Entire buffer will be updated
|
||||
) const;
|
||||
|
||||
//! Copies a buffer object to an image object
|
||||
virtual bool copyBufferToImage(
|
||||
device::Memory& srcMemory, //!< Source memory object
|
||||
device::Memory& dstMemory, //!< Destination memory object
|
||||
const amd::Coord3D& srcOrigin, //!< Source origin
|
||||
const amd::Coord3D& dstOrigin, //!< Destination origin
|
||||
const amd::Coord3D& size, //!< Size of the copy region
|
||||
bool entire = false, //!< Entire buffer will be updated
|
||||
size_t rowPitch = 0, //!< Pitch for buffer
|
||||
size_t slicePitch = 0 //!< Slice for buffer
|
||||
) const;
|
||||
|
||||
//! Copies an image object to a buffer object
|
||||
virtual bool copyImageToBuffer(
|
||||
device::Memory& srcMemory, //!< Source memory object
|
||||
device::Memory& dstMemory, //!< Destination memory object
|
||||
const amd::Coord3D& srcOrigin, //!< Source origin
|
||||
const amd::Coord3D& dstOrigin, //!< Destination origin
|
||||
const amd::Coord3D& size, //!< Size of the copy region
|
||||
bool entire = false, //!< Entire buffer will be updated
|
||||
size_t rowPitch = 0, //!< Pitch for buffer
|
||||
size_t slicePitch = 0 //!< Slice for buffer
|
||||
) const;
|
||||
|
||||
//! Copies an image object to another image object
|
||||
virtual bool copyImage(
|
||||
device::Memory& srcMemory, //!< Source memory object
|
||||
device::Memory& dstMemory, //!< Destination memory object
|
||||
const amd::Coord3D& srcOrigin, //!< Source origin
|
||||
const amd::Coord3D& dstOrigin, //!< Destination origin
|
||||
const amd::Coord3D& size, //!< Size of the copy region
|
||||
bool entire = false //!< Entire buffer will be updated
|
||||
) const;
|
||||
|
||||
//! Copies an image object to system memory
|
||||
virtual bool readImage(
|
||||
device::Memory& srcMemory, //!< Source memory object
|
||||
void* dstHost, //!< Destination host memory
|
||||
const amd::Coord3D& origin, //!< Source origin
|
||||
const amd::Coord3D& size, //!< Size of the copy region
|
||||
size_t rowPitch, //!< Row pitch for host memory
|
||||
size_t slicePitch, //!< Slice pitch for host memory
|
||||
bool entire = false //!< Entire buffer will be updated
|
||||
) const;
|
||||
|
||||
//! Copies system memory to an image object
|
||||
virtual bool writeImage(
|
||||
const void* srcHost, //!< Source host memory
|
||||
device::Memory& dstMemory, //!< Destination memory object
|
||||
const amd::Coord3D& origin, //!< Destination origin
|
||||
const amd::Coord3D& size, //!< Size of the copy region
|
||||
size_t rowPitch, //!< Row pitch for host memory
|
||||
size_t slicePitch, //!< Slice pitch for host memory
|
||||
bool entire = false //!< Entire buffer will be updated
|
||||
) const;
|
||||
|
||||
//! Fills a buffer memory with a pattern data
|
||||
virtual bool fillBuffer(
|
||||
device::Memory& memory, //!< Memory object to fill with pattern
|
||||
@@ -372,26 +405,56 @@ public:
|
||||
) const;
|
||||
|
||||
private:
|
||||
//! Disable copy constructor
|
||||
KernelBlitManager(const KernelBlitManager&);
|
||||
static const size_t MaxXferBuffers = 2;
|
||||
static const uint TransferSplitSize = 1;
|
||||
|
||||
//! Disable operator=
|
||||
KernelBlitManager& operator=(const KernelBlitManager&);
|
||||
//! Copies a buffer object to an image object
|
||||
bool copyBufferToImageKernel(
|
||||
device::Memory& srcMemory, //!< Source memory object
|
||||
device::Memory& dstMemory, //!< Destination memory object
|
||||
const amd::Coord3D& srcOrigin, //!< Source origin
|
||||
const amd::Coord3D& dstOrigin, //!< Destination origin
|
||||
const amd::Coord3D& size, //!< Size of the copy region
|
||||
bool entire = false, //!< Entire buffer will be updated
|
||||
size_t rowPitch = 0, //!< Pitch for buffer
|
||||
size_t slicePitch = 0 //!< Slice for buffer
|
||||
) const;
|
||||
|
||||
//! Copies an image object to a buffer object
|
||||
bool copyImageToBufferKernel(
|
||||
device::Memory& srcMemory, //!< Source memory object
|
||||
device::Memory& dstMemory, //!< Destination memory object
|
||||
const amd::Coord3D& srcOrigin, //!< Source origin
|
||||
const amd::Coord3D& dstOrigin, //!< Destination origin
|
||||
const amd::Coord3D& size, //!< Size of the copy region
|
||||
bool entire = false, //!< Entire buffer will be updated
|
||||
size_t rowPitch = 0, //!< Pitch for buffer
|
||||
size_t slicePitch = 0 //!< Slice for buffer
|
||||
) const;
|
||||
|
||||
//! Creates a program for all blit operations
|
||||
bool createProgram(
|
||||
Device& device //!< Device object
|
||||
);
|
||||
|
||||
amd::Image::Format filterFormat(amd::Image::Format oldFormat) const;
|
||||
//! Creates a view memory object
|
||||
Memory* createView(
|
||||
const Memory& parent, //!< Parent memory object
|
||||
const cl_image_format format //!< The new format for a view
|
||||
) const;
|
||||
|
||||
device::Memory *createImageView(
|
||||
device::Memory &parent,
|
||||
amd::Image::Format newFormat) const;
|
||||
//! Disable copy constructor
|
||||
KernelBlitManager(const KernelBlitManager&);
|
||||
|
||||
amd::Context *context_; //!< A dummy context
|
||||
amd::Program *program_; //!< GPU program obejct
|
||||
amd::Kernel *kernels_[BlitTotal]; //!< GPU kernels for blit
|
||||
//! Disable operator=
|
||||
KernelBlitManager& operator=(const KernelBlitManager&);
|
||||
|
||||
amd::Program* program_; //!< GPU program obejct
|
||||
amd::Kernel* kernels_[BlitTotal]; //!< GPU kernels for blit
|
||||
amd::Memory* constantBuffer_; //!< An internal CB for blits
|
||||
amd::Memory* xferBuffers_[MaxXferBuffers]; //!< Transfer buffers for images
|
||||
size_t xferBufferSize_; //!< Transfer buffer size
|
||||
amd::Monitor* lockXferOps_; //!< Lock transfer operation
|
||||
};
|
||||
|
||||
static const char* BlitName[KernelBlitManager::BlitTotal] = {
|
||||
@@ -404,9 +467,8 @@ static const char* BlitName[KernelBlitManager::BlitTotal] = {
|
||||
"copyBuffer",
|
||||
"copyBufferAligned",
|
||||
"fillBuffer",
|
||||
"fillImage"
|
||||
"fillImage",
|
||||
};
|
||||
|
||||
/*@}*/
|
||||
} // namespace roc
|
||||
/*@}*/} // namespace roc
|
||||
|
||||
|
||||
@@ -4,6 +4,9 @@
|
||||
|
||||
namespace roc {
|
||||
|
||||
//! Alignment restriciton for the pinned memory
|
||||
const static size_t PinnedMemoryAlignment = 4 * Ki;
|
||||
|
||||
typedef uint HsaDeviceId;
|
||||
|
||||
struct AMDDeviceInfo {
|
||||
|
||||
@@ -164,9 +164,7 @@ bool NullDevice::create(const AMDDeviceInfo& deviceInfo) {
|
||||
|
||||
settings_ = new Settings();
|
||||
roc::Settings* hsaSettings = static_cast<roc::Settings*>(settings_);
|
||||
if ((hsaSettings == NULL) ||
|
||||
// @Todo sramalin Use double precision from constsant
|
||||
!hsaSettings->create((true) & 0x1)) {
|
||||
if ((hsaSettings == NULL) || !hsaSettings->create(false)) {
|
||||
LogError("Error creating settings for NULL HSA device");
|
||||
return false;
|
||||
}
|
||||
@@ -189,6 +187,8 @@ Device::Device(hsa_agent_t bkendDevice)
|
||||
, alloc_granularity_(0)
|
||||
, context_(nullptr)
|
||||
, xferQueue_(nullptr)
|
||||
, xferRead_(nullptr)
|
||||
, xferWrite_(nullptr)
|
||||
, numOfVgpus_(0)
|
||||
{
|
||||
group_segment_.handle = 0;
|
||||
@@ -208,6 +208,10 @@ Device::~Device()
|
||||
delete mapCache_;
|
||||
delete mapCacheOps_;
|
||||
|
||||
// Destroy temporary buffers for read/write
|
||||
delete xferRead_;
|
||||
delete xferWrite_;
|
||||
|
||||
// Destroy transfer queue
|
||||
if (xferQueue_ && xferQueue_->terminate()) {
|
||||
delete xferQueue_;
|
||||
@@ -363,6 +367,85 @@ Device::loaderQueryHostAddress(const void* device, const void** host)
|
||||
: HSA_STATUS_ERROR;
|
||||
}
|
||||
|
||||
Device::XferBuffers::~XferBuffers()
|
||||
{
|
||||
// Destroy temporary buffer for reads
|
||||
for (const auto& buf : freeBuffers_) {
|
||||
delete buf;
|
||||
}
|
||||
freeBuffers_.clear();
|
||||
}
|
||||
|
||||
bool
|
||||
Device::XferBuffers::create()
|
||||
{
|
||||
Memory* xferBuf = nullptr;
|
||||
bool result = false;
|
||||
|
||||
// Create a buffer object
|
||||
xferBuf = new Buffer(dev(), bufSize_);
|
||||
|
||||
// Try to allocate memory for the transfer buffer
|
||||
if ((nullptr == xferBuf) || !xferBuf->create()) {
|
||||
delete xferBuf;
|
||||
xferBuf = nullptr;
|
||||
LogError("Couldn't allocate a transfer buffer!");
|
||||
}
|
||||
else {
|
||||
result = true;
|
||||
freeBuffers_.push_back(xferBuf);
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
Memory&
|
||||
Device::XferBuffers::acquire()
|
||||
{
|
||||
Memory* xferBuf = nullptr;
|
||||
size_t listSize;
|
||||
|
||||
// Lock the operations with the staged buffer list
|
||||
amd::ScopedLock l(lock_);
|
||||
listSize = freeBuffers_.size();
|
||||
|
||||
// If the list is empty, then attempt to allocate a staged buffer
|
||||
if (listSize == 0) {
|
||||
// Allocate memory
|
||||
xferBuf = new Buffer(dev(), bufSize_);
|
||||
|
||||
// Allocate memory for the transfer buffer
|
||||
if ((nullptr == xferBuf) || !xferBuf->create()) {
|
||||
delete xferBuf;
|
||||
xferBuf = nullptr;
|
||||
LogError("Couldn't allocate a transfer buffer!");
|
||||
}
|
||||
else {
|
||||
++acquiredCnt_;
|
||||
}
|
||||
}
|
||||
|
||||
if (xferBuf == nullptr) {
|
||||
xferBuf = *(freeBuffers_.begin());
|
||||
freeBuffers_.erase(freeBuffers_.begin());
|
||||
++acquiredCnt_;
|
||||
}
|
||||
|
||||
return *xferBuf;
|
||||
}
|
||||
|
||||
void
|
||||
Device::XferBuffers::release(VirtualGPU& gpu, Memory& buffer)
|
||||
{
|
||||
// Make sure buffer isn't busy on the current VirtualGPU, because
|
||||
// the next aquire can come from different queue
|
||||
// buffer.wait(gpu);
|
||||
// Lock the operations with the staged buffer list
|
||||
amd::ScopedLock l(lock_);
|
||||
freeBuffers_.push_back(&buffer);
|
||||
--acquiredCnt_;
|
||||
}
|
||||
|
||||
bool Device::init()
|
||||
{
|
||||
#if defined(__linux__)
|
||||
@@ -550,6 +633,28 @@ Device::create()
|
||||
// Use just 1 entry by default for the map cache
|
||||
mapCache_->push_back(NULL);
|
||||
|
||||
if (settings().stagedXferSize_ != 0) {
|
||||
// Initialize staged write buffers
|
||||
if (settings().stagedXferWrite_) {
|
||||
xferWrite_ = new XferBuffers(*this,
|
||||
amd::alignUp(settings().stagedXferSize_, 4 * Ki));
|
||||
if ((xferWrite_ == nullptr) || !xferWrite_->create()) {
|
||||
LogError("Couldn't allocate transfer buffer objects for read");
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// Initialize staged read buffers
|
||||
if (settings().stagedXferRead_) {
|
||||
xferRead_ = new XferBuffers(*this,
|
||||
amd::alignUp(settings().stagedXferSize_, 4 * Ki));
|
||||
if ((xferRead_ == nullptr) || !xferRead_->create()) {
|
||||
LogError("Couldn't allocate transfer buffer objects for write");
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
xferQueue();
|
||||
|
||||
return true;
|
||||
@@ -568,11 +673,17 @@ Device::createProgram(amd::option::Options* options) {
|
||||
bool
|
||||
Device::mapHSADeviceToOpenCLDevice(hsa_agent_t dev)
|
||||
{
|
||||
if (HSA_STATUS_SUCCESS != hsa_agent_get_info(_bkendDevice,
|
||||
HSA_AGENT_INFO_PROFILE,
|
||||
&agent_profile_)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Create HSA settings
|
||||
settings_ = new Settings();
|
||||
roc::Settings* hsaSettings = static_cast<roc::Settings*>(settings_);
|
||||
if ((hsaSettings == NULL) ||
|
||||
!hsaSettings->create((true) & 0x1)) {
|
||||
!hsaSettings->create((agent_profile_ == HSA_PROFILE_FULL))) {
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -712,12 +823,6 @@ Device::populateOCLDeviceConstants()
|
||||
::strcpy(info_.boardName_, device_name);
|
||||
}
|
||||
|
||||
if (HSA_STATUS_SUCCESS != hsa_agent_get_info(_bkendDevice,
|
||||
HSA_AGENT_INFO_PROFILE,
|
||||
&agent_profile_)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (HSA_STATUS_SUCCESS !=
|
||||
hsa_agent_get_info(
|
||||
_bkendDevice, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT,
|
||||
@@ -883,7 +988,7 @@ Device::populateOCLDeviceConstants()
|
||||
|
||||
if (agent_profile_ == HSA_PROFILE_FULL) { // full-profile = participating in coherent memory,
|
||||
// base-profile = NUMA based non-coherent memory
|
||||
info_.hostUnifiedMemory_ = CL_TRUE;
|
||||
info_.hostUnifiedMemory_ = CL_TRUE;
|
||||
}
|
||||
info_.memBaseAddrAlign_ = 8 * (flagIsDefault(MEMOBJ_BASE_ADDR_ALIGN) ?
|
||||
sizeof(cl_long16) : MEMOBJ_BASE_ADDR_ALIGN);
|
||||
@@ -1244,6 +1349,13 @@ Device::addMapTarget(amd::Memory* memory) const
|
||||
return true;
|
||||
}
|
||||
|
||||
Memory*
|
||||
Device::getRocMemory(amd::Memory* mem) const
|
||||
{
|
||||
return static_cast<roc::Memory*>(mem->getDeviceMemory(*this));
|
||||
}
|
||||
|
||||
|
||||
device::Memory*
|
||||
Device::createMemory(amd::Memory &owner) const
|
||||
{
|
||||
@@ -1302,9 +1414,9 @@ Device::createMemory(amd::Memory &owner) const
|
||||
imageView->replaceDeviceMemory(this, devImageView);
|
||||
|
||||
result = xferMgr().writeImage(owner.getHostMem(), *devImageView,
|
||||
amd::Coord3D(0), imageView->getRegion(),
|
||||
imageView->getRowPitch(),
|
||||
imageView->getSlicePitch(), true);
|
||||
amd::Coord3D(0, 0, 0), imageView->getRegion(),
|
||||
0,
|
||||
0, true);
|
||||
|
||||
imageView->release();
|
||||
}
|
||||
|
||||
@@ -219,6 +219,54 @@ private:
|
||||
//! A HSA device ordinal (physical HSA device)
|
||||
class Device : public NullDevice {
|
||||
public:
|
||||
//! Transfer buffers
|
||||
class XferBuffers : public amd::HeapObject
|
||||
{
|
||||
public:
|
||||
static const size_t MaxXferBufListSize = 8;
|
||||
|
||||
//! Default constructor
|
||||
XferBuffers(const Device& device, size_t bufSize)
|
||||
: bufSize_(bufSize)
|
||||
, acquiredCnt_(0)
|
||||
, gpuDevice_(device)
|
||||
{}
|
||||
|
||||
//! Default destructor
|
||||
~XferBuffers();
|
||||
|
||||
//! Creates the xfer buffers object
|
||||
bool create();
|
||||
|
||||
//! Acquires an instance of the transfer buffers
|
||||
Memory& acquire();
|
||||
|
||||
//! Releases transfer buffer
|
||||
void release(
|
||||
VirtualGPU& gpu, //!< Virual GPU object used with the buffer
|
||||
Memory& buffer //!< Transfer buffer for release
|
||||
);
|
||||
|
||||
//! Returns the buffer's size for transfer
|
||||
size_t bufSize() const { return bufSize_; }
|
||||
|
||||
private:
|
||||
//! Disable copy constructor
|
||||
XferBuffers(const XferBuffers&);
|
||||
|
||||
//! Disable assignment operator
|
||||
XferBuffers& operator=(const XferBuffers&);
|
||||
|
||||
//! Get device object
|
||||
const Device& dev() const { return gpuDevice_; }
|
||||
|
||||
size_t bufSize_; //!< Staged buffer size
|
||||
std::list<Memory*> freeBuffers_; //!< The list of free buffers
|
||||
amd::Atomic<uint> acquiredCnt_; //!< The total number of acquired buffers
|
||||
amd::Monitor lock_; //!< Stgaed buffer acquire/release lock
|
||||
const Device& gpuDevice_; //!< GPU device object
|
||||
};
|
||||
|
||||
//! Initialise the whole HSA device subsystem (CAL init, device enumeration, etc).
|
||||
static bool init();
|
||||
static void tearDown();
|
||||
@@ -354,6 +402,17 @@ public:
|
||||
//! Adds a map target to the cache
|
||||
bool addMapTarget(amd::Memory* memory) const;
|
||||
|
||||
//! Returns transfer buffer object
|
||||
XferBuffers& xferWrite() const { return *xferWrite_; }
|
||||
|
||||
//! Returns transfer buffer object
|
||||
XferBuffers& xferRead() const { return *xferRead_; }
|
||||
|
||||
//! Returns a ROC memory object from AMD memory object
|
||||
roc::Memory* getRocMemory(
|
||||
amd::Memory* mem //!< Pointer to AMD memory object
|
||||
) const;
|
||||
|
||||
private:
|
||||
static hsa_ven_amd_loader_1_00_pfn_t amd_loader_ext_table;
|
||||
|
||||
@@ -379,6 +438,9 @@ private:
|
||||
|
||||
VirtualGPU* xferQueue() const;
|
||||
|
||||
XferBuffers* xferRead_; //!< Transfer buffers read
|
||||
XferBuffers* xferWrite_; //!< Transfer buffers write
|
||||
|
||||
public:
|
||||
amd::Atomic<uint> numOfVgpus_; //!< Virtual gpu unique index
|
||||
}; // class roc::Device
|
||||
|
||||
@@ -25,10 +25,18 @@ namespace roc {
|
||||
|
||||
/////////////////////////////////roc::Memory//////////////////////////////
|
||||
Memory::Memory(const roc::Device &dev, amd::Memory &owner)
|
||||
: device::Memory(owner),
|
||||
dev_(dev),
|
||||
deviceMemory_(NULL),
|
||||
kind_(MEMORY_KIND_NORMAL)
|
||||
: device::Memory(owner)
|
||||
, dev_(dev)
|
||||
, deviceMemory_(NULL)
|
||||
, kind_(MEMORY_KIND_NORMAL)
|
||||
{
|
||||
}
|
||||
|
||||
Memory::Memory(const roc::Device &dev, size_t size)
|
||||
: device::Memory(size)
|
||||
, dev_(dev)
|
||||
, deviceMemory_(NULL)
|
||||
, kind_(MEMORY_KIND_NORMAL)
|
||||
{
|
||||
}
|
||||
|
||||
@@ -64,8 +72,8 @@ Memory::allocateMapMemory(size_t allocationSize)
|
||||
roc::Memory* hsaMapMemory = reinterpret_cast<roc::Memory *>(
|
||||
mapMemory->getDeviceMemory(dev_));
|
||||
if (hsaMapMemory == nullptr) {
|
||||
mapMemory->release();
|
||||
return false;
|
||||
mapMemory->release();
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -191,7 +199,7 @@ bool Memory::createInteropBuffer(GLenum targetType, int miplevel, size_t* metada
|
||||
return false;
|
||||
#else
|
||||
assert(owner()->isInterop() && "Object is not an interop object.");
|
||||
|
||||
|
||||
mesa_glinterop_export_in in;
|
||||
mesa_glinterop_export_out out;
|
||||
|
||||
@@ -213,7 +221,7 @@ bool Memory::createInteropBuffer(GLenum targetType, int miplevel, size_t* metada
|
||||
|
||||
if(!dev_.mesa().Export(in, out))
|
||||
return false;
|
||||
|
||||
|
||||
size_t size;
|
||||
hsa_agent_t agent=dev_.getBackendDevice();
|
||||
hsa_status_t status=hsa_amd_interop_map_buffer(1, &agent, out.dmabuf_fd, 0, &size, &deviceMemory_, metadata_size, (const void**)metadata);
|
||||
@@ -242,9 +250,18 @@ Buffer::Buffer(const roc::Device &dev, amd::Memory &owner)
|
||||
: roc::Memory(dev, owner)
|
||||
{}
|
||||
|
||||
Buffer::Buffer(const roc::Device &dev, size_t size)
|
||||
: roc::Memory(dev, size)
|
||||
{}
|
||||
|
||||
Buffer::~Buffer()
|
||||
{
|
||||
destroy();
|
||||
if (owner() == nullptr) {
|
||||
dev_.hostFree(deviceMemory_, size());
|
||||
}
|
||||
else {
|
||||
destroy();
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
@@ -288,6 +305,15 @@ Buffer::destroy()
|
||||
bool
|
||||
Buffer::create()
|
||||
{
|
||||
if (owner() == nullptr) {
|
||||
deviceMemory_ = dev_.hostAlloc(size(), 1, false);
|
||||
if (deviceMemory_ != nullptr) {
|
||||
flags_ |= HostMemoryDirectAccess;
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
//Interop buffer
|
||||
if(owner()->isInterop())
|
||||
return createInteropBuffer(GL_ARRAY_BUFFER, 0, NULL, NULL);
|
||||
@@ -303,8 +329,7 @@ Buffer::create()
|
||||
}
|
||||
|
||||
const size_t offset = owner()->getOrigin();
|
||||
deviceMemory_ =
|
||||
static_cast<char *>(parentBuffer->getDeviceMemory()) + offset;
|
||||
deviceMemory_ = parentBuffer->getDeviceMemory() + offset;
|
||||
|
||||
flags_ |= SubMemoryObject;
|
||||
flags_ |=
|
||||
@@ -562,10 +587,10 @@ Image::createInteropImage()
|
||||
{
|
||||
auto obj=owner()->getInteropObj()->asGLObject();
|
||||
assert(obj->getCLGLObjectType()!=CL_GL_OBJECT_BUFFER && "Non-image OpenGL object used with interop image API.");
|
||||
|
||||
|
||||
const hsa_amd_image_descriptor_t* meta;
|
||||
size_t size=0;
|
||||
|
||||
|
||||
GLenum glTarget = obj->getGLTarget();
|
||||
if (glTarget == GL_TEXTURE_CUBE_MAP) {
|
||||
glTarget = obj->getCubemapFace();
|
||||
@@ -593,13 +618,13 @@ Image::createInteropImage()
|
||||
|
||||
if (obj->getGLTarget()==GL_TEXTURE_CUBE_MAP)
|
||||
desc.setFace(obj->getCubemapFace());
|
||||
|
||||
|
||||
originalDeviceMemory_=deviceMemory_;
|
||||
|
||||
hsa_status_t err=hsa_amd_image_create(dev_.getBackendDevice(), &imageDescriptor_, amdImageDesc_, originalDeviceMemory_, permission_, &hsaImageObject_);
|
||||
if(err!=HSA_STATUS_SUCCESS)
|
||||
return false;
|
||||
|
||||
|
||||
BufferGuard.Dismiss();
|
||||
DescGuard.Dismiss();
|
||||
return true;
|
||||
@@ -672,13 +697,13 @@ Image::create()
|
||||
}
|
||||
|
||||
bool
|
||||
Image::createView(Memory &parent)
|
||||
Image::createView(const Memory &parent)
|
||||
{
|
||||
deviceMemory_ = parent.getDeviceMemory();
|
||||
|
||||
originalDeviceMemory_ = (parent.owner()->asBuffer() != NULL)
|
||||
? deviceMemory_
|
||||
: static_cast<Image &>(parent).originalDeviceMemory_;
|
||||
: static_cast<const Image&>(parent).originalDeviceMemory_;
|
||||
|
||||
kind_=parent.getKind();
|
||||
|
||||
|
||||
@@ -18,10 +18,12 @@ class Memory : public device::Memory {
|
||||
|
||||
Memory(const roc::Device &dev, amd::Memory &owner);
|
||||
|
||||
Memory(const roc::Device &dev, size_t size);
|
||||
|
||||
virtual ~Memory();
|
||||
|
||||
// Getter for deviceMemory_.
|
||||
void *getDeviceMemory() const { return deviceMemory_; }
|
||||
// Getter for deviceMemory_
|
||||
address getDeviceMemory() const { return reinterpret_cast<address>(deviceMemory_); }
|
||||
|
||||
// Gets a pointer to a region of host-visible memory for use as the target
|
||||
// of an indirect map for a given memory object
|
||||
@@ -41,7 +43,7 @@ class Memory : public device::Memory {
|
||||
Unimplemented();
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
// Immediate blocking write from device cache to owners's backing store.
|
||||
// Marks owner as "current" by resetting the last writer to NULL.
|
||||
virtual void syncHostFromCache(SyncFlags syncFlags = SyncFlags())
|
||||
@@ -112,6 +114,7 @@ class Memory : public device::Memory {
|
||||
class Buffer : public roc::Memory {
|
||||
public:
|
||||
Buffer(const roc::Device &dev, amd::Memory &owner);
|
||||
Buffer(const roc::Device &dev, size_t size);
|
||||
|
||||
virtual ~Buffer();
|
||||
|
||||
@@ -143,7 +146,7 @@ public:
|
||||
virtual bool create();
|
||||
|
||||
//! Create an image view
|
||||
bool createView(Memory &parent);
|
||||
bool createView(const Memory &parent);
|
||||
|
||||
//! Gets a pointer to a region of host-visible memory for use as the target
|
||||
//! of an indirect map for a given memory object
|
||||
|
||||
@@ -53,14 +53,38 @@ Settings::Settings()
|
||||
enablePartialDispatch_ = (partialDispatch) ? false : true;
|
||||
partialDispatch_ = (partialDispatch) ? false : true;
|
||||
commandQueues_ = 100; //!< Field value set to maximum number
|
||||
//!< concurrent Virtual GPUs for ROCm backend
|
||||
//!< concurrent Virtual GPUs for ROCm backend
|
||||
|
||||
// Disable image DMA by default (ROCM runtime doesn't support it)
|
||||
imageDMA_ = false;
|
||||
|
||||
stagedXferRead_ = true;
|
||||
stagedXferWrite_ = true;
|
||||
stagedXferSize_ = GPU_STAGING_BUFFER_SIZE * Ki;
|
||||
|
||||
// Initialize transfer buffer size to 1MB by default
|
||||
xferBufSize_ = 1024 * Ki;
|
||||
|
||||
const static size_t MaxPinnedXferSize = 32;
|
||||
pinnedXferSize_ = std::min(GPU_PINNED_XFER_SIZE, MaxPinnedXferSize) * Mi;
|
||||
pinnedMinXferSize_ = std::min(GPU_PINNED_MIN_XFER_SIZE * Ki, pinnedXferSize_);
|
||||
}
|
||||
|
||||
bool
|
||||
Settings::create(bool doublePrecision)
|
||||
Settings::create(bool fullProfile)
|
||||
{
|
||||
customHostAllocator_ = true;
|
||||
|
||||
if (fullProfile) {
|
||||
pinnedXferSize_ = 0;
|
||||
stagedXferSize_ = 0;
|
||||
xferBufSize_ = 0;
|
||||
}
|
||||
else {
|
||||
pinnedXferSize_ = std::max(pinnedXferSize_, pinnedMinXferSize_);
|
||||
stagedXferSize_ = std::max(stagedXferSize_, pinnedMinXferSize_ + 4 * Ki);
|
||||
}
|
||||
|
||||
// Enable extensions
|
||||
enableExtension(ClKhrByteAddressableStore);
|
||||
enableExtension(ClKhrGlobalInt32BaseAtomics);
|
||||
@@ -72,21 +96,16 @@ Settings::create(bool doublePrecision)
|
||||
enableExtension(ClKhr3DImageWrites);
|
||||
enableExtension(ClAmdMediaOps);
|
||||
enableExtension(ClAmdMediaOps2);
|
||||
if(MesaInterop::Supported())
|
||||
enableExtension(ClKhrGlSharing);
|
||||
|
||||
// Make sure device supports doubles
|
||||
doublePrecision_ &= doublePrecision;
|
||||
|
||||
if (doublePrecision_) {
|
||||
// Enable KHR double precision extension
|
||||
enableExtension(ClKhrFp64);
|
||||
#if !defined(WITH_LIGHTNING_COMPILER)
|
||||
// Also enable AMD double precision extension?
|
||||
enableExtension(ClAmdFp64);
|
||||
#endif // !defined(WITH_LIGHTNING_COMPILER)
|
||||
if(MesaInterop::Supported()) {
|
||||
enableExtension(ClKhrGlSharing);
|
||||
}
|
||||
|
||||
// Enable KHR double precision extension
|
||||
enableExtension(ClKhrFp64);
|
||||
#if !defined(WITH_LIGHTNING_COMPILER)
|
||||
// Also enable AMD double precision extension?
|
||||
enableExtension(ClAmdFp64);
|
||||
#endif // !defined(WITH_LIGHTNING_COMPILER)
|
||||
enableExtension(ClKhrSubGroups);
|
||||
|
||||
enableExtension(ClKhrDepthImages);
|
||||
@@ -109,6 +128,18 @@ Settings::override()
|
||||
if (!flagIsDefault(GPU_MAX_COMMAND_QUEUES)) {
|
||||
commandQueues_ = GPU_MAX_COMMAND_QUEUES;
|
||||
}
|
||||
|
||||
if (!flagIsDefault(GPU_IMAGE_DMA)) {
|
||||
commandQueues_ = GPU_IMAGE_DMA;
|
||||
}
|
||||
|
||||
if (!flagIsDefault(GPU_XFER_BUFFER_SIZE)) {
|
||||
xferBufSize_ = GPU_XFER_BUFFER_SIZE * Ki;
|
||||
}
|
||||
|
||||
if (!flagIsDefault(GPU_PINNED_MIN_XFER_SIZE)) {
|
||||
pinnedMinXferSize_ = std::min(GPU_PINNED_MIN_XFER_SIZE * Ki, pinnedXferSize_);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace roc
|
||||
|
||||
@@ -26,7 +26,10 @@ public:
|
||||
uint enableImageHandle_: 1; //!< Use HSAIL image/sampler pointer
|
||||
uint enableNCMode_: 1; //!< Enable Non Coherent mode for system memory
|
||||
uint enablePartialDispatch_: 1; //!< Enable support for Partial Dispatch
|
||||
uint reserved_: 26;
|
||||
uint imageDMA_: 1; //!< Enable direct image DMA transfers
|
||||
uint stagedXferRead_: 1; //!< Uses a staged buffer read
|
||||
uint stagedXferWrite_: 1; //!< Uses a staged buffer write
|
||||
uint reserved_: 22;
|
||||
};
|
||||
uint value_;
|
||||
};
|
||||
@@ -46,11 +49,16 @@ public:
|
||||
uint kernargPoolSize_;
|
||||
uint signalPoolSize_;
|
||||
|
||||
size_t xferBufSize_; //!< Transfer buffer size for image copy optimization
|
||||
size_t stagedXferSize_; //!< Staged buffer size
|
||||
size_t pinnedXferSize_; //!< Pinned buffer size for transfer
|
||||
size_t pinnedMinXferSize_; //!< Minimal buffer size for pinned transfer
|
||||
|
||||
//! Default constructor
|
||||
Settings();
|
||||
|
||||
//! Creates settings
|
||||
bool create(bool doublePrecision);
|
||||
bool create(bool fullProfile);
|
||||
|
||||
private:
|
||||
//! Disable copy constructor
|
||||
|
||||
@@ -446,6 +446,9 @@ bool VirtualGPU::releaseGpuMemoryFence() {
|
||||
|
||||
hasPendingDispatch_ = false;
|
||||
|
||||
// Release all transfer buffers on this command queue
|
||||
releaseXferWrite();
|
||||
|
||||
// Release all memory dependencies
|
||||
memoryDependency().clear();
|
||||
|
||||
@@ -1774,8 +1777,66 @@ void VirtualGPU::submitReleaseExtObjects(amd::ReleaseExtObjectsCommand& vcmd)
|
||||
profilingEnd(vcmd);
|
||||
}
|
||||
|
||||
void VirtualGPU::flush(amd::Command *list, bool wait) {
|
||||
void VirtualGPU::flush(amd::Command *list, bool wait)
|
||||
{
|
||||
releaseGpuMemoryFence();
|
||||
updateCommandsState(list);
|
||||
// Rlease all pinned memory
|
||||
releasePinnedMem();
|
||||
}
|
||||
|
||||
void
|
||||
VirtualGPU::addXferWrite(Memory& memory)
|
||||
{
|
||||
if (xferWriteBuffers_.size() > 7) {
|
||||
dev().xferWrite().release(*this, *xferWriteBuffers_.front());
|
||||
xferWriteBuffers_.erase(xferWriteBuffers_.begin());
|
||||
}
|
||||
|
||||
// Delay destruction
|
||||
xferWriteBuffers_.push_back(&memory);
|
||||
}
|
||||
|
||||
void
|
||||
VirtualGPU::releaseXferWrite()
|
||||
{
|
||||
for (auto& memory : xferWriteBuffers_) {
|
||||
dev().xferWrite().release(*this, *memory);
|
||||
}
|
||||
xferWriteBuffers_.resize(0);
|
||||
}
|
||||
|
||||
void
|
||||
VirtualGPU::addPinnedMem(amd::Memory* mem)
|
||||
{
|
||||
if (nullptr == findPinnedMem(mem->getHostMem(), mem->getSize())) {
|
||||
if (pinnedMems_.size() > 7) {
|
||||
pinnedMems_.front()->release();
|
||||
pinnedMems_.erase(pinnedMems_.begin());
|
||||
}
|
||||
|
||||
// Delay destruction
|
||||
pinnedMems_.push_back(mem);
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
VirtualGPU::releasePinnedMem()
|
||||
{
|
||||
for (auto& amdMemory : pinnedMems_) {
|
||||
amdMemory->release();
|
||||
}
|
||||
pinnedMems_.resize(0);
|
||||
}
|
||||
|
||||
amd::Memory*
|
||||
VirtualGPU::findPinnedMem(void* addr, size_t size)
|
||||
{
|
||||
for (auto& amdMemory : pinnedMems_) {
|
||||
if ((amdMemory->getHostMem() == addr) && (size <= amdMemory->getSize())) {
|
||||
return amdMemory;
|
||||
}
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
} // End of roc namespace
|
||||
|
||||
@@ -149,7 +149,7 @@ public:
|
||||
void submitAcquireExtObjects(amd::AcquireExtObjectsCommand& cmd);
|
||||
void submitReleaseExtObjects(amd::ReleaseExtObjectsCommand& cmd);
|
||||
void submitPerfCounter(amd::PerfCounterCommand& cmd){};
|
||||
|
||||
|
||||
void flush(amd::Command* list = NULL, bool wait = false);
|
||||
void submitFillMemory(amd::FillMemoryCommand& cmd);
|
||||
void submitMigrateMemObjects(amd::MigrateMemObjectsCommand& cmd);
|
||||
@@ -193,10 +193,24 @@ public:
|
||||
bool processMemObjects(
|
||||
const amd::Kernel& kernel, //!< AMD kernel object for execution
|
||||
const_address params //!< Pointer to the param's store
|
||||
);
|
||||
);
|
||||
//Retun the virtual gpu unique index
|
||||
uint index() const { return index_; }
|
||||
|
||||
//! Adds a stage write buffer into a list
|
||||
void addXferWrite(Memory& memory);
|
||||
|
||||
//! Releases stage write buffers
|
||||
void releaseXferWrite();
|
||||
|
||||
//! Adds a pinned memory object into a map
|
||||
void addPinnedMem(amd::Memory* mem);
|
||||
|
||||
//! Release pinned memory objects
|
||||
void releasePinnedMem();
|
||||
|
||||
//! Finds if pinned memory is cached
|
||||
amd::Memory* findPinnedMem(void* addr, size_t size);
|
||||
|
||||
// } roc OpenCL integration
|
||||
private:
|
||||
@@ -219,6 +233,9 @@ private:
|
||||
//! Updates AQL header for the upcomming dispatch
|
||||
void setAqlHeader(uint16_t header) { aqlHeader_ = header; }
|
||||
|
||||
std::vector<Memory*> xferWriteBuffers_; //!< Stage write buffers
|
||||
std::vector<amd::Memory*> pinnedMems_; //!< Pinned memory list
|
||||
|
||||
/**
|
||||
* @brief Maintains the list of sampler allocated for one or more kernel
|
||||
* submissions.
|
||||
@@ -231,16 +248,16 @@ private:
|
||||
*/
|
||||
bool hasPendingDispatch_;
|
||||
Timestamp* timestamp_;
|
||||
hsa_agent_t gpu_device_; //!< Physical device
|
||||
hsa_queue_t* gpu_queue_; //!< Queue associated with a gpu
|
||||
hsa_agent_t gpu_device_; //!< Physical device
|
||||
hsa_queue_t* gpu_queue_; //!< Queue associated with a gpu
|
||||
hsa_barrier_and_packet_t barrier_packet_;
|
||||
hsa_signal_t barrier_signal_;
|
||||
uint32_t dispatch_id_; //!< This variable must be updated atomically.
|
||||
Device& roc_device_; //!< roc device object
|
||||
uint32_t dispatch_id_; //!< This variable must be updated atomically.
|
||||
Device& roc_device_; //!< roc device object
|
||||
void * tools_lib_;
|
||||
PrintfDbg* printfdbg_;
|
||||
MemoryDependency memoryDependency_; //!< Memory dependency class
|
||||
uint16_t aqlHeader_; //!< AQL header for dispatch
|
||||
uint16_t aqlHeader_; //!< AQL header for dispatch
|
||||
|
||||
char* kernarg_pool_base_;
|
||||
size_t kernarg_pool_size_;
|
||||
|
||||
Reference in New Issue
Block a user