P4 to Git Change 1191682 by gandryey@gera-dev-w7 on 2015/09/17 11:14:23
ECR #304775 - Remove EG/NI support - Remove the heap emulation (non-vm) Affected files ... ... //depot/stg/opencl/drivers/opencl/api/opencl/amdocl/cl_memobj.cpp#77 edit ... //depot/stg/opencl/drivers/opencl/api/opencl/amdocl/cl_svm.cpp#12 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/cpu/cpusettings.cpp#31 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/device.cpp#186 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/device.hpp#253 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpublit.cpp#118 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpudevice.cpp#523 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpudevice.hpp#148 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuheap.cpp#28 delete ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuheap.hpp#16 delete ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpukernel.cpp#297 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpukernel.hpp#116 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpumemory.cpp#122 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpumemory.hpp#48 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuresource.cpp#227 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuresource.hpp#83 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpusettings.cpp#329 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpusettings.hpp#94 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuvirtual.cpp#379 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gslbe/src/rt/GSLDevice.cpp#143 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gslbe/src/rt/GSLDevice.h#57 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/hsa/hsasettings.cpp#38 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/hsa_foundation/hsasettings.cpp#9 edit ... //depot/stg/opencl/drivers/opencl/runtime/utils/flags.hpp#242 edit
此提交包含在:
@@ -10,8 +10,6 @@ namespace cpu {
|
||||
bool
|
||||
Settings::create()
|
||||
{
|
||||
largeHostMemAlloc_ = true;
|
||||
|
||||
// This code is temporary until cl_khr_fp64 is unconditional
|
||||
if (flagIsDefault(CL_KHR_FP64) || CL_KHR_FP64) {
|
||||
enableExtension(ClKhrFp64);
|
||||
|
||||
@@ -517,7 +517,6 @@ Settings::Settings()
|
||||
extensions_ = 0;
|
||||
partialDispatch_ = false;
|
||||
supportRA_ = true;
|
||||
largeHostMemAlloc_ = false;
|
||||
customHostAllocator_ = false;
|
||||
waitCommand_ = AMD_OCL_WAIT_COMMAND;
|
||||
supportDepthsRGB_ = false;
|
||||
|
||||
@@ -577,13 +577,12 @@ public:
|
||||
struct {
|
||||
uint partialDispatch_: 1; //!< Enables partial dispatch
|
||||
uint supportRA_: 1; //!< Support RA channel order format
|
||||
uint largeHostMemAlloc_: 1; //!< Allow large host mem allocations (> maxSingleAlloc)
|
||||
uint waitCommand_: 1; //!< Enables a wait for every submitted command
|
||||
uint customHostAllocator_: 1;//!< True if device has custom host allocator
|
||||
// that replaces generic OS allocation routines
|
||||
uint supportDepthsRGB_: 1; //!< Support DEPTH and sRGB channel order format
|
||||
uint enableHwDebug_: 1; //!< Enable HW debug support
|
||||
uint reserved_: 25;
|
||||
uint reserved_: 26;
|
||||
};
|
||||
uint value_;
|
||||
};
|
||||
|
||||
@@ -1955,20 +1955,9 @@ KernelBlitManager::copyBufferRect(
|
||||
|
||||
// Fall into the CAL path for rejected transfers
|
||||
if (setup_.disableCopyBufferRect_ ||
|
||||
(gpuMem(srcMemory).isHostMemDirectAccess() || gpuMem(dstMemory).isHostMemDirectAccess()) ||
|
||||
(!dev().heap()->isVirtual() &&
|
||||
((gpuMem(dstMemory).hb() == NULL) || (gpuMem(srcMemory).hb() == NULL)))) {
|
||||
// Copy data with CAL (no VM mode only)
|
||||
if (gpuMem(srcMemory).isHostMemDirectAccess() || gpuMem(dstMemory).isHostMemDirectAccess()) {
|
||||
result = DmaBlitManager::copyBufferRect(srcMemory, dstMemory,
|
||||
srcRectIn, dstRectIn, sizeIn, entire);
|
||||
}
|
||||
|
||||
if ((!dev().heap()->isVirtual() && ((gpuMem(dstMemory).hb() == NULL) || (gpuMem(srcMemory).hb() == NULL)))
|
||||
&& !result) {
|
||||
result = HostBlitManager::copyBufferRect(srcMemory, dstMemory,
|
||||
srcRectIn, dstRectIn, sizeIn, entire);
|
||||
}
|
||||
gpuMem(srcMemory).isHostMemDirectAccess() || gpuMem(dstMemory).isHostMemDirectAccess()) {
|
||||
result = DmaBlitManager::copyBufferRect(srcMemory, dstMemory,
|
||||
srcRectIn, dstRectIn, sizeIn, entire);
|
||||
|
||||
if (result) {
|
||||
synchronize();
|
||||
@@ -2395,11 +2384,9 @@ KernelBlitManager::copyBuffer(
|
||||
{
|
||||
amd::ScopedLock k(lockXferOps_);
|
||||
bool result = false;
|
||||
bool forceCal = !dev().heap()->isVirtual() &&
|
||||
((gpuMem(srcMemory).hb() == NULL) || (gpuMem(dstMemory).hb() == NULL));
|
||||
|
||||
if ((!forceCal && !gpuMem(srcMemory).isHostMemDirectAccess() &&
|
||||
!gpuMem(dstMemory).isHostMemDirectAccess())) {
|
||||
if (!gpuMem(srcMemory).isHostMemDirectAccess() &&
|
||||
!gpuMem(dstMemory).isHostMemDirectAccess()) {
|
||||
uint blitType = BlitCopyBuffer;
|
||||
size_t dim = 1;
|
||||
size_t globalWorkOffset[3] = { 0, 0, 0 };
|
||||
@@ -2489,7 +2476,6 @@ KernelBlitManager::copyBuffer(
|
||||
result = gpu().submitKernelInternal(ndrange, *kernels_[blitType], parameters);
|
||||
}
|
||||
else {
|
||||
// Copy data with CAL (no VM mode only)
|
||||
result = DmaBlitManager::copyBuffer(
|
||||
srcMemory, dstMemory, srcOrigin, dstOrigin, sizeIn, entire);
|
||||
}
|
||||
|
||||
+185
-476
@@ -173,7 +173,7 @@ NullDevice::create(CALtarget target)
|
||||
calAttr.localRAM = 512;
|
||||
|
||||
// Fill the device info structure
|
||||
fillDeviceInfo(calAttr, memInfo, 4096, 1, true);
|
||||
fillDeviceInfo(calAttr, memInfo, 4096, 1);
|
||||
|
||||
if (settings().hsail_ || (settings().oclVersion_ == OpenCL20)) {
|
||||
// Runtime doesn't know what local size could be on the real board
|
||||
@@ -225,9 +225,7 @@ void NullDevice::fillDeviceInfo(
|
||||
const CALdeviceattribs& calAttr,
|
||||
const gslMemInfo& memInfo,
|
||||
size_t maxTextureSize,
|
||||
uint numComputeRings,
|
||||
bool isVirtualMode
|
||||
)
|
||||
uint numComputeRings)
|
||||
{
|
||||
info_.type_ = CL_DEVICE_TYPE_GPU;
|
||||
info_.vendorId_ = 0x1002;
|
||||
@@ -276,56 +274,45 @@ void NullDevice::fillDeviceInfo(
|
||||
info_.globalMemCacheType_ = CL_NONE;
|
||||
}
|
||||
|
||||
if (isVirtualMode) {
|
||||
#if defined(ATI_OS_LINUX)
|
||||
info_.globalMemSize_ =
|
||||
(static_cast<cl_ulong>(std::min(GPU_MAX_HEAP_SIZE, 100u)) *
|
||||
// globalMemSize is the actual available size for app on Linux
|
||||
// Because Linux base driver doesn't support paging
|
||||
static_cast<cl_ulong>(memInfo.cardMemAvailableBytes + memInfo.cardExtMemAvailableBytes) / 100u);
|
||||
info_.globalMemSize_ =
|
||||
(static_cast<cl_ulong>(std::min(GPU_MAX_HEAP_SIZE, 100u)) *
|
||||
// globalMemSize is the actual available size for app on Linux
|
||||
// Because Linux base driver doesn't support paging
|
||||
static_cast<cl_ulong>(memInfo.cardMemAvailableBytes + memInfo.cardExtMemAvailableBytes) / 100u);
|
||||
#else
|
||||
info_.globalMemSize_ =
|
||||
(static_cast<cl_ulong>(std::min(GPU_MAX_HEAP_SIZE, 100u)) *
|
||||
static_cast<cl_ulong>(calAttr.localRAM) / 100u) * Mi;
|
||||
info_.globalMemSize_ =
|
||||
(static_cast<cl_ulong>(std::min(GPU_MAX_HEAP_SIZE, 100u)) *
|
||||
static_cast<cl_ulong>(calAttr.localRAM) / 100u) * Mi;
|
||||
#endif
|
||||
if (settings().apuSystem_) {
|
||||
info_.globalMemSize_ +=
|
||||
(static_cast<cl_ulong>(calAttr.uncachedRemoteRAM) * Mi * 75)/100;
|
||||
}
|
||||
if (settings().apuSystem_) {
|
||||
info_.globalMemSize_ +=
|
||||
(static_cast<cl_ulong>(calAttr.uncachedRemoteRAM) * Mi * 75)/100;
|
||||
}
|
||||
|
||||
// We try to calculate the largest available memory size from
|
||||
// the largest available block in either heap. In theory this
|
||||
// should be the size we can actually allocate at application
|
||||
// start. Note that it may not be a guarantee still as the
|
||||
// application progresses.
|
||||
info_.maxMemAllocSize_ = std::max(
|
||||
cl_ulong(memInfo.cardLargestFreeBlockBytes),
|
||||
cl_ulong(memInfo.cardExtLargestFreeBlockBytes));
|
||||
// We try to calculate the largest available memory size from
|
||||
// the largest available block in either heap. In theory this
|
||||
// should be the size we can actually allocate at application
|
||||
// start. Note that it may not be a guarantee still as the
|
||||
// application progresses.
|
||||
info_.maxMemAllocSize_ = std::max(
|
||||
cl_ulong(memInfo.cardLargestFreeBlockBytes),
|
||||
cl_ulong(memInfo.cardExtLargestFreeBlockBytes));
|
||||
|
||||
#if defined(ATI_OS_WIN)
|
||||
if (settings().apuSystem_) {
|
||||
info_.maxMemAllocSize_ = std::max(
|
||||
(static_cast<cl_ulong>(calAttr.uncachedRemoteRAM) * Mi * 75)/100,
|
||||
info_.maxMemAllocSize_);
|
||||
}
|
||||
if (settings().apuSystem_) {
|
||||
info_.maxMemAllocSize_ = std::max(
|
||||
(static_cast<cl_ulong>(calAttr.uncachedRemoteRAM) * Mi * 75)/100,
|
||||
info_.maxMemAllocSize_);
|
||||
}
|
||||
#endif
|
||||
info_.maxMemAllocSize_ = cl_ulong(info_.maxMemAllocSize_ *
|
||||
std::min(GPU_SINGLE_ALLOC_PERCENT, 100u) / 100u);
|
||||
info_.maxMemAllocSize_ = cl_ulong(info_.maxMemAllocSize_ *
|
||||
std::min(GPU_SINGLE_ALLOC_PERCENT, 100u) / 100u);
|
||||
|
||||
//! \note Force max single allocation size.
|
||||
//! 4GB limit for the blit kernels and 64 bit optimizations.
|
||||
info_.maxMemAllocSize_ = std::min(info_.maxMemAllocSize_,
|
||||
static_cast<cl_ulong>(settings().maxAllocSize_));
|
||||
}
|
||||
else {
|
||||
uint maxHeapSize = flagIsDefault(GPU_MAX_HEAP_SIZE) ? 50 : GPU_MAX_HEAP_SIZE;
|
||||
info_.globalMemSize_ = (std::min(maxHeapSize, 100u)
|
||||
* calAttr.localRAM / 100u) * Mi;
|
||||
|
||||
uint maxAllocSize = flagIsDefault(GPU_SINGLE_ALLOC_PERCENT) ? 25 : GPU_SINGLE_ALLOC_PERCENT;
|
||||
info_.maxMemAllocSize_ = cl_ulong(info_.globalMemSize_ *
|
||||
std::min(maxAllocSize, 100u) / 100u);
|
||||
}
|
||||
//! \note Force max single allocation size.
|
||||
//! 4GB limit for the blit kernels and 64 bit optimizations.
|
||||
info_.maxMemAllocSize_ = std::min(info_.maxMemAllocSize_,
|
||||
static_cast<cl_ulong>(settings().maxAllocSize_));
|
||||
|
||||
if (info_.maxMemAllocSize_ < cl_ulong(128 * Mi)) {
|
||||
LogError("We are unable to get a heap large enough to support the OpenCL minimum "\
|
||||
@@ -377,7 +364,7 @@ void NullDevice::fillDeviceInfo(
|
||||
info_.imagePitchAlignment_ = 256; // XXX: 256 pixel pitch alignment for now
|
||||
info_.imageBaseAddressAlignment_ = 256; // XXX: 256 byte base address alignment for now
|
||||
|
||||
info_.bufferFromImageSupport_ = (isVirtualMode) ? CL_TRUE : CL_FALSE;
|
||||
info_.bufferFromImageSupport_ = CL_TRUE;
|
||||
}
|
||||
|
||||
info_.errorCorrectionSupport_ = CL_FALSE;
|
||||
@@ -404,7 +391,7 @@ void NullDevice::fillDeviceInfo(
|
||||
::strcpy(info_.name_, hwInfo()->targetName_);
|
||||
::strcpy(info_.vendor_, "Advanced Micro Devices, Inc.");
|
||||
::snprintf(info_.driverVersion_, sizeof(info_.driverVersion_) - 1,
|
||||
AMD_BUILD_STRING "%s", (isVirtualMode) ? " (VM)": "");
|
||||
AMD_BUILD_STRING "%s", " (VM)");
|
||||
|
||||
info_.profile_ = "FULL_PROFILE";
|
||||
if (settings().oclVersion_ == OpenCL20) {
|
||||
@@ -508,6 +495,25 @@ void NullDevice::fillDeviceInfo(
|
||||
}
|
||||
}
|
||||
|
||||
bool
|
||||
Device::Heap::create(Device& device)
|
||||
{
|
||||
// Create a new GPU resource
|
||||
resource_ = new Resource(device, 0, CM_SURF_FMT_R32I);
|
||||
if (resource_ == NULL) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!resource_->create(Resource::Heap)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!device.settings().hsail_) {
|
||||
baseAddress_ = resource_->gslResource()->getSurfaceAddress();
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
void
|
||||
Device::Engines::create(uint num, gslEngineDescriptor* desc, uint maxNumComputeRings)
|
||||
{
|
||||
@@ -670,7 +676,7 @@ Device::Device()
|
||||
, CALGSLDevice()
|
||||
, numOfVgpus_(0)
|
||||
, context_(NULL)
|
||||
, heap_(NULL)
|
||||
, heap_()
|
||||
, dummyPage_(NULL)
|
||||
, lockAsyncOps_(NULL)
|
||||
, lockAsyncOpsForInitHeap_(NULL)
|
||||
@@ -731,11 +737,6 @@ Device::~Device()
|
||||
dummyPage_->release();
|
||||
}
|
||||
|
||||
// Destroy global heap
|
||||
if (heap_ != NULL) {
|
||||
delete heap_;
|
||||
}
|
||||
|
||||
// Destroy resource cache
|
||||
delete resourceCache_;
|
||||
|
||||
@@ -837,26 +838,6 @@ Device::create(CALuint ordinal, CALuint numOfDevices)
|
||||
|
||||
size_t resourceCacheSize = settings().resourceCacheSize_;
|
||||
|
||||
// Allocate heap
|
||||
heapSize_ = settings().heapSize_;
|
||||
|
||||
// Check if BE supports virtual addressing mode
|
||||
if (isVmMode()) {
|
||||
heap_ = new VirtualHeap(*this);
|
||||
gpuSettings->largeHostMemAlloc_ = (NULL != heap_) ? true : false;
|
||||
}
|
||||
|
||||
// If virtual heap allocation failed, then try static allocation
|
||||
if (heap_ == NULL) {
|
||||
heap_ = new Heap(*this);
|
||||
// Disable resource cache if VM is disable
|
||||
resourceCacheSize = 0;
|
||||
if (NULL == heap_) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#ifdef DEBUG
|
||||
std::stringstream message;
|
||||
if (settings().remoteAlloc_) {
|
||||
@@ -865,10 +846,7 @@ Device::create(CALuint ordinal, CALuint numOfDevices)
|
||||
else {
|
||||
message << "Using *Local* memory";
|
||||
}
|
||||
if (!heap()->isVirtual()) {
|
||||
message << ": " << settings().heapSize_ / Mi << "MB, growth: " << \
|
||||
settings().heapSizeGrowth_ / Mi << "MB";
|
||||
}
|
||||
|
||||
message << std::endl;
|
||||
LogInfo(message.str().c_str());
|
||||
#endif // DEBUG
|
||||
@@ -883,8 +861,7 @@ Device::create(CALuint ordinal, CALuint numOfDevices)
|
||||
// Fill the device info structure
|
||||
fillDeviceInfo(getAttribs(), getMemInfo(),
|
||||
static_cast<size_t>(getMaxTextureSize()),
|
||||
engines().numComputeRings(), heap()->isVirtual()
|
||||
);
|
||||
engines().numComputeRings());
|
||||
|
||||
if (settings().hsail_ || (settings().oclVersion_ == OpenCL20)) {
|
||||
if (NULL == hsaCompiler_) {
|
||||
@@ -955,7 +932,7 @@ Device::initializeHeapResources()
|
||||
}
|
||||
|
||||
// Complete initialization of the heap and other buffers
|
||||
if ((heap_ == NULL) || !heap_->create(heapSize_, settings().remoteAlloc_)) {
|
||||
if (!heap_.create(*this)) {
|
||||
LogError("Failed GPU heap creation");
|
||||
return false;
|
||||
}
|
||||
@@ -987,7 +964,7 @@ Device::initializeHeapResources()
|
||||
type = Resource::RemoteUSWC;
|
||||
}
|
||||
xferWrite_ = new XferBuffers(*this, type,
|
||||
amd::alignUp(settings().stagedXferSize_, heap()->granularityB()));
|
||||
amd::alignUp(settings().stagedXferSize_, 4 * Ki));
|
||||
if ((xferWrite_ == NULL) || !xferWrite_->create()) {
|
||||
LogError("Couldn't allocate transfer buffer objects for read");
|
||||
return false;
|
||||
@@ -997,7 +974,7 @@ Device::initializeHeapResources()
|
||||
// Initialize staged read buffers
|
||||
if (settings().stagedXferRead_) {
|
||||
xferRead_ = new XferBuffers(*this, Resource::Remote,
|
||||
amd::alignUp(settings().stagedXferSize_, heap()->granularityB()));
|
||||
amd::alignUp(settings().stagedXferSize_, 4 * Ki));
|
||||
if ((xferRead_ == NULL) || !xferRead_->create()) {
|
||||
LogError("Couldn't allocate transfer buffer objects for write");
|
||||
return false;
|
||||
@@ -1086,52 +1063,6 @@ Device::createVirtualDevice(
|
||||
}
|
||||
}
|
||||
|
||||
bool
|
||||
Device::reallocHeap(size_t size, bool remoteAlloc)
|
||||
{
|
||||
size_t heapSize = heapSize_ + ((size != 0) ?
|
||||
amd::alignUp(size, settings().heapSizeGrowth_) : 0);
|
||||
Heap* oldHeap = heap_;
|
||||
// Maximum heap limit size = reported size + internal memory
|
||||
size_t maxHeapLimit = static_cast<size_t>(info().globalMemSize_) +
|
||||
// an extra 10MB for the alignments of allocations,
|
||||
// since the conformance test doesn't expect any
|
||||
10 * Mi;
|
||||
|
||||
if ((settings().heapSizeGrowth_ == 0) ||
|
||||
// Allow the heap growth up to the global memory limit
|
||||
(heapSize_ + size > maxHeapLimit)) {
|
||||
return false;
|
||||
}
|
||||
heapSize = std::min(maxHeapLimit, heapSize);
|
||||
|
||||
heap_ = new Heap(*this);
|
||||
|
||||
// Make sure we have allocated a new global heap
|
||||
if (NULL == heap_) {
|
||||
heap_ = oldHeap;
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!heap_->create(heapSize, remoteAlloc)) {
|
||||
delete heap_;
|
||||
heap_ = oldHeap;
|
||||
return false;
|
||||
}
|
||||
|
||||
// Copy the old heap to the new one
|
||||
if (!oldHeap->copyTo(heap_)) {
|
||||
delete heap_;
|
||||
heap_ = oldHeap;
|
||||
return false;
|
||||
}
|
||||
|
||||
delete oldHeap;
|
||||
heapSize_ = heapSize;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
device::Program*
|
||||
Device::createProgram(int oclVer)
|
||||
{
|
||||
@@ -1288,65 +1219,6 @@ Device::tearDown()
|
||||
}
|
||||
}
|
||||
|
||||
//! @note This funciton must be lock protected from a caller
|
||||
HeapBlock*
|
||||
Device::allocHeapBlock(size_t size) const
|
||||
{
|
||||
HeapBlock* hb = NULL;
|
||||
|
||||
// Allocate the underlying heap block
|
||||
hb = heap_->alloc(size);
|
||||
|
||||
// Virtual heap should never fail allocation
|
||||
if ((hb == NULL) && (!heap_->isVirtual())) {
|
||||
// Queues can't process commands,
|
||||
// while the global heap reallocation occurs.
|
||||
// So stall all queues and then reallocate the global heap
|
||||
ScopedLockVgpus lock(*this);
|
||||
|
||||
// Wait for idle
|
||||
for (uint idx = 0; idx < vgpus().size(); ++idx) {
|
||||
vgpus()[idx]->waitAllEngines();
|
||||
}
|
||||
|
||||
// Acount memory alignment for the new allocation
|
||||
size_t extraSpace = heap_->granularityB();
|
||||
if (size >= heap_->freeSpace()) {
|
||||
// Required extra space = requested size - free space
|
||||
extraSpace += size - heap_->freeSpace();
|
||||
}
|
||||
|
||||
//! @note the const cast here looks bad, but the device object
|
||||
// is a lock protected above. The rest of the code
|
||||
// doesn't change the device object.
|
||||
// So the const methods can be safly used everywhere else.
|
||||
// In general we should avoid changing the device object after initialization
|
||||
|
||||
// Try to reallocate the heap with the same memory type
|
||||
if (const_cast<Device*>(this)->reallocHeap(extraSpace, settings().remoteAlloc_)) {
|
||||
hb = heap_->alloc(size);
|
||||
}
|
||||
|
||||
if (hb == NULL) {
|
||||
// Use reversed memory type as a temporary storage
|
||||
bool remoteAlloc = settings().remoteAlloc_ ^ true;
|
||||
|
||||
// Try to reallocate the heap
|
||||
if (const_cast<Device*>(this)->reallocHeap(extraSpace, remoteAlloc)) {
|
||||
// Back to the default location of the global heap
|
||||
remoteAlloc ^= true;
|
||||
if (!const_cast<Device*>(this)->reallocHeap(0, remoteAlloc)) {
|
||||
LogWarning("New memory type for the \
|
||||
global heap after reallocation!");
|
||||
}
|
||||
hb = heap_->alloc(size);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return hb;
|
||||
}
|
||||
|
||||
gpu::Memory*
|
||||
Device::getGpuMemory(amd::Memory* mem) const
|
||||
{
|
||||
@@ -1392,99 +1264,20 @@ Device::createScratchBuffer(size_t size) const
|
||||
{
|
||||
Memory* gpuMemory = NULL;
|
||||
|
||||
// Use virtual heap allocation
|
||||
if (heap()->isVirtual()) {
|
||||
// Create a memory object
|
||||
gpuMemory = new gpu::Memory(*this, size);
|
||||
if (NULL == gpuMemory || !gpuMemory->create(Resource::Local)) {
|
||||
delete gpuMemory;
|
||||
gpuMemory = NULL;
|
||||
}
|
||||
}
|
||||
else {
|
||||
// We have to lock the heap block allocation,
|
||||
// so possible reallocation won't occur twice or
|
||||
// another thread could destroy a heap block,
|
||||
// while we didn't finish allocation
|
||||
amd::ScopedLock k(lockAsyncOps());
|
||||
|
||||
HeapBlock* hb = allocHeapBlock(size);
|
||||
if (hb != NULL) {
|
||||
// wrap it
|
||||
gpuMemory = new gpu::Memory(*this, *hb);
|
||||
|
||||
// Create resource
|
||||
if (NULL != gpuMemory) {
|
||||
Resource::ViewParams params;
|
||||
params.offset_ = hb->offset_;
|
||||
params.size_ = hb->size_;
|
||||
params.resource_ = &(globalMem());
|
||||
params.memory_ = NULL;
|
||||
if (!gpuMemory->create(Resource::View, ¶ms)) {
|
||||
delete gpuMemory;
|
||||
gpuMemory = NULL;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return gpuMemory;
|
||||
}
|
||||
|
||||
gpu::Memory*
|
||||
Device::createBufferFromHeap(amd::Memory& owner) const
|
||||
{
|
||||
size_t size = owner.getSize();
|
||||
gpu::Memory* gpuMemory;
|
||||
|
||||
// We have to lock the heap block allocation,
|
||||
// so possible reallocation won't occur twice or
|
||||
// another thread could destroy a heap block,
|
||||
// while we didn't finish allocation
|
||||
amd::ScopedLock k(lockAsyncOps());
|
||||
|
||||
HeapBlock* hb = allocHeapBlock(size);
|
||||
if (hb == NULL) {
|
||||
LogError("We don't have enough video memory!");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
// Create a memory object
|
||||
gpuMemory = new gpu::Memory(*this, owner, hb);
|
||||
if (NULL == gpuMemory) {
|
||||
hb->setMemory(NULL);
|
||||
hb->free();
|
||||
return NULL;
|
||||
}
|
||||
|
||||
Resource::ViewParams params;
|
||||
params.owner_ = &owner;
|
||||
params.offset_ = hb->offset_;
|
||||
params.size_ = hb->size_;
|
||||
params.resource_ = &(globalMem());
|
||||
params.memory_ = NULL;
|
||||
|
||||
if (!gpuMemory->create(Resource::View, ¶ms)) {
|
||||
gpuMemory = new gpu::Memory(*this, size);
|
||||
if (NULL == gpuMemory || !gpuMemory->create(Resource::Local)) {
|
||||
delete gpuMemory;
|
||||
return NULL;
|
||||
gpuMemory = NULL;
|
||||
}
|
||||
|
||||
// Check if owner is interop memory
|
||||
if (owner.isInterop()) {
|
||||
if (!gpuMemory->createInterop(Memory::InteropHwEmulation)) {
|
||||
LogError("HW interop creation failed!");
|
||||
delete gpuMemory;
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
return gpuMemory;
|
||||
}
|
||||
|
||||
gpu::Memory*
|
||||
Device::createBuffer(
|
||||
amd::Memory& owner,
|
||||
bool directAccess,
|
||||
bool bufferAlloc) const
|
||||
bool directAccess) const
|
||||
{
|
||||
size_t size = owner.getSize();
|
||||
gpu::Memory* gpuMemory;
|
||||
@@ -1504,39 +1297,7 @@ Device::createBuffer(
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if (!heap()->isVirtual()) {
|
||||
bool uhpAlloc =
|
||||
(owner.parent()->getMemFlags() & CL_MEM_USE_HOST_PTR) ? true : false;
|
||||
|
||||
if (owner.parent()->getType() != CL_MEM_OBJECT_IMAGE1D_BUFFER) {
|
||||
//! \note This extra line is necessary to make sure that subbuffer
|
||||
//! allocation is a synch operation,
|
||||
//! due to a possible realloc of heap(no VM) or parent(UHP)
|
||||
amd::ScopedLock k(lockAsyncOps());
|
||||
|
||||
//! @note: For now make sure the parent is allocated in the global heap
|
||||
//! or if it's the UHP optimization for prepinned memory
|
||||
if (((gpuParent->hb() == NULL) || uhpAlloc) &&
|
||||
!owner.parent()->reallocedDeviceMemory(this)) {
|
||||
if (reallocMemory(*owner.parent())) {
|
||||
gpuParent = getGpuMemory(owner.parent());
|
||||
}
|
||||
else {
|
||||
LogError("Can't reallocate the owner object for subbuffer allocation");
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
|
||||
return gpuParent->createBufferView(owner);
|
||||
}
|
||||
else {
|
||||
gpuParent = getGpuMemory(owner.parent()->parent());
|
||||
return gpuParent->createBufferView(*owner.parent()->parent());
|
||||
}
|
||||
}
|
||||
else {
|
||||
return gpuParent->createBufferView(owner);
|
||||
}
|
||||
return gpuParent->createBufferView(owner);
|
||||
}
|
||||
|
||||
Resource::MemoryType type = (owner.forceSysMemAlloc() || (owner.getMemFlags() & CL_MEM_SVM_FINE_GRAIN_BUFFER)) ?
|
||||
@@ -1550,138 +1311,123 @@ Device::createBuffer(
|
||||
}
|
||||
|
||||
// Use direct access if it's possible
|
||||
if (bufferAlloc || (type == Resource::Remote)) {
|
||||
bool forceHeapAlloc = false;
|
||||
bool remoteAlloc = false;
|
||||
// Internal means VirtualDevice!=NULL
|
||||
bool internalAlloc = ((owner.getMemFlags() & CL_MEM_USE_HOST_PTR) &&
|
||||
(owner.getVirtualDevice() != NULL)) ? true : false;
|
||||
bool remoteAlloc = false;
|
||||
// Internal means VirtualDevice!=NULL
|
||||
bool internalAlloc = ((owner.getMemFlags() & CL_MEM_USE_HOST_PTR) &&
|
||||
(owner.getVirtualDevice() != NULL)) ? true : false;
|
||||
|
||||
// Create a memory object
|
||||
gpuMemory = new gpu::Buffer(*this, owner, owner.getSize());
|
||||
if (NULL == gpuMemory) {
|
||||
return NULL;
|
||||
}
|
||||
// Create a memory object
|
||||
gpuMemory = new gpu::Buffer(*this, owner, owner.getSize());
|
||||
if (NULL == gpuMemory) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
// Check if owner is interop memory
|
||||
if (owner.isInterop()) {
|
||||
result = gpuMemory->createInterop(Memory::InteropDirectAccess);
|
||||
}
|
||||
else if (owner.getMemFlags() & CL_MEM_USE_PERSISTENT_MEM_AMD) {
|
||||
// Attempt to allocate from persistent heap
|
||||
result = gpuMemory->create(Resource::Persistent);
|
||||
}
|
||||
else if (directAccess || (type == Resource::Remote)) {
|
||||
// Check for system memory allocations
|
||||
if ((owner.getMemFlags() & (CL_MEM_ALLOC_HOST_PTR | CL_MEM_USE_HOST_PTR))
|
||||
|| (settings().remoteAlloc_)) {
|
||||
// Allocate remote memory if AHP allocation and context has just 1 device
|
||||
if ((owner.getMemFlags() & CL_MEM_ALLOC_HOST_PTR) &&
|
||||
(owner.getContext().devices().size() == 1)) {
|
||||
if (owner.getMemFlags() & (CL_MEM_READ_ONLY |
|
||||
CL_MEM_HOST_WRITE_ONLY | CL_MEM_HOST_NO_ACCESS)) {
|
||||
// GPU will be reading from this host memory buffer,
|
||||
// so assume Host write into it
|
||||
type = Resource::RemoteUSWC;
|
||||
remoteAlloc = true;
|
||||
}
|
||||
// Check if owner is interop memory
|
||||
if (owner.isInterop()) {
|
||||
result = gpuMemory->createInterop(Memory::InteropDirectAccess);
|
||||
}
|
||||
else if (owner.getMemFlags() & CL_MEM_USE_PERSISTENT_MEM_AMD) {
|
||||
// Attempt to allocate from persistent heap
|
||||
result = gpuMemory->create(Resource::Persistent);
|
||||
}
|
||||
else if (directAccess || (type == Resource::Remote)) {
|
||||
// Check for system memory allocations
|
||||
if ((owner.getMemFlags() & (CL_MEM_ALLOC_HOST_PTR | CL_MEM_USE_HOST_PTR))
|
||||
|| (settings().remoteAlloc_)) {
|
||||
// Allocate remote memory if AHP allocation and context has just 1 device
|
||||
if ((owner.getMemFlags() & CL_MEM_ALLOC_HOST_PTR) &&
|
||||
(owner.getContext().devices().size() == 1)) {
|
||||
if (owner.getMemFlags() & (CL_MEM_READ_ONLY |
|
||||
CL_MEM_HOST_WRITE_ONLY | CL_MEM_HOST_NO_ACCESS)) {
|
||||
// GPU will be reading from this host memory buffer,
|
||||
// so assume Host write into it
|
||||
type = Resource::RemoteUSWC;
|
||||
remoteAlloc = true;
|
||||
}
|
||||
// Make sure owner has a valid hostmem pointer and it's not COPY
|
||||
if (!remoteAlloc && (owner.getHostMem() != NULL)) {
|
||||
Resource::PinnedParams params;
|
||||
params.owner_ = &owner;
|
||||
params.gpu_ =
|
||||
reinterpret_cast<VirtualGPU*>(owner.getVirtualDevice());
|
||||
}
|
||||
// Make sure owner has a valid hostmem pointer and it's not COPY
|
||||
if (!remoteAlloc && (owner.getHostMem() != NULL)) {
|
||||
Resource::PinnedParams params;
|
||||
params.owner_ = &owner;
|
||||
params.gpu_ =
|
||||
reinterpret_cast<VirtualGPU*>(owner.getVirtualDevice());
|
||||
|
||||
params.hostMemRef_ = owner.getHostMemRef();
|
||||
params.size_ = owner.getHostMemRef()->size();
|
||||
if (0 == params.size_) {
|
||||
params.size_ = owner.getSize();
|
||||
}
|
||||
// Create memory object
|
||||
result = gpuMemory->create(Resource::Pinned, ¶ms);
|
||||
params.hostMemRef_ = owner.getHostMemRef();
|
||||
params.size_ = owner.getHostMemRef()->size();
|
||||
if (0 == params.size_) {
|
||||
params.size_ = owner.getSize();
|
||||
}
|
||||
// Create memory object
|
||||
result = gpuMemory->create(Resource::Pinned, ¶ms);
|
||||
|
||||
// If direct access failed
|
||||
if (!result) {
|
||||
// and VM off, then force a heap allocation
|
||||
if (!heap()->isVirtual()) {
|
||||
// Internal pinning doesn't need a heap allocation
|
||||
if (!internalAlloc) {
|
||||
forceHeapAlloc = true;
|
||||
}
|
||||
}
|
||||
// Don't use cached allocation
|
||||
// if size is biger than max single alloc
|
||||
if (owner.getSize() > info().maxMemAllocSize_) {
|
||||
delete gpuMemory;
|
||||
return NULL;
|
||||
}
|
||||
// If direct access failed
|
||||
if (!result) {
|
||||
// Don't use cached allocation
|
||||
// if size is biger than max single alloc
|
||||
if (owner.getSize() > info().maxMemAllocSize_) {
|
||||
delete gpuMemory;
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!result && !forceHeapAlloc &&
|
||||
// Make sure it's not internal alloc
|
||||
!internalAlloc) {
|
||||
Resource::CreateParams params;
|
||||
params.owner_ = &owner;
|
||||
params.gpu_ = static_cast<VirtualGPU*>(owner.getVirtualDevice());
|
||||
if (!result &&
|
||||
// Make sure it's not internal alloc
|
||||
!internalAlloc) {
|
||||
Resource::CreateParams params;
|
||||
params.owner_ = &owner;
|
||||
params.gpu_ = static_cast<VirtualGPU*>(owner.getVirtualDevice());
|
||||
|
||||
// Create memory object
|
||||
result = gpuMemory->create(type, ¶ms);
|
||||
// Create memory object
|
||||
result = gpuMemory->create(type, ¶ms);
|
||||
|
||||
// If allocation was successful
|
||||
if (result) {
|
||||
// Initialize if the memory is a pipe object
|
||||
if (owner.getType() == CL_MEM_OBJECT_PIPE) {
|
||||
// Pipe initialize in order read_idx, write_idx, end_idx. Refer clk_pipe_t structure.
|
||||
// Init with 3 DWORDS for 32bit addressing and 6 DWORDS for 64bit
|
||||
size_t pipeInit[3] = {0 , 0, owner.asPipe()->getMaxNumPackets()};
|
||||
gpuMemory->writeRawData(*xferQueue_, sizeof(pipeInit), pipeInit, true);
|
||||
// If allocation was successful
|
||||
if (result) {
|
||||
// Initialize if the memory is a pipe object
|
||||
if (owner.getType() == CL_MEM_OBJECT_PIPE) {
|
||||
// Pipe initialize in order read_idx, write_idx, end_idx. Refer clk_pipe_t structure.
|
||||
// Init with 3 DWORDS for 32bit addressing and 6 DWORDS for 64bit
|
||||
size_t pipeInit[3] = {0 , 0, owner.asPipe()->getMaxNumPackets()};
|
||||
gpuMemory->writeRawData(*xferQueue_, sizeof(pipeInit), pipeInit, true);
|
||||
}
|
||||
// If memory has direct access from host, then get CPU address
|
||||
if (gpuMemory->isHostMemDirectAccess() &&
|
||||
(type != Resource::ExternalPhysical)) {
|
||||
void* address = gpuMemory->map(NULL);
|
||||
if (address != NULL) {
|
||||
// Copy saved memory
|
||||
if (owner.getMemFlags() & CL_MEM_COPY_HOST_PTR) {
|
||||
memcpy(address, owner.getHostMem(), owner.getSize());
|
||||
}
|
||||
// It should be safe to change the host memory pointer,
|
||||
// because it's lock protected from the upper caller
|
||||
owner.setHostMem(address);
|
||||
}
|
||||
// If memory has direct access from host, then get CPU address
|
||||
if (gpuMemory->isHostMemDirectAccess() &&
|
||||
(type != Resource::ExternalPhysical)) {
|
||||
void* address = gpuMemory->map(NULL);
|
||||
if (address != NULL) {
|
||||
// Copy saved memory
|
||||
if (owner.getMemFlags() & CL_MEM_COPY_HOST_PTR) {
|
||||
memcpy(address, owner.getHostMem(), owner.getSize());
|
||||
}
|
||||
// It should be safe to change the host memory pointer,
|
||||
// because it's lock protected from the upper caller
|
||||
owner.setHostMem(address);
|
||||
}
|
||||
else {
|
||||
result = false;
|
||||
}
|
||||
}
|
||||
// An optimization for CHP. Copy memory and destroy sysmem allocation
|
||||
else if ((gpuMemory->memoryType() != Resource::Pinned) &&
|
||||
(owner.getMemFlags() & CL_MEM_COPY_HOST_PTR) &&
|
||||
(owner.getContext().devices().size() == 1)) {
|
||||
amd::Coord3D origin(0, 0, 0);
|
||||
amd::Coord3D region(owner.getSize());
|
||||
static const bool Entire = true;
|
||||
if (xferMgr().writeBuffer(owner.getHostMem(),
|
||||
*gpuMemory, origin, region, Entire)) {
|
||||
// Clear CHP memory
|
||||
owner.setHostMem(NULL);
|
||||
}
|
||||
else {
|
||||
result = false;
|
||||
}
|
||||
}
|
||||
// An optimization for CHP. Copy memory and destroy sysmem allocation
|
||||
else if ((gpuMemory->memoryType() != Resource::Pinned) &&
|
||||
(owner.getMemFlags() & CL_MEM_COPY_HOST_PTR) &&
|
||||
(owner.getContext().devices().size() == 1)) {
|
||||
amd::Coord3D origin(0, 0, 0);
|
||||
amd::Coord3D region(owner.getSize());
|
||||
static const bool Entire = true;
|
||||
if (xferMgr().writeBuffer(owner.getHostMem(),
|
||||
*gpuMemory, origin, region, Entire)) {
|
||||
// Clear CHP memory
|
||||
owner.setHostMem(NULL);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!result && !forceHeapAlloc) {
|
||||
delete gpuMemory;
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
|
||||
if (!result) {
|
||||
assert(!heap()->isVirtual() && "Can't have static heap allocation with VM");
|
||||
gpuMemory = createBufferFromHeap(owner);
|
||||
delete gpuMemory;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
return gpuMemory;
|
||||
@@ -1703,10 +1449,10 @@ Device::createImage(amd::Memory& owner, bool directAccess) const
|
||||
}
|
||||
// Create a view on the specified device
|
||||
gpuImage = (gpu::Memory*)createView(owner, *devParent);
|
||||
if (heap()->isVirtual() && (NULL != gpuImage) && (gpuImage->owner() != NULL)) {
|
||||
if ((NULL != gpuImage) && (gpuImage->owner() != NULL)) {
|
||||
gpuImage->owner()->setHostMem((address)(owner.parent()->getHostMem()) + gpuImage->owner()->getOrigin());
|
||||
}
|
||||
return gpuImage ;
|
||||
return gpuImage;
|
||||
}
|
||||
|
||||
gpuImage = new gpu::Image(*this, owner,
|
||||
@@ -1778,11 +1524,11 @@ Device::createImage(amd::Memory& owner, bool directAccess) const
|
||||
(owner.getMemFlags() & CL_MEM_COPY_HOST_PTR) &&
|
||||
(owner.getContext().devices().size() == 1)) {
|
||||
// Ignore copy for image1D_buffer, since it was already done for buffer
|
||||
if (heap()->isVirtual() && imageBuffer) {
|
||||
if (imageBuffer) {
|
||||
// Clear CHP memory
|
||||
owner.setHostMem(NULL);
|
||||
}
|
||||
else if (!imageBuffer) {
|
||||
else {
|
||||
amd::Coord3D origin(0, 0, 0);
|
||||
static const bool Entire = true;
|
||||
if (xferMgr().writeImage(owner.getHostMem(),
|
||||
@@ -1809,25 +1555,12 @@ Device::createMemory(
|
||||
amd::Memory& owner) const
|
||||
{
|
||||
bool directAccess = false;
|
||||
bool bufferAlloc = false;
|
||||
gpu::Memory* memory = NULL;
|
||||
|
||||
if (heap()->isVirtual()) {
|
||||
bufferAlloc = true;
|
||||
}
|
||||
//!@todo Remove this code when VM is always on.
|
||||
// Use zero-copy transfers for sysmem allocations or persistent memory
|
||||
else {
|
||||
if (owner.getMemFlags() & (CL_MEM_ALLOC_HOST_PTR |
|
||||
CL_MEM_USE_HOST_PTR)) {
|
||||
bufferAlloc = true;
|
||||
}
|
||||
}
|
||||
|
||||
if (owner.asBuffer()) {
|
||||
directAccess = (settings().hostMemDirectAccess_ & Settings::HostMemBuffer)
|
||||
? true : false;
|
||||
memory = createBuffer(owner, directAccess, bufferAlloc);
|
||||
memory = createBuffer(owner, directAccess);
|
||||
}
|
||||
else if (owner.asImage()) {
|
||||
directAccess = (settings().hostMemDirectAccess_ & Settings::HostMemImage)
|
||||
@@ -1878,7 +1611,6 @@ bool
|
||||
Device::reallocMemory(amd::Memory& owner) const
|
||||
{
|
||||
bool directAccess = false;
|
||||
bool bufferAlloc = heap()->isVirtual();
|
||||
|
||||
// For now we have to serialize reallocation code
|
||||
amd::ScopedLock lk(*lockAsyncOps_);
|
||||
@@ -1889,35 +1621,18 @@ Device::reallocMemory(amd::Memory& owner) const
|
||||
if (gpuMemory == NULL) {
|
||||
return false;
|
||||
}
|
||||
if (gpuMemory->hb() != NULL) {
|
||||
|
||||
if (gpuMemory->pinOffset() == 0) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (bufferAlloc) {
|
||||
if (gpuMemory->pinOffset() == 0) {
|
||||
return true;
|
||||
}
|
||||
else if (NULL != owner.parent()) {
|
||||
if (!reallocMemory(*owner.parent())) {
|
||||
return false;
|
||||
}
|
||||
else if (NULL != owner.parent()) {
|
||||
if (!reallocMemory(*owner.parent())) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
if (owner.asBuffer()) {
|
||||
// Disable remote allocation if no VM
|
||||
if ((gpuMemory != NULL) &&
|
||||
((gpuMemory->memoryType() == Resource::Remote) ||
|
||||
(gpuMemory->memoryType() == Resource::RemoteUSWC)) && !bufferAlloc) {
|
||||
// Make sure we don't have a stale memory in VA cache before reallocation
|
||||
// of system memory.
|
||||
// \note: the app must unmap() memory before kernel launch
|
||||
removeVACache(gpuMemory);
|
||||
static const bool forceAllocHostMem = true;
|
||||
static const bool forceCopy = true;
|
||||
owner.allocHostMemory(owner.getHostMem(), forceAllocHostMem, forceCopy);
|
||||
}
|
||||
gpuMemory = createBuffer(owner, directAccess, bufferAlloc);
|
||||
gpuMemory = createBuffer(owner, directAccess);
|
||||
}
|
||||
else if (owner.asImage()) {
|
||||
return true;
|
||||
@@ -2113,24 +1828,18 @@ Device::globalFreeMemory(size_t* freeMemory) const
|
||||
if (!(const_cast<Device*>(this)->initializeHeapResources())) {
|
||||
return false;
|
||||
}
|
||||
if (heap()->isVirtual()) {
|
||||
gslMemInfo memInfo = {0};
|
||||
gslCtx()->getMemInfo(&memInfo, GSL_MEMINFO_BASIC);
|
||||
|
||||
// Fill free memory info
|
||||
freeMemory[TotalFreeMemory] = (memInfo.cardMemAvailableBytes +
|
||||
memInfo.cardExtMemAvailableBytes) / Ki;
|
||||
freeMemory[LargestFreeBlock] = std::max(memInfo.cardLargestFreeBlockBytes,
|
||||
memInfo.cardExtLargestFreeBlockBytes) / Ki;
|
||||
if (settings().apuSystem_) {
|
||||
freeMemory[TotalFreeMemory] += memInfo.agpMemAvailableBytes / Ki;
|
||||
freeMemory[LargestFreeBlock] += memInfo.agpLargestFreeBlockBytes / Ki;
|
||||
}
|
||||
}
|
||||
else {
|
||||
freeMemory[TotalFreeMemory] = static_cast<size_t>((info().globalMemSize_ -
|
||||
static_cast<cl_ulong>(heapSize_) + heap()->freeSpace()) / Ki);
|
||||
freeMemory[LargestFreeBlock] = freeMemory[TotalFreeMemory];
|
||||
gslMemInfo memInfo = {0};
|
||||
gslCtx()->getMemInfo(&memInfo, GSL_MEMINFO_BASIC);
|
||||
|
||||
// Fill free memory info
|
||||
freeMemory[TotalFreeMemory] = (memInfo.cardMemAvailableBytes +
|
||||
memInfo.cardExtMemAvailableBytes) / Ki;
|
||||
freeMemory[LargestFreeBlock] = std::max(memInfo.cardLargestFreeBlockBytes,
|
||||
memInfo.cardExtLargestFreeBlockBytes) / Ki;
|
||||
if (settings().apuSystem_) {
|
||||
freeMemory[TotalFreeMemory] += memInfo.agpMemAvailableBytes / Ki;
|
||||
freeMemory[LargestFreeBlock] += memInfo.agpLargestFreeBlockBytes / Ki;
|
||||
}
|
||||
|
||||
return true;
|
||||
|
||||
@@ -125,8 +125,7 @@ protected:
|
||||
const CALdeviceattribs& calAttr, //!< CAL device attributes info
|
||||
const gslMemInfo& memInfo, //!< GSL mem info
|
||||
size_t maxTextureSize, //!< Maximum texture size supported in HW
|
||||
uint numComputeRings, //!< Number of compute rings
|
||||
bool isVirtualMode //!< Device is in virtual mode
|
||||
uint numComputeRings //!< Number of compute rings
|
||||
);
|
||||
};
|
||||
|
||||
@@ -184,6 +183,32 @@ private:
|
||||
class Device : public NullDevice, public CALGSLDevice
|
||||
{
|
||||
public:
|
||||
class Heap : public amd::EmbeddedObject
|
||||
{
|
||||
public:
|
||||
//! The size of a heap element in bytes
|
||||
static const size_t ElementSize = 4;
|
||||
|
||||
//! The type of a heap element in bytes
|
||||
static const cmSurfFmt ElementType = CM_SURF_FMT_R32I;
|
||||
|
||||
Heap(): resource_(NULL), baseAddress_(0) {}
|
||||
|
||||
bool create(
|
||||
Device& device //!< GPU device object
|
||||
);
|
||||
|
||||
//! Gets the GPU resource associated with the global heap
|
||||
const Resource& resource() const { return *resource_; }
|
||||
|
||||
//! Returns the base virtual address of the heap
|
||||
uint64_t baseAddress() const { return baseAddress_; }
|
||||
|
||||
protected:
|
||||
Resource* resource_; //!< GPU resource referencing the heap memory
|
||||
uint64_t baseAddress_; //!< Virtual heap base address
|
||||
};
|
||||
|
||||
//! Locks any access to the virtual GPUs
|
||||
class ScopedLockVgpus : public amd::StackObject {
|
||||
public:
|
||||
@@ -377,12 +402,6 @@ public:
|
||||
//! Destructor for the physical GPU device
|
||||
virtual ~Device();
|
||||
|
||||
//! Reallocates current global heap
|
||||
bool reallocHeap(
|
||||
size_t size, //!< requested size for reallocation
|
||||
bool remoteAlloc //!< allocate the new heap in remote memory
|
||||
);
|
||||
|
||||
//! Instantiate a new virtual device
|
||||
device::VirtualDevice* createVirtualDevice(
|
||||
amd::CommandQueue* queue = NULL
|
||||
@@ -442,15 +461,10 @@ public:
|
||||
) const;
|
||||
|
||||
//! Gets the GPU resource associated with the global heap
|
||||
const Resource& globalMem() const { return heap_->resource(); }
|
||||
const Resource& globalMem() const { return heap_.resource(); }
|
||||
|
||||
//! Gets the global heap object
|
||||
const Heap* heap() const { return heap_; }
|
||||
|
||||
//! Allocates a heap block from the global heap
|
||||
HeapBlock* allocHeapBlock(
|
||||
size_t size //!< The heap block size for allocation
|
||||
) const;
|
||||
const Heap& heap() const { return heap_; }
|
||||
|
||||
//! Gets the memory object for the dummy page
|
||||
amd::Memory* dummyPage() const { return dummyPage_; }
|
||||
@@ -566,16 +580,10 @@ private:
|
||||
//! Sends the stall command to all queues
|
||||
bool stallQueues();
|
||||
|
||||
//! Buffer allocation from static heap (no VM mode only)
|
||||
gpu::Memory* createBufferFromHeap(
|
||||
amd::Memory& owner //!< Abstraction layer memory object
|
||||
) const;
|
||||
|
||||
//! Buffer allocation
|
||||
gpu::Memory* createBuffer(
|
||||
amd::Memory& owner, //!< Abstraction layer memory object
|
||||
bool directAccess, //!< Use direct host memory access
|
||||
bool bufferAlloc //!< If TRUE, then don't use heap
|
||||
bool directAccess //!< Use direct host memory access
|
||||
) const;
|
||||
|
||||
//! Image allocation
|
||||
@@ -591,8 +599,7 @@ private:
|
||||
);
|
||||
|
||||
amd::Context* context_; //!< A dummy context for internal allocations
|
||||
size_t heapSize_; //!< The global heap size
|
||||
Heap* heap_; //!< GPU heap manager
|
||||
Heap heap_; //!< GPU global heap
|
||||
amd::Memory* dummyPage_; //!< A dummy page for NULL pointer
|
||||
|
||||
amd::Monitor* lockAsyncOps_; //!< Lock to serialise all async ops on this device
|
||||
|
||||
@@ -1,536 +0,0 @@
|
||||
//! Implementation of GPU device memory management
|
||||
|
||||
#include "top.hpp"
|
||||
#include "thread/thread.hpp"
|
||||
#include "thread/monitor.hpp"
|
||||
#include "device/device.hpp"
|
||||
#include "device/gpu/gpuheap.hpp"
|
||||
#include "device/gpu/gpudevice.hpp"
|
||||
|
||||
#include <string>
|
||||
#include <fstream>
|
||||
#include <sstream>
|
||||
#include <iostream>
|
||||
|
||||
//! Turn this on to enable sanity checks before and after every heap operation.
|
||||
#if DEBUG
|
||||
#define EXTRA_HEAP_CHECKS 1
|
||||
#endif // DEBUG
|
||||
|
||||
namespace gpu {
|
||||
|
||||
// The GPU heap. Very simple implementation for now.
|
||||
Heap::Heap(
|
||||
Device& device)
|
||||
: resource_(NULL)
|
||||
, freeList_(NULL)
|
||||
, busyList_(NULL)
|
||||
, freeSize_(0)
|
||||
, device_(device)
|
||||
, granularity_(Heap::MinGranularity)
|
||||
, lock_("GPU heap lock", true)
|
||||
, virtualMode_(false)
|
||||
, baseAddress_(0)
|
||||
{
|
||||
}
|
||||
|
||||
size_t
|
||||
Heap::granularityB() const
|
||||
{
|
||||
return granularity_ * Heap::ElementSize;
|
||||
}
|
||||
|
||||
bool
|
||||
Heap::create(size_t totalSize, bool remoteAlloc)
|
||||
{
|
||||
Resource::MemoryType memType;
|
||||
size_t maxHeight = device_.info().image2DMaxHeight_;
|
||||
size_t sizeInElements;
|
||||
size_t npages;
|
||||
|
||||
freeSize_ = totalSize;
|
||||
|
||||
sizeInElements = (totalSize + Heap::ElementSize - 1) / Heap::ElementSize;
|
||||
|
||||
// Calculate best granularity given the size and device characteristics
|
||||
npages = amd::alignUp(sizeInElements, granularity_) / granularity_;
|
||||
|
||||
// Create a new GPU resource
|
||||
resource_ = new Resource(device_, sizeInElements, Heap::ElementType);
|
||||
|
||||
if (resource_ == NULL) {
|
||||
return false;
|
||||
}
|
||||
|
||||
memType = (remoteAlloc) ? Resource::RemoteUSWC : Resource::Local;
|
||||
|
||||
if (!resource_->create(memType, NULL, true)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Set up initial free list
|
||||
freeList_ = new HeapBlock(this, npages * granularityB(), 0, NULL, NULL);
|
||||
if (freeList_ == NULL) {
|
||||
return false;
|
||||
}
|
||||
|
||||
guarantee(isSane());
|
||||
return true;
|
||||
}
|
||||
|
||||
Heap::~Heap()
|
||||
{
|
||||
amd::ScopedLock k(lock_);
|
||||
|
||||
guarantee(isSane());
|
||||
|
||||
// Release all heap blocks
|
||||
HeapBlock *walk, *next;
|
||||
walk = busyList_;
|
||||
while (walk) {
|
||||
next = walk->next_;
|
||||
walk->free();
|
||||
walk = next;
|
||||
}
|
||||
|
||||
walk = freeList_;
|
||||
while (walk) {
|
||||
next = walk->next_;
|
||||
delete walk;
|
||||
walk = next;
|
||||
}
|
||||
|
||||
// Release resource
|
||||
delete resource_;
|
||||
}
|
||||
|
||||
HeapBlock*
|
||||
Heap::alloc(size_t size)
|
||||
{
|
||||
amd::ScopedLock k(lock_);
|
||||
HeapBlock* walk = freeList_;
|
||||
HeapBlock* best = NULL;
|
||||
|
||||
guarantee(isSane());
|
||||
|
||||
// Round size
|
||||
size = amd::alignUp(size, granularityB());
|
||||
|
||||
// Walk the free list looking for a suitable block (currently best-fit)
|
||||
//! @todo:dgladdin: experiment with switching back to first-fit
|
||||
|
||||
while (walk) {
|
||||
if ((walk->size_ > size) &&
|
||||
(best == NULL || walk->size_ < best->size_)) {
|
||||
best = walk;
|
||||
}
|
||||
else if (walk->size_ == size) {
|
||||
// No need to split, just move to busy list
|
||||
detachBlock(&freeList_, walk);
|
||||
walk->inUse_ = true;
|
||||
insertBlock(&busyList_, walk);
|
||||
guarantee(isSane());
|
||||
freeSize_ -= size;
|
||||
return walk;
|
||||
}
|
||||
walk = walk->next_;
|
||||
}
|
||||
|
||||
if (best != NULL) {
|
||||
// Got one, but need to split it. Keep first part in free list,
|
||||
// put second part into busy list.
|
||||
HeapBlock *newblock = splitBlock(best, size);
|
||||
newblock->inUse_ = true;
|
||||
insertBlock(&busyList_, newblock);
|
||||
guarantee(isSane());
|
||||
freeSize_ -= size;
|
||||
return newblock;
|
||||
}
|
||||
|
||||
// No free block available
|
||||
guarantee(isSane());
|
||||
return NULL;
|
||||
}
|
||||
|
||||
bool
|
||||
Heap::copyTo(Heap* heap)
|
||||
{
|
||||
HeapBlock *walk;
|
||||
|
||||
walk = busyList_;
|
||||
while (walk) {
|
||||
if (walk->getMemory() != NULL) {
|
||||
HeapBlock* hb = heap->alloc(walk->size_);
|
||||
if (hb == NULL) {
|
||||
return false;
|
||||
}
|
||||
hb->setMemory(walk->getMemory());
|
||||
|
||||
walk->destroyViewsMemory();
|
||||
if (!walk->getMemory()->reallocate(hb, &(heap->resource()))) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!walk->reallocateViews(hb,
|
||||
static_cast<size_t>(hb->offset_ - walk->offset_))) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
walk = walk->next_;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
void
|
||||
Heap::free(HeapBlock* blk)
|
||||
{
|
||||
amd::ScopedLock k(lock_);
|
||||
guarantee(isSane());
|
||||
detachBlock(&busyList_, blk);
|
||||
blk->inUse_ = false;
|
||||
freeSize_ += blk->size_;
|
||||
mergeBlock(&freeList_, blk);
|
||||
guarantee(isSane());
|
||||
}
|
||||
|
||||
void
|
||||
Heap::detachBlock(HeapBlock** list, HeapBlock* blk)
|
||||
{
|
||||
// Sanity checks
|
||||
guarantee(isSane());
|
||||
|
||||
if (*list == blk) {
|
||||
*list = blk->next_;
|
||||
}
|
||||
|
||||
if (blk->prev_) {
|
||||
blk->prev_->next_ = blk->next_;
|
||||
}
|
||||
if (blk->next_) {
|
||||
blk->next_->prev_ = blk->prev_;
|
||||
}
|
||||
// no heap sanity check as blk is now floating
|
||||
}
|
||||
|
||||
void
|
||||
Heap::insertBlock(HeapBlock** head, HeapBlock* blk)
|
||||
{
|
||||
if (NULL == *head) {
|
||||
*head = blk;
|
||||
blk->prev_ = NULL;
|
||||
blk->next_ = NULL;
|
||||
guarantee(isSane());
|
||||
return;
|
||||
}
|
||||
|
||||
// Find the place to insert it at
|
||||
HeapBlock* walk = *head;
|
||||
while (walk->next_ && walk->next_->offset_ < blk->offset_) {
|
||||
walk = walk->next_;
|
||||
}
|
||||
|
||||
// Insert it
|
||||
if (walk == *head) {
|
||||
if (walk->offset_ >= blk->offset_) {
|
||||
*head = blk;
|
||||
blk->prev_ = NULL;
|
||||
blk->next_ = walk;
|
||||
walk->prev_ = *head;
|
||||
guarantee(isSane());
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
blk->next_ = walk->next_;
|
||||
blk->prev_ = walk;
|
||||
if (walk->next_) {
|
||||
walk->next_->prev_ = blk;
|
||||
}
|
||||
walk->next_ = blk;
|
||||
guarantee(isSane());
|
||||
}
|
||||
|
||||
HeapBlock*
|
||||
Heap::splitBlock(HeapBlock* blk, size_t tailsize)
|
||||
{
|
||||
// Sanity checks
|
||||
|
||||
guarantee(isSane());
|
||||
guarantee(blk->size_ > tailsize && "block too small to split as requested");
|
||||
guarantee(!blk->inUse_ && "can't split in-use block");
|
||||
|
||||
// Create a new block
|
||||
|
||||
HeapBlock* nb = new HeapBlock(blk->owner_, tailsize,
|
||||
blk->offset_ + blk->size_ - tailsize);
|
||||
|
||||
// Resize the old block
|
||||
|
||||
blk->size_ = blk->size_ - tailsize;
|
||||
return nb; // no heap sanity check here as the new block hasn't been plugged in yet
|
||||
}
|
||||
|
||||
//! Join two blocks, transferring the size of the second into the first and deleting
|
||||
//! the second. Utility fn for mergeBlock()
|
||||
|
||||
static void
|
||||
join2Blocks(HeapBlock* first, HeapBlock* second)
|
||||
{
|
||||
// Sanity checks
|
||||
|
||||
guarantee(first->size_ > 0 && "first block invalid");
|
||||
guarantee(!first->inUse_ && "can't join an in-use block");
|
||||
guarantee(second->size_ > 0 && "second block invalid");
|
||||
guarantee(first->offset_ + first->size_ == second->offset_);
|
||||
|
||||
// Do the join
|
||||
first->size_ = first->size_ + second->size_;
|
||||
first->next_ = second->next_;
|
||||
if (second->next_) {
|
||||
second->next_->prev_ = first;
|
||||
}
|
||||
delete second;
|
||||
}
|
||||
|
||||
//! Insert a block into a list, merging it with adjacent blocks if possible. Must be called
|
||||
//! under a lock, cannot be used on in-use blocks or blocks with an associated resource alias.
|
||||
|
||||
void
|
||||
Heap::mergeBlock(HeapBlock** head, HeapBlock* blk)
|
||||
{
|
||||
insertBlock(head, blk);
|
||||
|
||||
// Merge with successor if possible
|
||||
if ((blk->next_ != NULL) &&
|
||||
(blk->offset_ + blk->size_ == blk->next_->offset_)) {
|
||||
join2Blocks(blk, blk->next_);
|
||||
}
|
||||
|
||||
// Merge with predecessor if possible
|
||||
if ((blk->prev_ != NULL) &&
|
||||
(blk->prev_->offset_ + blk->prev_->size_ == blk->offset_)) {
|
||||
join2Blocks(blk->prev_, blk);
|
||||
}
|
||||
|
||||
guarantee(isSane());
|
||||
}
|
||||
|
||||
//! Sanity check for both types of block (helper function for Heap::isSane())
|
||||
|
||||
static bool
|
||||
isBlockSane(HeapBlock* b)
|
||||
{
|
||||
return (b->owner_ != NULL
|
||||
&& (b->next_ == NULL || b->next_->prev_ == b)
|
||||
&& (b->prev_ == NULL || b->prev_->next_ == b));
|
||||
}
|
||||
|
||||
//! Sanity check for an individual free block (helper function for Heap::isSane())
|
||||
static bool
|
||||
isFreeBlockSane(HeapBlock* b)
|
||||
{
|
||||
if (isBlockSane(b) && !b->inUse_) {
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
//! Sanity check for an individual busy block (helper function for Heap::isSane())
|
||||
static bool
|
||||
isBusyBlockSane(HeapBlock* b)
|
||||
{
|
||||
if (isBlockSane(b) && b->inUse_) {
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
//! Sanity check for the heap.
|
||||
|
||||
bool
|
||||
Heap::isSane() const
|
||||
{
|
||||
// If we got this far, everything is (probably) OK
|
||||
#if EXTRA_HEAP_CHECKS
|
||||
HeapBlock* walkFree = freeList_; // Free list position
|
||||
HeapBlock* walkBusy = busyList_; // Busy list position
|
||||
size_t offset = 0; // Current offset
|
||||
|
||||
// We can have zero lists if Heap allocation fails
|
||||
if (walkFree == NULL && walkBusy == NULL) {
|
||||
return true;
|
||||
}
|
||||
|
||||
// Walk both lists in parallel
|
||||
while (walkFree != NULL || walkBusy != NULL) {
|
||||
if (walkFree != NULL && walkFree->offset_ == offset) {
|
||||
if (!isFreeBlockSane(walkFree)) {
|
||||
return false;
|
||||
}
|
||||
offset += walkFree->size_;
|
||||
walkFree = walkFree->next_;
|
||||
}
|
||||
else if (walkBusy != NULL && walkBusy->offset_ == offset) {
|
||||
if (!isBusyBlockSane(walkBusy)) {
|
||||
return false;
|
||||
}
|
||||
offset += walkBusy->size_;
|
||||
walkBusy = walkBusy->next_;
|
||||
}
|
||||
else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
#endif // EXTRA_HEAP_CHECKS
|
||||
return true;
|
||||
}
|
||||
|
||||
void
|
||||
HeapBlock::destroyViewsMemory()
|
||||
{
|
||||
if ((parent_ != NULL) && (0 == views_.size())) {
|
||||
memory_->free();
|
||||
}
|
||||
else if (views_.size() != 0) {
|
||||
std::list<HeapBlock*>::const_iterator it;
|
||||
for (it = views_.begin(); it != views_.end(); ++it) {
|
||||
(*it)->destroyViewsMemory();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
bool
|
||||
HeapBlock::reallocateViews(HeapBlock* parent, size_t shift)
|
||||
{
|
||||
if (views_.size() != 0) {
|
||||
std::list<HeapBlock*>::const_iterator it;
|
||||
|
||||
// Loop through all views and reallocate them
|
||||
for (it = views_.begin(); it != views_.end(); ++it) {
|
||||
// Get the view HeapBlock
|
||||
HeapBlock* hb = (*it);
|
||||
|
||||
// Readjust the offset
|
||||
hb->offset_ += shift;
|
||||
// Add to the list if we have a new parent
|
||||
if (parent != this) {
|
||||
parent->addView(hb);
|
||||
}
|
||||
|
||||
// Reallocate memory
|
||||
hb->memory_->reallocate(hb, parent->getMemory());
|
||||
|
||||
// Process a view on view if available
|
||||
if (!hb->reallocateViews(hb, shift)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// Destroy old list
|
||||
if (parent != this) {
|
||||
views_.clear();
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
//! Destructor. Frees the block if in use and does some final sanity checks.
|
||||
HeapBlock::~HeapBlock()
|
||||
{
|
||||
if (NULL != owner_) {
|
||||
if (inUse_) {
|
||||
owner_->free(this);
|
||||
}
|
||||
}
|
||||
else {
|
||||
// View destruction
|
||||
if (parent_ != NULL) {
|
||||
assert(((parent_->getMemory() != NULL) && (parent_->getMemory()->owner() != NULL)));
|
||||
amd::ScopedLock lock(parent_->getMemory()->owner()->lockMemoryOps());
|
||||
parent_->removeView(this);
|
||||
}
|
||||
}
|
||||
guarantee(size_ > 0 && "destructor called for zero-size heap block (destructor called twice?)");
|
||||
size_ = 0; // Mark as invalid
|
||||
|
||||
if (views_.size() != 0) {
|
||||
LogError("Can't destroy a resource if we still have views!");
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
HeapBlock::free()
|
||||
{
|
||||
if (NULL != owner_) {
|
||||
owner_->free(this);
|
||||
}
|
||||
else {
|
||||
// It's a view. Destroy the object
|
||||
delete this;
|
||||
}
|
||||
}
|
||||
|
||||
VirtualHeap::VirtualHeap(
|
||||
Device& device)
|
||||
: Heap(device)
|
||||
{
|
||||
virtualMode_ = true;
|
||||
}
|
||||
|
||||
bool
|
||||
VirtualHeap::create(
|
||||
size_t totalSize,
|
||||
bool remoteAlloc)
|
||||
{
|
||||
// Create a new GPU resource
|
||||
resource_ = new Resource(device_, 0, Heap::ElementType);
|
||||
if (resource_ == NULL) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!resource_->create(Resource::Heap)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!device_.settings().hsail_) {
|
||||
baseAddress_ = resource_->gslResource()->getSurfaceAddress();
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
VirtualHeap::~VirtualHeap()
|
||||
{
|
||||
}
|
||||
|
||||
HeapBlock*
|
||||
VirtualHeap::alloc(size_t size)
|
||||
{
|
||||
assert(false && "Dead branch!");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
void
|
||||
VirtualHeap::free(HeapBlock* blk)
|
||||
{
|
||||
assert(false && "Dead branch!");
|
||||
}
|
||||
|
||||
bool
|
||||
VirtualHeap::copyTo(Heap* heap)
|
||||
{
|
||||
assert(false && "Dead branch!");
|
||||
return false;
|
||||
}
|
||||
|
||||
bool
|
||||
VirtualHeap::isSane(void) const
|
||||
{
|
||||
assert(false && "Dead branch!");
|
||||
return true;
|
||||
}
|
||||
|
||||
} // namespace gpu
|
||||
@@ -1,225 +0,0 @@
|
||||
//! Declarations for GPU memory management
|
||||
|
||||
#ifndef GPUHEAP_HPP_
|
||||
#define GPUHEAP_HPP_
|
||||
|
||||
#include "top.hpp"
|
||||
#include "thread/atomic.hpp"
|
||||
#include "device/gpu/gpudefs.hpp"
|
||||
|
||||
/*! \addtogroup GPU
|
||||
* @{
|
||||
*/
|
||||
|
||||
//! GPU Device Implementation
|
||||
|
||||
namespace gpu {
|
||||
|
||||
class Device;
|
||||
class Heap;
|
||||
class Resource;
|
||||
class Memory;
|
||||
class VirtualGPU;
|
||||
|
||||
//! @todo:dgladdin: The heap list should be singly-linked
|
||||
|
||||
//! \brief A block on the GPU heap.
|
||||
//!
|
||||
//! Note that no code outside of the gpumemory.hpp/.cpp pair should touch this
|
||||
//! class directly as it is not thread-safe. In general, this class should be
|
||||
//! pretty much a struct and contain as little functionality as possible - just
|
||||
//! a constructor, destructor.
|
||||
//!
|
||||
//! Any other methods - in particular, anything that talks to CAL - should be no
|
||||
//! more than proxies for functionality implemented in Heap, as Heap is aware
|
||||
//! of the lock state.
|
||||
|
||||
class HeapBlock : public amd::HeapObject
|
||||
{
|
||||
public:
|
||||
//! Constructor
|
||||
HeapBlock(
|
||||
Heap* owner = NULL,
|
||||
size_t size = 0,
|
||||
size_t offset = 0,
|
||||
HeapBlock* next=NULL,
|
||||
HeapBlock* prev=NULL)
|
||||
: owner_(owner)
|
||||
, size_(size)
|
||||
, offset_(offset)
|
||||
, next_(next)
|
||||
, prev_(prev)
|
||||
, inUse_(false)
|
||||
, parent_(NULL)
|
||||
, memory_(NULL)
|
||||
{}
|
||||
|
||||
//! Destructor does some sanity checks.
|
||||
~HeapBlock();
|
||||
|
||||
//! Frees a heap block, returning its memory to the owning heap (proxy)
|
||||
void free();
|
||||
|
||||
//! Sets the GPU memory object associated with the heap block
|
||||
void setMemory(Memory* memory) { memory_ = memory; }
|
||||
|
||||
//! Gets the GPU memory object associated with the heap block
|
||||
Memory* getMemory() const { return memory_; }
|
||||
|
||||
//! Adds a heapblock view to the list of views
|
||||
void addView(HeapBlock* hb)
|
||||
{ views_.push_back(hb); hb->parent_ = this; }
|
||||
|
||||
//! Removes a heapblock view from the list of views
|
||||
void removeView(HeapBlock* hb) { views_.remove(hb); }
|
||||
|
||||
//! Destroys all views
|
||||
void destroyViewsMemory();
|
||||
|
||||
//! Creates all new views
|
||||
bool reallocateViews(
|
||||
HeapBlock* parent, //!< Parent heap block
|
||||
size_t shift //!< The new HeapBlock shift
|
||||
);
|
||||
|
||||
//! Gets the offset
|
||||
size_t offset() const { return offset_; }
|
||||
|
||||
Heap* owner_; //!< Heap that owns this block
|
||||
size_t size_; //!< Size of the block in bytes
|
||||
size_t offset_; //!< Offset of this block in the heap
|
||||
HeapBlock* next_; //!< Next block on the list, or NULL
|
||||
HeapBlock* prev_; //!< Previous block on the list, or NULL
|
||||
bool inUse_; //!< true if the block is in use
|
||||
HeapBlock* parent_; //!< The parent heap block for a view
|
||||
|
||||
private:
|
||||
//! Disable copy constructor
|
||||
HeapBlock(const HeapBlock&);
|
||||
|
||||
//! Disable assignment
|
||||
HeapBlock& operator=(const HeapBlock&);
|
||||
|
||||
Memory* memory_; //!< Memory object associated with the heap block
|
||||
std::list<HeapBlock*> views_; //!< The list of all allocated views
|
||||
};
|
||||
|
||||
class Heap : public amd::HeapObject
|
||||
{
|
||||
public:
|
||||
//! Minimal supported CAL granularity = 256 bytes / ElementSize
|
||||
static const size_t MinGranularity = 64;
|
||||
|
||||
//! The size of a heap element in bytes
|
||||
static const size_t ElementSize = 4;
|
||||
|
||||
//! The type of a heap element in bytes
|
||||
static const cmSurfFmt ElementType = CM_SURF_FMT_R32I;
|
||||
|
||||
Heap(
|
||||
Device& device //!< GPU device object
|
||||
);
|
||||
|
||||
virtual bool create(
|
||||
size_t totalSize, //!< total size of the allocated heap (bytes)
|
||||
bool remoteAlloc //!< allocate the heap in remote memory
|
||||
);
|
||||
|
||||
//! Heap destructor
|
||||
virtual ~Heap();
|
||||
|
||||
/*!
|
||||
* \brief Allocates memory from a heap (best-fit).
|
||||
* We round up to 4k granularity for alignment.
|
||||
*
|
||||
* \return A pointer to allocated heap block object.
|
||||
*/
|
||||
virtual HeapBlock* alloc(
|
||||
size_t size //! The allocation size
|
||||
);
|
||||
|
||||
//! Release memory back to a heap.
|
||||
virtual void free(HeapBlock* blk);
|
||||
|
||||
//! Copies this heap to another
|
||||
virtual bool copyTo(Heap* heap);
|
||||
|
||||
//! Gets the GPU resource associated with the global heap
|
||||
const Resource& resource() const { return *resource_; }
|
||||
|
||||
//! Read the page size (bytes)
|
||||
size_t granularityB() const;
|
||||
|
||||
//! Read the total free space (bytes)
|
||||
size_t freeSpace() const { return freeSize_; }
|
||||
|
||||
virtual bool isSane(void) const; //!< Checks heap sanity
|
||||
|
||||
//! Returns true if we have a virtual heap
|
||||
bool isVirtual() const { return virtualMode_; }
|
||||
|
||||
//! Returns the base virtual address of the heap
|
||||
uint64_t baseAddress() const { return baseAddress_; }
|
||||
|
||||
private:
|
||||
//! Insert a block into a list. Must be called under a lock.
|
||||
void insertBlock(HeapBlock** list, HeapBlock* node);
|
||||
|
||||
//! Merge a block into a list. Must be called under a lock.
|
||||
void mergeBlock(HeapBlock** list, HeapBlock* node);
|
||||
|
||||
//! Remove a block from a list. Must be called under a lock.
|
||||
void detachBlock(HeapBlock** list, HeapBlock* node);
|
||||
|
||||
//! Split a block into two pieces
|
||||
HeapBlock* splitBlock(HeapBlock* node, size_t size);
|
||||
|
||||
protected:
|
||||
Resource* resource_; //!< GPU resource referencing the heap memory
|
||||
HeapBlock* freeList_; //!< Head block for free list
|
||||
HeapBlock* busyList_; //!< Head block for busy list
|
||||
size_t freeSize_; //!< total free size of the heap
|
||||
Device& device_; //!< Device that owns this heap
|
||||
size_t granularity_; //!< Size of an allocation page
|
||||
amd::Monitor lock_; //!< Lock to serialise heap accesses
|
||||
bool virtualMode_; //!< Virtual mode
|
||||
uint64_t baseAddress_; //!< Virtual heap base address
|
||||
};
|
||||
|
||||
class VirtualHeap : public Heap
|
||||
{
|
||||
public:
|
||||
VirtualHeap(
|
||||
Device& device //!< GPU device object
|
||||
);
|
||||
|
||||
virtual bool create(
|
||||
size_t totalSize, //!< total size of the allocated heap (bytes)
|
||||
bool remoteAlloc //!< allocate the heap in remote memory
|
||||
);
|
||||
|
||||
//! Heap destructor
|
||||
virtual ~VirtualHeap();
|
||||
|
||||
/*!
|
||||
* \brief Allocates memory from a heap (best-fit).
|
||||
* We round up to 4k granularity for alignment.
|
||||
*
|
||||
* \return A pointer to allocated heap block object.
|
||||
*/
|
||||
virtual HeapBlock* alloc(
|
||||
size_t size //! The allocation size
|
||||
);
|
||||
|
||||
//! Release memory back to a heap.
|
||||
virtual void free(HeapBlock* blk);
|
||||
|
||||
//! Copies this heap to another
|
||||
virtual bool copyTo(Heap* heap);
|
||||
|
||||
virtual bool isSane(void) const; //!< Checks heap sanity
|
||||
};
|
||||
|
||||
} // namespace gpu
|
||||
|
||||
#endif // GPUHEAP_HPP_
|
||||
@@ -824,17 +824,6 @@ Kernel::create(
|
||||
// Initialize the kernel parameters
|
||||
bool result = initParameters();
|
||||
|
||||
if (!dev().heap()->isVirtual()) {
|
||||
amd::option::Options *options = nullProg().getCompilerOptions();
|
||||
// @todo Remove this. This is a hack for no VM mode
|
||||
if (!options->oVariables->EnableDumpKernel) {
|
||||
if (!name().compare(BlitName[KernelBlitManager::BlitCopyImageToBuffer]) ||
|
||||
!name().compare(BlitName[KernelBlitManager::BlitCopyBufferToImage])) {
|
||||
blitKernelHack_ = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Wave limiter needs to be initialized after kernel metadata is parsed
|
||||
// Since it depends on it.
|
||||
waveLimiter_.enable();
|
||||
@@ -855,7 +844,6 @@ Kernel::Kernel(
|
||||
const Program& prog,
|
||||
const InitData* initData)
|
||||
: NullKernel(name, gpuDev, prog)
|
||||
, blitKernelHack_(false)
|
||||
, waveLimiter_(this)
|
||||
{
|
||||
hwPrivateSize_ = 0;
|
||||
@@ -1603,10 +1591,6 @@ Kernel::debug(VirtualGPU& gpu) const
|
||||
{
|
||||
std::fstream stubWrite;
|
||||
address src = NULL;
|
||||
if (!dev().heap()->isVirtual()) {
|
||||
src = reinterpret_cast<address>
|
||||
(const_cast<Resource&>(dev().globalMem()).map(&gpu));
|
||||
}
|
||||
|
||||
std::cerr << "--- " << name_ << " ---" << std::endl;
|
||||
for (uint i = 0; i < arguments_.size(); ++i) {
|
||||
@@ -1689,9 +1673,6 @@ Kernel::debug(VirtualGPU& gpu) const
|
||||
stubWrite.close();
|
||||
}
|
||||
}
|
||||
if (!dev().heap()->isVirtual()) {
|
||||
const_cast<Resource&>(dev().globalMem()).unmap(&gpu);
|
||||
}
|
||||
}
|
||||
|
||||
bool
|
||||
@@ -1824,18 +1805,10 @@ Kernel::setArgument(
|
||||
type = ArgumentBuffer;
|
||||
}
|
||||
else {
|
||||
if (blitKernelHack_) {
|
||||
// Bind global buffer to UAV this buffer is bound to
|
||||
if (!bindResource(gpu, *gpuMem, 0, GlobalBuffer, uavRaw_)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
else {
|
||||
// Bind global buffer to UAV this buffer is bound to
|
||||
if (!bindResource(gpu, dev().globalMem(), 0,
|
||||
GlobalBuffer, uavRaw_)) {
|
||||
return false;
|
||||
}
|
||||
// Bind global buffer to UAV this buffer is bound to
|
||||
if (!bindResource(gpu, dev().globalMem(), 0,
|
||||
GlobalBuffer, uavRaw_)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1848,11 +1821,9 @@ Kernel::setArgument(
|
||||
|
||||
// Update offset only if we bind HeapBuffer or
|
||||
// it's global address space in UAV setup on SI+
|
||||
if (!blitKernelHack_) {
|
||||
offset += gpuMem->hbOffset();
|
||||
if (!forceZeroOffset) {
|
||||
assert((offset != 0) && "Offset 0 with a real allocation!");
|
||||
}
|
||||
offset += gpuMem->hbOffset();
|
||||
if (!forceZeroOffset) {
|
||||
assert((offset != 0) && "Offset 0 with a real allocation!");
|
||||
}
|
||||
gpu.addVmMemory(gpuMem);
|
||||
}
|
||||
@@ -2253,10 +2224,9 @@ Kernel::bindResource(
|
||||
|
||||
gslMemObject gslMem = NULL;
|
||||
// Use global address space on SI+ for UAV setup
|
||||
if (((type == ArgumentBuffer) || (type == ArgumentCbID) ||
|
||||
(type == ArgumentUavID) || (type == ArgumentPrintfID)) &&
|
||||
!blitKernelHack_) {
|
||||
gslMem = dev().heap()->resource().gslResource();
|
||||
if ((type == ArgumentBuffer) || (type == ArgumentCbID) ||
|
||||
(type == ArgumentUavID) || (type == ArgumentPrintfID)) {
|
||||
gslMem = dev().heap().resource().gslResource();
|
||||
}
|
||||
else {
|
||||
gslMem = resource.gslResource();
|
||||
@@ -2803,7 +2773,7 @@ NullKernel::parseArguments(const std::string& metaData, uint* uavRefCount)
|
||||
case KernelArg::PointerPrivate:
|
||||
// Check if can't use a dedicated UAV,
|
||||
// so realloc memory in the heap
|
||||
arg->memory_.realloc_ = isRealloc();
|
||||
arg->memory_.realloc_ = false;
|
||||
arg->memory_.uavBuf_ = true;
|
||||
break;
|
||||
case KernelArg::PointerHwConst:
|
||||
|
||||
@@ -450,9 +450,6 @@ public:
|
||||
uint instructionCnt() const { return instructionCnt_; }
|
||||
|
||||
protected:
|
||||
//! Returns TRUE if memory should be reallocated, returns FALSE always for NullDevice
|
||||
virtual bool isRealloc() const { return false; }
|
||||
|
||||
/*! \brief Parses the metadata structure for the kernel,
|
||||
* provided by the OpenCL compiler
|
||||
*
|
||||
@@ -673,9 +670,6 @@ protected:
|
||||
*/
|
||||
bool initConstBuffers();
|
||||
|
||||
//! Returns TRUE if memory should be reallocated, returns FALSE always for NullDevice
|
||||
virtual bool isRealloc() const { return !dev().heap()->isVirtual(); }
|
||||
|
||||
private:
|
||||
//! Disable copy constructor
|
||||
Kernel(const Kernel&);
|
||||
@@ -771,9 +765,6 @@ private:
|
||||
uint hwPrivateSize_; //!< initial HW private size
|
||||
uint hwLocalSize_; //!< initial HW local size
|
||||
|
||||
//! @todo remove the blit kernel hack
|
||||
bool blitKernelHack_; //!< No VM hack for kernel blit
|
||||
|
||||
WaveLimiterManager waveLimiter_; //!< adaptively control number of waves
|
||||
};
|
||||
|
||||
|
||||
@@ -30,39 +30,24 @@ namespace gpu {
|
||||
Memory::Memory(
|
||||
const Device& gpuDev,
|
||||
amd::Memory& owner,
|
||||
HeapBlock* hb,
|
||||
size_t size)
|
||||
: device::Memory(owner)
|
||||
, Resource(gpuDev, ((hb) ? hb->size_ : size) / Heap::ElementSize, Heap::ElementType)
|
||||
, hb_(hb)
|
||||
, Resource(gpuDev, size / Device::Heap::ElementSize, Device::Heap::ElementType)
|
||||
{
|
||||
init();
|
||||
|
||||
if (NULL != hb_) hb_->setMemory(this);
|
||||
|
||||
if (owner.parent() != NULL) {
|
||||
flags_ |= SubMemoryObject;
|
||||
}
|
||||
}
|
||||
|
||||
Memory::Memory(
|
||||
const Device& gpuDev,
|
||||
HeapBlock& hb)
|
||||
: device::Memory(hb.size_)
|
||||
, Resource(gpuDev, hb.size_ / Heap::ElementSize, Heap::ElementType)
|
||||
, hb_(&hb)
|
||||
{
|
||||
init();
|
||||
hb.setMemory(this);
|
||||
}
|
||||
|
||||
Memory::Memory(
|
||||
const Device& gpuDev,
|
||||
size_t size)
|
||||
: device::Memory(size)
|
||||
, Resource(gpuDev,
|
||||
amd::alignUp(size, Heap::ElementSize) / Heap::ElementSize, Heap::ElementType)
|
||||
, hb_(NULL)
|
||||
amd::alignUp(size, Device::Heap::ElementSize) /
|
||||
Device::Heap::ElementSize, Device::Heap::ElementType)
|
||||
{
|
||||
init();
|
||||
}
|
||||
@@ -75,7 +60,6 @@ Memory::Memory(
|
||||
)
|
||||
: device::Memory(owner)
|
||||
, Resource(gpuDev, width, format)
|
||||
, hb_(NULL)
|
||||
{
|
||||
init();
|
||||
|
||||
@@ -92,7 +76,6 @@ Memory::Memory(
|
||||
)
|
||||
: device::Memory(size)
|
||||
, Resource(gpuDev, width, format)
|
||||
, hb_(NULL)
|
||||
{
|
||||
init();
|
||||
}
|
||||
@@ -110,7 +93,6 @@ Memory::Memory(
|
||||
)
|
||||
: device::Memory(owner)
|
||||
, Resource(gpuDev, width, height, depth, format, chOrder, imageType, mipLevels)
|
||||
, hb_(NULL)
|
||||
{
|
||||
init();
|
||||
|
||||
@@ -132,7 +114,6 @@ Memory::Memory(
|
||||
)
|
||||
: device::Memory(size)
|
||||
, Resource(gpuDev, width, height, depth, format, chOrder, imageType, mipLevels)
|
||||
, hb_(NULL)
|
||||
{
|
||||
init();
|
||||
}
|
||||
@@ -197,14 +178,9 @@ Memory::create(
|
||||
break;
|
||||
case Resource::Remote:
|
||||
case Resource::RemoteUSWC:
|
||||
// @todo Enable unconditional optimization for remote memory
|
||||
if ((owner() != NULL &&
|
||||
owner()->getMemFlags() & CL_MEM_ALLOC_HOST_PTR) ||
|
||||
(hb() == NULL)) {
|
||||
if (!cal()->tiled_) {
|
||||
// Marks memory object for direct GPU access to the host memory
|
||||
flags_ |= HostMemoryDirectAccess;
|
||||
}
|
||||
if (!cal()->tiled_) {
|
||||
// Marks memory object for direct GPU access to the host memory
|
||||
flags_ |= HostMemoryDirectAccess;
|
||||
}
|
||||
break;
|
||||
case Resource::View: {
|
||||
@@ -481,8 +457,8 @@ Memory::createInterop(InteropType type)
|
||||
else {
|
||||
// Allocate Resource object for interop as buffer
|
||||
interopMemory_ = new Memory(dev(), size(),
|
||||
amd::alignUp(size(), Heap::ElementSize) / Heap::ElementSize,
|
||||
Heap::ElementType);
|
||||
amd::alignUp(size(), Device::Heap::ElementSize) / Device::Heap::ElementSize,
|
||||
Device::Heap::ElementType);
|
||||
|
||||
// Create the interop object in CAL
|
||||
if (NULL == interopMemory_ || !interopMemory_->create(memType, createParams)) {
|
||||
@@ -502,14 +478,6 @@ Memory::~Memory()
|
||||
// Clean VA cache
|
||||
dev().removeVACache(this);
|
||||
|
||||
// Release associated heap block, if any
|
||||
if (hb_) {
|
||||
// Protect heap block from simultaneous release with realloc
|
||||
amd::ScopedLock k(dev().lockAsyncOps());
|
||||
hb_->setMemory(NULL);
|
||||
hb_->free();
|
||||
}
|
||||
|
||||
delete interopMemory_;
|
||||
|
||||
// Release associated map target, if any
|
||||
@@ -531,35 +499,6 @@ Memory::~Memory()
|
||||
}
|
||||
}
|
||||
|
||||
bool
|
||||
Memory::reallocate(HeapBlock* hb, const Resource* parent)
|
||||
{
|
||||
Resource::ViewParams params;
|
||||
params.size_ = hb->size_;
|
||||
params.resource_ = parent;
|
||||
params.memory_ = NULL;
|
||||
|
||||
// Check if it's a view reallocation
|
||||
if (NULL != hb->parent_) {
|
||||
// The offset inside the view is unchanged
|
||||
params.offset_ = Resource::offset();
|
||||
|
||||
// Create a new view
|
||||
if (Resource::create(Resource::View, ¶ms)) {
|
||||
hb_ = hb;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
else {
|
||||
params.offset_ = hb->offset_;
|
||||
if (Resource::reallocate(¶ms)) {
|
||||
hb_ = hb;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
void
|
||||
Memory::syncCacheFromHost(VirtualGPU& gpu, device::Memory::SyncFlags syncFlags)
|
||||
{
|
||||
@@ -814,33 +753,13 @@ Memory::createBufferView(amd::Memory& subBufferOwner)
|
||||
{
|
||||
gpu::Memory* viewMemory;
|
||||
Resource::ViewParams params;
|
||||
HeapBlock* hb = NULL;
|
||||
|
||||
size_t offset = subBufferOwner.getOrigin();
|
||||
size_t size = subBufferOwner.getSize();
|
||||
|
||||
if (!dev().heap()->isVirtual()) {
|
||||
if (NULL == hb_) {
|
||||
LogError("HeapBlock must be initialized!");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
hb = new HeapBlock(NULL, size, offset + hb_->offset());
|
||||
if (hb == NULL) {
|
||||
LogError("We don't have enough video memory!");
|
||||
return NULL;
|
||||
}
|
||||
amd::ScopedLock lock(owner()->lockMemoryOps());
|
||||
hb_->addView(hb);
|
||||
}
|
||||
|
||||
// Create a memory object
|
||||
viewMemory = new gpu::Memory(dev(), subBufferOwner, hb, size);
|
||||
viewMemory = new gpu::Memory(dev(), subBufferOwner, size);
|
||||
if (NULL == viewMemory) {
|
||||
if (hb != NULL) {
|
||||
hb->setMemory(NULL);
|
||||
hb->free();
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
|
||||
@@ -8,7 +8,6 @@
|
||||
#include "top.hpp"
|
||||
#include "thread/atomic.hpp"
|
||||
#include "device/gpu/gpuresource.hpp"
|
||||
#include "device/gpu/gpuheap.hpp"
|
||||
#include "device/gpu/gpudevice.hpp"
|
||||
#include <map>
|
||||
|
||||
@@ -27,7 +26,6 @@ class Heap;
|
||||
class Resource;
|
||||
class Memory;
|
||||
class VirtualGPU;
|
||||
class HeapBlock;
|
||||
|
||||
//! GPU memory object.
|
||||
// Wrapper that can contain a heap block or an interop buffer/image.
|
||||
@@ -44,14 +42,8 @@ public:
|
||||
Memory(
|
||||
const Device& gpuDev,
|
||||
amd::Memory& owner,
|
||||
HeapBlock* hb,
|
||||
size_t size = 0);
|
||||
|
||||
//! Constructor (nonfat version for local scratch mem use)
|
||||
Memory(
|
||||
const Device& gpuDev,
|
||||
HeapBlock& hb);
|
||||
|
||||
//! Constructor (nonfat version for local scratch mem use without heap block)
|
||||
Memory(
|
||||
const Device& gpuDev,
|
||||
@@ -102,12 +94,6 @@ public:
|
||||
//! Default destructor
|
||||
~Memory();
|
||||
|
||||
//! Reallocates the memory object in the new heap block
|
||||
bool reallocate(
|
||||
HeapBlock* hb, //! The new heap block for this memory object
|
||||
const Resource* parent //! Parent resource for view reallocaiton
|
||||
);
|
||||
|
||||
//! Creates the interop memory
|
||||
bool createInterop(
|
||||
InteropType type //!< The interop type
|
||||
@@ -189,9 +175,6 @@ public:
|
||||
//! Sets interop type for this memory object
|
||||
void setInteropType(InteropType type) { interopType_ = type; }
|
||||
|
||||
//! Returns the HeapBlock pointer
|
||||
const HeapBlock* hb() const { return hb_; }
|
||||
|
||||
//! Set the owner
|
||||
void setOwner(amd::Memory* owner) { owner_ = owner; }
|
||||
|
||||
@@ -229,7 +212,6 @@ private:
|
||||
InteropType interopType_; //!< Interop type
|
||||
Memory* interopMemory_; //!< interop memory
|
||||
|
||||
HeapBlock* hb_; //!< Heap Block, or NULL if not in-heap memory
|
||||
Memory* pinnedMemory_; //!< Memory used as pinned system memory
|
||||
const Memory* parent_; //!< Parent memory object
|
||||
};
|
||||
|
||||
@@ -322,7 +322,7 @@ static uint32_t GetHSAILImageOrderType(gslChannelOrder chOrder, cmSurfFmt format
|
||||
}
|
||||
|
||||
bool
|
||||
Resource::create(MemoryType memType, CreateParams* params, bool heap)
|
||||
Resource::create(MemoryType memType, CreateParams* params)
|
||||
{
|
||||
bool calRes = false;
|
||||
gslMemObject gslResource = 0;
|
||||
@@ -382,7 +382,7 @@ Resource::create(MemoryType memType, CreateParams* params, bool heap)
|
||||
}
|
||||
|
||||
// Force remote allocation if it was requested in the settings
|
||||
if (dev().settings().remoteAlloc_ && !heap &&
|
||||
if (dev().settings().remoteAlloc_ &&
|
||||
((memoryType() == Local) ||
|
||||
(memoryType() == Persistent))) {
|
||||
if (dev().settings().apuSystem_ && dev().settings().viPlus_) {
|
||||
@@ -515,7 +515,7 @@ Resource::create(MemoryType memType, CreateParams* params, bool heap)
|
||||
if (memoryType() == Local) {
|
||||
cal_.type_ = Persistent;
|
||||
}
|
||||
else if (!heap && (memoryType() == Persistent)) {
|
||||
else if (memoryType() == Persistent) {
|
||||
cal_.type_ = RemoteUSWC;
|
||||
}
|
||||
// Remote cacheable to uncacheable
|
||||
@@ -553,11 +553,6 @@ Resource::create(MemoryType memType, CreateParams* params, bool heap)
|
||||
reinterpret_cast<const char*>(address_) - tmpHost);
|
||||
|
||||
pinOffset_ = hostMemOffset & 0xff;
|
||||
//!@note GSL has a problem with the defines for flags and
|
||||
//! view creation, so check the restriction here
|
||||
if (!dev().heap()->isVirtual() && (pinOffset_ != 0)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
pinAddress = tmpHost;
|
||||
// Align width to avoid GSL useless assert with a view
|
||||
@@ -629,20 +624,6 @@ Resource::create(MemoryType memType, CreateParams* params, bool heap)
|
||||
calRes = true;
|
||||
}
|
||||
|
||||
// Check if it's a heap allocation
|
||||
if (!dev().heap()->isVirtual()) {
|
||||
if (viewOwner_ == &dev().globalMem()) {
|
||||
// Allocation directly from the heap
|
||||
hbOffset_ = static_cast<uint64_t>(view->offset_);
|
||||
}
|
||||
else {
|
||||
// Allocation from another memory object
|
||||
hbOffset_ = static_cast<uint64_t>(view->offset_) +
|
||||
viewOwner_->hbOffset();
|
||||
}
|
||||
hbSize_ = view->size_;
|
||||
}
|
||||
|
||||
if (viewOwner_->isMemoryType(Pinned)) {
|
||||
address_ = viewOwner_->data() + offset();
|
||||
}
|
||||
@@ -952,11 +933,9 @@ Resource::create(MemoryType memType, CreateParams* params, bool heap)
|
||||
cal_.tiled_ = (GSL_MOA_TILING_LINEAR != tiling) &&
|
||||
(GSL_MOA_TILING_LINEAR_GENERAL != tiling);
|
||||
|
||||
// Get the heap block offset if it's a virtual heap
|
||||
if (dev().heap()->isVirtual()) {
|
||||
hbOffset_ = gslResource->getSurfaceAddress() -
|
||||
dev().heap()->baseAddress();
|
||||
}
|
||||
// Get the heap block offset
|
||||
hbOffset_ = gslResource->getSurfaceAddress() -
|
||||
dev().heap().baseAddress();
|
||||
hbSize_ = static_cast<uint64_t>(gslResource->getSurfaceSize());
|
||||
|
||||
if (!dev().settings().use64BitPtr_ &&
|
||||
@@ -1036,32 +1015,6 @@ Resource::create(MemoryType memType, CreateParams* params, bool heap)
|
||||
return true;
|
||||
}
|
||||
|
||||
bool
|
||||
Resource::reallocate(CreateParams* params)
|
||||
{
|
||||
GslResourceReference* old;
|
||||
GslResourceReference* active;
|
||||
|
||||
old = gslRef_;
|
||||
if (!create(memoryType(), params)) {
|
||||
gslRef_ = old;
|
||||
return false;
|
||||
}
|
||||
// Get the new active resource
|
||||
active = gslRef_;
|
||||
gslRef_ = old;
|
||||
|
||||
dev().resCopy(old->gslResource(),
|
||||
active->gslResource(), CAL_MEMCOPY_SYNC);
|
||||
|
||||
// Free all old resources
|
||||
assert(renames_.size() == 0);
|
||||
free();
|
||||
|
||||
gslRef_ = active;
|
||||
return true;
|
||||
}
|
||||
|
||||
void
|
||||
Resource::free()
|
||||
{
|
||||
@@ -1813,10 +1766,8 @@ Resource::setActiveRename(VirtualGPU& gpu, GslResourceReference* rename)
|
||||
gslRef_ = rename;
|
||||
address_ = rename->cpuAddress_;
|
||||
|
||||
if (dev().heap()->isVirtual()) {
|
||||
hbOffset_ = rename->gslResource()->getSurfaceAddress() -
|
||||
dev().heap()->baseAddress();
|
||||
}
|
||||
hbOffset_ = rename->gslResource()->getSurfaceAddress() -
|
||||
dev().heap().baseAddress();
|
||||
}
|
||||
|
||||
bool
|
||||
|
||||
@@ -209,15 +209,6 @@ public:
|
||||
*/
|
||||
virtual bool create(
|
||||
MemoryType memType, //!< memory type
|
||||
CreateParams* params = 0, //!< special parameters for resource allocation
|
||||
bool heap = false //!< Global heap allocation for not VM mode
|
||||
);
|
||||
|
||||
/*! \brief Reallocates a CAL object, associated with the resource
|
||||
*
|
||||
* \return True if we succesfully reallocated a CAL resource
|
||||
*/
|
||||
bool reallocate(
|
||||
CreateParams* params = 0 //!< special parameters for resource allocation
|
||||
);
|
||||
|
||||
|
||||
@@ -50,10 +50,6 @@ Settings::Settings()
|
||||
maxRenames_ = 16;
|
||||
maxRenameSize_ = 4 * Mi;
|
||||
|
||||
// The global heap settings
|
||||
heapSize_ = GPU_INITIAL_HEAP_SIZE * Mi;
|
||||
heapSizeGrowth_ = GPU_HEAP_GROWTH_INCREMENT * Mi;
|
||||
|
||||
imageSupport_ = false;
|
||||
hwLDSSize_ = 0;
|
||||
|
||||
|
||||
@@ -82,8 +82,6 @@ public:
|
||||
size_t stagedXferSize_; //!< Staged buffer size
|
||||
uint maxRenames_; //!< Maximum number of possible renames
|
||||
uint maxRenameSize_; //!< Maximum size for all renames
|
||||
size_t heapSize_; //!< The global heap size
|
||||
size_t heapSizeGrowth_; //!< The global heap size growth
|
||||
uint hwLDSSize_; //!< HW local data store size
|
||||
uint maxWorkGroupSize_; //!< Requested workgroup size for this device
|
||||
uint hostMemDirectAccess_; //!< Enables direct access to the host memory
|
||||
|
||||
@@ -517,10 +517,6 @@ VirtualGPU::create(
|
||||
// Fall through ...
|
||||
case Settings::BlitEngineCAL:
|
||||
case Settings::BlitEngineKernel:
|
||||
if (!dev().heap()->isVirtual()) {
|
||||
blitSetup.disableReadBufferRect_ = true;
|
||||
blitSetup.disableWriteBufferRect_ = true;
|
||||
}
|
||||
// use host blit for HW debug
|
||||
if (dev().settings().enableHwDebug_) {
|
||||
blitSetup.disableCopyImageToBuffer_ = true;
|
||||
@@ -3166,23 +3162,21 @@ VirtualGPU::profilingCollectResults(CommandBatch* cb, const amd::Event* waitingE
|
||||
bool
|
||||
VirtualGPU::addVmMemory(const Resource* resource)
|
||||
{
|
||||
if (dev().heap()->isVirtual()) {
|
||||
uint* cnt = &cal_.memCount_;
|
||||
(*cnt)++;
|
||||
// Reallocate array if kernel uses more memory objects
|
||||
if (numVmMems_ < *cnt) {
|
||||
gslMemObject* tmp;
|
||||
tmp = new gslMemObject [*cnt];
|
||||
if (tmp == NULL) {
|
||||
return false;
|
||||
}
|
||||
memcpy(tmp, vmMems_, sizeof(gslMemObject) * numVmMems_);
|
||||
delete [] vmMems_;
|
||||
vmMems_ = tmp;
|
||||
numVmMems_ = *cnt;
|
||||
uint* cnt = &cal_.memCount_;
|
||||
(*cnt)++;
|
||||
// Reallocate array if kernel uses more memory objects
|
||||
if (numVmMems_ < *cnt) {
|
||||
gslMemObject* tmp;
|
||||
tmp = new gslMemObject [*cnt];
|
||||
if (tmp == NULL) {
|
||||
return false;
|
||||
}
|
||||
vmMems_[*cnt - 1] = resource->gslResource();
|
||||
memcpy(tmp, vmMems_, sizeof(gslMemObject) * numVmMems_);
|
||||
delete [] vmMems_;
|
||||
vmMems_ = tmp;
|
||||
numVmMems_ = *cnt;
|
||||
}
|
||||
vmMems_[*cnt - 1] = resource->gslResource();
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -496,7 +496,7 @@ CALGSLDevice::SetupContext(int32 &asic_id)
|
||||
getAttribs_int(temp_cs);
|
||||
temp_cs->getMemInfo(&m_memInfo, GSL_MEMINFO_BASIC);
|
||||
|
||||
m_vmMode = temp_cs->getVMMode();
|
||||
assert(temp_cs->getVMMode());
|
||||
|
||||
m_adp->deleteContext(temp_cs);
|
||||
|
||||
@@ -1313,38 +1313,6 @@ CALGSLDevice::PerformDMACopy(gslMemObject srcMem, gslMemObject destMem, cmSurfFm
|
||||
return true;
|
||||
}
|
||||
|
||||
void
|
||||
CALGSLDevice::resCopy(gslMemObject srcRes, gslMemObject dstRes, uint32 flags) const
|
||||
{
|
||||
assert(m_cs != 0);
|
||||
assert(srcRes != 0);
|
||||
assert(dstRes != 0);
|
||||
|
||||
//! @note: GSL device isn't thread safe
|
||||
amd::ScopedLock k(gslDeviceOps());
|
||||
|
||||
uint64 surfaceSize;
|
||||
|
||||
CopyType type = GetCopyType(srcRes, dstRes, 0, 0, m_allowDMA, 0, surfaceSize, 0, 0);
|
||||
|
||||
if (type == USE_DRMDMA)
|
||||
{
|
||||
m_cs->DMACopy(srcRes, 0, dstRes, 0, surfaceSize, GSL_SYNCUPLOAD_SYNC_WAIT, NULL);
|
||||
m_cs->Flush();
|
||||
Wait(m_cs, GSL_DRMDMA_SYNC_ATI, m_mapDMAQuery);
|
||||
}
|
||||
else if (type == USE_CPDMA)
|
||||
{
|
||||
m_cs->syncUploadRaw(srcRes, 0, dstRes, 0, surfaceSize, 0);
|
||||
m_cs->Flush();
|
||||
Wait(m_cs, GSL_SYNC_ATI, m_mapQuery);
|
||||
}
|
||||
else
|
||||
{
|
||||
assert(0 && "No copy engine is being used");
|
||||
}
|
||||
}
|
||||
|
||||
#define CPDMA_THRESHOLD 131072
|
||||
|
||||
CopyType
|
||||
|
||||
@@ -97,14 +97,10 @@ public:
|
||||
const CALdeviceattribs& getAttribs() const { return m_attribs; }
|
||||
const gslMemInfo& getMemInfo() const { return m_memInfo; }
|
||||
|
||||
bool isVmMode() const { return m_vmMode; };
|
||||
|
||||
uint32 getVPUMask() const { return m_vpuMask; }
|
||||
bool canDMA() const { return m_canDMA; }
|
||||
gslMemObject m_srcDRMDMAMem, m_dstDRMDMAMem; // memory object of flush buffer, used for DRMDMA flush
|
||||
|
||||
void resCopy(gslMemObject srcRes, gslMemObject dstRes, uint32 flags) const;
|
||||
|
||||
void PerformAdapterInitialization() const;
|
||||
void PerformFullInitialization() const;
|
||||
|
||||
@@ -211,7 +207,6 @@ private:
|
||||
uint m_computeRing : 1;
|
||||
uint m_usePerVPUAdapterModel : 1;
|
||||
uint m_PerformLazyDeviceInit : 1;
|
||||
uint m_vmMode : 1;
|
||||
uint m_isComputeRingIDForced : 1;
|
||||
};
|
||||
};
|
||||
|
||||
@@ -34,7 +34,6 @@ Settings::Settings()
|
||||
bool
|
||||
Settings::create(bool doublePrecision)
|
||||
{
|
||||
largeHostMemAlloc_ = true;
|
||||
customHostAllocator_ = true;
|
||||
|
||||
// Enable extensions
|
||||
|
||||
@@ -52,12 +52,8 @@ release(cstring, GPU_DEVICE_ORDINAL, "", \
|
||||
"Select the device ordinal (comma seperated list of available devices)") \
|
||||
release(bool, REMOTE_ALLOC, false, \
|
||||
"Use remote memory for the global heap allocation") \
|
||||
release(int, GPU_INITIAL_HEAP_SIZE, 16, \
|
||||
"Initial size of the GPU heap in MiB") \
|
||||
release(uint, GPU_MAX_HEAP_SIZE, 100, \
|
||||
"Set maximum size of the GPU heap to % of board memory") \
|
||||
release(int, GPU_HEAP_GROWTH_INCREMENT, 8, \
|
||||
"Amount to grow the GPU heap by in MiB") \
|
||||
release(uint, GPU_STAGING_BUFFER_SIZE, 512, \
|
||||
"Size of the GPU staging buffer in KiB") \
|
||||
release(bool, GPU_DUMP_BLIT_KERNELS, false, \
|
||||
|
||||
新增問題並參考
封鎖使用者