diff --git a/rocclr/runtime/device/cpu/cpusettings.cpp b/rocclr/runtime/device/cpu/cpusettings.cpp index 14d43cdbe6..b7dfdf57de 100644 --- a/rocclr/runtime/device/cpu/cpusettings.cpp +++ b/rocclr/runtime/device/cpu/cpusettings.cpp @@ -10,8 +10,6 @@ namespace cpu { bool Settings::create() { - largeHostMemAlloc_ = true; - // This code is temporary until cl_khr_fp64 is unconditional if (flagIsDefault(CL_KHR_FP64) || CL_KHR_FP64) { enableExtension(ClKhrFp64); diff --git a/rocclr/runtime/device/device.cpp b/rocclr/runtime/device/device.cpp index 4e1ef8a2e8..d9c7a1a2b2 100644 --- a/rocclr/runtime/device/device.cpp +++ b/rocclr/runtime/device/device.cpp @@ -517,7 +517,6 @@ Settings::Settings() extensions_ = 0; partialDispatch_ = false; supportRA_ = true; - largeHostMemAlloc_ = false; customHostAllocator_ = false; waitCommand_ = AMD_OCL_WAIT_COMMAND; supportDepthsRGB_ = false; diff --git a/rocclr/runtime/device/device.hpp b/rocclr/runtime/device/device.hpp index bb9d16b368..d2a313b6be 100644 --- a/rocclr/runtime/device/device.hpp +++ b/rocclr/runtime/device/device.hpp @@ -577,13 +577,12 @@ public: struct { uint partialDispatch_: 1; //!< Enables partial dispatch uint supportRA_: 1; //!< Support RA channel order format - uint largeHostMemAlloc_: 1; //!< Allow large host mem allocations (> maxSingleAlloc) uint waitCommand_: 1; //!< Enables a wait for every submitted command uint customHostAllocator_: 1;//!< True if device has custom host allocator // that replaces generic OS allocation routines uint supportDepthsRGB_: 1; //!< Support DEPTH and sRGB channel order format uint enableHwDebug_: 1; //!< Enable HW debug support - uint reserved_: 25; + uint reserved_: 26; }; uint value_; }; diff --git a/rocclr/runtime/device/gpu/gpublit.cpp b/rocclr/runtime/device/gpu/gpublit.cpp index e201985e09..ab85396b93 100644 --- a/rocclr/runtime/device/gpu/gpublit.cpp +++ b/rocclr/runtime/device/gpu/gpublit.cpp @@ -1955,20 +1955,9 @@ KernelBlitManager::copyBufferRect( // Fall into the CAL path for rejected transfers if (setup_.disableCopyBufferRect_ || - (gpuMem(srcMemory).isHostMemDirectAccess() || gpuMem(dstMemory).isHostMemDirectAccess()) || - (!dev().heap()->isVirtual() && - ((gpuMem(dstMemory).hb() == NULL) || (gpuMem(srcMemory).hb() == NULL)))) { - // Copy data with CAL (no VM mode only) - if (gpuMem(srcMemory).isHostMemDirectAccess() || gpuMem(dstMemory).isHostMemDirectAccess()) { - result = DmaBlitManager::copyBufferRect(srcMemory, dstMemory, - srcRectIn, dstRectIn, sizeIn, entire); - } - - if ((!dev().heap()->isVirtual() && ((gpuMem(dstMemory).hb() == NULL) || (gpuMem(srcMemory).hb() == NULL))) - && !result) { - result = HostBlitManager::copyBufferRect(srcMemory, dstMemory, - srcRectIn, dstRectIn, sizeIn, entire); - } + gpuMem(srcMemory).isHostMemDirectAccess() || gpuMem(dstMemory).isHostMemDirectAccess()) { + result = DmaBlitManager::copyBufferRect(srcMemory, dstMemory, + srcRectIn, dstRectIn, sizeIn, entire); if (result) { synchronize(); @@ -2395,11 +2384,9 @@ KernelBlitManager::copyBuffer( { amd::ScopedLock k(lockXferOps_); bool result = false; - bool forceCal = !dev().heap()->isVirtual() && - ((gpuMem(srcMemory).hb() == NULL) || (gpuMem(dstMemory).hb() == NULL)); - if ((!forceCal && !gpuMem(srcMemory).isHostMemDirectAccess() && - !gpuMem(dstMemory).isHostMemDirectAccess())) { + if (!gpuMem(srcMemory).isHostMemDirectAccess() && + !gpuMem(dstMemory).isHostMemDirectAccess()) { uint blitType = BlitCopyBuffer; size_t dim = 1; size_t globalWorkOffset[3] = { 0, 0, 0 }; @@ -2489,7 +2476,6 @@ KernelBlitManager::copyBuffer( result = gpu().submitKernelInternal(ndrange, *kernels_[blitType], parameters); } else { - // Copy data with CAL (no VM mode only) result = DmaBlitManager::copyBuffer( srcMemory, dstMemory, srcOrigin, dstOrigin, sizeIn, entire); } diff --git a/rocclr/runtime/device/gpu/gpudevice.cpp b/rocclr/runtime/device/gpu/gpudevice.cpp index 32bd622b01..e8775b6db7 100644 --- a/rocclr/runtime/device/gpu/gpudevice.cpp +++ b/rocclr/runtime/device/gpu/gpudevice.cpp @@ -173,7 +173,7 @@ NullDevice::create(CALtarget target) calAttr.localRAM = 512; // Fill the device info structure - fillDeviceInfo(calAttr, memInfo, 4096, 1, true); + fillDeviceInfo(calAttr, memInfo, 4096, 1); if (settings().hsail_ || (settings().oclVersion_ == OpenCL20)) { // Runtime doesn't know what local size could be on the real board @@ -225,9 +225,7 @@ void NullDevice::fillDeviceInfo( const CALdeviceattribs& calAttr, const gslMemInfo& memInfo, size_t maxTextureSize, - uint numComputeRings, - bool isVirtualMode - ) + uint numComputeRings) { info_.type_ = CL_DEVICE_TYPE_GPU; info_.vendorId_ = 0x1002; @@ -276,56 +274,45 @@ void NullDevice::fillDeviceInfo( info_.globalMemCacheType_ = CL_NONE; } - if (isVirtualMode) { #if defined(ATI_OS_LINUX) - info_.globalMemSize_ = - (static_cast(std::min(GPU_MAX_HEAP_SIZE, 100u)) * - // globalMemSize is the actual available size for app on Linux - // Because Linux base driver doesn't support paging - static_cast(memInfo.cardMemAvailableBytes + memInfo.cardExtMemAvailableBytes) / 100u); + info_.globalMemSize_ = + (static_cast(std::min(GPU_MAX_HEAP_SIZE, 100u)) * + // globalMemSize is the actual available size for app on Linux + // Because Linux base driver doesn't support paging + static_cast(memInfo.cardMemAvailableBytes + memInfo.cardExtMemAvailableBytes) / 100u); #else - info_.globalMemSize_ = - (static_cast(std::min(GPU_MAX_HEAP_SIZE, 100u)) * - static_cast(calAttr.localRAM) / 100u) * Mi; + info_.globalMemSize_ = + (static_cast(std::min(GPU_MAX_HEAP_SIZE, 100u)) * + static_cast(calAttr.localRAM) / 100u) * Mi; #endif - if (settings().apuSystem_) { - info_.globalMemSize_ += - (static_cast(calAttr.uncachedRemoteRAM) * Mi * 75)/100; - } + if (settings().apuSystem_) { + info_.globalMemSize_ += + (static_cast(calAttr.uncachedRemoteRAM) * Mi * 75)/100; + } - // We try to calculate the largest available memory size from - // the largest available block in either heap. In theory this - // should be the size we can actually allocate at application - // start. Note that it may not be a guarantee still as the - // application progresses. - info_.maxMemAllocSize_ = std::max( - cl_ulong(memInfo.cardLargestFreeBlockBytes), - cl_ulong(memInfo.cardExtLargestFreeBlockBytes)); + // We try to calculate the largest available memory size from + // the largest available block in either heap. In theory this + // should be the size we can actually allocate at application + // start. Note that it may not be a guarantee still as the + // application progresses. + info_.maxMemAllocSize_ = std::max( + cl_ulong(memInfo.cardLargestFreeBlockBytes), + cl_ulong(memInfo.cardExtLargestFreeBlockBytes)); #if defined(ATI_OS_WIN) - if (settings().apuSystem_) { - info_.maxMemAllocSize_ = std::max( - (static_cast(calAttr.uncachedRemoteRAM) * Mi * 75)/100, - info_.maxMemAllocSize_); - } + if (settings().apuSystem_) { + info_.maxMemAllocSize_ = std::max( + (static_cast(calAttr.uncachedRemoteRAM) * Mi * 75)/100, + info_.maxMemAllocSize_); + } #endif - info_.maxMemAllocSize_ = cl_ulong(info_.maxMemAllocSize_ * - std::min(GPU_SINGLE_ALLOC_PERCENT, 100u) / 100u); + info_.maxMemAllocSize_ = cl_ulong(info_.maxMemAllocSize_ * + std::min(GPU_SINGLE_ALLOC_PERCENT, 100u) / 100u); - //! \note Force max single allocation size. - //! 4GB limit for the blit kernels and 64 bit optimizations. - info_.maxMemAllocSize_ = std::min(info_.maxMemAllocSize_, - static_cast(settings().maxAllocSize_)); - } - else { - uint maxHeapSize = flagIsDefault(GPU_MAX_HEAP_SIZE) ? 50 : GPU_MAX_HEAP_SIZE; - info_.globalMemSize_ = (std::min(maxHeapSize, 100u) - * calAttr.localRAM / 100u) * Mi; - - uint maxAllocSize = flagIsDefault(GPU_SINGLE_ALLOC_PERCENT) ? 25 : GPU_SINGLE_ALLOC_PERCENT; - info_.maxMemAllocSize_ = cl_ulong(info_.globalMemSize_ * - std::min(maxAllocSize, 100u) / 100u); - } + //! \note Force max single allocation size. + //! 4GB limit for the blit kernels and 64 bit optimizations. + info_.maxMemAllocSize_ = std::min(info_.maxMemAllocSize_, + static_cast(settings().maxAllocSize_)); if (info_.maxMemAllocSize_ < cl_ulong(128 * Mi)) { LogError("We are unable to get a heap large enough to support the OpenCL minimum "\ @@ -377,7 +364,7 @@ void NullDevice::fillDeviceInfo( info_.imagePitchAlignment_ = 256; // XXX: 256 pixel pitch alignment for now info_.imageBaseAddressAlignment_ = 256; // XXX: 256 byte base address alignment for now - info_.bufferFromImageSupport_ = (isVirtualMode) ? CL_TRUE : CL_FALSE; + info_.bufferFromImageSupport_ = CL_TRUE; } info_.errorCorrectionSupport_ = CL_FALSE; @@ -404,7 +391,7 @@ void NullDevice::fillDeviceInfo( ::strcpy(info_.name_, hwInfo()->targetName_); ::strcpy(info_.vendor_, "Advanced Micro Devices, Inc."); ::snprintf(info_.driverVersion_, sizeof(info_.driverVersion_) - 1, - AMD_BUILD_STRING "%s", (isVirtualMode) ? " (VM)": ""); + AMD_BUILD_STRING "%s", " (VM)"); info_.profile_ = "FULL_PROFILE"; if (settings().oclVersion_ == OpenCL20) { @@ -508,6 +495,25 @@ void NullDevice::fillDeviceInfo( } } +bool +Device::Heap::create(Device& device) +{ + // Create a new GPU resource + resource_ = new Resource(device, 0, CM_SURF_FMT_R32I); + if (resource_ == NULL) { + return false; + } + + if (!resource_->create(Resource::Heap)) { + return false; + } + + if (!device.settings().hsail_) { + baseAddress_ = resource_->gslResource()->getSurfaceAddress(); + } + return true; +} + void Device::Engines::create(uint num, gslEngineDescriptor* desc, uint maxNumComputeRings) { @@ -670,7 +676,7 @@ Device::Device() , CALGSLDevice() , numOfVgpus_(0) , context_(NULL) - , heap_(NULL) + , heap_() , dummyPage_(NULL) , lockAsyncOps_(NULL) , lockAsyncOpsForInitHeap_(NULL) @@ -731,11 +737,6 @@ Device::~Device() dummyPage_->release(); } - // Destroy global heap - if (heap_ != NULL) { - delete heap_; - } - // Destroy resource cache delete resourceCache_; @@ -837,26 +838,6 @@ Device::create(CALuint ordinal, CALuint numOfDevices) size_t resourceCacheSize = settings().resourceCacheSize_; - // Allocate heap - heapSize_ = settings().heapSize_; - - // Check if BE supports virtual addressing mode - if (isVmMode()) { - heap_ = new VirtualHeap(*this); - gpuSettings->largeHostMemAlloc_ = (NULL != heap_) ? true : false; - } - - // If virtual heap allocation failed, then try static allocation - if (heap_ == NULL) { - heap_ = new Heap(*this); - // Disable resource cache if VM is disable - resourceCacheSize = 0; - if (NULL == heap_) { - return false; - } - } - - #ifdef DEBUG std::stringstream message; if (settings().remoteAlloc_) { @@ -865,10 +846,7 @@ Device::create(CALuint ordinal, CALuint numOfDevices) else { message << "Using *Local* memory"; } - if (!heap()->isVirtual()) { - message << ": " << settings().heapSize_ / Mi << "MB, growth: " << \ - settings().heapSizeGrowth_ / Mi << "MB"; - } + message << std::endl; LogInfo(message.str().c_str()); #endif // DEBUG @@ -883,8 +861,7 @@ Device::create(CALuint ordinal, CALuint numOfDevices) // Fill the device info structure fillDeviceInfo(getAttribs(), getMemInfo(), static_cast(getMaxTextureSize()), - engines().numComputeRings(), heap()->isVirtual() - ); + engines().numComputeRings()); if (settings().hsail_ || (settings().oclVersion_ == OpenCL20)) { if (NULL == hsaCompiler_) { @@ -955,7 +932,7 @@ Device::initializeHeapResources() } // Complete initialization of the heap and other buffers - if ((heap_ == NULL) || !heap_->create(heapSize_, settings().remoteAlloc_)) { + if (!heap_.create(*this)) { LogError("Failed GPU heap creation"); return false; } @@ -987,7 +964,7 @@ Device::initializeHeapResources() type = Resource::RemoteUSWC; } xferWrite_ = new XferBuffers(*this, type, - amd::alignUp(settings().stagedXferSize_, heap()->granularityB())); + amd::alignUp(settings().stagedXferSize_, 4 * Ki)); if ((xferWrite_ == NULL) || !xferWrite_->create()) { LogError("Couldn't allocate transfer buffer objects for read"); return false; @@ -997,7 +974,7 @@ Device::initializeHeapResources() // Initialize staged read buffers if (settings().stagedXferRead_) { xferRead_ = new XferBuffers(*this, Resource::Remote, - amd::alignUp(settings().stagedXferSize_, heap()->granularityB())); + amd::alignUp(settings().stagedXferSize_, 4 * Ki)); if ((xferRead_ == NULL) || !xferRead_->create()) { LogError("Couldn't allocate transfer buffer objects for write"); return false; @@ -1086,52 +1063,6 @@ Device::createVirtualDevice( } } -bool -Device::reallocHeap(size_t size, bool remoteAlloc) -{ - size_t heapSize = heapSize_ + ((size != 0) ? - amd::alignUp(size, settings().heapSizeGrowth_) : 0); - Heap* oldHeap = heap_; - // Maximum heap limit size = reported size + internal memory - size_t maxHeapLimit = static_cast(info().globalMemSize_) + - // an extra 10MB for the alignments of allocations, - // since the conformance test doesn't expect any - 10 * Mi; - - if ((settings().heapSizeGrowth_ == 0) || - // Allow the heap growth up to the global memory limit - (heapSize_ + size > maxHeapLimit)) { - return false; - } - heapSize = std::min(maxHeapLimit, heapSize); - - heap_ = new Heap(*this); - - // Make sure we have allocated a new global heap - if (NULL == heap_) { - heap_ = oldHeap; - return false; - } - - if (!heap_->create(heapSize, remoteAlloc)) { - delete heap_; - heap_ = oldHeap; - return false; - } - - // Copy the old heap to the new one - if (!oldHeap->copyTo(heap_)) { - delete heap_; - heap_ = oldHeap; - return false; - } - - delete oldHeap; - heapSize_ = heapSize; - - return true; -} - device::Program* Device::createProgram(int oclVer) { @@ -1288,65 +1219,6 @@ Device::tearDown() } } -//! @note This funciton must be lock protected from a caller -HeapBlock* -Device::allocHeapBlock(size_t size) const -{ - HeapBlock* hb = NULL; - - // Allocate the underlying heap block - hb = heap_->alloc(size); - - // Virtual heap should never fail allocation - if ((hb == NULL) && (!heap_->isVirtual())) { - // Queues can't process commands, - // while the global heap reallocation occurs. - // So stall all queues and then reallocate the global heap - ScopedLockVgpus lock(*this); - - // Wait for idle - for (uint idx = 0; idx < vgpus().size(); ++idx) { - vgpus()[idx]->waitAllEngines(); - } - - // Acount memory alignment for the new allocation - size_t extraSpace = heap_->granularityB(); - if (size >= heap_->freeSpace()) { - // Required extra space = requested size - free space - extraSpace += size - heap_->freeSpace(); - } - - //! @note the const cast here looks bad, but the device object - // is a lock protected above. The rest of the code - // doesn't change the device object. - // So the const methods can be safly used everywhere else. - // In general we should avoid changing the device object after initialization - - // Try to reallocate the heap with the same memory type - if (const_cast(this)->reallocHeap(extraSpace, settings().remoteAlloc_)) { - hb = heap_->alloc(size); - } - - if (hb == NULL) { - // Use reversed memory type as a temporary storage - bool remoteAlloc = settings().remoteAlloc_ ^ true; - - // Try to reallocate the heap - if (const_cast(this)->reallocHeap(extraSpace, remoteAlloc)) { - // Back to the default location of the global heap - remoteAlloc ^= true; - if (!const_cast(this)->reallocHeap(0, remoteAlloc)) { - LogWarning("New memory type for the \ - global heap after reallocation!"); - } - hb = heap_->alloc(size); - } - } - } - - return hb; -} - gpu::Memory* Device::getGpuMemory(amd::Memory* mem) const { @@ -1392,99 +1264,20 @@ Device::createScratchBuffer(size_t size) const { Memory* gpuMemory = NULL; - // Use virtual heap allocation - if (heap()->isVirtual()) { - // Create a memory object - gpuMemory = new gpu::Memory(*this, size); - if (NULL == gpuMemory || !gpuMemory->create(Resource::Local)) { - delete gpuMemory; - gpuMemory = NULL; - } - } - else { - // We have to lock the heap block allocation, - // so possible reallocation won't occur twice or - // another thread could destroy a heap block, - // while we didn't finish allocation - amd::ScopedLock k(lockAsyncOps()); - - HeapBlock* hb = allocHeapBlock(size); - if (hb != NULL) { - // wrap it - gpuMemory = new gpu::Memory(*this, *hb); - - // Create resource - if (NULL != gpuMemory) { - Resource::ViewParams params; - params.offset_ = hb->offset_; - params.size_ = hb->size_; - params.resource_ = &(globalMem()); - params.memory_ = NULL; - if (!gpuMemory->create(Resource::View, ¶ms)) { - delete gpuMemory; - gpuMemory = NULL; - } - } - } - } - - return gpuMemory; -} - -gpu::Memory* -Device::createBufferFromHeap(amd::Memory& owner) const -{ - size_t size = owner.getSize(); - gpu::Memory* gpuMemory; - - // We have to lock the heap block allocation, - // so possible reallocation won't occur twice or - // another thread could destroy a heap block, - // while we didn't finish allocation - amd::ScopedLock k(lockAsyncOps()); - - HeapBlock* hb = allocHeapBlock(size); - if (hb == NULL) { - LogError("We don't have enough video memory!"); - return NULL; - } - // Create a memory object - gpuMemory = new gpu::Memory(*this, owner, hb); - if (NULL == gpuMemory) { - hb->setMemory(NULL); - hb->free(); - return NULL; - } - - Resource::ViewParams params; - params.owner_ = &owner; - params.offset_ = hb->offset_; - params.size_ = hb->size_; - params.resource_ = &(globalMem()); - params.memory_ = NULL; - - if (!gpuMemory->create(Resource::View, ¶ms)) { + gpuMemory = new gpu::Memory(*this, size); + if (NULL == gpuMemory || !gpuMemory->create(Resource::Local)) { delete gpuMemory; - return NULL; + gpuMemory = NULL; } - // Check if owner is interop memory - if (owner.isInterop()) { - if (!gpuMemory->createInterop(Memory::InteropHwEmulation)) { - LogError("HW interop creation failed!"); - delete gpuMemory; - return NULL; - } - } return gpuMemory; } gpu::Memory* Device::createBuffer( amd::Memory& owner, - bool directAccess, - bool bufferAlloc) const + bool directAccess) const { size_t size = owner.getSize(); gpu::Memory* gpuMemory; @@ -1504,39 +1297,7 @@ Device::createBuffer( return NULL; } - if (!heap()->isVirtual()) { - bool uhpAlloc = - (owner.parent()->getMemFlags() & CL_MEM_USE_HOST_PTR) ? true : false; - - if (owner.parent()->getType() != CL_MEM_OBJECT_IMAGE1D_BUFFER) { - //! \note This extra line is necessary to make sure that subbuffer - //! allocation is a synch operation, - //! due to a possible realloc of heap(no VM) or parent(UHP) - amd::ScopedLock k(lockAsyncOps()); - - //! @note: For now make sure the parent is allocated in the global heap - //! or if it's the UHP optimization for prepinned memory - if (((gpuParent->hb() == NULL) || uhpAlloc) && - !owner.parent()->reallocedDeviceMemory(this)) { - if (reallocMemory(*owner.parent())) { - gpuParent = getGpuMemory(owner.parent()); - } - else { - LogError("Can't reallocate the owner object for subbuffer allocation"); - return NULL; - } - } - - return gpuParent->createBufferView(owner); - } - else { - gpuParent = getGpuMemory(owner.parent()->parent()); - return gpuParent->createBufferView(*owner.parent()->parent()); - } - } - else { - return gpuParent->createBufferView(owner); - } + return gpuParent->createBufferView(owner); } Resource::MemoryType type = (owner.forceSysMemAlloc() || (owner.getMemFlags() & CL_MEM_SVM_FINE_GRAIN_BUFFER)) ? @@ -1550,138 +1311,123 @@ Device::createBuffer( } // Use direct access if it's possible - if (bufferAlloc || (type == Resource::Remote)) { - bool forceHeapAlloc = false; - bool remoteAlloc = false; - // Internal means VirtualDevice!=NULL - bool internalAlloc = ((owner.getMemFlags() & CL_MEM_USE_HOST_PTR) && - (owner.getVirtualDevice() != NULL)) ? true : false; + bool remoteAlloc = false; + // Internal means VirtualDevice!=NULL + bool internalAlloc = ((owner.getMemFlags() & CL_MEM_USE_HOST_PTR) && + (owner.getVirtualDevice() != NULL)) ? true : false; - // Create a memory object - gpuMemory = new gpu::Buffer(*this, owner, owner.getSize()); - if (NULL == gpuMemory) { - return NULL; - } + // Create a memory object + gpuMemory = new gpu::Buffer(*this, owner, owner.getSize()); + if (NULL == gpuMemory) { + return NULL; + } - // Check if owner is interop memory - if (owner.isInterop()) { - result = gpuMemory->createInterop(Memory::InteropDirectAccess); - } - else if (owner.getMemFlags() & CL_MEM_USE_PERSISTENT_MEM_AMD) { - // Attempt to allocate from persistent heap - result = gpuMemory->create(Resource::Persistent); - } - else if (directAccess || (type == Resource::Remote)) { - // Check for system memory allocations - if ((owner.getMemFlags() & (CL_MEM_ALLOC_HOST_PTR | CL_MEM_USE_HOST_PTR)) - || (settings().remoteAlloc_)) { - // Allocate remote memory if AHP allocation and context has just 1 device - if ((owner.getMemFlags() & CL_MEM_ALLOC_HOST_PTR) && - (owner.getContext().devices().size() == 1)) { - if (owner.getMemFlags() & (CL_MEM_READ_ONLY | - CL_MEM_HOST_WRITE_ONLY | CL_MEM_HOST_NO_ACCESS)) { - // GPU will be reading from this host memory buffer, - // so assume Host write into it - type = Resource::RemoteUSWC; - remoteAlloc = true; - } + // Check if owner is interop memory + if (owner.isInterop()) { + result = gpuMemory->createInterop(Memory::InteropDirectAccess); + } + else if (owner.getMemFlags() & CL_MEM_USE_PERSISTENT_MEM_AMD) { + // Attempt to allocate from persistent heap + result = gpuMemory->create(Resource::Persistent); + } + else if (directAccess || (type == Resource::Remote)) { + // Check for system memory allocations + if ((owner.getMemFlags() & (CL_MEM_ALLOC_HOST_PTR | CL_MEM_USE_HOST_PTR)) + || (settings().remoteAlloc_)) { + // Allocate remote memory if AHP allocation and context has just 1 device + if ((owner.getMemFlags() & CL_MEM_ALLOC_HOST_PTR) && + (owner.getContext().devices().size() == 1)) { + if (owner.getMemFlags() & (CL_MEM_READ_ONLY | + CL_MEM_HOST_WRITE_ONLY | CL_MEM_HOST_NO_ACCESS)) { + // GPU will be reading from this host memory buffer, + // so assume Host write into it + type = Resource::RemoteUSWC; + remoteAlloc = true; } - // Make sure owner has a valid hostmem pointer and it's not COPY - if (!remoteAlloc && (owner.getHostMem() != NULL)) { - Resource::PinnedParams params; - params.owner_ = &owner; - params.gpu_ = - reinterpret_cast(owner.getVirtualDevice()); + } + // Make sure owner has a valid hostmem pointer and it's not COPY + if (!remoteAlloc && (owner.getHostMem() != NULL)) { + Resource::PinnedParams params; + params.owner_ = &owner; + params.gpu_ = + reinterpret_cast(owner.getVirtualDevice()); - params.hostMemRef_ = owner.getHostMemRef(); - params.size_ = owner.getHostMemRef()->size(); - if (0 == params.size_) { - params.size_ = owner.getSize(); - } - // Create memory object - result = gpuMemory->create(Resource::Pinned, ¶ms); + params.hostMemRef_ = owner.getHostMemRef(); + params.size_ = owner.getHostMemRef()->size(); + if (0 == params.size_) { + params.size_ = owner.getSize(); + } + // Create memory object + result = gpuMemory->create(Resource::Pinned, ¶ms); - // If direct access failed - if (!result) { - // and VM off, then force a heap allocation - if (!heap()->isVirtual()) { - // Internal pinning doesn't need a heap allocation - if (!internalAlloc) { - forceHeapAlloc = true; - } - } - // Don't use cached allocation - // if size is biger than max single alloc - if (owner.getSize() > info().maxMemAllocSize_) { - delete gpuMemory; - return NULL; - } + // If direct access failed + if (!result) { + // Don't use cached allocation + // if size is biger than max single alloc + if (owner.getSize() > info().maxMemAllocSize_) { + delete gpuMemory; + return NULL; } } } } + } - if (!result && !forceHeapAlloc && - // Make sure it's not internal alloc - !internalAlloc) { - Resource::CreateParams params; - params.owner_ = &owner; - params.gpu_ = static_cast(owner.getVirtualDevice()); + if (!result && + // Make sure it's not internal alloc + !internalAlloc) { + Resource::CreateParams params; + params.owner_ = &owner; + params.gpu_ = static_cast(owner.getVirtualDevice()); - // Create memory object - result = gpuMemory->create(type, ¶ms); + // Create memory object + result = gpuMemory->create(type, ¶ms); - // If allocation was successful - if (result) { - // Initialize if the memory is a pipe object - if (owner.getType() == CL_MEM_OBJECT_PIPE) { - // Pipe initialize in order read_idx, write_idx, end_idx. Refer clk_pipe_t structure. - // Init with 3 DWORDS for 32bit addressing and 6 DWORDS for 64bit - size_t pipeInit[3] = {0 , 0, owner.asPipe()->getMaxNumPackets()}; - gpuMemory->writeRawData(*xferQueue_, sizeof(pipeInit), pipeInit, true); + // If allocation was successful + if (result) { + // Initialize if the memory is a pipe object + if (owner.getType() == CL_MEM_OBJECT_PIPE) { + // Pipe initialize in order read_idx, write_idx, end_idx. Refer clk_pipe_t structure. + // Init with 3 DWORDS for 32bit addressing and 6 DWORDS for 64bit + size_t pipeInit[3] = {0 , 0, owner.asPipe()->getMaxNumPackets()}; + gpuMemory->writeRawData(*xferQueue_, sizeof(pipeInit), pipeInit, true); + } + // If memory has direct access from host, then get CPU address + if (gpuMemory->isHostMemDirectAccess() && + (type != Resource::ExternalPhysical)) { + void* address = gpuMemory->map(NULL); + if (address != NULL) { + // Copy saved memory + if (owner.getMemFlags() & CL_MEM_COPY_HOST_PTR) { + memcpy(address, owner.getHostMem(), owner.getSize()); + } + // It should be safe to change the host memory pointer, + // because it's lock protected from the upper caller + owner.setHostMem(address); } - // If memory has direct access from host, then get CPU address - if (gpuMemory->isHostMemDirectAccess() && - (type != Resource::ExternalPhysical)) { - void* address = gpuMemory->map(NULL); - if (address != NULL) { - // Copy saved memory - if (owner.getMemFlags() & CL_MEM_COPY_HOST_PTR) { - memcpy(address, owner.getHostMem(), owner.getSize()); - } - // It should be safe to change the host memory pointer, - // because it's lock protected from the upper caller - owner.setHostMem(address); - } - else { - result = false; - } - } - // An optimization for CHP. Copy memory and destroy sysmem allocation - else if ((gpuMemory->memoryType() != Resource::Pinned) && - (owner.getMemFlags() & CL_MEM_COPY_HOST_PTR) && - (owner.getContext().devices().size() == 1)) { - amd::Coord3D origin(0, 0, 0); - amd::Coord3D region(owner.getSize()); - static const bool Entire = true; - if (xferMgr().writeBuffer(owner.getHostMem(), - *gpuMemory, origin, region, Entire)) { - // Clear CHP memory - owner.setHostMem(NULL); - } + else { + result = false; + } + } + // An optimization for CHP. Copy memory and destroy sysmem allocation + else if ((gpuMemory->memoryType() != Resource::Pinned) && + (owner.getMemFlags() & CL_MEM_COPY_HOST_PTR) && + (owner.getContext().devices().size() == 1)) { + amd::Coord3D origin(0, 0, 0); + amd::Coord3D region(owner.getSize()); + static const bool Entire = true; + if (xferMgr().writeBuffer(owner.getHostMem(), + *gpuMemory, origin, region, Entire)) { + // Clear CHP memory + owner.setHostMem(NULL); } } - } - - if (!result && !forceHeapAlloc) { - delete gpuMemory; - return NULL; } } if (!result) { - assert(!heap()->isVirtual() && "Can't have static heap allocation with VM"); - gpuMemory = createBufferFromHeap(owner); + delete gpuMemory; + return NULL; } return gpuMemory; @@ -1703,10 +1449,10 @@ Device::createImage(amd::Memory& owner, bool directAccess) const } // Create a view on the specified device gpuImage = (gpu::Memory*)createView(owner, *devParent); - if (heap()->isVirtual() && (NULL != gpuImage) && (gpuImage->owner() != NULL)) { + if ((NULL != gpuImage) && (gpuImage->owner() != NULL)) { gpuImage->owner()->setHostMem((address)(owner.parent()->getHostMem()) + gpuImage->owner()->getOrigin()); } - return gpuImage ; + return gpuImage; } gpuImage = new gpu::Image(*this, owner, @@ -1778,11 +1524,11 @@ Device::createImage(amd::Memory& owner, bool directAccess) const (owner.getMemFlags() & CL_MEM_COPY_HOST_PTR) && (owner.getContext().devices().size() == 1)) { // Ignore copy for image1D_buffer, since it was already done for buffer - if (heap()->isVirtual() && imageBuffer) { + if (imageBuffer) { // Clear CHP memory owner.setHostMem(NULL); } - else if (!imageBuffer) { + else { amd::Coord3D origin(0, 0, 0); static const bool Entire = true; if (xferMgr().writeImage(owner.getHostMem(), @@ -1809,25 +1555,12 @@ Device::createMemory( amd::Memory& owner) const { bool directAccess = false; - bool bufferAlloc = false; gpu::Memory* memory = NULL; - if (heap()->isVirtual()) { - bufferAlloc = true; - } - //!@todo Remove this code when VM is always on. - // Use zero-copy transfers for sysmem allocations or persistent memory - else { - if (owner.getMemFlags() & (CL_MEM_ALLOC_HOST_PTR | - CL_MEM_USE_HOST_PTR)) { - bufferAlloc = true; - } - } - if (owner.asBuffer()) { directAccess = (settings().hostMemDirectAccess_ & Settings::HostMemBuffer) ? true : false; - memory = createBuffer(owner, directAccess, bufferAlloc); + memory = createBuffer(owner, directAccess); } else if (owner.asImage()) { directAccess = (settings().hostMemDirectAccess_ & Settings::HostMemImage) @@ -1878,7 +1611,6 @@ bool Device::reallocMemory(amd::Memory& owner) const { bool directAccess = false; - bool bufferAlloc = heap()->isVirtual(); // For now we have to serialize reallocation code amd::ScopedLock lk(*lockAsyncOps_); @@ -1889,35 +1621,18 @@ Device::reallocMemory(amd::Memory& owner) const if (gpuMemory == NULL) { return false; } - if (gpuMemory->hb() != NULL) { + + if (gpuMemory->pinOffset() == 0) { return true; } - - if (bufferAlloc) { - if (gpuMemory->pinOffset() == 0) { - return true; - } - else if (NULL != owner.parent()) { - if (!reallocMemory(*owner.parent())) { - return false; - } + else if (NULL != owner.parent()) { + if (!reallocMemory(*owner.parent())) { + return false; } } if (owner.asBuffer()) { - // Disable remote allocation if no VM - if ((gpuMemory != NULL) && - ((gpuMemory->memoryType() == Resource::Remote) || - (gpuMemory->memoryType() == Resource::RemoteUSWC)) && !bufferAlloc) { - // Make sure we don't have a stale memory in VA cache before reallocation - // of system memory. - // \note: the app must unmap() memory before kernel launch - removeVACache(gpuMemory); - static const bool forceAllocHostMem = true; - static const bool forceCopy = true; - owner.allocHostMemory(owner.getHostMem(), forceAllocHostMem, forceCopy); - } - gpuMemory = createBuffer(owner, directAccess, bufferAlloc); + gpuMemory = createBuffer(owner, directAccess); } else if (owner.asImage()) { return true; @@ -2113,24 +1828,18 @@ Device::globalFreeMemory(size_t* freeMemory) const if (!(const_cast(this)->initializeHeapResources())) { return false; } - if (heap()->isVirtual()) { - gslMemInfo memInfo = {0}; - gslCtx()->getMemInfo(&memInfo, GSL_MEMINFO_BASIC); - // Fill free memory info - freeMemory[TotalFreeMemory] = (memInfo.cardMemAvailableBytes + - memInfo.cardExtMemAvailableBytes) / Ki; - freeMemory[LargestFreeBlock] = std::max(memInfo.cardLargestFreeBlockBytes, - memInfo.cardExtLargestFreeBlockBytes) / Ki; - if (settings().apuSystem_) { - freeMemory[TotalFreeMemory] += memInfo.agpMemAvailableBytes / Ki; - freeMemory[LargestFreeBlock] += memInfo.agpLargestFreeBlockBytes / Ki; - } - } - else { - freeMemory[TotalFreeMemory] = static_cast((info().globalMemSize_ - - static_cast(heapSize_) + heap()->freeSpace()) / Ki); - freeMemory[LargestFreeBlock] = freeMemory[TotalFreeMemory]; + gslMemInfo memInfo = {0}; + gslCtx()->getMemInfo(&memInfo, GSL_MEMINFO_BASIC); + + // Fill free memory info + freeMemory[TotalFreeMemory] = (memInfo.cardMemAvailableBytes + + memInfo.cardExtMemAvailableBytes) / Ki; + freeMemory[LargestFreeBlock] = std::max(memInfo.cardLargestFreeBlockBytes, + memInfo.cardExtLargestFreeBlockBytes) / Ki; + if (settings().apuSystem_) { + freeMemory[TotalFreeMemory] += memInfo.agpMemAvailableBytes / Ki; + freeMemory[LargestFreeBlock] += memInfo.agpLargestFreeBlockBytes / Ki; } return true; diff --git a/rocclr/runtime/device/gpu/gpudevice.hpp b/rocclr/runtime/device/gpu/gpudevice.hpp index 4072ed9670..045ee2b40c 100644 --- a/rocclr/runtime/device/gpu/gpudevice.hpp +++ b/rocclr/runtime/device/gpu/gpudevice.hpp @@ -125,8 +125,7 @@ protected: const CALdeviceattribs& calAttr, //!< CAL device attributes info const gslMemInfo& memInfo, //!< GSL mem info size_t maxTextureSize, //!< Maximum texture size supported in HW - uint numComputeRings, //!< Number of compute rings - bool isVirtualMode //!< Device is in virtual mode + uint numComputeRings //!< Number of compute rings ); }; @@ -184,6 +183,32 @@ private: class Device : public NullDevice, public CALGSLDevice { public: + class Heap : public amd::EmbeddedObject + { + public: + //! The size of a heap element in bytes + static const size_t ElementSize = 4; + + //! The type of a heap element in bytes + static const cmSurfFmt ElementType = CM_SURF_FMT_R32I; + + Heap(): resource_(NULL), baseAddress_(0) {} + + bool create( + Device& device //!< GPU device object + ); + + //! Gets the GPU resource associated with the global heap + const Resource& resource() const { return *resource_; } + + //! Returns the base virtual address of the heap + uint64_t baseAddress() const { return baseAddress_; } + + protected: + Resource* resource_; //!< GPU resource referencing the heap memory + uint64_t baseAddress_; //!< Virtual heap base address + }; + //! Locks any access to the virtual GPUs class ScopedLockVgpus : public amd::StackObject { public: @@ -377,12 +402,6 @@ public: //! Destructor for the physical GPU device virtual ~Device(); - //! Reallocates current global heap - bool reallocHeap( - size_t size, //!< requested size for reallocation - bool remoteAlloc //!< allocate the new heap in remote memory - ); - //! Instantiate a new virtual device device::VirtualDevice* createVirtualDevice( amd::CommandQueue* queue = NULL @@ -442,15 +461,10 @@ public: ) const; //! Gets the GPU resource associated with the global heap - const Resource& globalMem() const { return heap_->resource(); } + const Resource& globalMem() const { return heap_.resource(); } //! Gets the global heap object - const Heap* heap() const { return heap_; } - - //! Allocates a heap block from the global heap - HeapBlock* allocHeapBlock( - size_t size //!< The heap block size for allocation - ) const; + const Heap& heap() const { return heap_; } //! Gets the memory object for the dummy page amd::Memory* dummyPage() const { return dummyPage_; } @@ -566,16 +580,10 @@ private: //! Sends the stall command to all queues bool stallQueues(); - //! Buffer allocation from static heap (no VM mode only) - gpu::Memory* createBufferFromHeap( - amd::Memory& owner //!< Abstraction layer memory object - ) const; - //! Buffer allocation gpu::Memory* createBuffer( amd::Memory& owner, //!< Abstraction layer memory object - bool directAccess, //!< Use direct host memory access - bool bufferAlloc //!< If TRUE, then don't use heap + bool directAccess //!< Use direct host memory access ) const; //! Image allocation @@ -591,8 +599,7 @@ private: ); amd::Context* context_; //!< A dummy context for internal allocations - size_t heapSize_; //!< The global heap size - Heap* heap_; //!< GPU heap manager + Heap heap_; //!< GPU global heap amd::Memory* dummyPage_; //!< A dummy page for NULL pointer amd::Monitor* lockAsyncOps_; //!< Lock to serialise all async ops on this device diff --git a/rocclr/runtime/device/gpu/gpuheap.cpp b/rocclr/runtime/device/gpu/gpuheap.cpp deleted file mode 100644 index 28cc32ed29..0000000000 --- a/rocclr/runtime/device/gpu/gpuheap.cpp +++ /dev/null @@ -1,536 +0,0 @@ -//! Implementation of GPU device memory management - -#include "top.hpp" -#include "thread/thread.hpp" -#include "thread/monitor.hpp" -#include "device/device.hpp" -#include "device/gpu/gpuheap.hpp" -#include "device/gpu/gpudevice.hpp" - -#include -#include -#include -#include - -//! Turn this on to enable sanity checks before and after every heap operation. -#if DEBUG -#define EXTRA_HEAP_CHECKS 1 -#endif // DEBUG - -namespace gpu { - -// The GPU heap. Very simple implementation for now. -Heap::Heap( - Device& device) - : resource_(NULL) - , freeList_(NULL) - , busyList_(NULL) - , freeSize_(0) - , device_(device) - , granularity_(Heap::MinGranularity) - , lock_("GPU heap lock", true) - , virtualMode_(false) - , baseAddress_(0) -{ -} - -size_t -Heap::granularityB() const -{ - return granularity_ * Heap::ElementSize; -} - -bool -Heap::create(size_t totalSize, bool remoteAlloc) -{ - Resource::MemoryType memType; - size_t maxHeight = device_.info().image2DMaxHeight_; - size_t sizeInElements; - size_t npages; - - freeSize_ = totalSize; - - sizeInElements = (totalSize + Heap::ElementSize - 1) / Heap::ElementSize; - - // Calculate best granularity given the size and device characteristics - npages = amd::alignUp(sizeInElements, granularity_) / granularity_; - - // Create a new GPU resource - resource_ = new Resource(device_, sizeInElements, Heap::ElementType); - - if (resource_ == NULL) { - return false; - } - - memType = (remoteAlloc) ? Resource::RemoteUSWC : Resource::Local; - - if (!resource_->create(memType, NULL, true)) { - return false; - } - - // Set up initial free list - freeList_ = new HeapBlock(this, npages * granularityB(), 0, NULL, NULL); - if (freeList_ == NULL) { - return false; - } - - guarantee(isSane()); - return true; -} - -Heap::~Heap() -{ - amd::ScopedLock k(lock_); - - guarantee(isSane()); - - // Release all heap blocks - HeapBlock *walk, *next; - walk = busyList_; - while (walk) { - next = walk->next_; - walk->free(); - walk = next; - } - - walk = freeList_; - while (walk) { - next = walk->next_; - delete walk; - walk = next; - } - - // Release resource - delete resource_; -} - -HeapBlock* -Heap::alloc(size_t size) -{ - amd::ScopedLock k(lock_); - HeapBlock* walk = freeList_; - HeapBlock* best = NULL; - - guarantee(isSane()); - - // Round size - size = amd::alignUp(size, granularityB()); - - // Walk the free list looking for a suitable block (currently best-fit) - //! @todo:dgladdin: experiment with switching back to first-fit - - while (walk) { - if ((walk->size_ > size) && - (best == NULL || walk->size_ < best->size_)) { - best = walk; - } - else if (walk->size_ == size) { - // No need to split, just move to busy list - detachBlock(&freeList_, walk); - walk->inUse_ = true; - insertBlock(&busyList_, walk); - guarantee(isSane()); - freeSize_ -= size; - return walk; - } - walk = walk->next_; - } - - if (best != NULL) { - // Got one, but need to split it. Keep first part in free list, - // put second part into busy list. - HeapBlock *newblock = splitBlock(best, size); - newblock->inUse_ = true; - insertBlock(&busyList_, newblock); - guarantee(isSane()); - freeSize_ -= size; - return newblock; - } - - // No free block available - guarantee(isSane()); - return NULL; -} - -bool -Heap::copyTo(Heap* heap) -{ - HeapBlock *walk; - - walk = busyList_; - while (walk) { - if (walk->getMemory() != NULL) { - HeapBlock* hb = heap->alloc(walk->size_); - if (hb == NULL) { - return false; - } - hb->setMemory(walk->getMemory()); - - walk->destroyViewsMemory(); - if (!walk->getMemory()->reallocate(hb, &(heap->resource()))) { - return false; - } - - if (!walk->reallocateViews(hb, - static_cast(hb->offset_ - walk->offset_))) { - return false; - } - } - walk = walk->next_; - } - - return true; -} - -void -Heap::free(HeapBlock* blk) -{ - amd::ScopedLock k(lock_); - guarantee(isSane()); - detachBlock(&busyList_, blk); - blk->inUse_ = false; - freeSize_ += blk->size_; - mergeBlock(&freeList_, blk); - guarantee(isSane()); -} - -void -Heap::detachBlock(HeapBlock** list, HeapBlock* blk) -{ - // Sanity checks - guarantee(isSane()); - - if (*list == blk) { - *list = blk->next_; - } - - if (blk->prev_) { - blk->prev_->next_ = blk->next_; - } - if (blk->next_) { - blk->next_->prev_ = blk->prev_; - } - // no heap sanity check as blk is now floating -} - -void -Heap::insertBlock(HeapBlock** head, HeapBlock* blk) -{ - if (NULL == *head) { - *head = blk; - blk->prev_ = NULL; - blk->next_ = NULL; - guarantee(isSane()); - return; - } - - // Find the place to insert it at - HeapBlock* walk = *head; - while (walk->next_ && walk->next_->offset_ < blk->offset_) { - walk = walk->next_; - } - - // Insert it - if (walk == *head) { - if (walk->offset_ >= blk->offset_) { - *head = blk; - blk->prev_ = NULL; - blk->next_ = walk; - walk->prev_ = *head; - guarantee(isSane()); - return; - } - } - - blk->next_ = walk->next_; - blk->prev_ = walk; - if (walk->next_) { - walk->next_->prev_ = blk; - } - walk->next_ = blk; - guarantee(isSane()); -} - -HeapBlock* -Heap::splitBlock(HeapBlock* blk, size_t tailsize) -{ - // Sanity checks - - guarantee(isSane()); - guarantee(blk->size_ > tailsize && "block too small to split as requested"); - guarantee(!blk->inUse_ && "can't split in-use block"); - - // Create a new block - - HeapBlock* nb = new HeapBlock(blk->owner_, tailsize, - blk->offset_ + blk->size_ - tailsize); - - // Resize the old block - - blk->size_ = blk->size_ - tailsize; - return nb; // no heap sanity check here as the new block hasn't been plugged in yet -} - -//! Join two blocks, transferring the size of the second into the first and deleting -//! the second. Utility fn for mergeBlock() - -static void -join2Blocks(HeapBlock* first, HeapBlock* second) -{ - // Sanity checks - - guarantee(first->size_ > 0 && "first block invalid"); - guarantee(!first->inUse_ && "can't join an in-use block"); - guarantee(second->size_ > 0 && "second block invalid"); - guarantee(first->offset_ + first->size_ == second->offset_); - - // Do the join - first->size_ = first->size_ + second->size_; - first->next_ = second->next_; - if (second->next_) { - second->next_->prev_ = first; - } - delete second; -} - -//! Insert a block into a list, merging it with adjacent blocks if possible. Must be called -//! under a lock, cannot be used on in-use blocks or blocks with an associated resource alias. - -void -Heap::mergeBlock(HeapBlock** head, HeapBlock* blk) -{ - insertBlock(head, blk); - - // Merge with successor if possible - if ((blk->next_ != NULL) && - (blk->offset_ + blk->size_ == blk->next_->offset_)) { - join2Blocks(blk, blk->next_); - } - - // Merge with predecessor if possible - if ((blk->prev_ != NULL) && - (blk->prev_->offset_ + blk->prev_->size_ == blk->offset_)) { - join2Blocks(blk->prev_, blk); - } - - guarantee(isSane()); -} - -//! Sanity check for both types of block (helper function for Heap::isSane()) - -static bool -isBlockSane(HeapBlock* b) -{ - return (b->owner_ != NULL - && (b->next_ == NULL || b->next_->prev_ == b) - && (b->prev_ == NULL || b->prev_->next_ == b)); -} - -//! Sanity check for an individual free block (helper function for Heap::isSane()) -static bool -isFreeBlockSane(HeapBlock* b) -{ - if (isBlockSane(b) && !b->inUse_) { - return true; - } else { - return false; - } -} - -//! Sanity check for an individual busy block (helper function for Heap::isSane()) -static bool -isBusyBlockSane(HeapBlock* b) -{ - if (isBlockSane(b) && b->inUse_) { - return true; - } else { - return false; - } -} - -//! Sanity check for the heap. - -bool -Heap::isSane() const -{ - // If we got this far, everything is (probably) OK -#if EXTRA_HEAP_CHECKS - HeapBlock* walkFree = freeList_; // Free list position - HeapBlock* walkBusy = busyList_; // Busy list position - size_t offset = 0; // Current offset - - // We can have zero lists if Heap allocation fails - if (walkFree == NULL && walkBusy == NULL) { - return true; - } - - // Walk both lists in parallel - while (walkFree != NULL || walkBusy != NULL) { - if (walkFree != NULL && walkFree->offset_ == offset) { - if (!isFreeBlockSane(walkFree)) { - return false; - } - offset += walkFree->size_; - walkFree = walkFree->next_; - } - else if (walkBusy != NULL && walkBusy->offset_ == offset) { - if (!isBusyBlockSane(walkBusy)) { - return false; - } - offset += walkBusy->size_; - walkBusy = walkBusy->next_; - } - else { - return false; - } - } - -#endif // EXTRA_HEAP_CHECKS - return true; -} - -void -HeapBlock::destroyViewsMemory() -{ - if ((parent_ != NULL) && (0 == views_.size())) { - memory_->free(); - } - else if (views_.size() != 0) { - std::list::const_iterator it; - for (it = views_.begin(); it != views_.end(); ++it) { - (*it)->destroyViewsMemory(); - } - } -} - -bool -HeapBlock::reallocateViews(HeapBlock* parent, size_t shift) -{ - if (views_.size() != 0) { - std::list::const_iterator it; - - // Loop through all views and reallocate them - for (it = views_.begin(); it != views_.end(); ++it) { - // Get the view HeapBlock - HeapBlock* hb = (*it); - - // Readjust the offset - hb->offset_ += shift; - // Add to the list if we have a new parent - if (parent != this) { - parent->addView(hb); - } - - // Reallocate memory - hb->memory_->reallocate(hb, parent->getMemory()); - - // Process a view on view if available - if (!hb->reallocateViews(hb, shift)) { - return false; - } - } - - // Destroy old list - if (parent != this) { - views_.clear(); - } - } - return true; -} - -//! Destructor. Frees the block if in use and does some final sanity checks. -HeapBlock::~HeapBlock() -{ - if (NULL != owner_) { - if (inUse_) { - owner_->free(this); - } - } - else { - // View destruction - if (parent_ != NULL) { - assert(((parent_->getMemory() != NULL) && (parent_->getMemory()->owner() != NULL))); - amd::ScopedLock lock(parent_->getMemory()->owner()->lockMemoryOps()); - parent_->removeView(this); - } - } - guarantee(size_ > 0 && "destructor called for zero-size heap block (destructor called twice?)"); - size_ = 0; // Mark as invalid - - if (views_.size() != 0) { - LogError("Can't destroy a resource if we still have views!"); - } -} - -void -HeapBlock::free() -{ - if (NULL != owner_) { - owner_->free(this); - } - else { - // It's a view. Destroy the object - delete this; - } -} - -VirtualHeap::VirtualHeap( - Device& device) - : Heap(device) -{ - virtualMode_ = true; -} - -bool -VirtualHeap::create( - size_t totalSize, - bool remoteAlloc) -{ - // Create a new GPU resource - resource_ = new Resource(device_, 0, Heap::ElementType); - if (resource_ == NULL) { - return false; - } - - if (!resource_->create(Resource::Heap)) { - return false; - } - - if (!device_.settings().hsail_) { - baseAddress_ = resource_->gslResource()->getSurfaceAddress(); - } - return true; -} - -VirtualHeap::~VirtualHeap() -{ -} - -HeapBlock* -VirtualHeap::alloc(size_t size) -{ - assert(false && "Dead branch!"); - return NULL; -} - -void -VirtualHeap::free(HeapBlock* blk) -{ - assert(false && "Dead branch!"); -} - -bool -VirtualHeap::copyTo(Heap* heap) -{ - assert(false && "Dead branch!"); - return false; -} - -bool -VirtualHeap::isSane(void) const -{ - assert(false && "Dead branch!"); - return true; -} - -} // namespace gpu diff --git a/rocclr/runtime/device/gpu/gpuheap.hpp b/rocclr/runtime/device/gpu/gpuheap.hpp deleted file mode 100644 index b38f316446..0000000000 --- a/rocclr/runtime/device/gpu/gpuheap.hpp +++ /dev/null @@ -1,225 +0,0 @@ -//! Declarations for GPU memory management - -#ifndef GPUHEAP_HPP_ -#define GPUHEAP_HPP_ - -#include "top.hpp" -#include "thread/atomic.hpp" -#include "device/gpu/gpudefs.hpp" - -/*! \addtogroup GPU - * @{ - */ - -//! GPU Device Implementation - -namespace gpu { - -class Device; -class Heap; -class Resource; -class Memory; -class VirtualGPU; - -//! @todo:dgladdin: The heap list should be singly-linked - -//! \brief A block on the GPU heap. -//! -//! Note that no code outside of the gpumemory.hpp/.cpp pair should touch this -//! class directly as it is not thread-safe. In general, this class should be -//! pretty much a struct and contain as little functionality as possible - just -//! a constructor, destructor. -//! -//! Any other methods - in particular, anything that talks to CAL - should be no -//! more than proxies for functionality implemented in Heap, as Heap is aware -//! of the lock state. - -class HeapBlock : public amd::HeapObject -{ -public: - //! Constructor - HeapBlock( - Heap* owner = NULL, - size_t size = 0, - size_t offset = 0, - HeapBlock* next=NULL, - HeapBlock* prev=NULL) - : owner_(owner) - , size_(size) - , offset_(offset) - , next_(next) - , prev_(prev) - , inUse_(false) - , parent_(NULL) - , memory_(NULL) - {} - - //! Destructor does some sanity checks. - ~HeapBlock(); - - //! Frees a heap block, returning its memory to the owning heap (proxy) - void free(); - - //! Sets the GPU memory object associated with the heap block - void setMemory(Memory* memory) { memory_ = memory; } - - //! Gets the GPU memory object associated with the heap block - Memory* getMemory() const { return memory_; } - - //! Adds a heapblock view to the list of views - void addView(HeapBlock* hb) - { views_.push_back(hb); hb->parent_ = this; } - - //! Removes a heapblock view from the list of views - void removeView(HeapBlock* hb) { views_.remove(hb); } - - //! Destroys all views - void destroyViewsMemory(); - - //! Creates all new views - bool reallocateViews( - HeapBlock* parent, //!< Parent heap block - size_t shift //!< The new HeapBlock shift - ); - - //! Gets the offset - size_t offset() const { return offset_; } - - Heap* owner_; //!< Heap that owns this block - size_t size_; //!< Size of the block in bytes - size_t offset_; //!< Offset of this block in the heap - HeapBlock* next_; //!< Next block on the list, or NULL - HeapBlock* prev_; //!< Previous block on the list, or NULL - bool inUse_; //!< true if the block is in use - HeapBlock* parent_; //!< The parent heap block for a view - -private: - //! Disable copy constructor - HeapBlock(const HeapBlock&); - - //! Disable assignment - HeapBlock& operator=(const HeapBlock&); - - Memory* memory_; //!< Memory object associated with the heap block - std::list views_; //!< The list of all allocated views -}; - -class Heap : public amd::HeapObject -{ -public: - //! Minimal supported CAL granularity = 256 bytes / ElementSize - static const size_t MinGranularity = 64; - - //! The size of a heap element in bytes - static const size_t ElementSize = 4; - - //! The type of a heap element in bytes - static const cmSurfFmt ElementType = CM_SURF_FMT_R32I; - - Heap( - Device& device //!< GPU device object - ); - - virtual bool create( - size_t totalSize, //!< total size of the allocated heap (bytes) - bool remoteAlloc //!< allocate the heap in remote memory - ); - - //! Heap destructor - virtual ~Heap(); - - /*! - * \brief Allocates memory from a heap (best-fit). - * We round up to 4k granularity for alignment. - * - * \return A pointer to allocated heap block object. - */ - virtual HeapBlock* alloc( - size_t size //! The allocation size - ); - - //! Release memory back to a heap. - virtual void free(HeapBlock* blk); - - //! Copies this heap to another - virtual bool copyTo(Heap* heap); - - //! Gets the GPU resource associated with the global heap - const Resource& resource() const { return *resource_; } - - //! Read the page size (bytes) - size_t granularityB() const; - - //! Read the total free space (bytes) - size_t freeSpace() const { return freeSize_; } - - virtual bool isSane(void) const; //!< Checks heap sanity - - //! Returns true if we have a virtual heap - bool isVirtual() const { return virtualMode_; } - - //! Returns the base virtual address of the heap - uint64_t baseAddress() const { return baseAddress_; } - -private: - //! Insert a block into a list. Must be called under a lock. - void insertBlock(HeapBlock** list, HeapBlock* node); - - //! Merge a block into a list. Must be called under a lock. - void mergeBlock(HeapBlock** list, HeapBlock* node); - - //! Remove a block from a list. Must be called under a lock. - void detachBlock(HeapBlock** list, HeapBlock* node); - - //! Split a block into two pieces - HeapBlock* splitBlock(HeapBlock* node, size_t size); - -protected: - Resource* resource_; //!< GPU resource referencing the heap memory - HeapBlock* freeList_; //!< Head block for free list - HeapBlock* busyList_; //!< Head block for busy list - size_t freeSize_; //!< total free size of the heap - Device& device_; //!< Device that owns this heap - size_t granularity_; //!< Size of an allocation page - amd::Monitor lock_; //!< Lock to serialise heap accesses - bool virtualMode_; //!< Virtual mode - uint64_t baseAddress_; //!< Virtual heap base address -}; - -class VirtualHeap : public Heap -{ -public: - VirtualHeap( - Device& device //!< GPU device object - ); - - virtual bool create( - size_t totalSize, //!< total size of the allocated heap (bytes) - bool remoteAlloc //!< allocate the heap in remote memory - ); - - //! Heap destructor - virtual ~VirtualHeap(); - - /*! - * \brief Allocates memory from a heap (best-fit). - * We round up to 4k granularity for alignment. - * - * \return A pointer to allocated heap block object. - */ - virtual HeapBlock* alloc( - size_t size //! The allocation size - ); - - //! Release memory back to a heap. - virtual void free(HeapBlock* blk); - - //! Copies this heap to another - virtual bool copyTo(Heap* heap); - - virtual bool isSane(void) const; //!< Checks heap sanity -}; - -} // namespace gpu - -#endif // GPUHEAP_HPP_ diff --git a/rocclr/runtime/device/gpu/gpukernel.cpp b/rocclr/runtime/device/gpu/gpukernel.cpp index 0ffabf5468..8f511311e7 100644 --- a/rocclr/runtime/device/gpu/gpukernel.cpp +++ b/rocclr/runtime/device/gpu/gpukernel.cpp @@ -824,17 +824,6 @@ Kernel::create( // Initialize the kernel parameters bool result = initParameters(); - if (!dev().heap()->isVirtual()) { - amd::option::Options *options = nullProg().getCompilerOptions(); - // @todo Remove this. This is a hack for no VM mode - if (!options->oVariables->EnableDumpKernel) { - if (!name().compare(BlitName[KernelBlitManager::BlitCopyImageToBuffer]) || - !name().compare(BlitName[KernelBlitManager::BlitCopyBufferToImage])) { - blitKernelHack_ = true; - } - } - } - // Wave limiter needs to be initialized after kernel metadata is parsed // Since it depends on it. waveLimiter_.enable(); @@ -855,7 +844,6 @@ Kernel::Kernel( const Program& prog, const InitData* initData) : NullKernel(name, gpuDev, prog) - , blitKernelHack_(false) , waveLimiter_(this) { hwPrivateSize_ = 0; @@ -1603,10 +1591,6 @@ Kernel::debug(VirtualGPU& gpu) const { std::fstream stubWrite; address src = NULL; - if (!dev().heap()->isVirtual()) { - src = reinterpret_cast
- (const_cast(dev().globalMem()).map(&gpu)); - } std::cerr << "--- " << name_ << " ---" << std::endl; for (uint i = 0; i < arguments_.size(); ++i) { @@ -1689,9 +1673,6 @@ Kernel::debug(VirtualGPU& gpu) const stubWrite.close(); } } - if (!dev().heap()->isVirtual()) { - const_cast(dev().globalMem()).unmap(&gpu); - } } bool @@ -1824,18 +1805,10 @@ Kernel::setArgument( type = ArgumentBuffer; } else { - if (blitKernelHack_) { - // Bind global buffer to UAV this buffer is bound to - if (!bindResource(gpu, *gpuMem, 0, GlobalBuffer, uavRaw_)) { - return false; - } - } - else { - // Bind global buffer to UAV this buffer is bound to - if (!bindResource(gpu, dev().globalMem(), 0, - GlobalBuffer, uavRaw_)) { - return false; - } + // Bind global buffer to UAV this buffer is bound to + if (!bindResource(gpu, dev().globalMem(), 0, + GlobalBuffer, uavRaw_)) { + return false; } } @@ -1848,11 +1821,9 @@ Kernel::setArgument( // Update offset only if we bind HeapBuffer or // it's global address space in UAV setup on SI+ - if (!blitKernelHack_) { - offset += gpuMem->hbOffset(); - if (!forceZeroOffset) { - assert((offset != 0) && "Offset 0 with a real allocation!"); - } + offset += gpuMem->hbOffset(); + if (!forceZeroOffset) { + assert((offset != 0) && "Offset 0 with a real allocation!"); } gpu.addVmMemory(gpuMem); } @@ -2253,10 +2224,9 @@ Kernel::bindResource( gslMemObject gslMem = NULL; // Use global address space on SI+ for UAV setup - if (((type == ArgumentBuffer) || (type == ArgumentCbID) || - (type == ArgumentUavID) || (type == ArgumentPrintfID)) && - !blitKernelHack_) { - gslMem = dev().heap()->resource().gslResource(); + if ((type == ArgumentBuffer) || (type == ArgumentCbID) || + (type == ArgumentUavID) || (type == ArgumentPrintfID)) { + gslMem = dev().heap().resource().gslResource(); } else { gslMem = resource.gslResource(); @@ -2803,7 +2773,7 @@ NullKernel::parseArguments(const std::string& metaData, uint* uavRefCount) case KernelArg::PointerPrivate: // Check if can't use a dedicated UAV, // so realloc memory in the heap - arg->memory_.realloc_ = isRealloc(); + arg->memory_.realloc_ = false; arg->memory_.uavBuf_ = true; break; case KernelArg::PointerHwConst: diff --git a/rocclr/runtime/device/gpu/gpukernel.hpp b/rocclr/runtime/device/gpu/gpukernel.hpp index b46242ec2d..c89b9e1589 100644 --- a/rocclr/runtime/device/gpu/gpukernel.hpp +++ b/rocclr/runtime/device/gpu/gpukernel.hpp @@ -450,9 +450,6 @@ public: uint instructionCnt() const { return instructionCnt_; } protected: - //! Returns TRUE if memory should be reallocated, returns FALSE always for NullDevice - virtual bool isRealloc() const { return false; } - /*! \brief Parses the metadata structure for the kernel, * provided by the OpenCL compiler * @@ -673,9 +670,6 @@ protected: */ bool initConstBuffers(); - //! Returns TRUE if memory should be reallocated, returns FALSE always for NullDevice - virtual bool isRealloc() const { return !dev().heap()->isVirtual(); } - private: //! Disable copy constructor Kernel(const Kernel&); @@ -771,9 +765,6 @@ private: uint hwPrivateSize_; //!< initial HW private size uint hwLocalSize_; //!< initial HW local size - //! @todo remove the blit kernel hack - bool blitKernelHack_; //!< No VM hack for kernel blit - WaveLimiterManager waveLimiter_; //!< adaptively control number of waves }; diff --git a/rocclr/runtime/device/gpu/gpumemory.cpp b/rocclr/runtime/device/gpu/gpumemory.cpp index 3dacc145fc..0534ffe201 100644 --- a/rocclr/runtime/device/gpu/gpumemory.cpp +++ b/rocclr/runtime/device/gpu/gpumemory.cpp @@ -30,39 +30,24 @@ namespace gpu { Memory::Memory( const Device& gpuDev, amd::Memory& owner, - HeapBlock* hb, size_t size) : device::Memory(owner) - , Resource(gpuDev, ((hb) ? hb->size_ : size) / Heap::ElementSize, Heap::ElementType) - , hb_(hb) + , Resource(gpuDev, size / Device::Heap::ElementSize, Device::Heap::ElementType) { init(); - if (NULL != hb_) hb_->setMemory(this); - if (owner.parent() != NULL) { flags_ |= SubMemoryObject; } } -Memory::Memory( - const Device& gpuDev, - HeapBlock& hb) - : device::Memory(hb.size_) - , Resource(gpuDev, hb.size_ / Heap::ElementSize, Heap::ElementType) - , hb_(&hb) -{ - init(); - hb.setMemory(this); -} - Memory::Memory( const Device& gpuDev, size_t size) : device::Memory(size) , Resource(gpuDev, - amd::alignUp(size, Heap::ElementSize) / Heap::ElementSize, Heap::ElementType) - , hb_(NULL) + amd::alignUp(size, Device::Heap::ElementSize) / + Device::Heap::ElementSize, Device::Heap::ElementType) { init(); } @@ -75,7 +60,6 @@ Memory::Memory( ) : device::Memory(owner) , Resource(gpuDev, width, format) - , hb_(NULL) { init(); @@ -92,7 +76,6 @@ Memory::Memory( ) : device::Memory(size) , Resource(gpuDev, width, format) - , hb_(NULL) { init(); } @@ -110,7 +93,6 @@ Memory::Memory( ) : device::Memory(owner) , Resource(gpuDev, width, height, depth, format, chOrder, imageType, mipLevels) - , hb_(NULL) { init(); @@ -132,7 +114,6 @@ Memory::Memory( ) : device::Memory(size) , Resource(gpuDev, width, height, depth, format, chOrder, imageType, mipLevels) - , hb_(NULL) { init(); } @@ -197,14 +178,9 @@ Memory::create( break; case Resource::Remote: case Resource::RemoteUSWC: - // @todo Enable unconditional optimization for remote memory - if ((owner() != NULL && - owner()->getMemFlags() & CL_MEM_ALLOC_HOST_PTR) || - (hb() == NULL)) { - if (!cal()->tiled_) { - // Marks memory object for direct GPU access to the host memory - flags_ |= HostMemoryDirectAccess; - } + if (!cal()->tiled_) { + // Marks memory object for direct GPU access to the host memory + flags_ |= HostMemoryDirectAccess; } break; case Resource::View: { @@ -481,8 +457,8 @@ Memory::createInterop(InteropType type) else { // Allocate Resource object for interop as buffer interopMemory_ = new Memory(dev(), size(), - amd::alignUp(size(), Heap::ElementSize) / Heap::ElementSize, - Heap::ElementType); + amd::alignUp(size(), Device::Heap::ElementSize) / Device::Heap::ElementSize, + Device::Heap::ElementType); // Create the interop object in CAL if (NULL == interopMemory_ || !interopMemory_->create(memType, createParams)) { @@ -502,14 +478,6 @@ Memory::~Memory() // Clean VA cache dev().removeVACache(this); - // Release associated heap block, if any - if (hb_) { - // Protect heap block from simultaneous release with realloc - amd::ScopedLock k(dev().lockAsyncOps()); - hb_->setMemory(NULL); - hb_->free(); - } - delete interopMemory_; // Release associated map target, if any @@ -531,35 +499,6 @@ Memory::~Memory() } } -bool -Memory::reallocate(HeapBlock* hb, const Resource* parent) -{ - Resource::ViewParams params; - params.size_ = hb->size_; - params.resource_ = parent; - params.memory_ = NULL; - - // Check if it's a view reallocation - if (NULL != hb->parent_) { - // The offset inside the view is unchanged - params.offset_ = Resource::offset(); - - // Create a new view - if (Resource::create(Resource::View, ¶ms)) { - hb_ = hb; - return true; - } - } - else { - params.offset_ = hb->offset_; - if (Resource::reallocate(¶ms)) { - hb_ = hb; - return true; - } - } - return false; -} - void Memory::syncCacheFromHost(VirtualGPU& gpu, device::Memory::SyncFlags syncFlags) { @@ -814,33 +753,13 @@ Memory::createBufferView(amd::Memory& subBufferOwner) { gpu::Memory* viewMemory; Resource::ViewParams params; - HeapBlock* hb = NULL; size_t offset = subBufferOwner.getOrigin(); size_t size = subBufferOwner.getSize(); - if (!dev().heap()->isVirtual()) { - if (NULL == hb_) { - LogError("HeapBlock must be initialized!"); - return NULL; - } - - hb = new HeapBlock(NULL, size, offset + hb_->offset()); - if (hb == NULL) { - LogError("We don't have enough video memory!"); - return NULL; - } - amd::ScopedLock lock(owner()->lockMemoryOps()); - hb_->addView(hb); - } - // Create a memory object - viewMemory = new gpu::Memory(dev(), subBufferOwner, hb, size); + viewMemory = new gpu::Memory(dev(), subBufferOwner, size); if (NULL == viewMemory) { - if (hb != NULL) { - hb->setMemory(NULL); - hb->free(); - } return NULL; } diff --git a/rocclr/runtime/device/gpu/gpumemory.hpp b/rocclr/runtime/device/gpu/gpumemory.hpp index c6ccb4b23e..503ca42a34 100644 --- a/rocclr/runtime/device/gpu/gpumemory.hpp +++ b/rocclr/runtime/device/gpu/gpumemory.hpp @@ -8,7 +8,6 @@ #include "top.hpp" #include "thread/atomic.hpp" #include "device/gpu/gpuresource.hpp" -#include "device/gpu/gpuheap.hpp" #include "device/gpu/gpudevice.hpp" #include @@ -27,7 +26,6 @@ class Heap; class Resource; class Memory; class VirtualGPU; -class HeapBlock; //! GPU memory object. // Wrapper that can contain a heap block or an interop buffer/image. @@ -44,14 +42,8 @@ public: Memory( const Device& gpuDev, amd::Memory& owner, - HeapBlock* hb, size_t size = 0); - //! Constructor (nonfat version for local scratch mem use) - Memory( - const Device& gpuDev, - HeapBlock& hb); - //! Constructor (nonfat version for local scratch mem use without heap block) Memory( const Device& gpuDev, @@ -102,12 +94,6 @@ public: //! Default destructor ~Memory(); - //! Reallocates the memory object in the new heap block - bool reallocate( - HeapBlock* hb, //! The new heap block for this memory object - const Resource* parent //! Parent resource for view reallocaiton - ); - //! Creates the interop memory bool createInterop( InteropType type //!< The interop type @@ -189,9 +175,6 @@ public: //! Sets interop type for this memory object void setInteropType(InteropType type) { interopType_ = type; } - //! Returns the HeapBlock pointer - const HeapBlock* hb() const { return hb_; } - //! Set the owner void setOwner(amd::Memory* owner) { owner_ = owner; } @@ -229,7 +212,6 @@ private: InteropType interopType_; //!< Interop type Memory* interopMemory_; //!< interop memory - HeapBlock* hb_; //!< Heap Block, or NULL if not in-heap memory Memory* pinnedMemory_; //!< Memory used as pinned system memory const Memory* parent_; //!< Parent memory object }; diff --git a/rocclr/runtime/device/gpu/gpuresource.cpp b/rocclr/runtime/device/gpu/gpuresource.cpp index 85feb33c5e..248812b8c1 100644 --- a/rocclr/runtime/device/gpu/gpuresource.cpp +++ b/rocclr/runtime/device/gpu/gpuresource.cpp @@ -322,7 +322,7 @@ static uint32_t GetHSAILImageOrderType(gslChannelOrder chOrder, cmSurfFmt format } bool -Resource::create(MemoryType memType, CreateParams* params, bool heap) +Resource::create(MemoryType memType, CreateParams* params) { bool calRes = false; gslMemObject gslResource = 0; @@ -382,7 +382,7 @@ Resource::create(MemoryType memType, CreateParams* params, bool heap) } // Force remote allocation if it was requested in the settings - if (dev().settings().remoteAlloc_ && !heap && + if (dev().settings().remoteAlloc_ && ((memoryType() == Local) || (memoryType() == Persistent))) { if (dev().settings().apuSystem_ && dev().settings().viPlus_) { @@ -515,7 +515,7 @@ Resource::create(MemoryType memType, CreateParams* params, bool heap) if (memoryType() == Local) { cal_.type_ = Persistent; } - else if (!heap && (memoryType() == Persistent)) { + else if (memoryType() == Persistent) { cal_.type_ = RemoteUSWC; } // Remote cacheable to uncacheable @@ -553,11 +553,6 @@ Resource::create(MemoryType memType, CreateParams* params, bool heap) reinterpret_cast(address_) - tmpHost); pinOffset_ = hostMemOffset & 0xff; - //!@note GSL has a problem with the defines for flags and - //! view creation, so check the restriction here - if (!dev().heap()->isVirtual() && (pinOffset_ != 0)) { - return false; - } pinAddress = tmpHost; // Align width to avoid GSL useless assert with a view @@ -629,20 +624,6 @@ Resource::create(MemoryType memType, CreateParams* params, bool heap) calRes = true; } - // Check if it's a heap allocation - if (!dev().heap()->isVirtual()) { - if (viewOwner_ == &dev().globalMem()) { - // Allocation directly from the heap - hbOffset_ = static_cast(view->offset_); - } - else { - // Allocation from another memory object - hbOffset_ = static_cast(view->offset_) + - viewOwner_->hbOffset(); - } - hbSize_ = view->size_; - } - if (viewOwner_->isMemoryType(Pinned)) { address_ = viewOwner_->data() + offset(); } @@ -952,11 +933,9 @@ Resource::create(MemoryType memType, CreateParams* params, bool heap) cal_.tiled_ = (GSL_MOA_TILING_LINEAR != tiling) && (GSL_MOA_TILING_LINEAR_GENERAL != tiling); - // Get the heap block offset if it's a virtual heap - if (dev().heap()->isVirtual()) { - hbOffset_ = gslResource->getSurfaceAddress() - - dev().heap()->baseAddress(); - } + // Get the heap block offset + hbOffset_ = gslResource->getSurfaceAddress() - + dev().heap().baseAddress(); hbSize_ = static_cast(gslResource->getSurfaceSize()); if (!dev().settings().use64BitPtr_ && @@ -1036,32 +1015,6 @@ Resource::create(MemoryType memType, CreateParams* params, bool heap) return true; } -bool -Resource::reallocate(CreateParams* params) -{ - GslResourceReference* old; - GslResourceReference* active; - - old = gslRef_; - if (!create(memoryType(), params)) { - gslRef_ = old; - return false; - } - // Get the new active resource - active = gslRef_; - gslRef_ = old; - - dev().resCopy(old->gslResource(), - active->gslResource(), CAL_MEMCOPY_SYNC); - - // Free all old resources - assert(renames_.size() == 0); - free(); - - gslRef_ = active; - return true; -} - void Resource::free() { @@ -1813,10 +1766,8 @@ Resource::setActiveRename(VirtualGPU& gpu, GslResourceReference* rename) gslRef_ = rename; address_ = rename->cpuAddress_; - if (dev().heap()->isVirtual()) { - hbOffset_ = rename->gslResource()->getSurfaceAddress() - - dev().heap()->baseAddress(); - } + hbOffset_ = rename->gslResource()->getSurfaceAddress() - + dev().heap().baseAddress(); } bool diff --git a/rocclr/runtime/device/gpu/gpuresource.hpp b/rocclr/runtime/device/gpu/gpuresource.hpp index 6430467760..fe4215327f 100644 --- a/rocclr/runtime/device/gpu/gpuresource.hpp +++ b/rocclr/runtime/device/gpu/gpuresource.hpp @@ -209,15 +209,6 @@ public: */ virtual bool create( MemoryType memType, //!< memory type - CreateParams* params = 0, //!< special parameters for resource allocation - bool heap = false //!< Global heap allocation for not VM mode - ); - - /*! \brief Reallocates a CAL object, associated with the resource - * - * \return True if we succesfully reallocated a CAL resource - */ - bool reallocate( CreateParams* params = 0 //!< special parameters for resource allocation ); diff --git a/rocclr/runtime/device/gpu/gpusettings.cpp b/rocclr/runtime/device/gpu/gpusettings.cpp index 8718ad21e4..3f64bdfe8a 100644 --- a/rocclr/runtime/device/gpu/gpusettings.cpp +++ b/rocclr/runtime/device/gpu/gpusettings.cpp @@ -50,10 +50,6 @@ Settings::Settings() maxRenames_ = 16; maxRenameSize_ = 4 * Mi; - // The global heap settings - heapSize_ = GPU_INITIAL_HEAP_SIZE * Mi; - heapSizeGrowth_ = GPU_HEAP_GROWTH_INCREMENT * Mi; - imageSupport_ = false; hwLDSSize_ = 0; diff --git a/rocclr/runtime/device/gpu/gpusettings.hpp b/rocclr/runtime/device/gpu/gpusettings.hpp index 6fe4e974bc..fca4a2c6c4 100644 --- a/rocclr/runtime/device/gpu/gpusettings.hpp +++ b/rocclr/runtime/device/gpu/gpusettings.hpp @@ -82,8 +82,6 @@ public: size_t stagedXferSize_; //!< Staged buffer size uint maxRenames_; //!< Maximum number of possible renames uint maxRenameSize_; //!< Maximum size for all renames - size_t heapSize_; //!< The global heap size - size_t heapSizeGrowth_; //!< The global heap size growth uint hwLDSSize_; //!< HW local data store size uint maxWorkGroupSize_; //!< Requested workgroup size for this device uint hostMemDirectAccess_; //!< Enables direct access to the host memory diff --git a/rocclr/runtime/device/gpu/gpuvirtual.cpp b/rocclr/runtime/device/gpu/gpuvirtual.cpp index e78f63db50..6cc3eae985 100644 --- a/rocclr/runtime/device/gpu/gpuvirtual.cpp +++ b/rocclr/runtime/device/gpu/gpuvirtual.cpp @@ -517,10 +517,6 @@ VirtualGPU::create( // Fall through ... case Settings::BlitEngineCAL: case Settings::BlitEngineKernel: - if (!dev().heap()->isVirtual()) { - blitSetup.disableReadBufferRect_ = true; - blitSetup.disableWriteBufferRect_ = true; - } // use host blit for HW debug if (dev().settings().enableHwDebug_) { blitSetup.disableCopyImageToBuffer_ = true; @@ -3166,23 +3162,21 @@ VirtualGPU::profilingCollectResults(CommandBatch* cb, const amd::Event* waitingE bool VirtualGPU::addVmMemory(const Resource* resource) { - if (dev().heap()->isVirtual()) { - uint* cnt = &cal_.memCount_; - (*cnt)++; - // Reallocate array if kernel uses more memory objects - if (numVmMems_ < *cnt) { - gslMemObject* tmp; - tmp = new gslMemObject [*cnt]; - if (tmp == NULL) { - return false; - } - memcpy(tmp, vmMems_, sizeof(gslMemObject) * numVmMems_); - delete [] vmMems_; - vmMems_ = tmp; - numVmMems_ = *cnt; + uint* cnt = &cal_.memCount_; + (*cnt)++; + // Reallocate array if kernel uses more memory objects + if (numVmMems_ < *cnt) { + gslMemObject* tmp; + tmp = new gslMemObject [*cnt]; + if (tmp == NULL) { + return false; } - vmMems_[*cnt - 1] = resource->gslResource(); + memcpy(tmp, vmMems_, sizeof(gslMemObject) * numVmMems_); + delete [] vmMems_; + vmMems_ = tmp; + numVmMems_ = *cnt; } + vmMems_[*cnt - 1] = resource->gslResource(); return true; } diff --git a/rocclr/runtime/device/gpu/gslbe/src/rt/GSLDevice.cpp b/rocclr/runtime/device/gpu/gslbe/src/rt/GSLDevice.cpp index 32a3ed2c80..86ea275ac5 100644 --- a/rocclr/runtime/device/gpu/gslbe/src/rt/GSLDevice.cpp +++ b/rocclr/runtime/device/gpu/gslbe/src/rt/GSLDevice.cpp @@ -496,7 +496,7 @@ CALGSLDevice::SetupContext(int32 &asic_id) getAttribs_int(temp_cs); temp_cs->getMemInfo(&m_memInfo, GSL_MEMINFO_BASIC); - m_vmMode = temp_cs->getVMMode(); + assert(temp_cs->getVMMode()); m_adp->deleteContext(temp_cs); @@ -1313,38 +1313,6 @@ CALGSLDevice::PerformDMACopy(gslMemObject srcMem, gslMemObject destMem, cmSurfFm return true; } -void -CALGSLDevice::resCopy(gslMemObject srcRes, gslMemObject dstRes, uint32 flags) const -{ - assert(m_cs != 0); - assert(srcRes != 0); - assert(dstRes != 0); - - //! @note: GSL device isn't thread safe - amd::ScopedLock k(gslDeviceOps()); - - uint64 surfaceSize; - - CopyType type = GetCopyType(srcRes, dstRes, 0, 0, m_allowDMA, 0, surfaceSize, 0, 0); - - if (type == USE_DRMDMA) - { - m_cs->DMACopy(srcRes, 0, dstRes, 0, surfaceSize, GSL_SYNCUPLOAD_SYNC_WAIT, NULL); - m_cs->Flush(); - Wait(m_cs, GSL_DRMDMA_SYNC_ATI, m_mapDMAQuery); - } - else if (type == USE_CPDMA) - { - m_cs->syncUploadRaw(srcRes, 0, dstRes, 0, surfaceSize, 0); - m_cs->Flush(); - Wait(m_cs, GSL_SYNC_ATI, m_mapQuery); - } - else - { - assert(0 && "No copy engine is being used"); - } -} - #define CPDMA_THRESHOLD 131072 CopyType diff --git a/rocclr/runtime/device/gpu/gslbe/src/rt/GSLDevice.h b/rocclr/runtime/device/gpu/gslbe/src/rt/GSLDevice.h index adeecc2006..41eda23143 100644 --- a/rocclr/runtime/device/gpu/gslbe/src/rt/GSLDevice.h +++ b/rocclr/runtime/device/gpu/gslbe/src/rt/GSLDevice.h @@ -97,14 +97,10 @@ public: const CALdeviceattribs& getAttribs() const { return m_attribs; } const gslMemInfo& getMemInfo() const { return m_memInfo; } - bool isVmMode() const { return m_vmMode; }; - uint32 getVPUMask() const { return m_vpuMask; } bool canDMA() const { return m_canDMA; } gslMemObject m_srcDRMDMAMem, m_dstDRMDMAMem; // memory object of flush buffer, used for DRMDMA flush - void resCopy(gslMemObject srcRes, gslMemObject dstRes, uint32 flags) const; - void PerformAdapterInitialization() const; void PerformFullInitialization() const; @@ -211,7 +207,6 @@ private: uint m_computeRing : 1; uint m_usePerVPUAdapterModel : 1; uint m_PerformLazyDeviceInit : 1; - uint m_vmMode : 1; uint m_isComputeRingIDForced : 1; }; }; diff --git a/rocclr/runtime/device/hsa/hsasettings.cpp b/rocclr/runtime/device/hsa/hsasettings.cpp index bef9ca26ae..5c5a2c2df1 100644 --- a/rocclr/runtime/device/hsa/hsasettings.cpp +++ b/rocclr/runtime/device/hsa/hsasettings.cpp @@ -34,7 +34,6 @@ Settings::Settings() bool Settings::create(bool doublePrecision) { - largeHostMemAlloc_ = true; customHostAllocator_ = true; // Enable extensions diff --git a/rocclr/runtime/utils/flags.hpp b/rocclr/runtime/utils/flags.hpp index fb803782d9..3e4db8c522 100644 --- a/rocclr/runtime/utils/flags.hpp +++ b/rocclr/runtime/utils/flags.hpp @@ -52,12 +52,8 @@ release(cstring, GPU_DEVICE_ORDINAL, "", \ "Select the device ordinal (comma seperated list of available devices)") \ release(bool, REMOTE_ALLOC, false, \ "Use remote memory for the global heap allocation") \ -release(int, GPU_INITIAL_HEAP_SIZE, 16, \ - "Initial size of the GPU heap in MiB") \ release(uint, GPU_MAX_HEAP_SIZE, 100, \ "Set maximum size of the GPU heap to % of board memory") \ -release(int, GPU_HEAP_GROWTH_INCREMENT, 8, \ - "Amount to grow the GPU heap by in MiB") \ release(uint, GPU_STAGING_BUFFER_SIZE, 512, \ "Size of the GPU staging buffer in KiB") \ release(bool, GPU_DUMP_BLIT_KERNELS, false, \