Files
rocm-systems/rocclr/runtime/device/pal/palresource.cpp
T
foreman 58e4bca449 P4 to Git Change 2031605 by gandryey@gera-win10 on 2019/11/15 16:34:08
SWDEV-79445 - OCL generic changes and code clean-up
	- Make sure transfer doesn't exceed CP dma limit

Affected files ...

... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palresource.cpp#81 edit
2019-11-15 16:38:27 -05:00

2213 строки
80 KiB
C++

// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
//
#include "platform/program.hpp"
#include "platform/kernel.hpp"
#include "os/os.hpp"
#include "device/device.hpp"
#include "utils/flags.hpp"
#include "thread/monitor.hpp"
#include "device/pal/palresource.hpp"
#include "device/pal/paldevice.hpp"
#include "device/pal/palblit.hpp"
#include "device/pal/paltimestamp.hpp"
#include "thread/atomic.hpp"
#include "hsa_ext_image.h"
#ifdef _WIN32
#include <d3d10_1.h>
#include "CL/cl_d3d10.h"
#include "CL/cl_d3d11.h"
#endif // _WIN32
#include <GL/gl.h>
#include "GL/glATIInternal.h"
#include <string>
#include <fstream>
#include <sstream>
#include <iostream>
#include <cmath>
namespace pal {
// ================================================================================================
Pal::Result GpuMemoryReference::MakeResident() const {
Pal::Result result = Pal::Result::Success;
if (device_.settings().alwaysResident_) {
Pal::GpuMemoryRef memRef = {};
memRef.pGpuMemory = gpuMem_;
result = device_.iDev()->AddGpuMemoryReferences(1, &memRef, nullptr, Pal::GpuMemoryRefCantTrim);
}
return result;
}
// ================================================================================================
GpuMemoryReference* GpuMemoryReference::Create(const Device& dev,
const Pal::GpuMemoryCreateInfo& createInfo) {
Pal::Result result;
size_t gpuMemSize = dev.iDev()->GetGpuMemorySize(createInfo, &result);
if (result != Pal::Result::Success) {
return nullptr;
}
GpuMemoryReference* memRef = new (gpuMemSize) GpuMemoryReference(dev);
if (memRef != nullptr) {
result = dev.iDev()->CreateGpuMemory(createInfo, &memRef[1], &memRef->gpuMem_);
if ((result != Pal::Result::Success) &&
// Free cache if PAL failed allocation
dev.resourceCache().free()) {
// If cache was freed, then try to allocate again
result = dev.iDev()->CreateGpuMemory(createInfo, &memRef[1], &memRef->gpuMem_);
}
if (result == Pal::Result::Success) {
result = memRef->MakeResident();
}
if (result != Pal::Result::Success) {
memRef->release();
return nullptr;
}
}
if (!createInfo.flags.sdiExternal) {
// Update free memory size counters
dev.updateAllocedMemory(createInfo.heaps[0], createInfo.size, false);
}
return memRef;
}
// ================================================================================================
GpuMemoryReference* GpuMemoryReference::Create(const Device& dev,
const Pal::PinnedGpuMemoryCreateInfo& createInfo) {
Pal::Result result;
size_t gpuMemSize = dev.iDev()->GetPinnedGpuMemorySize(createInfo, &result);
if (result != Pal::Result::Success) {
return nullptr;
}
GpuMemoryReference* memRef = new (gpuMemSize) GpuMemoryReference(dev);
Pal::VaRange vaRange = Pal::VaRange::Default;
if (memRef != nullptr) {
result = dev.iDev()->CreatePinnedGpuMemory(createInfo, &memRef[1], &memRef->gpuMem_);
if (result == Pal::Result::Success) {
result = memRef->MakeResident();
}
if (result != Pal::Result::Success) {
memRef->release();
return nullptr;
}
}
// Update free memory size counters
dev.updateAllocedMemory(Pal::GpuHeap::GpuHeapGartCacheable, createInfo.size, false);
return memRef;
}
// ================================================================================================
GpuMemoryReference* GpuMemoryReference::Create(const Device& dev,
const Pal::SvmGpuMemoryCreateInfo& createInfo) {
Pal::Result result;
size_t gpuMemSize = dev.iDev()->GetSvmGpuMemorySize(createInfo, &result);
if (result != Pal::Result::Success) {
return nullptr;
}
GpuMemoryReference* memRef = new (gpuMemSize) GpuMemoryReference(dev);
if (memRef != nullptr) {
result = dev.iDev()->CreateSvmGpuMemory(createInfo, &memRef[1], &memRef->gpuMem_);
if (result == Pal::Result::Success) {
result = memRef->MakeResident();
}
if (result != Pal::Result::Success) {
memRef->release();
return nullptr;
}
}
// Update free memory size counters
dev.updateAllocedMemory(Pal::GpuHeap::GpuHeapGartCacheable, createInfo.size, false);
return memRef;
}
// ================================================================================================
GpuMemoryReference* GpuMemoryReference::Create(const Device& dev,
const Pal::ExternalGpuMemoryOpenInfo& openInfo) {
Pal::Result result;
size_t gpuMemSize = dev.iDev()->GetExternalSharedGpuMemorySize(&result);
if (result != Pal::Result::Success) {
return nullptr;
}
Pal::GpuMemoryCreateInfo createInfo = {};
GpuMemoryReference* memRef = new (gpuMemSize) GpuMemoryReference(dev);
if (memRef != nullptr) {
result = dev.iDev()->OpenExternalSharedGpuMemory(openInfo, &memRef[1], &createInfo,
&memRef->gpuMem_);
if (result == Pal::Result::Success) {
result = memRef->MakeResident();
}
if (result != Pal::Result::Success) {
memRef->release();
return nullptr;
}
}
return memRef;
}
// ================================================================================================
GpuMemoryReference* GpuMemoryReference::Create(const Device& dev,
const Pal::ExternalImageOpenInfo& openInfo,
Pal::ImageCreateInfo* imgCreateInfo,
Pal::IImage** image) {
Pal::Result result;
size_t gpuMemSize = 0;
size_t imageSize = 0;
if (Pal::Result::Success !=
dev.iDev()->GetExternalSharedImageSizes(openInfo, &imageSize, &gpuMemSize, imgCreateInfo)) {
return nullptr;
}
Pal::GpuMemoryCreateInfo createInfo = {};
GpuMemoryReference* memRef = new (gpuMemSize) GpuMemoryReference(dev);
char* imgMem = new char[imageSize];
if (memRef != nullptr) {
result = dev.iDev()->OpenExternalSharedImage(openInfo, imgMem, &memRef[1], &createInfo, image,
&memRef->gpuMem_);
if (result == Pal::Result::Success) {
result = memRef->MakeResident();
}
if (result != Pal::Result::Success) {
memRef->release();
return nullptr;
}
}
return memRef;
}
// ================================================================================================
GpuMemoryReference* GpuMemoryReference::Create(const Device& dev,
const Pal::PeerGpuMemoryOpenInfo& openInfo) {
Pal::Result result;
size_t gpuMemSize = dev.iDev()->GetPeerGpuMemorySize(openInfo, &result);
if (result != Pal::Result::Success) {
return nullptr;
}
GpuMemoryReference* memRef = new (gpuMemSize) GpuMemoryReference(dev);
if (memRef != nullptr) {
result = dev.iDev()->OpenPeerGpuMemory(openInfo, &memRef[1], &memRef->gpuMem_);
if (result == Pal::Result::Success) {
result = memRef->MakeResident();
}
if (result != Pal::Result::Success) {
memRef->release();
return nullptr;
}
}
return memRef;
}
// ================================================================================================
GpuMemoryReference::GpuMemoryReference(const Device& dev)
: gpuMem_(nullptr), cpuAddress_(nullptr), device_(dev), gpu_(nullptr) {}
// ================================================================================================
GpuMemoryReference::~GpuMemoryReference() {
if (nullptr == iMem()) {
return;
}
if (gpu_ == nullptr) {
Device::ScopedLockVgpus lock(device_);
// Release all memory objects on all virtual GPUs
for (uint idx = 1; idx < device_.vgpus().size(); ++idx) {
device_.vgpus()[idx]->releaseMemory(this);
}
} else {
amd::ScopedLock l(gpu_->execution());
gpu_->releaseMemory(this);
}
if (device_.vgpus().size() != 0) {
assert(device_.vgpus()[0] == device_.xferQueue() && "Wrong transfer queue!");
// Lock the transfer queue, since it's not handled by ScopedLockVgpus
amd::ScopedLock k(device_.xferMgr().lockXfer());
device_.vgpus()[0]->releaseMemory(this);
}
// Destroy PAL object if it's not a suballocation
if (cpuAddress_ != nullptr) {
iMem()->Unmap();
}
if (!(iMem()->Desc().flags.isShared || iMem()->Desc().flags.isExternal ||
iMem()->Desc().flags.isExternPhys)) {
// Update free memory size counters
device_.updateAllocedMemory(iMem()->Desc().preferredHeap, iMem()->Desc().size, true);
}
iMem()->Destroy();
gpuMem_ = nullptr;
}
// ================================================================================================
Resource::Resource(const Device& gpuDev, size_t size)
: elementSize_(0),
gpuDevice_(gpuDev),
mapCount_(0),
address_(nullptr),
offset_(0),
memRef_(nullptr),
subOffset_(0),
viewOwner_(nullptr),
image_(nullptr),
hwSrd_(0),
events_(gpuDev.numOfVgpus()) {
// Fill resource descriptor fields
desc_.state_ = 0;
desc_.type_ = Empty;
desc_.width_ = amd::alignUp(size, Pal::Formats::BytesPerPixel(Pal::ChNumFormat::X32_Uint)) /
Pal::Formats::BytesPerPixel(Pal::ChNumFormat::X32_Uint);
desc_.height_ = 1;
desc_.depth_ = 1;
desc_.mipLevels_ = 1;
desc_.format_.image_channel_order = CL_R;
desc_.format_.image_channel_data_type = CL_FLOAT;
desc_.flags_ = 0;
desc_.pitch_ = 0;
desc_.slice_ = 0;
desc_.cardMemory_ = true;
desc_.dimSize_ = 1;
desc_.buffer_ = true;
desc_.imageArray_ = false;
desc_.topology_ = CL_MEM_OBJECT_BUFFER;
desc_.SVMRes_ = false;
desc_.scratch_ = false;
desc_.isAllocExecute_ = false;
desc_.baseLevel_ = 0;
desc_.gl2CacheDisabled_ = false;
gpuDev.addResource(this);
}
// ================================================================================================
Resource::Resource(const Device& gpuDev, size_t width, size_t height, size_t depth,
cl_image_format format, cl_mem_object_type imageType, uint mipLevels)
: elementSize_(0),
gpuDevice_(gpuDev),
mapCount_(0),
address_(nullptr),
offset_(0),
memRef_(nullptr),
subOffset_(0),
viewOwner_(nullptr),
image_(nullptr),
hwSrd_(0),
events_(gpuDev.numOfVgpus()) {
// Fill resource descriptor fields
desc_.state_ = 0;
desc_.type_ = Empty;
desc_.width_ = width;
desc_.height_ = height;
desc_.depth_ = depth;
desc_.mipLevels_ = mipLevels;
desc_.format_ = format;
desc_.flags_ = 0;
desc_.pitch_ = 0;
desc_.slice_ = 0;
desc_.cardMemory_ = true;
desc_.buffer_ = false;
desc_.imageArray_ = false;
desc_.topology_ = imageType;
desc_.SVMRes_ = false;
desc_.scratch_ = false;
desc_.isAllocExecute_ = false;
desc_.baseLevel_ = 0;
desc_.gl2CacheDisabled_ = false;
switch (imageType) {
case CL_MEM_OBJECT_IMAGE2D:
desc_.dimSize_ = 2;
break;
case CL_MEM_OBJECT_IMAGE3D:
desc_.dimSize_ = 3;
break;
case CL_MEM_OBJECT_IMAGE2D_ARRAY:
desc_.dimSize_ = 3;
desc_.imageArray_ = true;
break;
case CL_MEM_OBJECT_IMAGE1D:
desc_.dimSize_ = 1;
break;
case CL_MEM_OBJECT_IMAGE1D_ARRAY:
desc_.dimSize_ = 2;
desc_.imageArray_ = true;
break;
case CL_MEM_OBJECT_IMAGE1D_BUFFER:
desc_.dimSize_ = 1;
break;
default:
desc_.dimSize_ = 1;
LogError("Unknown image type!");
break;
}
gpuDev.addResource(this);
}
// ================================================================================================
Resource::~Resource() {
free();
if ((nullptr != image_) &&
((memoryType() != ImageView) ||
//! @todo PAL doesn't allow an SRD view creation with different pixel size
(elementSize() != viewOwner_->elementSize()))) {
image_->Destroy();
delete[] reinterpret_cast<char*>(image_);
}
// Remove the current resource from the global resource list
gpuDevice_.removeResource(this);
}
// ================================================================================================
static uint32_t GetHSAILImageFormatType(const cl_image_format& format) {
static const uint32_t FormatType[] = {HSA_EXT_IMAGE_CHANNEL_TYPE_SNORM_INT8,
HSA_EXT_IMAGE_CHANNEL_TYPE_SNORM_INT16,
HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_INT8,
HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_INT16,
HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_SHORT_565,
HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_SHORT_555,
HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_SHORT_101010,
HSA_EXT_IMAGE_CHANNEL_TYPE_SIGNED_INT8,
HSA_EXT_IMAGE_CHANNEL_TYPE_SIGNED_INT16,
HSA_EXT_IMAGE_CHANNEL_TYPE_SIGNED_INT32,
HSA_EXT_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8,
HSA_EXT_IMAGE_CHANNEL_TYPE_UNSIGNED_INT16,
HSA_EXT_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32,
HSA_EXT_IMAGE_CHANNEL_TYPE_HALF_FLOAT,
HSA_EXT_IMAGE_CHANNEL_TYPE_FLOAT,
HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_INT24};
uint idx = format.image_channel_data_type - CL_SNORM_INT8;
assert((idx <= (CL_UNORM_INT24 - CL_SNORM_INT8)) && "Out of range format channel!");
return FormatType[idx];
}
// ================================================================================================
static uint32_t GetHSAILImageOrderType(const cl_image_format& format) {
static const uint32_t OrderType[] = {HSA_EXT_IMAGE_CHANNEL_ORDER_R,
HSA_EXT_IMAGE_CHANNEL_ORDER_A,
HSA_EXT_IMAGE_CHANNEL_ORDER_RG,
HSA_EXT_IMAGE_CHANNEL_ORDER_RA,
HSA_EXT_IMAGE_CHANNEL_ORDER_RGB,
HSA_EXT_IMAGE_CHANNEL_ORDER_RGBA,
HSA_EXT_IMAGE_CHANNEL_ORDER_BGRA,
HSA_EXT_IMAGE_CHANNEL_ORDER_ARGB,
HSA_EXT_IMAGE_CHANNEL_ORDER_INTENSITY,
HSA_EXT_IMAGE_CHANNEL_ORDER_LUMINANCE,
HSA_EXT_IMAGE_CHANNEL_ORDER_RX,
HSA_EXT_IMAGE_CHANNEL_ORDER_RGX,
HSA_EXT_IMAGE_CHANNEL_ORDER_RGBX,
HSA_EXT_IMAGE_CHANNEL_ORDER_DEPTH,
HSA_EXT_IMAGE_CHANNEL_ORDER_DEPTH_STENCIL,
HSA_EXT_IMAGE_CHANNEL_ORDER_SRGB,
HSA_EXT_IMAGE_CHANNEL_ORDER_SRGBX,
HSA_EXT_IMAGE_CHANNEL_ORDER_SRGBA,
HSA_EXT_IMAGE_CHANNEL_ORDER_SBGRA,
HSA_EXT_IMAGE_CHANNEL_ORDER_ABGR};
uint idx = format.image_channel_order - CL_R;
assert((idx <= (CL_ABGR - CL_R)) && "Out of range format order!");
return OrderType[idx];
}
// ================================================================================================
void Resource::memTypeToHeap(Pal::GpuMemoryCreateInfo* createInfo) {
createInfo->heapCount = 1;
switch (memoryType()) {
case Persistent:
createInfo->heapCount = 2;
createInfo->heaps[0] = Pal::GpuHeapLocal;
createInfo->heaps[1] = Pal::GpuHeapGartUswc;
createInfo->flags.peerWritable = dev().P2PAccessAllowed();
#ifdef ATI_OS_LINUX
// Note: SSG in Linux requires DGMA heap
if (dev().properties().gpuMemoryProperties.busAddressableMemSize > 0) {
createInfo->flags.busAddressable = true;
}
#endif
break;
case RemoteUSWC:
createInfo->heaps[0] = Pal::GpuHeapGartUswc;
desc_.cardMemory_ = false;
break;
case Remote:
createInfo->heaps[0] = Pal::GpuHeapGartCacheable;
desc_.cardMemory_ = false;
break;
case ExternalPhysical:
desc_.cardMemory_ = false;
case Shader:
// Fall through to process the memory allocation ...
case Local:
createInfo->heapCount = 3;
createInfo->heaps[0] = Pal::GpuHeapInvisible;
createInfo->heaps[1] = Pal::GpuHeapLocal;
createInfo->heaps[2] = Pal::GpuHeapGartUswc;
createInfo->flags.peerWritable = dev().P2PAccessAllowed();
break;
default:
createInfo->heaps[0] = Pal::GpuHeapLocal;
break;
}
#if !IS_MAINLINE
// Pick the appropriate mall policy based on the mem type
switch (memoryType()) {
case Local:
case Scratch:
createInfo->mallPolicy = static_cast<Pal::GpuMemMallPolicy>(dev().settings().mallPolicy_);
break;
default:
createInfo->mallPolicy = Pal::GpuMemMallPolicy::Never;
break;
}
#endif
}
// ================================================================================================
bool Resource::CreateImage(CreateParams* params, bool forceLinear) {
Pal::Result result;
Pal::SubresId ImgSubresId = {Pal::ImageAspect::Color, 0, 0};
Pal::SubresRange ImgSubresRange = {ImgSubresId, 1, 1};
Pal::ChannelMapping channels;
Pal::ChNumFormat format = dev().getPalFormat(desc().format_, &channels);
if (desc().topology_ == CL_MEM_OBJECT_IMAGE1D_BUFFER) {
if (memoryType() == ImageBuffer) {
ImageBufferParams* imageBuffer = reinterpret_cast<ImageBufferParams*>(params);
viewOwner_ = imageBuffer->resource_;
memRef_ = viewOwner_->memRef_;
memRef_->retain();
desc_.cardMemory_ = viewOwner_->desc().cardMemory_;
offset_ += viewOwner_->offset_;
} else {
Pal::GpuMemoryCreateInfo createInfo = {};
createInfo.size = desc().width_ * elementSize();
createInfo.size = amd::alignUp(createInfo.size, MaxGpuAlignment);
createInfo.alignment = MaxGpuAlignment;
createInfo.vaRange = Pal::VaRange::Default;
createInfo.priority = Pal::GpuMemPriority::Normal;
memTypeToHeap(&createInfo);
// createInfo.priority;
memRef_ = dev().resourceCache().findGpuMemory(&desc_, createInfo.size, createInfo.alignment,
nullptr, &subOffset_);
if (nullptr == memRef_) {
memRef_ = GpuMemoryReference::Create(dev(), createInfo);
if (nullptr == memRef_) {
LogError("Failed PAL memory allocation!");
return false;
}
}
offset_ += static_cast<size_t>(subOffset_);
}
// Check if memory is locked already and restore CPU pointer
if (memRef_->cpuAddress_ != nullptr) {
address_ = memRef_->cpuAddress_;
memRef_->cpuAddress_ = nullptr;
mapCount_++;
}
Pal::BufferViewInfo viewInfo = {};
viewInfo.gpuAddr = vmAddress();
viewInfo.range = memRef_->iMem()->Desc().size;
viewInfo.stride = elementSize();
viewInfo.swizzledFormat.format = format;
viewInfo.swizzledFormat.swizzle = channels;
// viewInfo.channels = channels;
hwSrd_ = dev().srds().allocSrdSlot(reinterpret_cast<address*>(&hwState_));
if ((0 == hwSrd_) && (memoryType() != ImageView)) {
return false;
}
dev().iDev()->CreateTypedBufferViewSrds(1, &viewInfo, hwState_);
hwState_[8] = GetHSAILImageFormatType(desc().format_);
hwState_[9] = GetHSAILImageOrderType(desc().format_);
hwState_[10] = static_cast<uint32_t>(desc().width_);
hwState_[11] = 0; // one extra reserved field in the argument
return true;
}
Pal::ImageViewInfo viewInfo = {};
Pal::ImageCreateInfo imgCreateInfo = {};
Pal::GpuMemoryRequirements req = {};
imgCreateInfo.imageType = Pal::ImageType::Tex2d;
viewInfo.viewType = Pal::ImageViewType::Tex2d;
viewInfo.possibleLayouts.engines = Pal::LayoutComputeEngine | Pal::LayoutDmaEngine;
viewInfo.possibleLayouts.usages = Pal::LayoutShaderWrite;
imgCreateInfo.extent.width = desc_.width_;
imgCreateInfo.extent.height = desc_.height_;
imgCreateInfo.extent.depth = desc_.depth_;
imgCreateInfo.arraySize = 1;
switch (desc_.topology_) {
case CL_MEM_OBJECT_IMAGE3D:
imgCreateInfo.imageType = Pal::ImageType::Tex3d;
viewInfo.viewType = Pal::ImageViewType::Tex3d;
break;
case CL_MEM_OBJECT_IMAGE1D:
case CL_MEM_OBJECT_IMAGE1D_ARRAY:
case CL_MEM_OBJECT_IMAGE1D_BUFFER:
imgCreateInfo.imageType = Pal::ImageType::Tex1d;
viewInfo.viewType = Pal::ImageViewType::Tex1d;
break;
}
if (desc_.topology_ == CL_MEM_OBJECT_IMAGE1D_ARRAY) {
ImgSubresRange.numSlices = imgCreateInfo.arraySize = desc_.height_;
imgCreateInfo.extent.depth = desc_.height_;
imgCreateInfo.extent.height = 1;
}
if (desc_.topology_ == CL_MEM_OBJECT_IMAGE2D_ARRAY) {
ImgSubresRange.numSlices = imgCreateInfo.arraySize = desc_.depth_;
}
if (memoryType() == ImageView) {
ImageViewParams* imageView = reinterpret_cast<ImageViewParams*>(params);
ImgSubresRange.startSubres.mipLevel = imageView->level_;
desc_.baseLevel_ = imageView->level_;
ImgSubresRange.startSubres.arraySlice = imageView->layer_;
viewOwner_ = imageView->resource_;
image_ = viewOwner_->image_;
} else if (memoryType() == ImageBuffer) {
ImageBufferParams* imageBuffer = reinterpret_cast<ImageBufferParams*>(params);
viewOwner_ = imageBuffer->resource_;
}
if (nullptr != viewOwner_) {
offset_ = viewOwner_->offset();
}
ImgSubresRange.numMips = desc().mipLevels_;
if ((memoryType() != ImageView) ||
//! @todo PAL doesn't allow an SRD view creation with different pixel size
(elementSize() != viewOwner_->elementSize())) {
imgCreateInfo.usageFlags.shaderRead = true;
imgCreateInfo.usageFlags.shaderWrite =
(format == Pal::ChNumFormat::X8Y8Z8W8_Srgb) ? false : true;
imgCreateInfo.swizzledFormat.format = format;
imgCreateInfo.swizzledFormat.swizzle = channels;
imgCreateInfo.mipLevels = (desc_.mipLevels_) ? desc_.mipLevels_ : 1;
imgCreateInfo.samples = 1;
imgCreateInfo.fragments = 1;
Pal::ImageTiling tiling = forceLinear ? Pal::ImageTiling::Linear : Pal::ImageTiling::Optimal;
uint32_t rowPitch = 0;
if (((memoryType() == Persistent) && dev().settings().linearPersistentImage_) ||
(memoryType() == ImageBuffer)) {
tiling = Pal::ImageTiling::Linear;
} else if (memoryType() == ImageView) {
tiling = viewOwner_->image_->GetImageCreateInfo().tiling;
// Find the new pitch in pixels for the new format
rowPitch = viewOwner_->desc().pitch_ * viewOwner_->elementSize() / elementSize();
}
if (memoryType() == ImageBuffer) {
if ((params->owner_ != NULL) && params->owner_->asImage() &&
(params->owner_->asImage()->getRowPitch() != 0)) {
rowPitch = params->owner_->asImage()->getRowPitch() / elementSize();
} else {
rowPitch = desc().width_;
}
}
desc_.pitch_ = rowPitch;
// Make sure the row pitch is aligned to pixels
imgCreateInfo.rowPitch =
amd::alignUp(elementSize() * rowPitch, dev().info().imagePitchAlignment_);
imgCreateInfo.depthPitch = imgCreateInfo.rowPitch * desc().height_;
imgCreateInfo.tiling = tiling;
size_t imageSize = dev().iDev()->GetImageSize(imgCreateInfo, &result);
if (result != Pal::Result::Success) {
return false;
}
char* memImg = new char[imageSize];
if (memImg != nullptr) {
result = dev().iDev()->CreateImage(imgCreateInfo, memImg, &image_);
if (result != Pal::Result::Success) {
delete[] memImg;
return false;
}
}
image_->GetGpuMemoryRequirements(&req);
// createInfo.priority;
}
if ((memoryType() != ImageView) && (memoryType() != ImageBuffer)) {
Pal::GpuMemoryCreateInfo createInfo = {};
createInfo.size = amd::alignUp(req.size, MaxGpuAlignment);
createInfo.alignment = std::max(req.alignment, MaxGpuAlignment);
createInfo.vaRange = Pal::VaRange::Default;
createInfo.priority = Pal::GpuMemPriority::Normal;
memTypeToHeap(&createInfo);
memRef_ = dev().resourceCache().findGpuMemory(&desc_, createInfo.size, createInfo.alignment,
nullptr, &subOffset_);
if (nullptr == memRef_) {
memRef_ = GpuMemoryReference::Create(dev(), createInfo);
if (nullptr == memRef_) {
LogError("Failed PAL memory allocation!");
return false;
}
}
offset_ += static_cast<size_t>(subOffset_);
} else {
memRef_ = viewOwner_->memRef_;
memRef_->retain();
desc_.cardMemory_ = viewOwner_->desc().cardMemory_;
if (req.size > viewOwner_->iMem()->Desc().size) {
LogWarning("Image is bigger than the original mem object!");
}
}
// Check if memory is locked already and restore CPU pointer
if (memRef_->cpuAddress_ != nullptr) {
address_ = memRef_->cpuAddress_;
memRef_->cpuAddress_ = nullptr;
mapCount_++;
}
result = image_->BindGpuMemory(memRef_->gpuMem_, offset_);
if (result != Pal::Result::Success) {
return false;
}
hwSrd_ = dev().srds().allocSrdSlot(reinterpret_cast<address*>(&hwState_));
if ((0 == hwSrd_) && (memoryType() != ImageView)) {
return false;
}
viewInfo.pImage = image_;
viewInfo.swizzledFormat.format = format;
viewInfo.swizzledFormat.swizzle = channels;
viewInfo.subresRange = ImgSubresRange;
dev().iDev()->CreateImageViewSrds(1, &viewInfo, hwState_);
hwState_[8] = GetHSAILImageFormatType(desc().format_);
hwState_[9] = GetHSAILImageOrderType(desc().format_);
hwState_[10] = static_cast<uint32_t>(desc().width_);
hwState_[11] = 0; // one extra reserved field in the argument
return true;
}
// ================================================================================================
bool Resource::CreateInterop(CreateParams* params) {
Pal::Result result;
Pal::SubresId ImgSubresId = {Pal::ImageAspect::Color, 0, 0};
Pal::SubresRange ImgSubresRange = {ImgSubresId, 1, 1};
Pal::ChannelMapping channels;
Pal::ChNumFormat format = dev().getPalFormat(desc().format_, &channels);
Pal::ExternalGpuMemoryOpenInfo gpuMemOpenInfo = {};
Pal::ExternalResourceOpenInfo& openInfo = gpuMemOpenInfo.resourceInfo;
uint misc = 0;
uint layer = 0;
uint mipLevel = 0;
InteropType type = InteropTypeless;
if (memoryType() == OGLInterop) {
OGLInteropParams* oglRes = reinterpret_cast<OGLInteropParams*>(params);
assert(oglRes->glPlatformContext_ && "We don't have OGL context!");
switch (oglRes->type_) {
case InteropVertexBuffer:
glType_ = GL_RESOURCE_ATTACH_VERTEXBUFFER_AMD;
break;
case InteropRenderBuffer:
glType_ = GL_RESOURCE_ATTACH_RENDERBUFFER_AMD;
break;
case InteropTexture:
case InteropTextureViewLevel:
case InteropTextureViewCube:
glType_ = GL_RESOURCE_ATTACH_TEXTURE_AMD;
break;
default:
LogError("Unknown OGL interop type!");
return false;
break;
}
glPlatformContext_ = oglRes->glPlatformContext_;
layer = oglRes->layer_;
type = oglRes->type_;
mipLevel = oglRes->mipLevel_;
if (!dev().resGLAssociate(oglRes->glPlatformContext_, oglRes->handle_, glType_,
&openInfo.hExternalResource, &glInteropMbRes_, &offset_, desc_.format_
#ifdef ATI_OS_WIN
,
openInfo.doppDesktopInfo
#endif
)) {
return false;
}
desc_.isDoppTexture_ = (openInfo.doppDesktopInfo.gpuVirtAddr != 0);
format = dev().getPalFormat(desc().format_, &channels);
}
#ifdef ATI_OS_WIN
else {
D3DInteropParams* d3dRes = reinterpret_cast<D3DInteropParams*>(params);
openInfo.hExternalResource = d3dRes->handle_;
misc = d3dRes->misc;
layer = d3dRes->layer_;
type = d3dRes->type_;
mipLevel = d3dRes->mipLevel_;
}
#endif
//! @todo PAL query for image/buffer object doesn't work properly!
#if 0
bool isImage = false;
if (Pal::Result::Success !=
dev().iDev()->DetermineExternalSharedResourceType(openInfo, &isImage)) {
return false;
}
#endif // 0
if (desc().buffer_ || misc) {
memRef_ = GpuMemoryReference::Create(dev(), gpuMemOpenInfo);
if (nullptr == memRef_) {
return false;
}
if (misc) {
Pal::ImageCreateInfo imgCreateInfo = {};
Pal::ExternalImageOpenInfo imgOpenInfo = {};
imgOpenInfo.resourceInfo = openInfo;
imgOpenInfo.swizzledFormat.format = format;
imgOpenInfo.swizzledFormat.swizzle = channels;
imgOpenInfo.usage.shaderRead = true;
imgOpenInfo.usage.shaderWrite = true;
size_t imageSize;
size_t gpuMemSize;
if (Pal::Result::Success !=
dev().iDev()->GetExternalSharedImageSizes(imgOpenInfo, &imageSize, &gpuMemSize,
&imgCreateInfo)) {
return false;
}
Pal::gpusize viewOffset = 0;
imgCreateInfo.flags.shareable = false;
imgCreateInfo.imageType = Pal::ImageType::Tex2d;
imgCreateInfo.extent.width = desc().width_;
imgCreateInfo.extent.height = desc().height_;
imgCreateInfo.extent.depth = desc().depth_;
imgCreateInfo.arraySize = 1;
imgCreateInfo.usageFlags.shaderRead = true;
imgCreateInfo.usageFlags.shaderWrite = true;
imgCreateInfo.swizzledFormat.format = format;
imgCreateInfo.swizzledFormat.swizzle = channels;
imgCreateInfo.mipLevels = 1;
imgCreateInfo.samples = 1;
imgCreateInfo.fragments = 1;
imgCreateInfo.tiling = Pal::ImageTiling::Linear;
imgCreateInfo.depthPitch = desc().height_ * imgCreateInfo.rowPitch;
switch (misc) {
case 1: // NV12 or P010 formats
switch (layer) {
case -1:
case 0:
break;
case 1:
// Y - plane size to the offset
// NV12 format. UV is 2 times smaller plane Y
viewOffset = 2 * imgCreateInfo.rowPitch * desc().height_;
imgCreateInfo.depthPitch = imgCreateInfo.rowPitch * desc().height_;
break;
default:
LogError("Unknown Interop View Type");
return false;
}
break;
case 2: // YV12 format
switch (layer) {
case -1:
case 0:
break;
case 1:
// Y - plane size to the offset
// YV12 format. U is 4 times smaller plane than Y
viewOffset = 2 * imgCreateInfo.rowPitch * desc().height_;
imgCreateInfo.rowPitch >>= 1;
break;
case 2:
// Y + U plane sizes to the offest.
// U plane is 4 times smaller than Y and U == V
viewOffset = 5 * imgCreateInfo.rowPitch * desc().height_ / 2;
imgCreateInfo.rowPitch >>= 1;
break;
default:
LogError("Unknown Interop View Type");
return false;
}
imgCreateInfo.depthPitch = imgCreateInfo.rowPitch * desc().height_;
break;
case 3: // YUY2 format
imgCreateInfo.depthPitch = imgCreateInfo.rowPitch * desc().height_;
break;
default:
LogError("Unknown Interop View Type");
return false;
}
imageSize = dev().iDev()->GetImageSize(imgCreateInfo, &result);
if (result != Pal::Result::Success) {
return false;
}
char* memImg = new char[imageSize];
if (memImg != nullptr) {
result = dev().iDev()->CreateImage(imgCreateInfo, memImg, &image_);
if (result != Pal::Result::Success) {
delete[] memImg;
return false;
}
}
offset_ += static_cast<size_t>(viewOffset);
result = image_->BindGpuMemory(iMem(), offset_);
if (result != Pal::Result::Success) {
return false;
}
hwSrd_ = dev().srds().allocSrdSlot(reinterpret_cast<address*>(&hwState_));
if ((0 == hwSrd_) && (memoryType() != ImageView)) {
return false;
}
Pal::ImageViewInfo viewInfo = {};
viewInfo.viewType = Pal::ImageViewType::Tex2d;
viewInfo.pImage = image_;
viewInfo.swizzledFormat.format = format;
viewInfo.swizzledFormat.swizzle = channels;
viewInfo.subresRange = ImgSubresRange;
viewInfo.possibleLayouts.engines = Pal::LayoutComputeEngine | Pal::LayoutDmaEngine;
viewInfo.possibleLayouts.usages = Pal::LayoutShaderWrite;
dev().iDev()->CreateImageViewSrds(1, &viewInfo, hwState_);
hwState_[8] = GetHSAILImageFormatType(desc().format_);
hwState_[9] = GetHSAILImageOrderType(desc().format_);
hwState_[10] = static_cast<uint32_t>(desc().width_);
hwState_[11] = 0; // one extra reserved field in the argument
}
} else if (desc().topology_ == CL_MEM_OBJECT_IMAGE1D_BUFFER) {
memRef_ = GpuMemoryReference::Create(dev(), gpuMemOpenInfo);
if (nullptr == memRef_) {
return false;
}
Pal::BufferViewInfo viewInfo = {};
viewInfo.gpuAddr = vmAddress();
viewInfo.range = memRef_->iMem()->Desc().size;
viewInfo.stride = elementSize();
viewInfo.swizzledFormat.format = format;
viewInfo.swizzledFormat.swizzle = channels;
hwSrd_ = dev().srds().allocSrdSlot(reinterpret_cast<address*>(&hwState_));
if ((0 == hwSrd_) && (memoryType() != ImageView)) {
return false;
}
dev().iDev()->CreateTypedBufferViewSrds(1, &viewInfo, hwState_);
hwState_[8] = GetHSAILImageFormatType(desc().format_);
hwState_[9] = GetHSAILImageOrderType(desc().format_);
hwState_[10] = static_cast<uint32_t>(desc().width_);
hwState_[11] = 0; // one extra reserved field in the argument
} else {
Pal::ExternalImageOpenInfo imgOpenInfo = {};
Pal::ImageCreateInfo imgCreateInfo = {};
imgOpenInfo.resourceInfo = openInfo;
imgOpenInfo.swizzledFormat.format = format;
imgOpenInfo.swizzledFormat.swizzle = channels;
imgOpenInfo.usage.shaderRead = true;
imgOpenInfo.usage.shaderWrite = true;
memRef_ = GpuMemoryReference::Create(dev(), imgOpenInfo, &imgCreateInfo, &image_);
if (nullptr == memRef_) {
return false;
}
hwSrd_ = dev().srds().allocSrdSlot(reinterpret_cast<address*>(&hwState_));
if ((0 == hwSrd_) && (memoryType() != ImageView)) {
return false;
}
Pal::ImageViewInfo viewInfo = {};
viewInfo.possibleLayouts.engines = Pal::LayoutComputeEngine | Pal::LayoutDmaEngine;
viewInfo.possibleLayouts.usages = Pal::LayoutShaderWrite;
viewInfo.viewType = Pal::ImageViewType::Tex2d;
switch (imgCreateInfo.imageType) {
case Pal::ImageType::Tex3d:
viewInfo.viewType = Pal::ImageViewType::Tex3d;
break;
case Pal::ImageType::Tex1d:
viewInfo.viewType = Pal::ImageViewType::Tex1d;
break;
default:
break;
}
viewInfo.pImage = image_;
viewInfo.swizzledFormat.format = format;
viewInfo.swizzledFormat.swizzle = channels;
if ((type == InteropTextureViewLevel) || (type == InteropTextureViewCube)) {
ImgSubresRange.startSubres.mipLevel = mipLevel;
if (type == InteropTextureViewCube) {
ImgSubresRange.startSubres.arraySlice = layer;
viewInfo.viewType = Pal::ImageViewType::Tex2d;
}
}
if (desc().topology_ == CL_MEM_OBJECT_IMAGE1D_ARRAY) {
ImgSubresRange.numSlices = desc_.height_;
}
if (desc().topology_ == CL_MEM_OBJECT_IMAGE2D_ARRAY) {
ImgSubresRange.numSlices = desc_.depth_;
}
ImgSubresRange.numMips = desc().mipLevels_;
viewInfo.subresRange = ImgSubresRange;
dev().iDev()->CreateImageViewSrds(1, &viewInfo, hwState_);
//! It's a workaround for D24S8 format, since PAL doesn't support this format
//! and GSL decompresses 24bit DEPTH into D24S8 for OGL compatibility
if ((desc().format_.image_channel_order == CL_DEPTH_STENCIL) &&
(desc().format_.image_channel_data_type == CL_UNORM_INT24)) {
if (dev().settings().gfx10Plus_) {
hwState_[1] = (hwState_[1] & ~0x1ff00000) | 0x08d00000;
} else {
hwState_[1] &= ~0x3c000000;
hwState_[1] = (hwState_[1] & ~0x3f00000) | 0x1400000;
}
}
hwState_[8] = GetHSAILImageFormatType(desc().format_);
hwState_[9] = GetHSAILImageOrderType(desc().format_);
hwState_[10] = static_cast<uint32_t>(desc().width_);
hwState_[11] = 0; // one extra reserved field in the argument
}
return true;
}
// ================================================================================================
bool Resource::CreateP2PAccess(CreateParams* params) {
Pal::PeerGpuMemoryOpenInfo openInfo = {};
openInfo.pOriginalMem = params->svmBase_->iMem();
memRef_ = GpuMemoryReference::Create(dev(), openInfo);
if (nullptr == memRef_) {
return false;
}
desc_.cardMemory_ = false;
offset_ = params->svmBase_->offset();
return true;
}
// ================================================================================================
bool Resource::CreatePinned(CreateParams* params) {
PinnedParams* pinned = reinterpret_cast<PinnedParams*>(params);
size_t allocSize = pinned->size_;
const amd::HostMemoryReference* hostMemRef = pinned->hostMemRef_;
void* pinAddress = address_ = hostMemRef->hostMem();
uint hostMemOffset = 0;
// assert((allocSize == (desc().width_ * elementSize())) && "Sizes don't match");
if (desc().topology_ == CL_MEM_OBJECT_BUFFER) {
// Allign offset to 4K boundary (Vista/Win7 limitation)
char* tmpHost = const_cast<char*>(
amd::alignDown(reinterpret_cast<const char*>(address_), PinnedMemoryAlignment));
// Find the partial size for unaligned copy
hostMemOffset = static_cast<uint>(reinterpret_cast<const char*>(address_) - tmpHost);
offset_ = hostMemOffset;
pinAddress = tmpHost;
if (hostMemOffset != 0) {
allocSize += hostMemOffset;
}
allocSize = amd::alignUp(allocSize, PinnedMemoryAlignment);
// hostMemOffset &= ~(0xff);
} else if (desc().topology_ == CL_MEM_OBJECT_IMAGE2D) {
//! @todo: Width has to be aligned for 3D.
//! Need to be replaced with a compute copy
// Width aligned by 8 texels
if (((desc().width_ % 0x8) != 0) ||
// Pitch aligned by 64 bytes
(((desc().width_ * elementSize()) % 0x40) != 0)) {
return false;
}
} else {
//! @todo GSL doesn't support pinning with resAlloc_
return false;
}
if (dev().settings().svmFineGrainSystem_) {
desc_.SVMRes_ = true;
}
// Ensure page alignment
if ((uint64_t)(pinAddress) & (amd::Os::pageSize() - 1)) {
return false;
}
Pal::PinnedGpuMemoryCreateInfo createInfo = {};
createInfo.pSysMem = pinAddress;
createInfo.size = allocSize;
createInfo.vaRange = Pal::VaRange::Default;
memRef_ = GpuMemoryReference::Create(dev(), createInfo);
if (nullptr == memRef_) {
LogError("Failed PAL memory allocation!");
return false;
}
desc_.cardMemory_ = false;
return true;
}
// ================================================================================================
bool Resource::CreateSvm(CreateParams* params, Pal::gpusize svmPtr) {
const bool isFineGrain = (memoryType() == RemoteUSWC) || (memoryType() == Remote);
size_t allocSize = amd::alignUp(desc().width_ * elementSize_,
dev().properties().gpuMemoryProperties.fragmentSize);
if (isFineGrain) {
Pal::SvmGpuMemoryCreateInfo createInfo = {};
createInfo.isUsedForKernel = desc_.isAllocExecute_;
createInfo.size = allocSize;
createInfo.alignment = MaxGpuAlignment;
if (svmPtr != 0) {
createInfo.flags.useReservedGpuVa = true;
createInfo.pReservedGpuVaOwner = params->svmBase_->iMem();
} else {
createInfo.flags.useReservedGpuVa = false;
createInfo.pReservedGpuVaOwner = nullptr;
}
if (!dev().settings().svmFineGrainSystem_) {
memRef_ = dev().resourceCache().findGpuMemory(&desc_, createInfo.size, createInfo.alignment,
createInfo.pReservedGpuVaOwner, &subOffset_);
}
if (memRef_ == nullptr) {
memRef_ = GpuMemoryReference::Create(dev(), createInfo);
}
} else {
Pal::GpuMemoryCreateInfo createInfo = {};
createInfo.size = allocSize;
createInfo.alignment = MaxGpuAlignment;
createInfo.vaRange = Pal::VaRange::Svm;
createInfo.priority = Pal::GpuMemPriority::Normal;
if (svmPtr != 0) {
createInfo.flags.useReservedGpuVa = true;
createInfo.pReservedGpuVaOwner = params->svmBase_->iMem();
}
memTypeToHeap(&createInfo);
memRef_ = dev().resourceCache().findGpuMemory(&desc_, createInfo.size, createInfo.alignment,
createInfo.pReservedGpuVaOwner, &subOffset_);
if (memRef_ == nullptr) {
createInfo.alignment = dev().properties().gpuMemoryProperties.fragmentSize;
memRef_ = GpuMemoryReference::Create(dev(), createInfo);
}
}
if (nullptr == memRef_) {
LogError("Failed PAL memory allocation!");
return false;
}
desc_.cardMemory_ = false;
if ((nullptr != params) && (nullptr != params->owner_) &&
(nullptr != params->owner_->getSvmPtr())) {
params->owner_->setSvmPtr(
reinterpret_cast<void*>(memRef_->iMem()->Desc().gpuVirtAddr + subOffset_));
offset_ += static_cast<size_t>(subOffset_);
}
return true;
}
// ================================================================================================
bool Resource::create(MemoryType memType, CreateParams* params, bool forceLinear) {
bool imageCreateView = false;
bool foundCalRef = false;
bool viewDefined = false;
uint viewLayer = 0;
uint viewLevel = 0;
uint viewFlags = 0;
Pal::ChannelMapping channels;
Pal::ChNumFormat format = dev().getPalFormat(desc().format_, &channels);
// Set the initial offset value for any resource to 0.
// Note: Runtime can call create() more than once, if the initial memory type failed
offset_ = 0;
// This is a thread safe operation
const_cast<Device&>(dev()).initializeHeapResources();
if (memType == Shader) {
if (dev().settings().svmFineGrainSystem_) {
desc_.isAllocExecute_ = true;
desc_.SVMRes_ = true;
memType = RemoteUSWC;
} else {
memType = Local;
}
// force to use remote memory for HW DEBUG or use
// local memory once we determine if FGS is supported
// memType = (!dev().settings().enableHwDebug_) ? Local : RemoteUSWC;
}
// Get the element size
elementSize_ = Pal::Formats::BytesPerPixel(format);
desc_.type_ = memType;
if (memType == Scratch) {
// use local memory for scratch buffer unless it is using HW DEBUG
desc_.type_ = (!dev().settings().enableHwDebug_) ? Local : RemoteUSWC;
desc_.scratch_ = true;
}
// Force remote allocation if it was requested in the settings
if (dev().settings().remoteAlloc_ && ((memoryType() == Local) || (memoryType() == Persistent))) {
if (dev().settings().apuSystem_ && dev().settings().viPlus_) {
desc_.type_ = Remote;
} else {
desc_.type_ = RemoteUSWC;
}
}
if (dev().settings().disablePersistent_ && (memoryType() == Persistent)) {
desc_.type_ = RemoteUSWC;
}
switch (memoryType()) {
case OGLInterop:
case D3D9Interop:
case D3D10Interop:
case D3D11Interop:
return CreateInterop(params);
case P2PAccess:
return CreateP2PAccess(params);
case Pinned:
return CreatePinned(params);
case View: {
// Save the offset in the global heap
ViewParams* view = reinterpret_cast<ViewParams*>(params);
offset_ = view->offset_;
// Make sure parent was provided
if (nullptr != view->resource_) {
viewOwner_ = view->resource_;
offset_ += viewOwner_->offset();
if (viewOwner_->data() != nullptr) {
address_ = viewOwner_->data() + view->offset_;
mapCount_++;
}
memRef_ = viewOwner_->memRef_;
memRef_->retain();
desc_.cardMemory_ = viewOwner_->desc().cardMemory_;
} else {
desc_.type_ = Empty;
}
return true;
}
default:
break;
}
if (!desc_.buffer_) {
return CreateImage(params, forceLinear);
}
Pal::gpusize svmPtr = 0;
if ((nullptr != params) && (nullptr != params->owner_) &&
(nullptr != params->owner_->getSvmPtr())) {
svmPtr = reinterpret_cast<Pal::gpusize>(params->owner_->getSvmPtr());
desc_.SVMRes_ = true;
svmPtr = (svmPtr == 1) ? 0 : svmPtr;
if (params->owner_->getMemFlags() & CL_MEM_SVM_ATOMICS) {
desc_.gl2CacheDisabled_ = true;
}
}
if (desc_.SVMRes_) {
return CreateSvm(params, svmPtr);
}
Pal::GpuMemoryCreateInfo createInfo = {};
createInfo.size = desc().width_ * elementSize_;
createInfo.size = amd::alignUp(createInfo.size, MaxGpuAlignment);
createInfo.alignment = desc().scratch_ ? 64 * Ki : MaxGpuAlignment;
createInfo.vaRange = Pal::VaRange::Default;
createInfo.priority = Pal::GpuMemPriority::Normal;
if (memoryType() == ExternalPhysical) {
cl_bus_address_amd bus_address = (reinterpret_cast<amd::Buffer*>(params->owner_))->busAddress();
createInfo.surfaceBusAddr = bus_address.surface_bus_address;
createInfo.markerBusAddr = bus_address.marker_bus_address;
createInfo.flags.sdiExternal = true;
} else if (memoryType() == BusAddressable) {
createInfo.flags.busAddressable = true;
}
memTypeToHeap(&createInfo);
// createInfo.priority;
memRef_ = dev().resourceCache().findGpuMemory(&desc_, createInfo.size, createInfo.alignment,
nullptr, &subOffset_);
if (nullptr == memRef_) {
memRef_ = GpuMemoryReference::Create(dev(), createInfo);
if (nullptr == memRef_) {
LogError("Failed PAL memory allocation!");
return false;
}
}
offset_ += static_cast<size_t>(subOffset_);
// Check if memory is locked already and restore CPU pointer
if (memRef_->cpuAddress_ != nullptr) {
address_ = memRef_->cpuAddress_;
memRef_->cpuAddress_ = nullptr;
mapCount_++;
}
return true;
}
// ================================================================================================
void Resource::free() {
if (memRef_ == nullptr) {
return;
}
const bool wait =
(memoryType() != ImageView) && (memoryType() != ImageBuffer) && (memoryType() != View);
// OCL has to wait, even if resource is placed in the cache, since reallocation can occur
// and resource can be reused on another async queue without a wait on a busy operation
if (wait) {
if (memRef_->gpu_ == nullptr) {
Device::ScopedLockVgpus lock(dev());
// Release all memory objects on all virtual GPUs
for (uint idx = 1; idx < dev().vgpus().size(); ++idx) {
dev().vgpus()[idx]->waitForEvent(&events_[idx]);
}
} else {
amd::ScopedLock l(memRef_->gpu_->execution());
memRef_->gpu_->waitForEvent(&events_[memRef_->gpu_->index()]);
}
} else {
// After a view destruction the original object is no longer can be associated with a vgpu
memRef_->gpu_ = nullptr;
}
// Destroy PAL resource
if (iMem() != 0) {
if (mapCount_ != 0 && wait) {
if ((memoryType() != Remote) && (memoryType() != RemoteUSWC)) {
//! @note: This is a workaround for bad applications that don't unmap memory
unmap(nullptr);
} else {
// Delay CPU address unmap until memRef_ destruction
if (!desc_.SVMRes_) {
assert(memRef_->cpuAddress_ == nullptr && "Memref shouldn't have a valid CPU address");
memRef_->cpuAddress_ = address_;
}
}
}
// Add resource to the cache
if (!dev().resourceCache().addGpuMemory(&desc_, memRef_, subOffset_)) {
// Free PAL resource
palFree();
}
}
// Free SRD for images
if (!desc().buffer_) {
dev().srds().freeSrdSlot(hwSrd_);
}
memRef_ = nullptr;
}
// ================================================================================================
void Resource::writeRawData(VirtualGPU& gpu, size_t offset, size_t size, const void* data,
bool waitForEvent) const {
GpuEvent event;
// Write data size bytes to surface
// size needs to be DWORD aligned
assert((size & 3) == 0);
gpu.eventBegin(MainEngine);
gpu.queue(MainEngine).addCmdMemRef(memRef());
gpu.iCmd()->CmdUpdateMemory(*iMem(), offset_ + offset, size,
reinterpret_cast<const uint32_t*>(data));
gpu.eventEnd(MainEngine, event);
if (waitForEvent) {
//! @note: We don't really have to mark the allocations as busy
//! if we are waiting for a transfer
// Wait for event to complete
gpu.waitForEvent(&event);
} else {
setBusy(gpu, event);
// Update the global GPU event
gpu.setGpuEvent(event, false);
}
}
// ================================================================================================
static const Pal::ChNumFormat ChannelFmt(uint bytesPerElement) {
if (bytesPerElement == 16) {
return Pal::ChNumFormat::X32Y32Z32W32_Uint;
} else if (bytesPerElement == 8) {
return Pal::ChNumFormat::X32Y32_Uint;
} else if (bytesPerElement == 4) {
return Pal::ChNumFormat::X32_Uint;
} else if (bytesPerElement == 2) {
return Pal::ChNumFormat::X16_Uint;
} else {
return Pal::ChNumFormat::X8_Uint;
}
}
// ================================================================================================
bool Resource::partialMemCopyTo(VirtualGPU& gpu, const amd::Coord3D& srcOrigin,
const amd::Coord3D& dstOrigin, const amd::Coord3D& size,
Resource& dstResource, bool enableCopyRect, bool flushDMA,
uint bytesPerElement) const {
GpuEvent event;
EngineType activeEngineID = gpu.engineID_;
static const bool waitOnBusyEngine = true;
assert(!(desc().cardMemory_ && dstResource.desc().cardMemory_) && "Unsupported configuraiton!");
uint64_t gpuMemoryOffset = 0;
uint64_t gpuMemoryRowPitch = 0;
uint64_t imageOffsetx = 0;
bool img1Darray = false;
bool img2Darray = false;
if (desc().buffer_ && !dstResource.desc().buffer_) {
imageOffsetx = dstOrigin[0] % dstResource.elementSize();
gpuMemoryOffset = srcOrigin[0] + offset();
gpuMemoryRowPitch = (srcOrigin[1]) ? srcOrigin[1] : size[0] * dstResource.elementSize();
img1Darray = (dstResource.desc().topology_ == CL_MEM_OBJECT_IMAGE1D_ARRAY);
img2Darray = (dstResource.desc().topology_ == CL_MEM_OBJECT_IMAGE2D_ARRAY);
} else if (!desc().buffer_ && dstResource.desc().buffer_) {
imageOffsetx = srcOrigin[0] % elementSize();
gpuMemoryOffset = dstOrigin[0] + dstResource.offset();
gpuMemoryRowPitch = (dstOrigin[1]) ? dstOrigin[1] : size[0] * elementSize();
img1Darray = (desc().topology_ == CL_MEM_OBJECT_IMAGE1D_ARRAY);
img2Darray = (desc().topology_ == CL_MEM_OBJECT_IMAGE2D_ARRAY);
}
if ((desc().buffer_ && !dstResource.desc().buffer_) ||
(!desc().buffer_ && dstResource.desc().buffer_)) {
// sDMA cannot be used for the below conditions
// Make sure linear pitch in bytes is 4 bytes aligned
if (((gpuMemoryRowPitch % 4) != 0) ||
// another DRM restriciton... SI has 4 pixels
(gpuMemoryOffset % 4 != 0) || (dev().settings().sdamPageFaultWar_ && (imageOffsetx != 0))) {
return false;
}
}
if (dev().settings().disableSdma_) {
// Make sure compute is done before CP DMA start
gpu.addBarrier(RgpSqqtBarrierReason::MemDependency);
} else {
gpu.engineID_ = SdmaEngine;
}
// Wait for the resources, since runtime may use async transfers
wait(gpu, waitOnBusyEngine);
dstResource.wait(gpu, waitOnBusyEngine);
if (gpu.validateSdmaOverlap(*this, dstResource)) {
// Note: PAL should insert a NOP into the command buffer for synchronization
gpu.addBarrier();
}
Pal::ImageLayout imgLayout = {};
gpu.eventBegin(gpu.engineID_);
gpu.queue(gpu.engineID_).addCmdMemRef(memRef());
gpu.queue(gpu.engineID_).addCmdMemRef(dstResource.memRef());
if (desc().buffer_ && !dstResource.desc().buffer_) {
Pal::SubresId ImgSubresId = {Pal::ImageAspect::Color, dstResource.desc().baseLevel_, 0};
Pal::MemoryImageCopyRegion copyRegion = {};
copyRegion.imageSubres = ImgSubresId;
copyRegion.imageOffset.x = dstOrigin[0];
copyRegion.imageOffset.y = dstOrigin[1];
copyRegion.imageOffset.z = dstOrigin[2];
copyRegion.imageExtent.width = size[0];
copyRegion.imageExtent.height = size[1];
copyRegion.imageExtent.depth = size[2];
copyRegion.numSlices = 1;
if (img1Darray) {
copyRegion.numSlices = copyRegion.imageExtent.height;
copyRegion.imageExtent.height = 1;
} else if (img2Darray) {
copyRegion.numSlices = copyRegion.imageExtent.depth;
copyRegion.imageExtent.depth = 1;
}
copyRegion.gpuMemoryOffset = gpuMemoryOffset;
copyRegion.gpuMemoryRowPitch = gpuMemoryRowPitch;
copyRegion.gpuMemoryDepthPitch = (srcOrigin[2])
? srcOrigin[2]
: copyRegion.gpuMemoryRowPitch * copyRegion.imageExtent.height;
gpu.iCmd()->CmdCopyMemoryToImage(*iMem(), *dstResource.image_, imgLayout, 1, &copyRegion);
} else if (!desc().buffer_ && dstResource.desc().buffer_) {
Pal::MemoryImageCopyRegion copyRegion = {};
Pal::SubresId ImgSubresId = {Pal::ImageAspect::Color, desc().baseLevel_, 0};
copyRegion.imageSubres = ImgSubresId;
copyRegion.imageOffset.x = srcOrigin[0];
copyRegion.imageOffset.y = srcOrigin[1];
copyRegion.imageOffset.z = srcOrigin[2];
copyRegion.imageExtent.width = size[0];
copyRegion.imageExtent.height = size[1];
copyRegion.imageExtent.depth = size[2];
copyRegion.numSlices = 1;
if (img1Darray) {
copyRegion.numSlices = copyRegion.imageExtent.height;
copyRegion.imageExtent.height = 1;
} else if (img2Darray) {
copyRegion.numSlices = copyRegion.imageExtent.depth;
copyRegion.imageExtent.depth = 1;
}
copyRegion.gpuMemoryOffset = gpuMemoryOffset;
copyRegion.gpuMemoryRowPitch = gpuMemoryRowPitch;
copyRegion.gpuMemoryDepthPitch = (dstOrigin[2])
? dstOrigin[2]
: copyRegion.gpuMemoryRowPitch * copyRegion.imageExtent.height;
gpu.iCmd()->CmdCopyImageToMemory(*image_, imgLayout, *dstResource.iMem(), 1, &copyRegion);
} else {
if (enableCopyRect) {
Pal::TypedBufferCopyRegion copyRegion = {};
Pal::ChannelMapping channels = {Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::Y,
Pal::ChannelSwizzle::Z, Pal::ChannelSwizzle::W};
copyRegion.srcBuffer.swizzledFormat.format = ChannelFmt(bytesPerElement);
copyRegion.srcBuffer.swizzledFormat.swizzle = channels;
copyRegion.srcBuffer.offset = srcOrigin[0] + offset();
copyRegion.srcBuffer.rowPitch = srcOrigin[1];
copyRegion.srcBuffer.depthPitch = srcOrigin[2];
copyRegion.extent.width = size[0] / bytesPerElement;
copyRegion.extent.height = size[1];
copyRegion.extent.depth = size[2];
copyRegion.dstBuffer.swizzledFormat.format = ChannelFmt(bytesPerElement);
copyRegion.dstBuffer.swizzledFormat.swizzle = channels;
copyRegion.dstBuffer.offset = dstOrigin[0] + dstResource.offset();
copyRegion.dstBuffer.rowPitch = dstOrigin[1];
copyRegion.dstBuffer.depthPitch = dstOrigin[2];
gpu.iCmd()->CmdCopyTypedBuffer(*iMem(), *dstResource.iMem(), 1, &copyRegion);
} else {
Pal::MemoryCopyRegion copyRegion = {};
copyRegion.srcOffset = srcOrigin[0] + offset();
copyRegion.dstOffset = dstOrigin[0] + dstResource.offset();
copyRegion.copySize = size[0];
constexpr size_t CpCopySizeLimit = (1 << 26) - sizeof(uint64_t);
if (dev().settings().disableSdma_ && (size[0] > CpCopySizeLimit)) {
size_t orgSize = size[0];
copyRegion.copySize = CpCopySizeLimit;
do {
gpu.iCmd()->CmdCopyMemory(*iMem(), *dstResource.iMem(), 1, &copyRegion);
copyRegion.srcOffset += CpCopySizeLimit;
copyRegion.dstOffset += CpCopySizeLimit;
orgSize -= (orgSize > CpCopySizeLimit) ? CpCopySizeLimit : orgSize;
if (orgSize < CpCopySizeLimit) {
copyRegion.copySize = orgSize;
}
} while (orgSize > 0);
} else {
gpu.iCmd()->CmdCopyMemory(*iMem(), *dstResource.iMem(), 1, &copyRegion);
}
}
}
if (dev().settings().disableSdma_) {
// Make sure CP dma is done
gpu.addBarrier(RgpSqqtBarrierReason::MemDependency);
}
gpu.eventEnd(gpu.engineID_, event);
// Mark source and destination as busy
setBusy(gpu, event);
dstResource.setBusy(gpu, event);
// Update the global GPU event
gpu.setGpuEvent(event, flushDMA);
// Restore the original engine
gpu.engineID_ = activeEngineID;
return true;
}
// ================================================================================================
void Resource::setBusy(VirtualGPU& gpu, GpuEvent gpuEvent) const {
addGpuEvent(gpu, gpuEvent);
// If current resource is a view, then update the parent event as well
if (viewOwner_ != nullptr) {
viewOwner_->setBusy(gpu, gpuEvent);
}
}
// ================================================================================================
void Resource::wait(VirtualGPU& gpu, bool waitOnBusyEngine) const {
GpuEvent* gpuEvent = getGpuEvent(gpu);
// Check if we have to wait unconditionally
if (!waitOnBusyEngine ||
// or we have to wait only if another engine was used on this resource
(gpuEvent->engineId_ != gpu.engineID_)) {
gpu.waitForEvent(gpuEvent);
}
// If current resource is a view and not in the global heap,
// then wait for the parent event as well
if (viewOwner_ != nullptr) {
viewOwner_->wait(gpu, waitOnBusyEngine);
}
}
// ================================================================================================
bool Resource::hostWrite(VirtualGPU* gpu, const void* hostPtr, const amd::Coord3D& origin,
const amd::Coord3D& size, uint flags, size_t rowPitch, size_t slicePitch) {
void* dst;
size_t startLayer = origin[2];
size_t numLayers = size[2];
if (desc().topology_ == CL_MEM_OBJECT_IMAGE1D_ARRAY) {
startLayer = origin[1];
numLayers = size[1];
}
// Get physical GPU memmory
dst = map(gpu, flags, startLayer, numLayers);
if (nullptr == dst) {
LogError("Couldn't map GPU memory for host write");
return false;
}
if (1 == desc().dimSize_) {
size_t copySize = (desc().buffer_) ? size[0] : size[0] * elementSize_;
// Update the pointer
dst = static_cast<void*>(static_cast<char*>(dst) + origin[0]);
// Copy memory
amd::Os::fastMemcpy(dst, hostPtr, copySize);
} else {
size_t dstOffsBase = origin[0] * elementSize_;
// Make sure we use the right pitch if it's not specified
if (rowPitch == 0) {
rowPitch = size[0] * elementSize_;
}
// Make sure we use the right slice if it's not specified
if (slicePitch == 0) {
slicePitch = size[0] * size[1] * elementSize_;
}
// Adjust the destination offset with Y dimension
dstOffsBase += desc().pitch_ * origin[1] * elementSize_;
// Adjust the destination offset with Z dimension
dstOffsBase += desc().slice_ * origin[2] * elementSize_;
// Copy memory slice by slice
for (size_t slice = 0; slice < size[2]; ++slice) {
size_t dstOffs = dstOffsBase + slice * desc().slice_ * elementSize_;
size_t srcOffs = slice * slicePitch;
// Copy memory line by line
for (size_t row = 0; row < size[1]; ++row) {
// Copy memory
amd::Os::fastMemcpy((reinterpret_cast<address>(dst) + dstOffs),
(reinterpret_cast<const_address>(hostPtr) + srcOffs),
size[0] * elementSize_);
dstOffs += desc().pitch_ * elementSize_;
srcOffs += rowPitch;
}
}
}
// Unmap GPU memory
unmap(gpu);
return true;
}
// ================================================================================================
bool Resource::hostRead(VirtualGPU* gpu, void* hostPtr, const amd::Coord3D& origin,
const amd::Coord3D& size, size_t rowPitch, size_t slicePitch) {
void* src;
size_t startLayer = origin[2];
size_t numLayers = size[2];
if (desc().topology_ == CL_MEM_OBJECT_IMAGE1D_ARRAY) {
startLayer = origin[1];
numLayers = size[1];
}
// Get physical GPU memmory
src = map(gpu, ReadOnly, startLayer, numLayers);
if (nullptr == src) {
LogError("Couldn't map GPU memory for host read");
return false;
}
if (1 == desc().dimSize_) {
size_t copySize = (desc().buffer_) ? size[0] : size[0] * elementSize_;
// Update the pointer
src = static_cast<void*>(static_cast<char*>(src) + origin[0]);
// Copy memory
amd::Os::fastMemcpy(hostPtr, src, copySize);
} else {
size_t srcOffsBase = origin[0] * elementSize_;
// Make sure we use the right pitch if it's not specified
if (rowPitch == 0) {
rowPitch = size[0] * elementSize_;
}
// Make sure we use the right slice if it's not specified
if (slicePitch == 0) {
slicePitch = size[0] * size[1] * elementSize_;
}
// Adjust destination offset with Y dimension
srcOffsBase += desc().pitch_ * origin[1] * elementSize_;
// Adjust the destination offset with Z dimension
srcOffsBase += desc().slice_ * origin[2] * elementSize_;
// Copy memory line by line
for (size_t slice = 0; slice < size[2]; ++slice) {
size_t srcOffs = srcOffsBase + slice * desc().slice_ * elementSize_;
size_t dstOffs = slice * slicePitch;
// Copy memory line by line
for (size_t row = 0; row < size[1]; ++row) {
// Copy memory
amd::Os::fastMemcpy((reinterpret_cast<address>(hostPtr) + dstOffs),
(reinterpret_cast<const_address>(src) + srcOffs),
size[0] * elementSize_);
srcOffs += desc().pitch_ * elementSize_;
dstOffs += rowPitch;
}
}
}
// Unmap GPU memory
unmap(gpu);
return true;
}
// ================================================================================================
void* Resource::gpuMemoryMap(size_t* pitch, uint flags, Pal::IGpuMemory* resource) const {
if (desc_.cardMemory_ && !isPersistentDirectMap()) {
// @todo remove const cast
Unimplemented();
return nullptr;
// return const_cast<Device&>(dev()).resMapLocal(*pitch, resource, flags);
} else {
amd::ScopedLock lk(dev().lockPAL());
void* address;
if (image_ != nullptr) {
constexpr Pal::SubresId ImgSubresId = {Pal::ImageAspect::Color, 0, 0};
Pal::SubresLayout layout;
image_->GetSubresourceLayout(ImgSubresId, &layout);
*pitch = layout.rowPitch / elementSize();
}
*pitch = desc().width_;
if (Pal::Result::Success == resource->Map(&address)) {
return address;
} else {
LogError("PAL GpuMemory->Map() failed!");
return nullptr;
}
}
}
// ================================================================================================
void Resource::gpuMemoryUnmap(Pal::IGpuMemory* resource) const {
if (desc_.cardMemory_ && !isPersistentDirectMap()) {
// @todo remove const cast
Unimplemented();
// const_cast<Device&>(dev()).resUnmapLocal(resource);
} else {
Pal::Result result = resource->Unmap();
if (Pal::Result::Success != result) {
LogError("PAL GpuMemory->Unmap() failed!");
}
}
}
// ================================================================================================
bool Resource::glAcquire() {
bool retVal = true;
if (desc().type_ == OGLInterop) {
retVal = dev().resGLAcquire(glPlatformContext_, glInteropMbRes_, glType_);
}
return retVal;
}
// ================================================================================================
bool Resource::glRelease() {
bool retVal = true;
if (desc().type_ == OGLInterop) {
retVal = dev().resGLRelease(glPlatformContext_, glInteropMbRes_, glType_);
}
return retVal;
}
// ================================================================================================
void Resource::addGpuEvent(const VirtualGPU& gpu, GpuEvent event) const {
uint idx = gpu.index();
assert(idx < events_.size());
events_[idx] = event;
}
// ================================================================================================
GpuEvent* Resource::getGpuEvent(const VirtualGPU& gpu) const {
uint idx = gpu.index();
assert((idx < events_.size()) && "Undeclared queue access!");
return &events_[idx];
}
// ================================================================================================
void Resource::setModified(VirtualGPU& gpu, bool modified) const {
uint idx = gpu.index();
assert(idx < events_.size());
events_[idx].modified_ = modified;
// If current resource is a view, then update the parent as well
if (viewOwner_ != nullptr) {
viewOwner_->setModified(gpu, modified);
}
}
// ================================================================================================
bool Resource::isModified(VirtualGPU& gpu) const {
uint idx = gpu.index();
assert(idx < events_.size());
bool modified = events_[idx].modified_;
// If current resource is a view, then get the parent state as well
if (viewOwner_ != nullptr) {
modified |= viewOwner_->isModified(gpu);
}
return modified;
}
// ================================================================================================
void Resource::palFree() const {
if (desc().type_ == OGLInterop) {
amd::ScopedLock lk(dev().lockPAL());
dev().resGLFree(glPlatformContext_, glInteropMbRes_, glType_);
}
memRef_->release();
}
// ================================================================================================
bool Resource::isMemoryType(MemoryType memType) const {
if (memoryType() == memType) {
return true;
} else if (memoryType() == View) {
return viewOwner_->isMemoryType(memType);
}
return false;
}
// ================================================================================================
bool Resource::isPersistentDirectMap() const {
bool directMap =
((memoryType() == Resource::Persistent) && (desc().dimSize_ < 3) && !desc().imageArray_);
// If direct map is possible, then validate it with the current tiling
if (directMap && desc().tiled_) {
//!@note IOL for Linux doesn't support tiling aperture
// and runtime doesn't force linear images in persistent
directMap = IS_WINDOWS && !dev().settings().linearPersistentImage_;
}
return directMap;
}
// ================================================================================================
void* Resource::map(VirtualGPU* gpu, uint flags, uint startLayer, uint numLayers) {
if (isMemoryType(Pinned)) {
// Check if we have to wait
if (!(flags & NoWait)) {
if (gpu != nullptr) {
wait(*gpu);
}
}
return address_;
}
if (flags & ReadOnly) {
}
if (flags & WriteOnly) {
}
// Check if we have to wait
if (!(flags & NoWait)) {
if (gpu != nullptr) {
wait(*gpu);
}
}
// Check if memory wasn't mapped yet
if (++mapCount_ == 1) {
if ((desc().dimSize_ == 3) || desc().imageArray_ ||
((desc().type_ == ImageView) && viewOwner_->mipMapped())) {
// Save map info for multilayer map/unmap
startLayer_ = startLayer;
numLayers_ = numLayers;
mapFlags_ = flags;
// Map with layers
address_ = mapLayers(gpu, flags);
} else {
// Map current resource
if (memRef_->cpuAddress_ != nullptr) {
// Suballocations are mapped by the memory suballocator
address_ = reinterpret_cast<uint8_t*>(memRef_->cpuAddress_) + subOffset_;
} else {
address_ = gpuMemoryMap(&desc_.pitch_, flags, iMem());
address_ = reinterpret_cast<address>(address_) + offset_;
}
if (address_ == nullptr) {
LogError("cal::ResMap failed!");
--mapCount_;
return nullptr;
}
}
}
//! \note the atomic operation with counter doesn't
// guarantee that the address will be valid,
// since PAL could still process the first map
if (address_ == nullptr) {
for (uint i = 0; address_ == NULL && i < 10; ++i) {
amd::Os::sleep(1);
}
assert((address_ != nullptr) && "Multiple maps failed!");
}
return address_;
}
// ================================================================================================
void* Resource::mapLayers(VirtualGPU* gpu, uint flags) {
Unimplemented();
return nullptr;
}
// ================================================================================================
void Resource::unmap(VirtualGPU* gpu) {
if (isMemoryType(Pinned)) {
return;
}
// Decrement map counter
int count = --mapCount_;
// Check if it's the last unmap
if (count == 0) {
if ((desc().dimSize_ == 3) || desc().imageArray_ ||
((desc().type_ == ImageView) && viewOwner_->mipMapped())) {
// Unmap layers
unmapLayers(gpu);
} else {
// Unmap current resource
gpuMemoryUnmap(iMem());
}
address_ = nullptr;
} else if (count < 0) {
LogError("dev().serialCalResUnmap failed!");
++mapCount_;
return;
}
}
// ================================================================================================
void Resource::unmapLayers(VirtualGPU* gpu) { Unimplemented(); }
// ================================================================================================
bool MemorySubAllocator::InitAllocator(GpuMemoryReference* mem_ref) {
MemBuddyAllocator* allocator =
new MemBuddyAllocator(device_, device_->settings().subAllocationChunkSize_,
device_->settings().subAllocationMinSize_);
if (!((allocator != nullptr) && (allocator->Init() == Pal::Result::Success) &&
heaps_.insert({mem_ref, allocator}).second)) {
mem_ref->release();
delete allocator;
return false;
}
return true;
}
// ================================================================================================
bool MemorySubAllocator::CreateChunk(const Pal::IGpuMemory* reserved_va) {
Pal::GpuMemoryCreateInfo createInfo = {};
createInfo.size = device_->settings().subAllocationChunkSize_;
createInfo.alignment = device_->properties().gpuMemoryProperties.fragmentSize;
createInfo.vaRange = Pal::VaRange::Default;
createInfo.priority = Pal::GpuMemPriority::Normal;
createInfo.heapCount = 1;
createInfo.heaps[0] = Pal::GpuHeapInvisible;
createInfo.flags.peerWritable = device_->P2PAccessAllowed();.mallPolicy_);)
GpuMemoryReference* mem_ref = GpuMemoryReference::Create(*device_, createInfo);
if (mem_ref != nullptr) {
return InitAllocator(mem_ref);
}
return false;
}
// ================================================================================================
bool CoarseMemorySubAllocator::CreateChunk(const Pal::IGpuMemory* reserved_va) {
Pal::GpuMemoryCreateInfo createInfo = {};
createInfo.size = device_->settings().subAllocationChunkSize_;
createInfo.alignment = device_->properties().gpuMemoryProperties.fragmentSize;
createInfo.vaRange = Pal::VaRange::Svm;
createInfo.priority = Pal::GpuMemPriority::Normal;
createInfo.flags.useReservedGpuVa = (reserved_va != nullptr);
createInfo.pReservedGpuVaOwner = reserved_va;
createInfo.heapCount = 2;
createInfo.heaps[0] = Pal::GpuHeapInvisible;
createInfo.heaps[1] = Pal::GpuHeapLocal;.mallPolicy_);)
GpuMemoryReference* mem_ref = GpuMemoryReference::Create(*device_, createInfo);
if (mem_ref != nullptr) {
return InitAllocator(mem_ref);
}
return false;
}
// ================================================================================================
bool FineMemorySubAllocator::CreateChunk(const Pal::IGpuMemory* reserved_va) {
Pal::SvmGpuMemoryCreateInfo createInfo = {};
createInfo.isUsedForKernel = false;
createInfo.size = device_->settings().subAllocationChunkSize_;
createInfo.alignment = MaxGpuAlignment;
createInfo.flags.useReservedGpuVa = (reserved_va != nullptr);
createInfo.pReservedGpuVaOwner = reserved_va;
GpuMemoryReference* mem_ref = GpuMemoryReference::Create(*device_, createInfo);
if ((mem_ref != nullptr) && InitAllocator(mem_ref)) {
mem_ref->iMem()->Map(&mem_ref->cpuAddress_);
return mem_ref->cpuAddress_ != nullptr;
}
return false;
}
// ================================================================================================
bool FineUncachedMemorySubAllocator::CreateChunk(const Pal::IGpuMemory* reserved_va) {
Pal::SvmGpuMemoryCreateInfo createInfo = {};
createInfo.isUsedForKernel = false;
createInfo.size = device_->settings().subAllocationChunkSize_;
createInfo.alignment = MaxGpuAlignment;
createInfo.flags.useReservedGpuVa = (reserved_va != nullptr);
createInfo.pReservedGpuVaOwner = reserved_va;
createInfo.flags.gl2Uncached = true;
GpuMemoryReference* mem_ref = GpuMemoryReference::Create(*device_, createInfo);
if ((mem_ref != nullptr) && InitAllocator(mem_ref)) {
mem_ref->iMem()->Map(&mem_ref->cpuAddress_);
return mem_ref->cpuAddress_ != nullptr;
}
return false;
}
// ================================================================================================
MemorySubAllocator::~MemorySubAllocator() {
// Release memory heap for suballocations
for (const auto& it : heaps_) {
it.first->release();
delete it.second;
}
}
// ================================================================================================
GpuMemoryReference* MemorySubAllocator::Allocate(Pal::gpusize size, Pal::gpusize alignment,
const Pal::IGpuMemory* reserved_va,
Pal::gpusize* offset) {
GpuMemoryReference* mem_ref = nullptr;
MemBuddyAllocator* allocator = nullptr;
// Check if the resource size and alignment are allowed for suballocation
if ((size < device_->settings().subAllocationMaxSize_) &&
(alignment <= device_->properties().gpuMemoryProperties.fragmentSize)) {
uint i = 0;
size = amd::alignUp(size, device_->settings().subAllocationMinSize_);
do {
// Find if current heap has enough empty space
for (const auto& it : heaps_) {
mem_ref = it.first;
allocator = it.second;
// SVM allocations may required a fixed VA, make sure we find the heap with the same VA
if (reserved_va &&
(reserved_va->Desc().gpuVirtAddr != mem_ref->iMem()->Desc().gpuVirtAddr)) {
continue;
}
// If we have found a valid chunk, then suballocate memory
if (Pal::Result::Success == allocator->Allocate(size, alignment, offset)) {
return mem_ref;
}
}
// We didn't find a valid chunk, so create a new one
if (!CreateChunk(reserved_va)) {
return nullptr;
}
i++;
} while (i < 2);
}
return nullptr;
}
// ================================================================================================
bool MemorySubAllocator::Free(amd::Monitor* monitor, GpuMemoryReference* ref, Pal::gpusize offset) {
bool release_mem = false;
{
amd::ScopedLock l(monitor);
// Find if current memory reference is a chunk allocation
auto it = heaps_.find(ref);
if (it == heaps_.end()) {
return false;
}
it->second->Free(offset);
// If this suballocator empty, then release memory chunk
if (it->second->IsEmpty()) {
delete it->second;
heaps_.erase(it);
release_mem = true;
}
}
if (release_mem) {
ref->release();
}
return true;
}
// ================================================================================================
ResourceCache::~ResourceCache() { free(); }
// ================================================================================================
//! \note the cache works in FILO mode
bool ResourceCache::addGpuMemory(Resource::Descriptor* desc, GpuMemoryReference* ref,
Pal::gpusize offset) {
bool result = false;
size_t size = ref->iMem()->Desc().size;
// Check if runtime can free suballocation
if ((desc->type_ == Resource::Local) && !desc->SVMRes_) {
result = mem_sub_alloc_local_.Free(&lockCacheOps_, ref, offset);
} else if ((desc->type_ == Resource::Local) && desc->SVMRes_) {
result = mem_sub_alloc_coarse_.Free(&lockCacheOps_, ref, offset);
} else if (desc->SVMRes_) {
if (desc->gl2CacheDisabled_) {
result = mem_sub_alloc_fine_uncached_.Free(&lockCacheOps_, ref, offset);
} else {
result = mem_sub_alloc_fine_.Free(&lockCacheOps_, ref, offset);
}
}
// If a resource was a suballocation, don't try to cache it
if (result == true) {
return result;
}
// Make sure current allocation isn't bigger than cache
if (((desc->type_ == Resource::Local) || (desc->type_ == Resource::Persistent) ||
(desc->type_ == Resource::Remote) || (desc->type_ == Resource::RemoteUSWC)) &&
(size < cacheSizeLimit_) && !desc->SVMRes_) {
// Validate the cache size limit. Loop until we have enough space
while ((cacheSize_ + size) > cacheSizeLimit_) {
removeLast();
}
Resource::Descriptor* descCached = new Resource::Descriptor;
if (descCached != nullptr) {
// Copy the original desc to the cached version
memcpy(descCached, desc, sizeof(Resource::Descriptor));
amd::ScopedLock l(&lockCacheOps_);
// Add the current resource to the cache
resCache_.push_front({descCached, ref});
ref->gpu_ = nullptr;
cacheSize_ += size;
if (desc->type_ == Resource::Local) {
lclCacheSize_ += size;
}
result = true;
}
}
return result;
}
// ================================================================================================
GpuMemoryReference* ResourceCache::findGpuMemory(Resource::Descriptor* desc, Pal::gpusize size,
Pal::gpusize alignment,
const Pal::IGpuMemory* reserved_va,
Pal::gpusize* offset) {
amd::ScopedLock l(&lockCacheOps_);
GpuMemoryReference* ref = nullptr;
// Check if the runtime can suballocate memory
if ((desc->type_ == Resource::Local) && !desc->SVMRes_) {
ref = mem_sub_alloc_local_.Allocate(size, alignment, reserved_va, offset);
} else if ((desc->type_ == Resource::Local) && desc->SVMRes_) {
ref = mem_sub_alloc_coarse_.Allocate(size, alignment, reserved_va, offset);
} else if (desc->SVMRes_) {
if (desc->gl2CacheDisabled_) {
ref = mem_sub_alloc_fine_uncached_.Allocate(size, alignment, reserved_va, offset);
} else {
ref = mem_sub_alloc_fine_.Allocate(size, alignment, reserved_va, offset);
}
}
if (ref != nullptr) {
return ref;
}
// Early exit if resource is too big
if (size >= cacheSizeLimit_ || desc->SVMRes_) {
//! \note we may need to free the cache here to reduce memory pressure
return ref;
}
// Serach the right resource through the cache list
for (const auto& it : resCache_) {
Resource::Descriptor* entry = it.first;
size_t sizeRes = it.second->iMem()->Desc().size;
// Find if we can reuse this entry
if ((entry->type_ == desc->type_) && (entry->flags_ == desc->flags_) && (size <= sizeRes) &&
(size > (sizeRes >> 1)) && ((it.second->iMem()->Desc().gpuVirtAddr % alignment) == 0) &&
(entry->isAllocExecute_ == desc->isAllocExecute_)) {
ref = it.second;
cacheSize_ -= sizeRes;
if (entry->type_ == Resource::Local) {
lclCacheSize_ -= sizeRes;
}
delete it.first;
// Remove the found etry from the cache
resCache_.remove(it);
break;
}
}
return ref;
}
// ================================================================================================
bool ResourceCache::free(size_t minCacheEntries) {
bool result = false;
if (minCacheEntries < resCache_.size()) {
result = true;
// Clear the cache
while (static_cast<int>(cacheSize_) > 0) {
removeLast();
}
CondLog((cacheSize_ != 0), "Incorrect size for cache release!");
}
return result;
}
// ================================================================================================
void ResourceCache::removeLast() {
std::pair<Resource::Descriptor*, GpuMemoryReference*> entry;
{
// Protect access to the global data
amd::ScopedLock l(&lockCacheOps_);
entry = resCache_.back();
resCache_.pop_back();
cacheSize_ -= entry.second->iMem()->Desc().size;
if (entry.first->type_ == Resource::Local) {
lclCacheSize_ -= entry.second->iMem()->Desc().size;
}
// Delete Descriptor
delete entry.first;
}
// Destroy PAL resource
entry.second->release();
}
} // namespace pal