// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved. // #include "platform/program.hpp" #include "platform/kernel.hpp" #include "os/os.hpp" #include "device/device.hpp" #include "utils/flags.hpp" #include "thread/monitor.hpp" #include "device/pal/palresource.hpp" #include "device/pal/paldevice.hpp" #include "device/pal/palblit.hpp" #include "device/pal/paltimestamp.hpp" #include "thread/atomic.hpp" #include "hsa_ext_image.h" #ifdef _WIN32 #include #include "CL/cl_d3d10.h" #include "CL/cl_d3d11.h" #endif // _WIN32 #include #include "GL/glATIInternal.h" #include #include #include #include #include namespace pal { // ================================================================================================ GpuMemoryReference* GpuMemoryReference::Create(const Device& dev, const Pal::GpuMemoryCreateInfo& createInfo) { Pal::Result result; size_t gpuMemSize = dev.iDev()->GetGpuMemorySize(createInfo, &result); if (result != Pal::Result::Success) { return nullptr; } GpuMemoryReference* memRef = new (gpuMemSize) GpuMemoryReference(dev); if (memRef != nullptr) { result = dev.iDev()->CreateGpuMemory(createInfo, &memRef[1], &memRef->gpuMem_); if ((result != Pal::Result::Success) && // Free cache if PAL failed allocation dev.resourceCache().free()) { // If cache was freed, then try to allocate again result = dev.iDev()->CreateGpuMemory(createInfo, &memRef[1], &memRef->gpuMem_); } if (result != Pal::Result::Success) { memRef->release(); return nullptr; } } if (!createInfo.flags.sdiExternal) { // Update free memory size counters dev.updateAllocedMemory(createInfo.heaps[0], createInfo.size, false); } return memRef; } // ================================================================================================ GpuMemoryReference* GpuMemoryReference::Create(const Device& dev, const Pal::PinnedGpuMemoryCreateInfo& createInfo) { Pal::Result result; size_t gpuMemSize = dev.iDev()->GetPinnedGpuMemorySize(createInfo, &result); if (result != Pal::Result::Success) { return nullptr; } GpuMemoryReference* memRef = new (gpuMemSize) GpuMemoryReference(dev); Pal::VaRange vaRange = Pal::VaRange::Default; if (memRef != nullptr) { result = dev.iDev()->CreatePinnedGpuMemory(createInfo, &memRef[1], &memRef->gpuMem_); if (result != Pal::Result::Success) { memRef->release(); return nullptr; } } // Update free memory size counters dev.updateAllocedMemory(Pal::GpuHeap::GpuHeapGartCacheable, createInfo.size, false); return memRef; } // ================================================================================================ GpuMemoryReference* GpuMemoryReference::Create(const Device& dev, const Pal::SvmGpuMemoryCreateInfo& createInfo) { Pal::Result result; size_t gpuMemSize = dev.iDev()->GetSvmGpuMemorySize(createInfo, &result); if (result != Pal::Result::Success) { return nullptr; } GpuMemoryReference* memRef = new (gpuMemSize) GpuMemoryReference(dev); if (memRef != nullptr) { result = dev.iDev()->CreateSvmGpuMemory(createInfo, &memRef[1], &memRef->gpuMem_); if (result != Pal::Result::Success) { memRef->release(); return nullptr; } } // Update free memory size counters dev.updateAllocedMemory(Pal::GpuHeap::GpuHeapGartCacheable, createInfo.size, false); return memRef; } // ================================================================================================ GpuMemoryReference* GpuMemoryReference::Create(const Device& dev, const Pal::ExternalGpuMemoryOpenInfo& openInfo) { Pal::Result result; size_t gpuMemSize = dev.iDev()->GetExternalSharedGpuMemorySize(&result); if (result != Pal::Result::Success) { return nullptr; } Pal::GpuMemoryCreateInfo createInfo = {}; GpuMemoryReference* memRef = new (gpuMemSize) GpuMemoryReference(dev); if (memRef != nullptr) { result = dev.iDev()->OpenExternalSharedGpuMemory(openInfo, &memRef[1], &createInfo, &memRef->gpuMem_); if (result != Pal::Result::Success) { memRef->release(); return nullptr; } } return memRef; } // ================================================================================================ GpuMemoryReference* GpuMemoryReference::Create(const Device& dev, const Pal::ExternalImageOpenInfo& openInfo, Pal::ImageCreateInfo* imgCreateInfo, Pal::IImage** image) { Pal::Result result; size_t gpuMemSize = 0; size_t imageSize = 0; if (Pal::Result::Success != dev.iDev()->GetExternalSharedImageSizes(openInfo, &imageSize, &gpuMemSize, imgCreateInfo)) { return nullptr; } Pal::GpuMemoryCreateInfo createInfo = {}; GpuMemoryReference* memRef = new (gpuMemSize) GpuMemoryReference(dev); char* imgMem = new char[imageSize]; if (memRef != nullptr) { result = dev.iDev()->OpenExternalSharedImage(openInfo, imgMem, &memRef[1], &createInfo, image, &memRef->gpuMem_); if (result != Pal::Result::Success) { memRef->release(); return nullptr; } } return memRef; } // ================================================================================================ GpuMemoryReference::GpuMemoryReference(const Device& dev) : gpuMem_(nullptr), cpuAddress_(nullptr), device_(dev), gpu_(nullptr) {} // ================================================================================================ GpuMemoryReference::~GpuMemoryReference() { if (gpu_ == nullptr) { Device::ScopedLockVgpus lock(device_); // Release all memory objects on all virtual GPUs for (uint idx = 1; idx < device_.vgpus().size(); ++idx) { device_.vgpus()[idx]->releaseMemory(this); } } else { amd::ScopedLock l(gpu_->execution()); gpu_->releaseMemory(this); } if (device_.vgpus().size() != 0) { assert(device_.vgpus()[0] == device_.xferQueue() && "Wrong transfer queue!"); // Lock the transfer queue, since it's not handled by ScopedLockVgpus amd::ScopedLock k(device_.xferMgr().lockXfer()); device_.vgpus()[0]->releaseMemory(this); } // Destroy PAL object if it's not a suballocation if (cpuAddress_ != nullptr) { iMem()->Unmap(); } if (0 != iMem()) { if (!(iMem()->Desc().flags.isShared || iMem()->Desc().flags.isExternal || iMem()->Desc().flags.isExternPhys)) { // Update free memory size counters device_.updateAllocedMemory(iMem()->Desc().preferredHeap, iMem()->Desc().size, true); } iMem()->Destroy(); gpuMem_ = nullptr; } } // ================================================================================================ Resource::Resource(const Device& gpuDev, size_t size) : elementSize_(0), gpuDevice_(gpuDev), mapCount_(0), address_(nullptr), offset_(0), memRef_(nullptr), subOffset_(0), viewOwner_(nullptr), image_(nullptr), hwSrd_(0), events_(gpuDev.numOfVgpus()) { // Fill resource descriptor fields desc_.state_ = 0; desc_.type_ = Empty; desc_.width_ = amd::alignUp(size, Pal::Formats::BytesPerPixel(Pal::ChNumFormat::X32_Uint)) / Pal::Formats::BytesPerPixel(Pal::ChNumFormat::X32_Uint); desc_.height_ = 1; desc_.depth_ = 1; desc_.mipLevels_ = 1; desc_.format_.image_channel_order = CL_R; desc_.format_.image_channel_data_type = CL_FLOAT; desc_.flags_ = 0; desc_.pitch_ = 0; desc_.slice_ = 0; desc_.cardMemory_ = true; desc_.dimSize_ = 1; desc_.buffer_ = true; desc_.imageArray_ = false; desc_.topology_ = CL_MEM_OBJECT_BUFFER; desc_.SVMRes_ = false; desc_.scratch_ = false; desc_.isAllocExecute_ = false; desc_.baseLevel_ = 0; gpuDev.addResource(this); } // ================================================================================================ Resource::Resource(const Device& gpuDev, size_t width, size_t height, size_t depth, cl_image_format format, cl_mem_object_type imageType, uint mipLevels) : elementSize_(0), gpuDevice_(gpuDev), mapCount_(0), address_(nullptr), offset_(0), memRef_(nullptr), subOffset_(0), viewOwner_(nullptr), image_(nullptr), hwSrd_(0), events_(gpuDev.numOfVgpus()) { // Fill resource descriptor fields desc_.state_ = 0; desc_.type_ = Empty; desc_.width_ = width; desc_.height_ = height; desc_.depth_ = depth; desc_.mipLevels_ = mipLevels; desc_.format_ = format; desc_.flags_ = 0; desc_.pitch_ = 0; desc_.slice_ = 0; desc_.cardMemory_ = true; desc_.buffer_ = false; desc_.imageArray_ = false; desc_.topology_ = imageType; desc_.SVMRes_ = false; desc_.scratch_ = false; desc_.isAllocExecute_ = false; desc_.baseLevel_ = 0; switch (imageType) { case CL_MEM_OBJECT_IMAGE2D: desc_.dimSize_ = 2; break; case CL_MEM_OBJECT_IMAGE3D: desc_.dimSize_ = 3; break; case CL_MEM_OBJECT_IMAGE2D_ARRAY: desc_.dimSize_ = 3; desc_.imageArray_ = true; break; case CL_MEM_OBJECT_IMAGE1D: desc_.dimSize_ = 1; break; case CL_MEM_OBJECT_IMAGE1D_ARRAY: desc_.dimSize_ = 2; desc_.imageArray_ = true; break; case CL_MEM_OBJECT_IMAGE1D_BUFFER: desc_.dimSize_ = 1; break; default: desc_.dimSize_ = 1; LogError("Unknown image type!"); break; } gpuDev.addResource(this); } // ================================================================================================ Resource::~Resource() { free(); if ((nullptr != image_) && ((memoryType() != ImageView) || //! @todo PAL doesn't allow an SRD view creation with different pixel size (elementSize() != viewOwner_->elementSize()))) { image_->Destroy(); delete[] reinterpret_cast(image_); } // Remove the current resource from the global resource list gpuDevice_.removeResource(this); } // ================================================================================================ static uint32_t GetHSAILImageFormatType(const cl_image_format& format) { static const uint32_t FormatType[] = {HSA_EXT_IMAGE_CHANNEL_TYPE_SNORM_INT8, HSA_EXT_IMAGE_CHANNEL_TYPE_SNORM_INT16, HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_INT8, HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_INT16, HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_SHORT_565, HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_SHORT_555, HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_SHORT_101010, HSA_EXT_IMAGE_CHANNEL_TYPE_SIGNED_INT8, HSA_EXT_IMAGE_CHANNEL_TYPE_SIGNED_INT16, HSA_EXT_IMAGE_CHANNEL_TYPE_SIGNED_INT32, HSA_EXT_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8, HSA_EXT_IMAGE_CHANNEL_TYPE_UNSIGNED_INT16, HSA_EXT_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32, HSA_EXT_IMAGE_CHANNEL_TYPE_HALF_FLOAT, HSA_EXT_IMAGE_CHANNEL_TYPE_FLOAT, HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_INT24}; uint idx = format.image_channel_data_type - CL_SNORM_INT8; assert((idx <= (CL_UNORM_INT24 - CL_SNORM_INT8)) && "Out of range format channel!"); return FormatType[idx]; } // ================================================================================================ static uint32_t GetHSAILImageOrderType(const cl_image_format& format) { static const uint32_t OrderType[] = {HSA_EXT_IMAGE_CHANNEL_ORDER_R, HSA_EXT_IMAGE_CHANNEL_ORDER_A, HSA_EXT_IMAGE_CHANNEL_ORDER_RG, HSA_EXT_IMAGE_CHANNEL_ORDER_RA, HSA_EXT_IMAGE_CHANNEL_ORDER_RGB, HSA_EXT_IMAGE_CHANNEL_ORDER_RGBA, HSA_EXT_IMAGE_CHANNEL_ORDER_BGRA, HSA_EXT_IMAGE_CHANNEL_ORDER_ARGB, HSA_EXT_IMAGE_CHANNEL_ORDER_INTENSITY, HSA_EXT_IMAGE_CHANNEL_ORDER_LUMINANCE, HSA_EXT_IMAGE_CHANNEL_ORDER_RX, HSA_EXT_IMAGE_CHANNEL_ORDER_RGX, HSA_EXT_IMAGE_CHANNEL_ORDER_RGBX, HSA_EXT_IMAGE_CHANNEL_ORDER_DEPTH, HSA_EXT_IMAGE_CHANNEL_ORDER_DEPTH_STENCIL, HSA_EXT_IMAGE_CHANNEL_ORDER_SRGB, HSA_EXT_IMAGE_CHANNEL_ORDER_SRGBX, HSA_EXT_IMAGE_CHANNEL_ORDER_SRGBA, HSA_EXT_IMAGE_CHANNEL_ORDER_SBGRA, HSA_EXT_IMAGE_CHANNEL_ORDER_ABGR}; uint idx = format.image_channel_order - CL_R; assert((idx <= (CL_ABGR - CL_R)) && "Out of range format order!"); return OrderType[idx]; } // ================================================================================================ void Resource::memTypeToHeap(Pal::GpuMemoryCreateInfo* createInfo) { createInfo->heapCount = 1; switch (memoryType()) { case Persistent: createInfo->heaps[0] = Pal::GpuHeapLocal; #ifdef ATI_OS_LINUX // Note: SSG in Linux requires DGMA heap if (dev().properties().gpuMemoryProperties.busAddressableMemSize > 0) { createInfo->flags.busAddressable = true; } #endif break; case RemoteUSWC: createInfo->heaps[0] = Pal::GpuHeapGartUswc; desc_.cardMemory_ = false; break; case Remote: createInfo->heaps[0] = Pal::GpuHeapGartCacheable; desc_.cardMemory_ = false; break; case ExternalPhysical: desc_.cardMemory_ = false; case Shader: // Fall through to process the memory allocation ... case Local: createInfo->heapCount = 2; createInfo->heaps[0] = Pal::GpuHeapInvisible; createInfo->heaps[1] = Pal::GpuHeapLocal; break; default: createInfo->heaps[0] = Pal::GpuHeapLocal; break; } } // ================================================================================================ bool Resource::CreateImage(CreateParams* params) { Pal::Result result; Pal::SubresId ImgSubresId = { Pal::ImageAspect::Color, 0, 0 }; Pal::SubresRange ImgSubresRange = { ImgSubresId, 1, 1 }; Pal::ChannelMapping channels; Pal::ChNumFormat format = dev().getPalFormat(desc().format_, &channels); if (desc().topology_ == CL_MEM_OBJECT_IMAGE1D_BUFFER) { if (memoryType() == ImageBuffer) { ImageBufferParams* imageBuffer = reinterpret_cast(params); viewOwner_ = imageBuffer->resource_; memRef_ = viewOwner_->memRef_; memRef_->retain(); desc_.cardMemory_ = viewOwner_->desc().cardMemory_; offset_ += viewOwner_->offset_; } else { Pal::GpuMemoryCreateInfo createInfo = {}; createInfo.size = desc().width_ * elementSize(); createInfo.size = amd::alignUp(createInfo.size, MaxGpuAlignment); createInfo.alignment = MaxGpuAlignment; createInfo.vaRange = Pal::VaRange::Default; createInfo.priority = Pal::GpuMemPriority::Normal; memTypeToHeap(&createInfo); // createInfo.priority; memRef_ = dev().resourceCache().findGpuMemory(&desc_, createInfo.size, createInfo.alignment, nullptr, &subOffset_); if (nullptr == memRef_) { memRef_ = GpuMemoryReference::Create(dev(), createInfo); if (nullptr == memRef_) { LogError("Failed PAL memory allocation!"); return false; } } offset_ += static_cast(subOffset_); } // Check if memory is locked already and restore CPU pointer if (memRef_->cpuAddress_ != nullptr) { address_ = memRef_->cpuAddress_; memRef_->cpuAddress_ = nullptr; mapCount_++; } Pal::BufferViewInfo viewInfo = {}; viewInfo.gpuAddr = vmAddress(); viewInfo.range = memRef_->iMem()->Desc().size; viewInfo.stride = elementSize(); viewInfo.swizzledFormat.format = format; viewInfo.swizzledFormat.swizzle = channels; // viewInfo.channels = channels; hwSrd_ = dev().srds().allocSrdSlot(reinterpret_cast(&hwState_)); if ((0 == hwSrd_) && (memoryType() != ImageView)) { return false; } dev().iDev()->CreateTypedBufferViewSrds(1, &viewInfo, hwState_); hwState_[8] = GetHSAILImageFormatType(desc().format_); hwState_[9] = GetHSAILImageOrderType(desc().format_); hwState_[10] = static_cast(desc().width_); hwState_[11] = 0; // one extra reserved field in the argument return true; } Pal::ImageViewInfo viewInfo = {}; Pal::ImageCreateInfo imgCreateInfo = {}; Pal::GpuMemoryRequirements req = {}; imgCreateInfo.imageType = Pal::ImageType::Tex2d; viewInfo.viewType = Pal::ImageViewType::Tex2d; viewInfo.possibleLayouts.engines = Pal::LayoutComputeEngine | Pal::LayoutDmaEngine; viewInfo.possibleLayouts.usages = Pal::LayoutShaderWrite; imgCreateInfo.extent.width = desc_.width_; imgCreateInfo.extent.height = desc_.height_; imgCreateInfo.extent.depth = desc_.depth_; imgCreateInfo.arraySize = 1; switch (desc_.topology_) { case CL_MEM_OBJECT_IMAGE3D: imgCreateInfo.imageType = Pal::ImageType::Tex3d; viewInfo.viewType = Pal::ImageViewType::Tex3d; break; case CL_MEM_OBJECT_IMAGE1D: case CL_MEM_OBJECT_IMAGE1D_ARRAY: case CL_MEM_OBJECT_IMAGE1D_BUFFER: imgCreateInfo.imageType = Pal::ImageType::Tex1d; viewInfo.viewType = Pal::ImageViewType::Tex1d; break; } if (desc_.topology_ == CL_MEM_OBJECT_IMAGE1D_ARRAY) { ImgSubresRange.numSlices = imgCreateInfo.arraySize = desc_.height_; imgCreateInfo.extent.depth = desc_.height_; imgCreateInfo.extent.height = 1; } if (desc_.topology_ == CL_MEM_OBJECT_IMAGE2D_ARRAY) { ImgSubresRange.numSlices = imgCreateInfo.arraySize = desc_.depth_; } if (memoryType() == ImageView) { ImageViewParams* imageView = reinterpret_cast(params); ImgSubresRange.startSubres.mipLevel = imageView->level_; desc_.baseLevel_ = imageView->level_; ImgSubresRange.startSubres.arraySlice = imageView->layer_; viewOwner_ = imageView->resource_; image_ = viewOwner_->image_; } else if (memoryType() == ImageBuffer) { ImageBufferParams* imageBuffer = reinterpret_cast(params); viewOwner_ = imageBuffer->resource_; } if (nullptr != viewOwner_) { offset_ = viewOwner_->offset(); } ImgSubresRange.numMips = desc().mipLevels_; if ((memoryType() != ImageView) || //! @todo PAL doesn't allow an SRD view creation with different pixel size (elementSize() != viewOwner_->elementSize())) { imgCreateInfo.usageFlags.shaderRead = true; imgCreateInfo.usageFlags.shaderWrite = (format == Pal::ChNumFormat::X8Y8Z8W8_Srgb) ? false : true; imgCreateInfo.swizzledFormat.format = format; imgCreateInfo.swizzledFormat.swizzle = channels; imgCreateInfo.mipLevels = (desc_.mipLevels_) ? desc_.mipLevels_ : 1; imgCreateInfo.samples = 1; imgCreateInfo.fragments = 1; Pal::ImageTiling tiling = Pal::ImageTiling::Optimal; uint32_t rowPitch = 0; if (((memoryType() == Persistent) && dev().settings().linearPersistentImage_) || (memoryType() == ImageBuffer)) { tiling = Pal::ImageTiling::Linear; } else if (memoryType() == ImageView) { tiling = viewOwner_->image_->GetImageCreateInfo().tiling; // Find the new pitch in pixels for the new format rowPitch = viewOwner_->desc().pitch_ * viewOwner_->elementSize() / elementSize(); } if (memoryType() == ImageBuffer) { if ((params->owner_ != NULL) && params->owner_->asImage() && (params->owner_->asImage()->getRowPitch() != 0)) { rowPitch = params->owner_->asImage()->getRowPitch() / elementSize(); } else { rowPitch = desc().width_; } } desc_.pitch_ = rowPitch; // Make sure the row pitch is aligned to pixels imgCreateInfo.rowPitch = amd::alignUp(elementSize() * rowPitch, dev().info().imagePitchAlignment_); imgCreateInfo.depthPitch = imgCreateInfo.rowPitch * desc().height_; imgCreateInfo.tiling = tiling; size_t imageSize = dev().iDev()->GetImageSize(imgCreateInfo, &result); if (result != Pal::Result::Success) { return false; } char* memImg = new char[imageSize]; if (memImg != nullptr) { result = dev().iDev()->CreateImage(imgCreateInfo, memImg, &image_); if (result != Pal::Result::Success) { delete[] memImg; return false; } } image_->GetGpuMemoryRequirements(&req); // createInfo.priority; } if ((memoryType() != ImageView) && (memoryType() != ImageBuffer)) { Pal::GpuMemoryCreateInfo createInfo = {}; createInfo.size = amd::alignUp(req.size, MaxGpuAlignment); createInfo.alignment = std::max(req.alignment, MaxGpuAlignment); createInfo.vaRange = Pal::VaRange::Default; createInfo.priority = Pal::GpuMemPriority::Normal; memTypeToHeap(&createInfo); memRef_ = dev().resourceCache().findGpuMemory(&desc_, createInfo.size, createInfo.alignment, nullptr, &subOffset_); if (nullptr == memRef_) { memRef_ = GpuMemoryReference::Create(dev(), createInfo); if (nullptr == memRef_) { LogError("Failed PAL memory allocation!"); return false; } } offset_ += static_cast(subOffset_); } else { memRef_ = viewOwner_->memRef_; memRef_->retain(); desc_.cardMemory_ = viewOwner_->desc().cardMemory_; if (req.size > viewOwner_->iMem()->Desc().size) { LogWarning("Image is bigger than the original mem object!"); } } // Check if memory is locked already and restore CPU pointer if (memRef_->cpuAddress_ != nullptr) { address_ = memRef_->cpuAddress_; memRef_->cpuAddress_ = nullptr; mapCount_++; } result = image_->BindGpuMemory(memRef_->gpuMem_, offset_); if (result != Pal::Result::Success) { return false; } hwSrd_ = dev().srds().allocSrdSlot(reinterpret_cast(&hwState_)); if ((0 == hwSrd_) && (memoryType() != ImageView)) { return false; } viewInfo.pImage = image_; viewInfo.swizzledFormat.format = format; viewInfo.swizzledFormat.swizzle = channels; viewInfo.subresRange = ImgSubresRange; dev().iDev()->CreateImageViewSrds(1, &viewInfo, hwState_); hwState_[8] = GetHSAILImageFormatType(desc().format_); hwState_[9] = GetHSAILImageOrderType(desc().format_); hwState_[10] = static_cast(desc().width_); hwState_[11] = 0; // one extra reserved field in the argument return true; } // ================================================================================================ bool Resource::CreateInterop(CreateParams* params) { Pal::Result result; Pal::SubresId ImgSubresId = { Pal::ImageAspect::Color, 0, 0 }; Pal::SubresRange ImgSubresRange = { ImgSubresId, 1, 1 }; Pal::ChannelMapping channels; Pal::ChNumFormat format = dev().getPalFormat(desc().format_, &channels); Pal::ExternalGpuMemoryOpenInfo gpuMemOpenInfo = {}; Pal::ExternalResourceOpenInfo& openInfo = gpuMemOpenInfo.resourceInfo; uint misc = 0; uint layer = 0; uint mipLevel = 0; InteropType type = InteropTypeless; if (memoryType() == OGLInterop) { OGLInteropParams* oglRes = reinterpret_cast(params); assert(oglRes->glPlatformContext_ && "We don't have OGL context!"); switch (oglRes->type_) { case InteropVertexBuffer: glType_ = GL_RESOURCE_ATTACH_VERTEXBUFFER_AMD; break; case InteropRenderBuffer: glType_ = GL_RESOURCE_ATTACH_RENDERBUFFER_AMD; break; case InteropTexture: case InteropTextureViewLevel: case InteropTextureViewCube: glType_ = GL_RESOURCE_ATTACH_TEXTURE_AMD; break; default: LogError("Unknown OGL interop type!"); return false; break; } glPlatformContext_ = oglRes->glPlatformContext_; layer = oglRes->layer_; type = oglRes->type_; mipLevel = oglRes->mipLevel_; if (!dev().resGLAssociate(oglRes->glPlatformContext_, oglRes->handle_, glType_, &openInfo.hExternalResource, &glInteropMbRes_, &offset_, desc_.format_ #ifdef ATI_OS_WIN , openInfo.doppDesktopInfo #endif )) { return false; } desc_.isDoppTexture_ = (openInfo.doppDesktopInfo.gpuVirtAddr != 0); format = dev().getPalFormat(desc().format_, &channels); } #ifdef ATI_OS_WIN else { D3DInteropParams* d3dRes = reinterpret_cast(params); openInfo.hExternalResource = d3dRes->handle_; misc = d3dRes->misc; layer = d3dRes->layer_; type = d3dRes->type_; mipLevel = d3dRes->mipLevel_; } #endif //! @todo PAL query for image/buffer object doesn't work properly! #if 0 bool isImage = false; if (Pal::Result::Success != dev().iDev()->DetermineExternalSharedResourceType(openInfo, &isImage)) { return false; } #endif // 0 if (desc().buffer_ || misc) { memRef_ = GpuMemoryReference::Create(dev(), gpuMemOpenInfo); if (nullptr == memRef_) { return false; } if (misc) { Pal::ImageCreateInfo imgCreateInfo = {}; Pal::ExternalImageOpenInfo imgOpenInfo = {}; imgOpenInfo.resourceInfo = openInfo; imgOpenInfo.swizzledFormat.format = format; imgOpenInfo.swizzledFormat.swizzle = channels; imgOpenInfo.usage.shaderRead = true; imgOpenInfo.usage.shaderWrite = true; size_t imageSize; size_t gpuMemSize; if (Pal::Result::Success != dev().iDev()->GetExternalSharedImageSizes(imgOpenInfo, &imageSize, &gpuMemSize, &imgCreateInfo)) { return false; } Pal::gpusize viewOffset = 0; imgCreateInfo.flags.shareable = false; imgCreateInfo.imageType = Pal::ImageType::Tex2d; imgCreateInfo.extent.width = desc().width_; imgCreateInfo.extent.height = desc().height_; imgCreateInfo.extent.depth = desc().depth_; imgCreateInfo.arraySize = 1; imgCreateInfo.usageFlags.shaderRead = true; imgCreateInfo.usageFlags.shaderWrite = true; imgCreateInfo.swizzledFormat.format = format; imgCreateInfo.swizzledFormat.swizzle = channels; imgCreateInfo.mipLevels = 1; imgCreateInfo.samples = 1; imgCreateInfo.fragments = 1; imgCreateInfo.tiling = Pal::ImageTiling::Linear; imgCreateInfo.depthPitch = desc().height_ * imgCreateInfo.rowPitch; switch (misc) { case 1: // NV12 or P010 formats switch (layer) { case -1: case 0: break; case 1: // Y - plane size to the offset // NV12 format. UV is 2 times smaller plane Y viewOffset = 2 * imgCreateInfo.rowPitch * desc().height_; imgCreateInfo.depthPitch = imgCreateInfo.rowPitch * desc().height_; break; default: LogError("Unknown Interop View Type"); return false; } break; case 2: // YV12 format switch (layer) { case -1: case 0: break; case 1: // Y - plane size to the offset // YV12 format. U is 4 times smaller plane than Y viewOffset = 2 * imgCreateInfo.rowPitch * desc().height_; imgCreateInfo.rowPitch >>= 1; break; case 2: // Y + U plane sizes to the offest. // U plane is 4 times smaller than Y and U == V viewOffset = 5 * imgCreateInfo.rowPitch * desc().height_ / 2; imgCreateInfo.rowPitch >>= 1; break; default: LogError("Unknown Interop View Type"); return false; } imgCreateInfo.depthPitch = imgCreateInfo.rowPitch * desc().height_; break; case 3: // YUY2 format imgCreateInfo.depthPitch = imgCreateInfo.rowPitch * desc().height_; break; default: LogError("Unknown Interop View Type"); return false; } imageSize = dev().iDev()->GetImageSize(imgCreateInfo, &result); if (result != Pal::Result::Success) { return false; } char* memImg = new char[imageSize]; if (memImg != nullptr) { result = dev().iDev()->CreateImage(imgCreateInfo, memImg, &image_); if (result != Pal::Result::Success) { delete[] memImg; return false; } } offset_ += static_cast(viewOffset); result = image_->BindGpuMemory(iMem(), offset_); if (result != Pal::Result::Success) { return false; } hwSrd_ = dev().srds().allocSrdSlot(reinterpret_cast(&hwState_)); if ((0 == hwSrd_) && (memoryType() != ImageView)) { return false; } Pal::ImageViewInfo viewInfo = {}; viewInfo.viewType = Pal::ImageViewType::Tex2d; viewInfo.pImage = image_; viewInfo.swizzledFormat.format = format; viewInfo.swizzledFormat.swizzle = channels; viewInfo.subresRange = ImgSubresRange; viewInfo.possibleLayouts.engines = Pal::LayoutComputeEngine | Pal::LayoutDmaEngine; viewInfo.possibleLayouts.usages = Pal::LayoutShaderWrite; dev().iDev()->CreateImageViewSrds(1, &viewInfo, hwState_); hwState_[8] = GetHSAILImageFormatType(desc().format_); hwState_[9] = GetHSAILImageOrderType(desc().format_); hwState_[10] = static_cast(desc().width_); hwState_[11] = 0; // one extra reserved field in the argument } } else if (desc().topology_ == CL_MEM_OBJECT_IMAGE1D_BUFFER) { memRef_ = GpuMemoryReference::Create(dev(), gpuMemOpenInfo); if (nullptr == memRef_) { return false; } Pal::BufferViewInfo viewInfo = {}; viewInfo.gpuAddr = vmAddress(); viewInfo.range = memRef_->iMem()->Desc().size; viewInfo.stride = elementSize(); viewInfo.swizzledFormat.format = format; viewInfo.swizzledFormat.swizzle = channels; hwSrd_ = dev().srds().allocSrdSlot(reinterpret_cast(&hwState_)); if ((0 == hwSrd_) && (memoryType() != ImageView)) { return false; } dev().iDev()->CreateTypedBufferViewSrds(1, &viewInfo, hwState_); hwState_[8] = GetHSAILImageFormatType(desc().format_); hwState_[9] = GetHSAILImageOrderType(desc().format_); hwState_[10] = static_cast(desc().width_); hwState_[11] = 0; // one extra reserved field in the argument } else { Pal::ExternalImageOpenInfo imgOpenInfo = {}; Pal::ImageCreateInfo imgCreateInfo = {}; imgOpenInfo.resourceInfo = openInfo; imgOpenInfo.swizzledFormat.format = format; imgOpenInfo.swizzledFormat.swizzle = channels; imgOpenInfo.usage.shaderRead = true; imgOpenInfo.usage.shaderWrite = true; memRef_ = GpuMemoryReference::Create(dev(), imgOpenInfo, &imgCreateInfo, &image_); if (nullptr == memRef_) { return false; } hwSrd_ = dev().srds().allocSrdSlot(reinterpret_cast(&hwState_)); if ((0 == hwSrd_) && (memoryType() != ImageView)) { return false; } Pal::ImageViewInfo viewInfo = {}; viewInfo.possibleLayouts.engines = Pal::LayoutComputeEngine | Pal::LayoutDmaEngine; viewInfo.possibleLayouts.usages = Pal::LayoutShaderWrite; viewInfo.viewType = Pal::ImageViewType::Tex2d; switch (imgCreateInfo.imageType) { case Pal::ImageType::Tex3d: viewInfo.viewType = Pal::ImageViewType::Tex3d; break; case Pal::ImageType::Tex1d: viewInfo.viewType = Pal::ImageViewType::Tex1d; break; default: break; } viewInfo.pImage = image_; viewInfo.swizzledFormat.format = format; viewInfo.swizzledFormat.swizzle = channels; if ((type == InteropTextureViewLevel) || (type == InteropTextureViewCube)) { ImgSubresRange.startSubres.mipLevel = mipLevel; if (type == InteropTextureViewCube) { ImgSubresRange.startSubres.arraySlice = layer; viewInfo.viewType = Pal::ImageViewType::Tex2d; } } if (desc().topology_ == CL_MEM_OBJECT_IMAGE1D_ARRAY) { ImgSubresRange.numSlices = desc_.height_; } if (desc().topology_ == CL_MEM_OBJECT_IMAGE2D_ARRAY) { ImgSubresRange.numSlices = desc_.depth_; } ImgSubresRange.numMips = desc().mipLevels_; viewInfo.subresRange = ImgSubresRange; dev().iDev()->CreateImageViewSrds(1, &viewInfo, hwState_); //! It's a workaround for D24S8 format, since PAL doesn't support this format //! and GSL decompresses 24bit DEPTH into D24S8 for OGL compatibility if ((desc().format_.image_channel_order == CL_DEPTH_STENCIL) && (desc().format_.image_channel_data_type == CL_UNORM_INT24)) { if (dev().settings().gfx10Plus_) { hwState_[1] = (hwState_[1] & ~0x1ff00000) | 0x08d00000; } else { hwState_[1] &= ~0x3c000000; hwState_[1] = (hwState_[1] & ~0x3f00000) | 0x1400000; } } hwState_[8] = GetHSAILImageFormatType(desc().format_); hwState_[9] = GetHSAILImageOrderType(desc().format_); hwState_[10] = static_cast(desc().width_); hwState_[11] = 0; // one extra reserved field in the argument } return true; } // ================================================================================================ bool Resource::CreatePinned(CreateParams* params) { PinnedParams* pinned = reinterpret_cast(params); size_t allocSize = pinned->size_; const amd::HostMemoryReference* hostMemRef = pinned->hostMemRef_; void* pinAddress = address_ = hostMemRef->hostMem(); uint hostMemOffset = 0; // assert((allocSize == (desc().width_ * elementSize())) && "Sizes don't match"); if (desc().topology_ == CL_MEM_OBJECT_BUFFER) { // Allign offset to 4K boundary (Vista/Win7 limitation) char* tmpHost = const_cast( amd::alignDown(reinterpret_cast(address_), PinnedMemoryAlignment)); // Find the partial size for unaligned copy hostMemOffset = static_cast(reinterpret_cast(address_) - tmpHost); offset_ = hostMemOffset; pinAddress = tmpHost; if (hostMemOffset != 0) { allocSize += hostMemOffset; } allocSize = amd::alignUp(allocSize, PinnedMemoryAlignment); // hostMemOffset &= ~(0xff); } else if (desc().topology_ == CL_MEM_OBJECT_IMAGE2D) { //! @todo: Width has to be aligned for 3D. //! Need to be replaced with a compute copy // Width aligned by 8 texels if (((desc().width_ % 0x8) != 0) || // Pitch aligned by 64 bytes (((desc().width_ * elementSize()) % 0x40) != 0)) { return false; } } else { //! @todo GSL doesn't support pinning with resAlloc_ return false; } if (dev().settings().svmFineGrainSystem_) { desc_.SVMRes_ = true; } // Ensure page alignment if ((uint64_t)(pinAddress) & (amd::Os::pageSize() - 1)) { return false; } Pal::PinnedGpuMemoryCreateInfo createInfo = {}; createInfo.pSysMem = pinAddress; createInfo.size = allocSize; createInfo.vaRange = Pal::VaRange::Default; memRef_ = GpuMemoryReference::Create(dev(), createInfo); if (nullptr == memRef_) { LogError("Failed PAL memory allocation!"); return false; } desc_.cardMemory_ = false; return true; } // ================================================================================================ bool Resource::CreateSvm(CreateParams* params, Pal::gpusize svmPtr) { const bool isFineGrain = (memoryType() == RemoteUSWC) || (memoryType() == Remote); size_t allocSize = amd::alignUp(desc().width_ * elementSize_, dev().properties().gpuMemoryProperties.fragmentSize); if (isFineGrain) { Pal::SvmGpuMemoryCreateInfo createInfo = {}; createInfo.isUsedForKernel = desc_.isAllocExecute_; createInfo.size = allocSize; createInfo.alignment = MaxGpuAlignment; if (svmPtr != 0) { createInfo.flags.useReservedGpuVa = true; createInfo.pReservedGpuVaOwner = params->svmBase_->iMem(); } else { createInfo.flags.useReservedGpuVa = false; createInfo.pReservedGpuVaOwner = nullptr; } if (!dev().settings().svmFineGrainSystem_) { memRef_ = dev().resourceCache().findGpuMemory(&desc_, createInfo.size, createInfo.alignment, createInfo.pReservedGpuVaOwner, &subOffset_); } if (memRef_ == nullptr) { memRef_ = GpuMemoryReference::Create(dev(), createInfo); } } else { Pal::GpuMemoryCreateInfo createInfo = {}; createInfo.size = allocSize; createInfo.alignment = MaxGpuAlignment; createInfo.vaRange = Pal::VaRange::Svm; createInfo.priority = Pal::GpuMemPriority::Normal; if (svmPtr != 0) { createInfo.flags.useReservedGpuVa = true; createInfo.pReservedGpuVaOwner = params->svmBase_->iMem(); } memTypeToHeap(&createInfo); memRef_ = dev().resourceCache().findGpuMemory(&desc_, createInfo.size, createInfo.alignment, createInfo.pReservedGpuVaOwner, &subOffset_); if (memRef_ == nullptr) { createInfo.alignment = dev().properties().gpuMemoryProperties.fragmentSize; memRef_ = GpuMemoryReference::Create(dev(), createInfo); } } if (nullptr == memRef_) { LogError("Failed PAL memory allocation!"); return false; } desc_.cardMemory_ = false; if ((nullptr != params) && (nullptr != params->owner_) && (nullptr != params->owner_->getSvmPtr())) { params->owner_->setSvmPtr( reinterpret_cast(memRef_->iMem()->Desc().gpuVirtAddr + subOffset_)); offset_ += static_cast(subOffset_); } return true; } // ================================================================================================ bool Resource::create(MemoryType memType, CreateParams* params) { bool imageCreateView = false; bool foundCalRef = false; bool viewDefined = false; uint viewLayer = 0; uint viewLevel = 0; uint viewFlags = 0; Pal::ChannelMapping channels; Pal::ChNumFormat format = dev().getPalFormat(desc().format_, &channels); // Set the initial offset value for any resource to 0. // Note: Runtime can call create() more than once, if the initial memory type failed offset_ = 0; // This is a thread safe operation const_cast(dev()).initializeHeapResources(); if (memType == Shader) { if (dev().settings().svmFineGrainSystem_) { desc_.isAllocExecute_ = true; desc_.SVMRes_ = true; memType = RemoteUSWC; } else { memType = Local; } // force to use remote memory for HW DEBUG or use // local memory once we determine if FGS is supported // memType = (!dev().settings().enableHwDebug_) ? Local : RemoteUSWC; } // Get the element size elementSize_ = Pal::Formats::BytesPerPixel(format); desc_.type_ = memType; if (memType == Scratch) { // use local memory for scratch buffer unless it is using HW DEBUG desc_.type_ = (!dev().settings().enableHwDebug_) ? Local : RemoteUSWC; desc_.scratch_ = true; } // Force remote allocation if it was requested in the settings if (dev().settings().remoteAlloc_ && ((memoryType() == Local) || (memoryType() == Persistent))) { if (dev().settings().apuSystem_ && dev().settings().viPlus_) { desc_.type_ = Remote; } else { desc_.type_ = RemoteUSWC; } } if (dev().settings().disablePersistent_ && (memoryType() == Persistent)) { desc_.type_ = RemoteUSWC; } if ((memoryType() == OGLInterop) || (memoryType() == D3D9Interop) || (memoryType() == D3D10Interop) || (memoryType() == D3D11Interop)) { return CreateInterop(params); } if (!desc_.buffer_) { return CreateImage(params); } if (memoryType() == Pinned) { return CreatePinned(params); } if (memoryType() == View) { // Save the offset in the global heap ViewParams* view = reinterpret_cast(params); offset_ = view->offset_; // Make sure parent was provided if (nullptr != view->resource_) { viewOwner_ = view->resource_; offset_ += viewOwner_->offset(); if (viewOwner_->data() != nullptr) { address_ = viewOwner_->data() + view->offset_; mapCount_++; } memRef_ = viewOwner_->memRef_; memRef_->retain(); desc_.cardMemory_ = viewOwner_->desc().cardMemory_; } else { desc_.type_ = Empty; } return true; } Pal::gpusize svmPtr = 0; if ((nullptr != params) && (nullptr != params->owner_) && (nullptr != params->owner_->getSvmPtr())) { svmPtr = reinterpret_cast(params->owner_->getSvmPtr()); desc_.SVMRes_ = true; svmPtr = (svmPtr == 1) ? 0 : svmPtr; } if (desc_.SVMRes_) { return CreateSvm(params, svmPtr); } Pal::GpuMemoryCreateInfo createInfo = {}; createInfo.size = desc().width_ * elementSize_; createInfo.size = amd::alignUp(createInfo.size, MaxGpuAlignment); createInfo.alignment = desc().scratch_ ? 64*Ki : MaxGpuAlignment; createInfo.vaRange = Pal::VaRange::Default; createInfo.priority = Pal::GpuMemPriority::Normal; if (memoryType() == ExternalPhysical) { cl_bus_address_amd bus_address = (reinterpret_cast(params->owner_))->busAddress(); createInfo.surfaceBusAddr = bus_address.surface_bus_address; createInfo.markerBusAddr = bus_address.marker_bus_address; createInfo.flags.sdiExternal = true; } else if (memoryType() == BusAddressable) { createInfo.flags.busAddressable = true; } memTypeToHeap(&createInfo); // createInfo.priority; memRef_ = dev().resourceCache().findGpuMemory(&desc_, createInfo.size, createInfo.alignment, nullptr, &subOffset_); if (nullptr == memRef_) { memRef_ = GpuMemoryReference::Create(dev(), createInfo); if (nullptr == memRef_) { LogError("Failed PAL memory allocation!"); return false; } } offset_ += static_cast(subOffset_); // Check if memory is locked already and restore CPU pointer if (memRef_->cpuAddress_ != nullptr) { address_ = memRef_->cpuAddress_; memRef_->cpuAddress_ = nullptr; mapCount_++; } return true; } // ================================================================================================ void Resource::free() { if (memRef_ == nullptr) { return; } const bool wait = (memoryType() != ImageView) && (memoryType() != ImageBuffer) && (memoryType() != View); // OCL has to wait, even if resource is placed in the cache, since reallocation can occur // and resource can be reused on another async queue without a wait on a busy operation if (wait) { if (memRef_->gpu_ == nullptr) { Device::ScopedLockVgpus lock(dev()); // Release all memory objects on all virtual GPUs for (uint idx = 1; idx < dev().vgpus().size(); ++idx) { dev().vgpus()[idx]->waitForEvent(&events_[idx]); } } else { amd::ScopedLock l(memRef_->gpu_->execution()); memRef_->gpu_->waitForEvent(&events_[memRef_->gpu_->index()]); } } else { // After a view destruction the original object is no longer can be associated with a vgpu memRef_->gpu_ = nullptr; } // Destroy PAL resource if (iMem() != 0) { if (mapCount_ != 0 && wait) { if ((memoryType() != Remote) && (memoryType() != RemoteUSWC)) { //! @note: This is a workaround for bad applications that don't unmap memory unmap(nullptr); } else { // Delay CPU address unmap until memRef_ destruction if (!desc_.SVMRes_) { assert(memRef_->cpuAddress_ == nullptr && "Memref shouldn't have a valid CPU address"); memRef_->cpuAddress_ = address_; } } } // Add resource to the cache if (!dev().resourceCache().addGpuMemory(&desc_, memRef_, subOffset_)) { // Free PAL resource palFree(); } } // Free SRD for images if (!desc().buffer_) { dev().srds().freeSrdSlot(hwSrd_); } memRef_ = nullptr; } // ================================================================================================ void Resource::writeRawData(VirtualGPU& gpu, size_t offset, size_t size, const void* data, bool waitForEvent) const { GpuEvent event; // Write data size bytes to surface // size needs to be DWORD aligned assert((size & 3) == 0); gpu.eventBegin(MainEngine); gpu.queue(MainEngine).addCmdMemRef(memRef()); gpu.iCmd()->CmdUpdateMemory(*iMem(), offset_ + offset, size, reinterpret_cast(data)); gpu.eventEnd(MainEngine, event); if (waitForEvent) { //! @note: We don't really have to mark the allocations as busy //! if we are waiting for a transfer // Wait for event to complete gpu.waitForEvent(&event); } else { setBusy(gpu, event); // Update the global GPU event gpu.setGpuEvent(event, false); } } // ================================================================================================ static const Pal::ChNumFormat ChannelFmt(uint bytesPerElement) { if (bytesPerElement == 16) { return Pal::ChNumFormat::X32Y32Z32W32_Uint; } else if (bytesPerElement == 8) { return Pal::ChNumFormat::X32Y32_Uint; } else if (bytesPerElement == 4) { return Pal::ChNumFormat::X32_Uint; } else if (bytesPerElement == 2) { return Pal::ChNumFormat::X16_Uint; } else { return Pal::ChNumFormat::X8_Uint; } } // ================================================================================================ bool Resource::partialMemCopyTo(VirtualGPU& gpu, const amd::Coord3D& srcOrigin, const amd::Coord3D& dstOrigin, const amd::Coord3D& size, Resource& dstResource, bool enableCopyRect, bool flushDMA, uint bytesPerElement) const { GpuEvent event; EngineType activeEngineID = gpu.engineID_; static const bool waitOnBusyEngine = true; assert(!(desc().cardMemory_ && dstResource.desc().cardMemory_) && "Unsupported configuraiton!"); uint64_t gpuMemoryOffset = 0; uint64_t gpuMemoryRowPitch = 0; uint64_t imageOffsetx = 0; bool img1Darray = false; bool img2Darray = false; if (desc().buffer_ && !dstResource.desc().buffer_) { imageOffsetx = dstOrigin[0] % dstResource.elementSize(); gpuMemoryOffset = srcOrigin[0] + offset(); gpuMemoryRowPitch = (srcOrigin[1]) ? srcOrigin[1] : size[0] * dstResource.elementSize(); img1Darray = (dstResource.desc().topology_ == CL_MEM_OBJECT_IMAGE1D_ARRAY); img2Darray = (dstResource.desc().topology_ == CL_MEM_OBJECT_IMAGE2D_ARRAY); } else if (!desc().buffer_ && dstResource.desc().buffer_) { imageOffsetx = srcOrigin[0] % elementSize(); gpuMemoryOffset = dstOrigin[0] + dstResource.offset(); gpuMemoryRowPitch = (dstOrigin[1]) ? dstOrigin[1] : size[0] * elementSize(); img1Darray = (desc().topology_ == CL_MEM_OBJECT_IMAGE1D_ARRAY); img2Darray = (desc().topology_ == CL_MEM_OBJECT_IMAGE2D_ARRAY); } if ((desc().buffer_ && !dstResource.desc().buffer_) || (!desc().buffer_ && dstResource.desc().buffer_)) { // sDMA cannot be used for the below conditions // Make sure linear pitch in bytes is 4 bytes aligned if (((gpuMemoryRowPitch % 4) != 0) || // another DRM restriciton... SI has 4 pixels (gpuMemoryOffset % 4 != 0) || (dev().settings().sdamPageFaultWar_ && (imageOffsetx != 0))) { return false; } } gpu.engineID_ = SdmaEngine; // Wait for the resources, since runtime may use async transfers wait(gpu, waitOnBusyEngine); dstResource.wait(gpu, waitOnBusyEngine); if (gpu.validateSdmaOverlap(*this, dstResource)) { // Note: PAL should insert a NOP into the command buffer for synchronization gpu.addBarrier(); } Pal::ImageLayout imgLayout = {}; gpu.eventBegin(gpu.engineID_); gpu.queue(gpu.engineID_).addCmdMemRef(memRef()); gpu.queue(gpu.engineID_).addCmdMemRef(dstResource.memRef()); if (desc().buffer_ && !dstResource.desc().buffer_) { Pal::SubresId ImgSubresId = {Pal::ImageAspect::Color, dstResource.desc().baseLevel_, 0}; Pal::MemoryImageCopyRegion copyRegion = {}; copyRegion.imageSubres = ImgSubresId; copyRegion.imageOffset.x = dstOrigin[0]; copyRegion.imageOffset.y = dstOrigin[1]; copyRegion.imageOffset.z = dstOrigin[2]; copyRegion.imageExtent.width = size[0]; copyRegion.imageExtent.height = size[1]; copyRegion.imageExtent.depth = size[2]; copyRegion.numSlices = 1; if (img1Darray) { copyRegion.numSlices = copyRegion.imageExtent.height; copyRegion.imageExtent.height = 1; } else if (img2Darray) { copyRegion.numSlices = copyRegion.imageExtent.depth; copyRegion.imageExtent.depth = 1; } copyRegion.gpuMemoryOffset = gpuMemoryOffset; copyRegion.gpuMemoryRowPitch = gpuMemoryRowPitch; copyRegion.gpuMemoryDepthPitch = (srcOrigin[2]) ? srcOrigin[2] : copyRegion.gpuMemoryRowPitch * copyRegion.imageExtent.height; gpu.iCmd()->CmdCopyMemoryToImage(*iMem(), *dstResource.image_, imgLayout, 1, ©Region); } else if (!desc().buffer_ && dstResource.desc().buffer_) { Pal::MemoryImageCopyRegion copyRegion = {}; Pal::SubresId ImgSubresId = {Pal::ImageAspect::Color, desc().baseLevel_, 0}; copyRegion.imageSubres = ImgSubresId; copyRegion.imageOffset.x = srcOrigin[0]; copyRegion.imageOffset.y = srcOrigin[1]; copyRegion.imageOffset.z = srcOrigin[2]; copyRegion.imageExtent.width = size[0]; copyRegion.imageExtent.height = size[1]; copyRegion.imageExtent.depth = size[2]; copyRegion.numSlices = 1; if (img1Darray) { copyRegion.numSlices = copyRegion.imageExtent.height; copyRegion.imageExtent.height = 1; } else if (img2Darray) { copyRegion.numSlices = copyRegion.imageExtent.depth; copyRegion.imageExtent.depth = 1; } copyRegion.gpuMemoryOffset = gpuMemoryOffset; copyRegion.gpuMemoryRowPitch = gpuMemoryRowPitch; copyRegion.gpuMemoryDepthPitch = (dstOrigin[2]) ? dstOrigin[2] : copyRegion.gpuMemoryRowPitch * copyRegion.imageExtent.height; gpu.iCmd()->CmdCopyImageToMemory(*image_, imgLayout, *dstResource.iMem(), 1, ©Region); } else { if (enableCopyRect) { Pal::TypedBufferCopyRegion copyRegion = {}; Pal::ChannelMapping channels = {Pal::ChannelSwizzle::X, Pal::ChannelSwizzle::Y, Pal::ChannelSwizzle::Z, Pal::ChannelSwizzle::W}; copyRegion.srcBuffer.swizzledFormat.format = ChannelFmt(bytesPerElement); copyRegion.srcBuffer.swizzledFormat.swizzle = channels; copyRegion.srcBuffer.offset = srcOrigin[0] + offset(); copyRegion.srcBuffer.rowPitch = srcOrigin[1]; copyRegion.srcBuffer.depthPitch = srcOrigin[2]; copyRegion.extent.width = size[0] / bytesPerElement; copyRegion.extent.height = size[1]; copyRegion.extent.depth = size[2]; copyRegion.dstBuffer.swizzledFormat.format = ChannelFmt(bytesPerElement); copyRegion.dstBuffer.swizzledFormat.swizzle = channels; copyRegion.dstBuffer.offset = dstOrigin[0] + dstResource.offset(); copyRegion.dstBuffer.rowPitch = dstOrigin[1]; copyRegion.dstBuffer.depthPitch = dstOrigin[2]; gpu.iCmd()->CmdCopyTypedBuffer(*iMem(), *dstResource.iMem(), 1, ©Region); } else { Pal::MemoryCopyRegion copyRegion = {}; copyRegion.srcOffset = srcOrigin[0] + offset(); copyRegion.dstOffset = dstOrigin[0] + dstResource.offset(); copyRegion.copySize = size[0]; gpu.iCmd()->CmdCopyMemory(*iMem(), *dstResource.iMem(), 1, ©Region); } } gpu.eventEnd(gpu.engineID_, event); // Mark source and destination as busy setBusy(gpu, event); dstResource.setBusy(gpu, event); // Update the global GPU event gpu.setGpuEvent(event, flushDMA); // Restore the original engine gpu.engineID_ = activeEngineID; return true; } // ================================================================================================ void Resource::setBusy(VirtualGPU& gpu, GpuEvent gpuEvent) const { addGpuEvent(gpu, gpuEvent); // If current resource is a view, then update the parent event as well if (viewOwner_ != nullptr) { viewOwner_->setBusy(gpu, gpuEvent); } } // ================================================================================================ void Resource::wait(VirtualGPU& gpu, bool waitOnBusyEngine) const { GpuEvent* gpuEvent = getGpuEvent(gpu); // Check if we have to wait unconditionally if (!waitOnBusyEngine || // or we have to wait only if another engine was used on this resource (gpuEvent->engineId_ != gpu.engineID_)) { gpu.waitForEvent(gpuEvent); } // If current resource is a view and not in the global heap, // then wait for the parent event as well if (viewOwner_ != nullptr) { viewOwner_->wait(gpu, waitOnBusyEngine); } } // ================================================================================================ bool Resource::hostWrite(VirtualGPU* gpu, const void* hostPtr, const amd::Coord3D& origin, const amd::Coord3D& size, uint flags, size_t rowPitch, size_t slicePitch) { void* dst; size_t startLayer = origin[2]; size_t numLayers = size[2]; if (desc().topology_ == CL_MEM_OBJECT_IMAGE1D_ARRAY) { startLayer = origin[1]; numLayers = size[1]; } // Get physical GPU memmory dst = map(gpu, flags, startLayer, numLayers); if (nullptr == dst) { LogError("Couldn't map GPU memory for host write"); return false; } if (1 == desc().dimSize_) { size_t copySize = (desc().buffer_) ? size[0] : size[0] * elementSize_; // Update the pointer dst = static_cast(static_cast(dst) + origin[0]); // Copy memory amd::Os::fastMemcpy(dst, hostPtr, copySize); } else { size_t dstOffsBase = origin[0] * elementSize_; // Make sure we use the right pitch if it's not specified if (rowPitch == 0) { rowPitch = size[0] * elementSize_; } // Make sure we use the right slice if it's not specified if (slicePitch == 0) { slicePitch = size[0] * size[1] * elementSize_; } // Adjust the destination offset with Y dimension dstOffsBase += desc().pitch_ * origin[1] * elementSize_; // Adjust the destination offset with Z dimension dstOffsBase += desc().slice_ * origin[2] * elementSize_; // Copy memory slice by slice for (size_t slice = 0; slice < size[2]; ++slice) { size_t dstOffs = dstOffsBase + slice * desc().slice_ * elementSize_; size_t srcOffs = slice * slicePitch; // Copy memory line by line for (size_t row = 0; row < size[1]; ++row) { // Copy memory amd::Os::fastMemcpy((reinterpret_cast
(dst) + dstOffs), (reinterpret_cast(hostPtr) + srcOffs), size[0] * elementSize_); dstOffs += desc().pitch_ * elementSize_; srcOffs += rowPitch; } } } // Unmap GPU memory unmap(gpu); return true; } // ================================================================================================ bool Resource::hostRead(VirtualGPU* gpu, void* hostPtr, const amd::Coord3D& origin, const amd::Coord3D& size, size_t rowPitch, size_t slicePitch) { void* src; size_t startLayer = origin[2]; size_t numLayers = size[2]; if (desc().topology_ == CL_MEM_OBJECT_IMAGE1D_ARRAY) { startLayer = origin[1]; numLayers = size[1]; } // Get physical GPU memmory src = map(gpu, ReadOnly, startLayer, numLayers); if (nullptr == src) { LogError("Couldn't map GPU memory for host read"); return false; } if (1 == desc().dimSize_) { size_t copySize = (desc().buffer_) ? size[0] : size[0] * elementSize_; // Update the pointer src = static_cast(static_cast(src) + origin[0]); // Copy memory amd::Os::fastMemcpy(hostPtr, src, copySize); } else { size_t srcOffsBase = origin[0] * elementSize_; // Make sure we use the right pitch if it's not specified if (rowPitch == 0) { rowPitch = size[0] * elementSize_; } // Make sure we use the right slice if it's not specified if (slicePitch == 0) { slicePitch = size[0] * size[1] * elementSize_; } // Adjust destination offset with Y dimension srcOffsBase += desc().pitch_ * origin[1] * elementSize_; // Adjust the destination offset with Z dimension srcOffsBase += desc().slice_ * origin[2] * elementSize_; // Copy memory line by line for (size_t slice = 0; slice < size[2]; ++slice) { size_t srcOffs = srcOffsBase + slice * desc().slice_ * elementSize_; size_t dstOffs = slice * slicePitch; // Copy memory line by line for (size_t row = 0; row < size[1]; ++row) { // Copy memory amd::Os::fastMemcpy((reinterpret_cast
(hostPtr) + dstOffs), (reinterpret_cast(src) + srcOffs), size[0] * elementSize_); srcOffs += desc().pitch_ * elementSize_; dstOffs += rowPitch; } } } // Unmap GPU memory unmap(gpu); return true; } // ================================================================================================ void* Resource::gpuMemoryMap(size_t* pitch, uint flags, Pal::IGpuMemory* resource) const { if (desc_.cardMemory_ && !isPersistentDirectMap()) { // @todo remove const cast Unimplemented(); return nullptr; // return const_cast(dev()).resMapLocal(*pitch, resource, flags); } else { amd::ScopedLock lk(dev().lockPAL()); void* address; if (image_ != nullptr) { constexpr Pal::SubresId ImgSubresId = {Pal::ImageAspect::Color, 0, 0}; Pal::SubresLayout layout; image_->GetSubresourceLayout(ImgSubresId, &layout); *pitch = layout.rowPitch / elementSize(); } *pitch = desc().width_; if (Pal::Result::Success == resource->Map(&address)) { return address; } else { LogError("PAL GpuMemory->Map() failed!"); return nullptr; } } } // ================================================================================================ void Resource::gpuMemoryUnmap(Pal::IGpuMemory* resource) const { if (desc_.cardMemory_ && !isPersistentDirectMap()) { // @todo remove const cast Unimplemented(); // const_cast(dev()).resUnmapLocal(resource); } else { Pal::Result result = resource->Unmap(); if (Pal::Result::Success != result) { LogError("PAL GpuMemory->Unmap() failed!"); } } } // ================================================================================================ bool Resource::glAcquire() { bool retVal = true; if (desc().type_ == OGLInterop) { retVal = dev().resGLAcquire(glPlatformContext_, glInteropMbRes_, glType_); } return retVal; } // ================================================================================================ bool Resource::glRelease() { bool retVal = true; if (desc().type_ == OGLInterop) { retVal = dev().resGLRelease(glPlatformContext_, glInteropMbRes_, glType_); } return retVal; } // ================================================================================================ void Resource::addGpuEvent(const VirtualGPU& gpu, GpuEvent event) const { uint idx = gpu.index(); assert(idx < events_.size()); events_[idx] = event; } // ================================================================================================ GpuEvent* Resource::getGpuEvent(const VirtualGPU& gpu) const { uint idx = gpu.index(); assert((idx < events_.size()) && "Undeclared queue access!"); return &events_[idx]; } // ================================================================================================ void Resource::setModified(VirtualGPU& gpu, bool modified) const { uint idx = gpu.index(); assert(idx < events_.size()); events_[idx].modified_ = modified; // If current resource is a view, then update the parent as well if (viewOwner_ != nullptr) { viewOwner_->setModified(gpu, modified); } } // ================================================================================================ bool Resource::isModified(VirtualGPU& gpu) const { uint idx = gpu.index(); assert(idx < events_.size()); bool modified = events_[idx].modified_; // If current resource is a view, then get the parent state as well if (viewOwner_ != nullptr) { modified |= viewOwner_->isModified(gpu); } return modified; } // ================================================================================================ void Resource::palFree() const { if (desc().type_ == OGLInterop) { amd::ScopedLock lk(dev().lockPAL()); dev().resGLFree(glPlatformContext_, glInteropMbRes_, glType_); } memRef_->release(); } // ================================================================================================ bool Resource::isMemoryType(MemoryType memType) const { if (memoryType() == memType) { return true; } else if (memoryType() == View) { return viewOwner_->isMemoryType(memType); } return false; } // ================================================================================================ bool Resource::isPersistentDirectMap() const { bool directMap = ((memoryType() == Resource::Persistent) && (desc().dimSize_ < 3) && !desc().imageArray_); // If direct map is possible, then validate it with the current tiling if (directMap && desc().tiled_) { //!@note IOL for Linux doesn't support tiling aperture // and runtime doesn't force linear images in persistent directMap = IS_WINDOWS && !dev().settings().linearPersistentImage_; } return directMap; } // ================================================================================================ void* Resource::map(VirtualGPU* gpu, uint flags, uint startLayer, uint numLayers) { if (isMemoryType(Pinned)) { // Check if we have to wait if (!(flags & NoWait)) { if (gpu != nullptr) { wait(*gpu); } } return address_; } if (flags & ReadOnly) { } if (flags & WriteOnly) { } // Check if we have to wait if (!(flags & NoWait)) { if (gpu != nullptr) { wait(*gpu); } } // Check if memory wasn't mapped yet if (++mapCount_ == 1) { if ((desc().dimSize_ == 3) || desc().imageArray_ || ((desc().type_ == ImageView) && viewOwner_->mipMapped())) { // Save map info for multilayer map/unmap startLayer_ = startLayer; numLayers_ = numLayers; mapFlags_ = flags; // Map with layers address_ = mapLayers(gpu, flags); } else { // Map current resource if (memRef_->cpuAddress_ != nullptr) { // Suballocations are mapped by the memory suballocator address_ = reinterpret_cast(memRef_->cpuAddress_) + subOffset_; } else { address_ = gpuMemoryMap(&desc_.pitch_, flags, iMem()); address_ = reinterpret_cast
(address_) + offset_; } if (address_ == nullptr) { LogError("cal::ResMap failed!"); --mapCount_; return nullptr; } } } //! \note the atomic operation with counter doesn't // guarantee that the address will be valid, // since PAL could still process the first map if (address_ == nullptr) { for (uint i = 0; address_ == NULL && i < 10; ++i) { amd::Os::sleep(1); } assert((address_ != nullptr) && "Multiple maps failed!"); } return address_; } // ================================================================================================ void* Resource::mapLayers(VirtualGPU* gpu, uint flags) { Unimplemented(); return nullptr; } // ================================================================================================ void Resource::unmap(VirtualGPU* gpu) { if (isMemoryType(Pinned)) { return; } // Decrement map counter int count = --mapCount_; // Check if it's the last unmap if (count == 0) { if ((desc().dimSize_ == 3) || desc().imageArray_ || ((desc().type_ == ImageView) && viewOwner_->mipMapped())) { // Unmap layers unmapLayers(gpu); } else { // Unmap current resource gpuMemoryUnmap(iMem()); } address_ = nullptr; } else if (count < 0) { LogError("dev().serialCalResUnmap failed!"); ++mapCount_; return; } } // ================================================================================================ void Resource::unmapLayers(VirtualGPU* gpu) { Unimplemented(); } // ================================================================================================ bool MemorySubAllocator::InitAllocator(GpuMemoryReference* mem_ref) { MemBuddyAllocator* allocator = new MemBuddyAllocator( device_, device_->settings().subAllocationChunkSize_, device_->settings().subAllocationMinSize_); if (!((allocator != nullptr) && (allocator->Init() == Pal::Result::Success) && heaps_.insert({mem_ref, allocator}).second)) { mem_ref->release(); delete allocator; return false; } return true; } // ================================================================================================ bool MemorySubAllocator::CreateChunk(const Pal::IGpuMemory* reserved_va) { Pal::GpuMemoryCreateInfo createInfo = {}; createInfo.size = device_->settings().subAllocationChunkSize_; createInfo.alignment = device_->properties().gpuMemoryProperties.fragmentSize; createInfo.vaRange = Pal::VaRange::Default; createInfo.priority = Pal::GpuMemPriority::Normal; createInfo.heapCount = 1; createInfo.heaps[0] = Pal::GpuHeapInvisible; GpuMemoryReference* mem_ref = GpuMemoryReference::Create(*device_, createInfo); if (mem_ref != nullptr) { return InitAllocator(mem_ref); } return false; } // ================================================================================================ bool CoarseMemorySubAllocator::CreateChunk(const Pal::IGpuMemory* reserved_va) { Pal::GpuMemoryCreateInfo createInfo = {}; createInfo.size = device_->settings().subAllocationChunkSize_; createInfo.alignment = device_->properties().gpuMemoryProperties.fragmentSize; createInfo.vaRange = Pal::VaRange::Svm; createInfo.priority = Pal::GpuMemPriority::Normal; createInfo.flags.useReservedGpuVa = (reserved_va != nullptr); createInfo.pReservedGpuVaOwner = reserved_va; createInfo.heapCount = 2; createInfo.heaps[0] = Pal::GpuHeapInvisible; createInfo.heaps[1] = Pal::GpuHeapLocal; GpuMemoryReference* mem_ref = GpuMemoryReference::Create(*device_, createInfo); if (mem_ref != nullptr) { return InitAllocator(mem_ref); } return false; } // ================================================================================================ bool FineMemorySubAllocator::CreateChunk(const Pal::IGpuMemory* reserved_va) { Pal::SvmGpuMemoryCreateInfo createInfo = {}; createInfo.isUsedForKernel = false; createInfo.size = device_->settings().subAllocationChunkSize_; createInfo.alignment = MaxGpuAlignment; createInfo.flags.useReservedGpuVa = (reserved_va != nullptr); createInfo.pReservedGpuVaOwner = reserved_va; GpuMemoryReference* mem_ref = GpuMemoryReference::Create(*device_, createInfo); if ((mem_ref != nullptr) && InitAllocator(mem_ref)) { mem_ref->iMem()->Map(&mem_ref->cpuAddress_); return mem_ref->cpuAddress_ != nullptr; } return false; } // ================================================================================================ MemorySubAllocator::~MemorySubAllocator() { // Release memory heap for suballocations for (const auto& it : heaps_) { it.first->release(); delete it.second; } } // ================================================================================================ GpuMemoryReference* MemorySubAllocator::Allocate(Pal::gpusize size, Pal::gpusize alignment, const Pal::IGpuMemory* reserved_va, Pal::gpusize* offset) { GpuMemoryReference* mem_ref = nullptr; MemBuddyAllocator* allocator = nullptr; // Check if the resource size and alignment are allowed for suballocation if ((size < device_->settings().subAllocationMaxSize_) && (alignment <= device_->properties().gpuMemoryProperties.fragmentSize)) { uint i = 0; size = amd::alignUp(size, device_->settings().subAllocationMinSize_); do { // Find if current heap has enough empty space for (const auto& it : heaps_) { mem_ref = it.first; allocator = it.second; // SVM allocations may required a fixed VA, make sure we find the heap with the same VA if (reserved_va && (reserved_va->Desc().gpuVirtAddr != mem_ref->iMem()->Desc().gpuVirtAddr)) { continue; } // If we have found a valid chunk, then suballocate memory if (Pal::Result::Success == allocator->Allocate(size, alignment, offset)) { return mem_ref; } } // We didn't find a valid chunk, so create a new one if (!CreateChunk(reserved_va)) { return nullptr; } i++; } while (i < 2); } return nullptr; } // ================================================================================================ bool MemorySubAllocator::Free(amd::Monitor* monitor, GpuMemoryReference* ref, Pal::gpusize offset) { bool release_mem = false; { amd::ScopedLock l(monitor); // Find if current memory reference is a chunk allocation auto it = heaps_.find(ref); if (it == heaps_.end()) { return false; } it->second->Free(offset); // If this suballocator empty, then release memory chunk if (it->second->IsEmpty()) { delete it->second; heaps_.erase(it); release_mem = true; } } if (release_mem) { ref->release(); } return true; } // ================================================================================================ ResourceCache::~ResourceCache() { free(); } // ================================================================================================ //! \note the cache works in FILO mode bool ResourceCache::addGpuMemory(Resource::Descriptor* desc, GpuMemoryReference* ref, Pal::gpusize offset) { bool result = false; size_t size = ref->iMem()->Desc().size; // Check if runtime can free suballocation if ((desc->type_ == Resource::Local) && !desc->SVMRes_) { result = mem_sub_alloc_local_.Free(&lockCacheOps_, ref, offset); } else if ((desc->type_ == Resource::Local) && desc->SVMRes_) { result = mem_sub_alloc_coarse_.Free(&lockCacheOps_, ref, offset); } else if (desc->SVMRes_) { result = mem_sub_alloc_fine_.Free(&lockCacheOps_, ref, offset); } // If a resource was a suballocation, don't try to cache it if (result == true) { return result; } // Make sure current allocation isn't bigger than cache if (((desc->type_ == Resource::Local) || (desc->type_ == Resource::Persistent) || (desc->type_ == Resource::Remote) || (desc->type_ == Resource::RemoteUSWC)) && (size < cacheSizeLimit_) && !desc->SVMRes_) { // Validate the cache size limit. Loop until we have enough space while ((cacheSize_ + size) > cacheSizeLimit_) { removeLast(); } Resource::Descriptor* descCached = new Resource::Descriptor; if (descCached != nullptr) { // Copy the original desc to the cached version memcpy(descCached, desc, sizeof(Resource::Descriptor)); amd::ScopedLock l(&lockCacheOps_); // Add the current resource to the cache resCache_.push_front({descCached, ref}); ref->gpu_ = nullptr; cacheSize_ += size; if (desc->type_ == Resource::Local) { lclCacheSize_ += size; } result = true; } } return result; } // ================================================================================================ GpuMemoryReference* ResourceCache::findGpuMemory(Resource::Descriptor* desc, Pal::gpusize size, Pal::gpusize alignment, const Pal::IGpuMemory* reserved_va, Pal::gpusize* offset) { amd::ScopedLock l(&lockCacheOps_); GpuMemoryReference* ref = nullptr; // Check if the runtime can suballocate memory if ((desc->type_ == Resource::Local) && !desc->SVMRes_) { ref = mem_sub_alloc_local_.Allocate(size, alignment, reserved_va, offset); } else if ((desc->type_ == Resource::Local) && desc->SVMRes_) { ref = mem_sub_alloc_coarse_.Allocate(size, alignment, reserved_va, offset); } else if (desc->SVMRes_) { ref = mem_sub_alloc_fine_.Allocate(size, alignment, reserved_va, offset); } if (ref != nullptr) { return ref; } // Early exit if resource is too big if (size >= cacheSizeLimit_ || desc->SVMRes_) { //! \note we may need to free the cache here to reduce memory pressure return ref; } // Serach the right resource through the cache list for (const auto& it : resCache_) { Resource::Descriptor* entry = it.first; size_t sizeRes = it.second->iMem()->Desc().size; // Find if we can reuse this entry if ((entry->type_ == desc->type_) && (entry->flags_ == desc->flags_) && (size <= sizeRes) && (size > (sizeRes >> 1)) && ((it.second->iMem()->Desc().gpuVirtAddr % alignment) == 0) && (entry->isAllocExecute_ == desc->isAllocExecute_)) { ref = it.second; cacheSize_ -= sizeRes; if (entry->type_ == Resource::Local) { lclCacheSize_ -= sizeRes; } delete it.first; // Remove the found etry from the cache resCache_.remove(it); break; } } return ref; } // ================================================================================================ bool ResourceCache::free(size_t minCacheEntries) { bool result = false; if (minCacheEntries < resCache_.size()) { result = true; // Clear the cache while (static_cast(cacheSize_) > 0) { removeLast(); } CondLog((cacheSize_ != 0), "Incorrect size for cache release!"); } return result; } // ================================================================================================ void ResourceCache::removeLast() { std::pair entry; { // Protect access to the global data amd::ScopedLock l(&lockCacheOps_); entry = resCache_.back(); resCache_.pop_back(); cacheSize_ -= entry.second->iMem()->Desc().size; if (entry.first->type_ == Resource::Local) { lclCacheSize_ -= entry.second->iMem()->Desc().size; } // Delete Descriptor delete entry.first; } // Destroy PAL resource entry.second->release(); } } // namespace pal