diff --git a/projects/clr/rocclr/runtime/device/device.cpp b/projects/clr/rocclr/runtime/device/device.cpp index 20b6bbdd21..e085459449 100644 --- a/projects/clr/rocclr/runtime/device/device.cpp +++ b/projects/clr/rocclr/runtime/device/device.cpp @@ -15,6 +15,13 @@ extern amd::AppProfile* oclhsaCreateAppProfile(); #include "device/cpu/cpudevice.hpp" #endif // WITH_CPU_DEVICE +#if defined(WITH_PAL_DEVICE) +//namespace pal { +extern bool PalDeviceLoad(); +extern void PalDeviceUnload(); +//} +#endif // WITH_PAL_DEVICE + #if defined(WITH_GPU_DEVICE) extern bool DeviceLoad(); extern void DeviceUnload(); @@ -177,9 +184,12 @@ Device::init() ret |= oclhsa::NullDevice::init(); } #endif // WITH_HSA_DEVICE -#if defined(WITH_GPU_DEVICE) +#if defined(WITH_GPU_DEVICE) && !defined(WITH_PAL_DEVICE) ret |= DeviceLoad(); #endif // WITH_GPU_DEVICE +#if defined(WITH_PAL_DEVICE) + ret |= PalDeviceLoad(); +#endif // WITH_PAL_DEVICE #if defined(WITH_CPU_DEVICE) ret |= cpu::Device::init(); #endif // WITH_CPU_DEVICE @@ -203,9 +213,12 @@ Device::tearDown() oclhsaAppProfile_ = NULL; } #endif // WITH_HSA_DEVICE -#if defined(WITH_GPU_DEVICE) +#if defined(WITH_GPU_DEVICE) && !defined(WITH_PAL_DEVICE) DeviceUnload(); #endif // WITH_GPU_DEVICE +#if defined(WITH_PAL_DEVICE) + PalDeviceUnload(); +#endif // WITH_PAL_DEVICE #if defined(WITH_CPU_DEVICE) cpu::Device::tearDown(); #endif // WITH_CPU_DEVICE diff --git a/projects/clr/rocclr/runtime/device/pal/palappprofile.cpp b/projects/clr/rocclr/runtime/device/pal/palappprofile.cpp new file mode 100644 index 0000000000..b5a7e40a6c --- /dev/null +++ b/projects/clr/rocclr/runtime/device/pal/palappprofile.cpp @@ -0,0 +1,25 @@ +// +// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved. +// + +#include "top.hpp" +#include "utils/debug.hpp" +#include "device/appprofile.hpp" +#include "device/pal/palappprofile.hpp" + +namespace pal { + +AppProfile::AppProfile() + : amd::AppProfile() + , enableHighPerformanceState_(true) + , reportAsOCL12Device_(false) +{ + propertyDataMap_.insert(DataMap::value_type("HighPerfState", + PropertyData(DataType_Boolean, &enableHighPerformanceState_))); + + propertyDataMap_.insert(DataMap::value_type("OCL12Device", + PropertyData(DataType_Boolean, &reportAsOCL12Device_))); +} + +} + diff --git a/projects/clr/rocclr/runtime/device/pal/palappprofile.hpp b/projects/clr/rocclr/runtime/device/pal/palappprofile.hpp new file mode 100644 index 0000000000..63f4965d0f --- /dev/null +++ b/projects/clr/rocclr/runtime/device/pal/palappprofile.hpp @@ -0,0 +1,30 @@ +// +// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved. +// + +#ifndef PALAPPPROFILE_HPP_ +#define PALAPPPROFILE_HPP_ + +#include +#include + +namespace pal { + +class AppProfile : public amd::AppProfile +{ +public: + AppProfile(); + + //! return the value of enableHighPerformanceState_ + bool enableHighPerformanceState() const { return enableHighPerformanceState_; } + bool reportAsOCL12Device() const { return reportAsOCL12Device_; } + +private: + + bool enableHighPerformanceState_; + bool reportAsOCL12Device_; +}; + +} + +#endif // PALAPPPROFILE_HPP_ diff --git a/projects/clr/rocclr/runtime/device/pal/palbinary.cpp b/projects/clr/rocclr/runtime/device/pal/palbinary.cpp new file mode 100644 index 0000000000..0ceca32b3c --- /dev/null +++ b/projects/clr/rocclr/runtime/device/pal/palbinary.cpp @@ -0,0 +1,7 @@ +// +// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved. +// +namespace pal { + + +} // namespace pal diff --git a/projects/clr/rocclr/runtime/device/pal/palbinary.hpp b/projects/clr/rocclr/runtime/device/pal/palbinary.hpp new file mode 100644 index 0000000000..5026663a8f --- /dev/null +++ b/projects/clr/rocclr/runtime/device/pal/palbinary.hpp @@ -0,0 +1,48 @@ +// +// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved. +// +#ifndef PALBINARY_HPP_ +#define PALBINARY_HPP_ + +#include "top.hpp" +#include "device/pal/paldevice.hpp" +#include "device/pal/palkernel.hpp" + +namespace pal { + +class ClBinaryHsa : public device::ClBinary +{ +public: + ClBinaryHsa(const Device& dev, BinaryImageFormat bifVer = BIF_VERSION3) + : device::ClBinary(dev, bifVer) + {} + + //! Destructor + ~ClBinaryHsa() {} + + +protected: + bool setElfTarget() { + uint32_t target = static_cast(21);//dev().calTarget()); + assert (((0xFFFF8000 & target) == 0) && "ASIC target ID >= 2^15"); + uint16_t elf_target = (uint16_t)(0x7FFF & target); + return elfOut()->setTarget(elf_target, amd::OclElf::CAL_PLATFORM); + return true; + } + +private: + //! Disable default copy constructor + ClBinaryHsa(const ClBinaryHsa&); + + //! Disable default operator= + ClBinaryHsa& operator=(const ClBinaryHsa&); + + //! Returns the HSA device for this object + const Device& dev() const { return static_cast(dev_); } + +}; + +} // namespace pal + +#endif // PALBINARY_HPP_ + diff --git a/projects/clr/rocclr/runtime/device/pal/palblit.cpp b/projects/clr/rocclr/runtime/device/pal/palblit.cpp new file mode 100644 index 0000000000..386926d714 --- /dev/null +++ b/projects/clr/rocclr/runtime/device/pal/palblit.cpp @@ -0,0 +1,2775 @@ +// +// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved. +// +#include "platform/commandqueue.hpp" +#include "device/pal/paldevice.hpp" +#include "device/pal/palblit.hpp" +#include "device/pal/palmemory.hpp" +#include "device/pal/palvirtual.hpp" +#include "utils/debug.hpp" +#include + +namespace pal { + +DmaBlitManager::DmaBlitManager(VirtualGPU& gpu, Setup setup) + : HostBlitManager(gpu, setup) + , MinSizeForPinnedTransfer(dev().settings().pinnedMinXferSize_) + , completeOperation_(false) + , context_(NULL) +{ +} + +inline void +DmaBlitManager::synchronize() const +{ + if (syncOperation_) { + gpu().waitAllEngines(); + gpu().releaseMemObjects(); + } +} + +inline Memory& +DmaBlitManager::gpuMem(device::Memory& mem) const +{ + return static_cast(mem); +} + +bool +DmaBlitManager::readMemoryStaged( + Memory& srcMemory, + void* dstHost, + Memory** xferBuf, + size_t origin, + size_t& offset, + size_t& totalSize, + size_t xferSize) const +{ + amd::Coord3D dst(0, 0, 0); + size_t tmpSize; + uint idxWrite = 0; + uint idxRead = 0; + size_t chunkSize; + static const bool CopyRect = false; + // Flush DMA for ASYNC copy + static const bool FlushDMA = true; + + if (dev().xferRead().bufSize() < 128 * Ki) { + chunkSize = dev().xferRead().bufSize(); + } + else { + chunkSize = std::min(amd::alignUp(xferSize / 4, 256), + dev().xferRead().bufSize()); + chunkSize = std::max(chunkSize, 128 * Ki); + } + + // Find the partial transfer size + tmpSize = std::min(chunkSize, xferSize); + + amd::Coord3D srcLast(origin + offset, 0, 0); + amd::Coord3D copySizeLast(tmpSize, 0, 0); + + // Copy data into the temporary surface + if (!srcMemory.partialMemCopyTo(gpu(), srcLast, dst, copySizeLast, + *xferBuf[idxWrite], CopyRect, FlushDMA)) { + return false; + } + + totalSize -= tmpSize; + xferSize -= tmpSize; + offset += tmpSize; + + while (xferSize != 0) { + // Find the partial transfer size + tmpSize = std::min(chunkSize, xferSize); + + amd::Coord3D src(origin + offset, 0, 0); + amd::Coord3D copySize(tmpSize, 0, 0); + + idxWrite = (idxWrite + 1) % 2; + // Copy data into the temporary surface + if (!srcMemory.partialMemCopyTo(gpu(), src, dst, copySize, + *xferBuf[idxWrite], CopyRect, FlushDMA)) { + return false; + } + + // Read previous buffer + if (!xferBuf[idxRead]->hostRead(&gpu(), + reinterpret_cast(dstHost) + offset - copySizeLast[0], + dst, copySizeLast)) { + return false; + } + idxRead = (idxRead + 1) % 2; + copySizeLast = copySize; + + totalSize -= tmpSize; + xferSize -= tmpSize; + offset += tmpSize; + } + + // Last read + if (!xferBuf[idxRead]->hostRead(&gpu(), + reinterpret_cast(dstHost) + offset - copySizeLast[0], dst, copySizeLast)) { + return false; + } + return true; +} + +bool +DmaBlitManager::readBuffer( + device::Memory& srcMemory, + void* dstHost, + const amd::Coord3D& origin, + const amd::Coord3D& size, + bool entire) const +{ + // Use host copy if memory has direct access + if (setup_.disableReadBuffer_ || + (gpuMem(srcMemory).isHostMemDirectAccess() && gpuMem(srcMemory).isCacheable())) { + return HostBlitManager::readBuffer( + srcMemory, dstHost, origin, size, entire); + } + else { + size_t srcSize = size[0]; + size_t offset = 0; + size_t pinSize = dev().settings().pinnedXferSize_; + pinSize = std::min(pinSize, srcSize); + + // Check if a pinned transfer can be executed + if (pinSize && (srcSize > MinSizeForPinnedTransfer)) { + // Allign offset to 4K boundary (Vista/Win7 limitation) + char* tmpHost = const_cast( + amd::alignDown(reinterpret_cast(dstHost), + PinnedMemoryAlignment)); + + // Find the partial size for unaligned copy + size_t partial = reinterpret_cast(dstHost) - tmpHost; + + amd::Memory* pinned = NULL; + bool first = true; + size_t tmpSize; + size_t pinAllocSize; + + // Copy memory, using pinning + while (srcSize > 0) { + // If it's the first iterarion, then readjust the copy size + // to include alignment + if (first) { + pinAllocSize = amd::alignUp(pinSize + partial, + PinnedMemoryAlignment); + tmpSize = std::min(pinAllocSize - partial, srcSize); + first = false; + } + else { + tmpSize = std::min(pinSize, srcSize); + pinAllocSize = amd::alignUp(tmpSize, PinnedMemoryAlignment); + partial = 0; + } + amd::Coord3D dst(partial, 0, 0); + amd::Coord3D srcPin(origin[0] + offset, 0, 0); + amd::Coord3D copySizePin(tmpSize, 0, 0); + size_t partial2; + + // Allocate a GPU resource for pinning + pinned = pinHostMemory(tmpHost, pinAllocSize, partial2); + + if (pinned != NULL) { + // Get device memory for this virtual device + Memory* dstMemory = dev().getGpuMemory(pinned); + + if (!gpuMem(srcMemory).partialMemCopyTo( + gpu(), srcPin, dst, copySizePin, *dstMemory)) { + LogWarning("DmaBlitManager::readBuffer failed a pinned copy!"); + gpu().addPinnedMem(pinned); + break; + } + gpu().addPinnedMem(pinned); + } + else { + LogWarning("DmaBlitManager::readBuffer failed to pin a resource!"); + break; + } + srcSize -= tmpSize; + offset += tmpSize; + tmpHost = reinterpret_cast(tmpHost) + tmpSize + partial; + } + } + + if (0 != srcSize) { + Memory& xferBuf0 = dev().xferRead().acquire(); + Memory& xferBuf1 = dev().xferRead().acquire(); + Memory* xferBuf[2] = { &xferBuf0, &xferBuf1 }; + + // Read memory using a staged resource + if (!readMemoryStaged(gpuMem(srcMemory), dstHost, xferBuf, origin[0], + offset, srcSize, srcSize)) { + LogError("DmaBlitManager::readBuffer failed!"); + return false; + } + + dev().xferRead().release(gpu(), xferBuf1); + dev().xferRead().release(gpu(), xferBuf0); + } + } + + return true; +} + +bool +DmaBlitManager::readBufferRect( + device::Memory& srcMemory, + void* dstHost, + const amd::BufferRect& bufRect, + const amd::BufferRect& hostRect, + const amd::Coord3D& size, + bool entire) const +{ + // Use host copy if memory has direct access + if (setup_.disableReadBufferRect_ || + (gpuMem(srcMemory).isHostMemDirectAccess() && gpuMem(srcMemory).isCacheable())) { + return HostBlitManager::readBufferRect( + srcMemory, dstHost, bufRect, hostRect, size, entire); + } + else { + Memory& xferBuf = dev().xferRead().acquire(); + + amd::Coord3D dst(0, 0, 0); + size_t tmpSize = 0; + size_t bufOffset; + size_t hostOffset; + size_t srcSize; + + for (size_t z = 0; z < size[2]; ++z) { + for (size_t y = 0; y < size[1]; ++y) { + srcSize = size[0]; + bufOffset = bufRect.offset(0, y, z); + hostOffset = hostRect.offset(0, y, z); + + while (srcSize != 0) { + // Find the partial transfer size + tmpSize = std::min(dev().xferRead().bufSize(), srcSize); + + amd::Coord3D src(bufOffset, 0, 0); + amd::Coord3D copySize(tmpSize, 0, 0); + + // Copy data into the temporary surface + if (!gpuMem(srcMemory).partialMemCopyTo( + gpu(), src, dst, copySize, xferBuf, true)) { + LogError("DmaBlitManager::readBufferRect failed!"); + return false; + } + + if (!xferBuf.hostRead(&gpu(), + reinterpret_cast(dstHost) + hostOffset, + dst, copySize)) { + LogError("DmaBlitManager::readBufferRect failed!"); + return false; + } + + srcSize -= tmpSize; + bufOffset += tmpSize; + hostOffset += tmpSize; + } + } + } + dev().xferRead().release(gpu(), xferBuf); + } + + return true; +} + +bool +DmaBlitManager::readImage( + device::Memory& srcMemory, + void* dstHost, + const amd::Coord3D& origin, + const amd::Coord3D& size, + size_t rowPitch, + size_t slicePitch, + bool entire) const +{ + if (setup_.disableReadImage_) { + return HostBlitManager::readImage(srcMemory, dstHost, + origin, size, rowPitch, slicePitch, entire); + } + else { + //! @todo Add HW accelerated path + return HostBlitManager::readImage(srcMemory, dstHost, + origin, size, rowPitch, slicePitch, entire); + } + + return true; +} + +bool +DmaBlitManager::writeMemoryStaged( + const void* srcHost, + Memory& dstMemory, + Memory& xferBuf, + size_t origin, + size_t& offset, + size_t& totalSize, + size_t xferSize) const +{ + amd::Coord3D src(0, 0, 0); + size_t tmpSize; + size_t chunkSize; + + if (dev().xferRead().bufSize() < 128 * Ki) { + chunkSize = dev().xferRead().bufSize(); + } + else { + chunkSize = std::min(amd::alignUp(xferSize / 4, 256), + dev().xferRead().bufSize()); + chunkSize = std::max(chunkSize, 128 * Ki); + } + + while (xferSize != 0) { + // Find the partial transfer size + tmpSize = std::min(chunkSize, xferSize); + amd::Coord3D dst(origin + offset, 0, 0); + amd::Coord3D copySize(tmpSize, 0, 0); + + // Copy data into the temporary buffer, using CPU + if (!xferBuf.hostWrite(&gpu(), + reinterpret_cast(srcHost) + offset, + src, copySize, Resource::Discard)) { + return false; + } + + // Copy data into the original destination memory + if (!xferBuf.partialMemCopyTo( + gpu(), src, dst, copySize, dstMemory)) { + return false; + } + + totalSize -= tmpSize; + offset += tmpSize; + xferSize -= tmpSize; + } + return true; +} + +bool +DmaBlitManager::writeBuffer( + const void* srcHost, + device::Memory& dstMemory, + const amd::Coord3D& origin, + const amd::Coord3D& size, + bool entire) const +{ + // Use host copy if memory has direct access or it's persistent + if (setup_.disableWriteBuffer_ || + gpuMem(dstMemory).isHostMemDirectAccess() || + gpuMem(dstMemory).isPersistentDirectMap()) { + return HostBlitManager::writeBuffer( + srcHost, dstMemory, origin, size, entire); + } + else { + size_t dstSize = size[0]; + size_t tmpSize = 0; + size_t offset = 0; + size_t pinSize = dev().settings().pinnedXferSize_; + pinSize = std::min(pinSize, dstSize); + + // Check if a pinned transfer can be executed + if (pinSize && (dstSize > MinSizeForPinnedTransfer)) { + // Allign offset to 4K boundary (Vista/Win7 limitation) + char* tmpHost = const_cast( + amd::alignDown(reinterpret_cast(srcHost), + PinnedMemoryAlignment)); + + // Find the partial size for unaligned copy + size_t partial = reinterpret_cast(srcHost) - tmpHost; + + amd::Memory* pinned = NULL; + bool first = true; + size_t tmpSize; + size_t pinAllocSize; + + // Copy memory, using pinning + while (dstSize > 0) { + // If it's the first iterarion, then readjust the copy size + // to include alignment + if (first) { + pinAllocSize = amd::alignUp(pinSize + partial, + PinnedMemoryAlignment); + tmpSize = std::min(pinAllocSize - partial, dstSize); + first = false; + } + else { + tmpSize = std::min(pinSize, dstSize); + pinAllocSize = amd::alignUp(tmpSize, PinnedMemoryAlignment); + partial = 0; + } + amd::Coord3D src(partial, 0, 0); + amd::Coord3D dstPin(origin[0] + offset, 0, 0); + amd::Coord3D copySizePin(tmpSize, 0, 0); + size_t partial2; + + // Allocate a GPU resource for pinning + pinned = pinHostMemory(tmpHost, pinAllocSize, partial2); + + if (pinned != NULL) { + // Get device memory for this virtual device + Memory* srcMemory = dev().getGpuMemory(pinned); + + if (!srcMemory->partialMemCopyTo( + gpu(), src, dstPin, copySizePin, gpuMem(dstMemory))) { + LogWarning("DmaBlitManager::writeBuffer failed a pinned copy!"); + gpu().addPinnedMem(pinned); + break; + } + gpu().addPinnedMem(pinned); + } + else { + LogWarning("DmaBlitManager::writeBuffer failed to pin a resource!"); + break; + } + dstSize -= tmpSize; + offset += tmpSize; + tmpHost = reinterpret_cast(tmpHost) + tmpSize + partial; + } + } + + if (dstSize != 0) { + Memory& xferBuf = dev().xferWrite().acquire(); + + // Write memory using a staged resource + if (!writeMemoryStaged(srcHost, gpuMem(dstMemory), xferBuf, origin[0], + offset, dstSize, dstSize)) { + LogError("DmaBlitManager::writeBuffer failed!"); + return false; + } + + gpu().addXferWrite(xferBuf); + } + } + + return true; +} + +bool +DmaBlitManager::writeBufferRect( + const void* srcHost, + device::Memory& dstMemory, + const amd::BufferRect& hostRect, + const amd::BufferRect& bufRect, + const amd::Coord3D& size, + bool entire) const +{ + // Use host copy if memory has direct access or it's persistent + if (setup_.disableWriteBufferRect_ || + dstMemory.isHostMemDirectAccess() || + gpuMem(dstMemory).isPersistentDirectMap()) { + return HostBlitManager::writeBufferRect( + srcHost, dstMemory, hostRect, bufRect, size, entire); + } + else { + Memory& xferBuf = dev().xferWrite().acquire(); + + amd::Coord3D src(0, 0, 0); + size_t tmpSize = 0; + size_t bufOffset; + size_t hostOffset; + size_t dstSize; + + for (size_t z = 0; z < size[2]; ++z) { + for (size_t y = 0; y < size[1]; ++y) { + dstSize = size[0]; + bufOffset = bufRect.offset(0, y, z); + hostOffset = hostRect.offset(0, y, z); + + while (dstSize != 0) { + // Find the partial transfer size + tmpSize = std::min(dev().xferWrite().bufSize(), dstSize); + + amd::Coord3D dst(bufOffset, 0, 0); + amd::Coord3D copySize(tmpSize, 0, 0); + + // Copy data into the temporary buffer, using CPU + if (!xferBuf.hostWrite(&gpu(), + reinterpret_cast(srcHost) + hostOffset, + src, copySize, Resource::Discard)) { + LogError("DmaBlitManager::writeBufferRect failed!"); + return false; + } + + // Copy data into the original destination memory + if (!xferBuf.partialMemCopyTo( + gpu(), src, dst, copySize, gpuMem(dstMemory))) { + LogError("DmaBlitManager::writeBufferRect failed!"); + return false; + } + + dstSize -= tmpSize; + bufOffset += tmpSize; + hostOffset += tmpSize; + } + } + } + gpu().addXferWrite(xferBuf); + } + + return true; +} + +bool +DmaBlitManager::writeImage( + const void* srcHost, + device::Memory& dstMemory, + const amd::Coord3D& origin, + const amd::Coord3D& size, + size_t rowPitch, + size_t slicePitch, + bool entire) const +{ + if (setup_.disableWriteImage_) { + return HostBlitManager::writeImage( + srcHost, dstMemory, origin, size, rowPitch, slicePitch, entire); + } + else { + //! @todo Add HW accelerated path + return HostBlitManager::writeImage( + srcHost, dstMemory, origin, size, rowPitch, slicePitch, entire); + } + + return true; +} + +bool +DmaBlitManager::copyBuffer( + device::Memory& srcMemory, + device::Memory& dstMemory, + const amd::Coord3D& srcOrigin, + const amd::Coord3D& dstOrigin, + const amd::Coord3D& size, + bool entire) const +{ + if (setup_.disableCopyBuffer_ || + (gpuMem(srcMemory).isHostMemDirectAccess() && gpuMem(srcMemory).isCacheable() && + !dev().settings().apuSystem_ && + gpuMem(dstMemory).isHostMemDirectAccess())) { + return HostBlitManager::copyBuffer( + srcMemory, dstMemory, srcOrigin, dstOrigin, size); + } + else { + return gpuMem(srcMemory).partialMemCopyTo(gpu(), + srcOrigin, dstOrigin, size, gpuMem(dstMemory)); + } + + return true; +} + +bool +DmaBlitManager::copyBufferRect( + device::Memory& srcMemory, + device::Memory& dstMemory, + const amd::BufferRect& srcRect, + const amd::BufferRect& dstRect, + const amd::Coord3D& size, + bool entire) const +{ + if (setup_.disableCopyBufferRect_ || + (gpuMem(srcMemory).isHostMemDirectAccess() && gpuMem(srcMemory).isCacheable() && + gpuMem(dstMemory).isHostMemDirectAccess())) { + return HostBlitManager::copyBufferRect( + srcMemory, dstMemory, srcRect, dstRect, size, entire); + } + else { + size_t srcOffset; + size_t dstOffset; + + uint bytesPerElement = 16; + bool optimalElementSize = false; + bool subWindowRectCopy = true; + + srcOffset = srcRect.offset(0, 0, 0); + dstOffset = dstRect.offset(0, 0, 0); + + while (bytesPerElement >= 1) { + if (((srcOffset % 4) == 0) && + ((dstOffset % 4) == 0) && + ((size[0] % bytesPerElement) == 0) && + ((srcRect.rowPitch_ % bytesPerElement) == 0) && + ((srcRect.slicePitch_ % bytesPerElement) == 0) && + ((dstRect.rowPitch_ % bytesPerElement) == 0) && + ((dstRect.slicePitch_ % bytesPerElement) == 0)) { + optimalElementSize = true; + break; + } + bytesPerElement = bytesPerElement >> 1; + } + + // 19 bit limit in HW in SI and 16 bit limit in CI+(we adjust the ElementSize to 4bytes but the packet still has 14bits) + size_t pitchLimit = dev().settings().ciPlus_ ? (0x3FFF * bytesPerElement) | 0xF : 0x7FFFF; + size_t sizeLimit = dev().settings().ciPlus_ ? (0x3FFF * bytesPerElement) | 0xF : 0x3FFF; + + if (!optimalElementSize || + (srcRect.rowPitch_ > pitchLimit) || + (dstRect.rowPitch_ > pitchLimit) || + (size[0] > sizeLimit) || // See above + (size[1] > 0x3fff) || // 14 bits limit in HW + (size[2] > 0x7ff)) { // 11 bits limit in HW + // Restriction with rectLinearDRMDMA packet + subWindowRectCopy = false; + } + + if (subWindowRectCopy) { + // Copy data with subwindow copy packet + if (!gpuMem(srcMemory).partialMemCopyTo(gpu(), + amd::Coord3D(srcOffset, srcRect.rowPitch_, srcRect.slicePitch_), + amd::Coord3D(dstOffset, dstRect.rowPitch_, dstRect.slicePitch_), + size, gpuMem(dstMemory), true, false, bytesPerElement)) { + LogError("copyBufferRect failed!"); + return false; + } + } + else { + for (size_t z = 0; z < size[2]; ++z) { + for (size_t y = 0; y < size[1]; ++y) { + srcOffset = srcRect.offset(0, y, z); + dstOffset = dstRect.offset(0, y, z); + + amd::Coord3D src(srcOffset, 0, 0); + amd::Coord3D dst(dstOffset, 0, 0); + amd::Coord3D copySize(size[0], 0, 0); + + // Copy data + if (!gpuMem(srcMemory).partialMemCopyTo( + gpu(), src, dst, copySize, gpuMem(dstMemory))) { + LogError("copyBufferRect failed!"); + return false; + } + } + } + } + } + return true; +} + +bool +DmaBlitManager::copyImageToBuffer( + device::Memory& srcMemory, + device::Memory& dstMemory, + const amd::Coord3D& srcOrigin, + const amd::Coord3D& dstOrigin, + const amd::Coord3D& size, + bool entire, + size_t rowPitch, + size_t slicePitch) const +{ + bool result = false; + + if (setup_.disableCopyImageToBuffer_) { + result = HostBlitManager::copyImageToBuffer(srcMemory, dstMemory, + srcOrigin, dstOrigin, size, entire, rowPitch, slicePitch); + } + else { + // Use PAL path for a transfer + result = gpuMem(srcMemory).partialMemCopyTo( + gpu(), srcOrigin, dstOrigin, size, gpuMem(dstMemory)); + + // Check if a HostBlit transfer is required + if (completeOperation_ && !result) { + result = HostBlitManager::copyImageToBuffer(srcMemory, + dstMemory, srcOrigin, dstOrigin, size, entire, rowPitch, slicePitch); + } + } + + return result; +} + +bool +DmaBlitManager::copyBufferToImage( + device::Memory& srcMemory, + device::Memory& dstMemory, + const amd::Coord3D& srcOrigin, + const amd::Coord3D& dstOrigin, + const amd::Coord3D& size, + bool entire, + size_t rowPitch, + size_t slicePitch) const +{ + bool result = false; + + if (setup_.disableCopyBufferToImage_) { + result = HostBlitManager::copyBufferToImage(srcMemory, + dstMemory, srcOrigin, dstOrigin, size, entire, rowPitch, slicePitch); + } + else { + // Use PAL path for a transfer + result = gpuMem(srcMemory).partialMemCopyTo( + gpu(), srcOrigin, dstOrigin, size, gpuMem(dstMemory)); + + // Check if a HostBlit transfer is required + if (completeOperation_ && !result) { + result = HostBlitManager::copyBufferToImage(srcMemory, + dstMemory, srcOrigin, dstOrigin, size, entire, rowPitch, slicePitch); + } + } + + return result; +} + +bool +DmaBlitManager::copyImage( + device::Memory& srcMemory, + device::Memory& dstMemory, + const amd::Coord3D& srcOrigin, + const amd::Coord3D& dstOrigin, + const amd::Coord3D& size, + bool entire) const +{ + bool result = false; + + if (setup_.disableCopyImage_) { + return HostBlitManager::copyImage(srcMemory, dstMemory, + srcOrigin, dstOrigin, size, entire); + } + else { + //! @todo Add HW accelerated path + return HostBlitManager::copyImage(srcMemory, dstMemory, + srcOrigin, dstOrigin, size, entire); + } + + return result; +} + +KernelBlitManager::KernelBlitManager( + VirtualGPU& gpu, Setup setup) + : DmaBlitManager(gpu, setup) + , program_(NULL) + , constantBuffer_(NULL) + , xferBufferSize_(0) + , lockXferOps_(NULL) +{ + for (uint i = 0; i < BlitTotal; ++i) { + kernels_[i] = NULL; + } + + for (uint i = 0; i < MaxXferBuffers; ++i) { + xferBuffers_[i] = NULL; + } + + completeOperation_ = false; +} + +KernelBlitManager::~KernelBlitManager() +{ + for (uint i = 0; i < BlitTotal; ++i) { + if (NULL != kernels_[i]) { + kernels_[i]->release(); + } + } + if (NULL != program_) { + program_->release(); + } + + if (NULL != context_) { + // Release a dummy context + context_->release(); + } + + if (NULL != constantBuffer_) { + constantBuffer_->release(); + } + + for (uint i = 0; i < MaxXferBuffers; ++i) { + if (NULL != xferBuffers_[i]) { + xferBuffers_[i]->release(); + } + } + + delete lockXferOps_; +} + +bool +KernelBlitManager::create(amd::Device& device) +{ + if (!createProgram(static_cast(device))) { + return false; + } + return true; +} + +bool +KernelBlitManager::createProgram(Device& device) +{ + std::vector devices; + devices.push_back(&device); + + // Save context and program for this device + context_ = device.blitProgram()->context_; + context_->retain(); + program_ = device.blitProgram()->program_; + program_->retain(); + + bool result = false; + do { + // Create kernel objects for all blits + for (uint i = 0; i < BlitTotal; ++i) { + const amd::Symbol* symbol = program_->findSymbol(BlitName[i]); + if (symbol == NULL) { + break; + } + kernels_[i] = new amd::Kernel(*program_, *symbol, BlitName[i]); + if (kernels_[i] == NULL) { + break; + } + // Validate blit kernels for the scratch memory usage (pre SI) + if (!device.validateKernel(*kernels_[i], &gpu())) { + break; + } + } + + result = true; + } while(!result); + + // Create an internal constant buffer + constantBuffer_ = new (*context_) + amd::Buffer(*context_, CL_MEM_ALLOC_HOST_PTR, 4 * Ki); + + if ((constantBuffer_ != NULL) && !constantBuffer_->create(NULL)) { + constantBuffer_->release(); + constantBuffer_ = NULL; + return false; + } + else if (constantBuffer_ == NULL) { + return false; + } + + // Assign the constant buffer to the current virtual GPU + constantBuffer_->setVirtualDevice(&gpu()); + + if (dev().settings().xferBufSize_ > 0) { + xferBufferSize_ = dev().settings().xferBufSize_; + for (uint i = 0; i < MaxXferBuffers; ++i) { + // Create internal xfer buffers for image copy optimization + xferBuffers_[i] = new (*context_) + amd::Buffer(*context_, 0, xferBufferSize_); + + if ((xferBuffers_[i] != NULL) && !xferBuffers_[i]->create(NULL)) { + xferBuffers_[i]->release(); + xferBuffers_[i] = NULL; + return false; + } + else if (xferBuffers_[i] == NULL) { + return false; + } + + // Assign the xfer buffer to the current virtual GPU + xferBuffers_[i]->setVirtualDevice(&gpu()); + //! @note Workaround for conformance allocation test. + //! Force GPU mem alloc. + //! Unaligned images require xfer optimization, + //! but deferred memory allocation can cause + //! virtual heap fragmentation for big allocations and + //! then fail the following test with 32 bit ISA, because + //! runtime runs out of 4GB space. + dev().getGpuMemory(xferBuffers_[i]); + } + } + + lockXferOps_ = new amd::Monitor("Transfer Ops Lock", true); + if (NULL == lockXferOps_) { + return false; + } + + return result; +} + +// The following data structures will be used for the view creations. +// Some formats has to be converted before a kernel blit operation +struct FormatConvertion { + cl_uint clOldType_; + cl_uint clNewType_; +}; + +// The list of rejected data formats and corresponding conversion +static const FormatConvertion RejectedData[] = +{ + { CL_UNORM_INT8, CL_UNSIGNED_INT8 }, + { CL_UNORM_INT16, CL_UNSIGNED_INT16 }, + { CL_SNORM_INT8, CL_UNSIGNED_INT8 }, + { CL_SNORM_INT16, CL_UNSIGNED_INT16 }, + { CL_HALF_FLOAT, CL_UNSIGNED_INT16 }, + { CL_FLOAT, CL_UNSIGNED_INT32 }, + { CL_SIGNED_INT8, CL_UNSIGNED_INT8 }, + { CL_SIGNED_INT16, CL_UNSIGNED_INT16 }, + { CL_UNORM_INT_101010, CL_UNSIGNED_INT8 }, + { CL_SIGNED_INT32, CL_UNSIGNED_INT32 } +}; + +// The list of rejected channel's order and corresponding conversion +static const FormatConvertion RejectedOrder[] = +{ + { CL_A, CL_R }, + { CL_RA, CL_RG }, + { CL_LUMINANCE, CL_R }, + { CL_INTENSITY, CL_R }, + { CL_RGB, CL_RGBA }, + { CL_BGRA, CL_RGBA }, + { CL_ARGB, CL_RGBA }, + { CL_sRGB, CL_RGBA }, + { CL_sRGBx, CL_RGBA }, + { CL_sRGBA, CL_RGBA }, + { CL_sBGRA, CL_RGBA } +}; + +const uint RejectedFormatDataTotal = + sizeof(RejectedData) / sizeof(FormatConvertion); +const uint RejectedFormatChannelTotal = + sizeof(RejectedOrder) / sizeof(FormatConvertion); + +bool +KernelBlitManager::copyBufferToImage( + device::Memory& srcMemory, + device::Memory& dstMemory, + const amd::Coord3D& srcOrigin, + const amd::Coord3D& dstOrigin, + const amd::Coord3D& size, + bool entire, + size_t rowPitch, + size_t slicePitch) const +{ + amd::ScopedLock k(lockXferOps_); + bool result = false; + static const bool CopyRect = false; + // Flush DMA for ASYNC copy + static const bool FlushDMA = true; + + if (setup_.disableCopyBufferToImage_) { + result = DmaBlitManager::copyBufferToImage( + srcMemory, dstMemory, srcOrigin, dstOrigin, size, + entire, rowPitch, slicePitch); + synchronize(); + return result; + } + // Check if buffer is in system memory with direct access + else if (gpuMem(srcMemory).isHostMemDirectAccess() && + (rowPitch == 0) && (slicePitch == 0)) { + // First attempt to do this all with DMA, + // but there are restriciton with older hardware + if (dev().settings().imageDMA_) { + result = DmaBlitManager::copyBufferToImage( + srcMemory, dstMemory, srcOrigin, dstOrigin, size, + entire, rowPitch, slicePitch); + if (result) { + synchronize(); + return result; + } + } + + if (!setup_.disableCopyBufferToImageOpt_) { + // Find the overall copy size + size_t copySize = size[0] * size[1] * size[2] * gpuMem(dstMemory).elementSize(); + + // Check if double copy was requested + if (xferBufferSize_ != 0) { + amd::Coord3D src(srcOrigin); + amd::Coord3D xferSrc(0, 0, 0); + amd::Coord3D dst(dstOrigin); + amd::Coord3D xferRect(size); + // Find transfer size in pixels + size_t xferSizePix = xferBufferSize_ / gpuMem(dstMemory).elementSize(); + bool transfer = true; + + // Find transfer rectangle + if (xferRect[0] > xferSizePix) { + // The algorithm can't break a line. + // It requires multiple rectangles tracking + transfer = false; + } + else { + xferRect.c[1] = xferSizePix / xferRect[0]; + } + // Check if we exceeded the original size boundary in Y + if (xferRect[1] > size[1]) { + xferRect.c[1] = size[1]; + xferRect.c[2] = xferSizePix / (xferRect[0] * xferRect[1]); + } + else { + xferRect.c[2] = 1; + } + // Check if we exceeded the original size boundary in Z + if (xferRect[2] > size[2]) { + xferRect.c[2] = size[2]; + } + // Make sure size in Y dimension is divided by the rectangle size + if (size[2] > 1) { + while ((size[1] % xferRect[1]) != 0) { + xferRect.c[1]--; + } + } + + // Find one step copy size, based on the copy rectange + amd::Coord3D oneStepSize( + xferRect[0] * xferRect[1] * xferRect[2] * gpuMem(dstMemory).elementSize()); + + // Initialize transfer buffer array + Memory* xferBuf[MaxXferBuffers]; + for (uint i = 0; i < MaxXferBuffers; ++i) { + xferBuf[i] = dev().getGpuMemory(xferBuffers_[i]); + if (xferBuf[i] == NULL) { + transfer = false; + break; + } + } + + // Loop until we transfer all data + while (transfer && (copySize > 0)) { + size_t copySizeTmp = copySize; + amd::Coord3D srcTmp(src); + amd::Coord3D oneStepSizeTmp(oneStepSize); + // Step 1. Initiate DRM transfer with all staging buffers + for (uint i = 0; i < MaxXferBuffers; ++i) { + // Make sure we don't transfer more than copy size + if (copySizeTmp > 0) { + if (!gpuMem(srcMemory).partialMemCopyTo(gpu(), srcTmp, + xferSrc, oneStepSizeTmp, *xferBuf[i], CopyRect, FlushDMA)) { + transfer = false; + break; + } + + copySizeTmp -= oneStepSizeTmp[0]; + // Change buffer offset + srcTmp.c[0] += oneStepSizeTmp[0]; + + if (copySizeTmp < oneStepSizeTmp[0]) { + oneStepSizeTmp.c[0] = copySizeTmp; + } + } + else { + break; + } + } + + // Step 2. Initiate compute transfer with all staging buffers + for (uint i = 0; i < MaxXferBuffers; ++i) { + if (copySize > 0) { + if (!copyBufferToImageKernel( + *xferBuf[i], dstMemory, + xferSrc, dst, xferRect, false)) { + transfer = false; + break; + } + gpu().flushDMA(MainEngine); + + copySize -= oneStepSize[0]; + // Change buffer offset + src.c[0] += oneStepSize[0]; + // Change image offset, ignore X offset + for (uint j = 1; j < 3; ++j) { + dst.c[j] += xferRect[j]; + if ((dst[j] - dstOrigin[j]) >= size[j]) { + dst.c[j] = dstOrigin[j]; + } + else { + break; + } + } + // Recalculate rectangle size if the remain data is smaller + if (copySize < oneStepSize[0]) { + for (uint j = 0; j < 3; ++j) { + xferRect.c[j] = size[j] - (dst[j] - dstOrigin[j]); + } + oneStepSize.c[0] = copySize; + } + } + else { + break; + } + } + } + + if (copySize == 0) { + result = true; + } + else { + LogWarning("2 step transfer in copyBufferToImage failed"); + } + } + } + } + + if (!result) { + result = copyBufferToImageKernel(srcMemory, + dstMemory, srcOrigin, dstOrigin, size, entire, rowPitch, slicePitch); + } + + synchronize(); + + return result; +} + +void +CalcRowSlicePitches( + cl_ulong* pitch, const cl_int* copySize, + size_t rowPitch, size_t slicePitch, const Memory& mem) +{ + uint32_t memFmtSize = mem.elementSize(); + bool img1Darray = (mem.desc().topology_ == CL_MEM_OBJECT_IMAGE1D_ARRAY) ? true : false; + + if (rowPitch == 0) { + pitch[0] = copySize[0]; + } + else { + pitch[0] = rowPitch / memFmtSize; + } + if (slicePitch == 0) { + pitch[1] = pitch[0] * (img1Darray ? 1 : copySize[1]); + } + else { + pitch[1] = slicePitch / memFmtSize; + } + assert((pitch[0] <= pitch[1]) && "rowPitch must be <= slicePitch"); + + if (img1Darray) { + // For 1D array rowRitch = slicePitch + pitch[0] = pitch[1]; + } +} + +static void +setArgument(amd::Kernel* kernel, size_t index, size_t size, const void* value) +{ + const amd::KernelParameterDescriptor& desc = kernel->signature().at(index); + + void* param = kernel->parameters().values() + desc.offset_; + assert((desc.type_ == T_POINTER || value != NULL || desc.size_ == 0) && + "not a valid local mem arg"); + + uint32_t uint32_value = 0; + uint64_t uint64_value = 0; + + if (desc.type_ == T_POINTER && desc.size_ != 0) { + if ((value == NULL) || (static_cast(value) == NULL)) { + LP64_SWITCH(uint32_value, uint64_value) = 0; + } + else { + // convert cl_mem to amd::Memory*, return false if invalid. + LP64_SWITCH(uint32_value, uint64_value) = + (uintptr_t)(*static_cast(value)); + } + } + else if (desc.type_ == T_SAMPLER) { + assert(false && "No sampler support in blit manager! Use internal samplers!"); + } + else switch (desc.size_) { + case 1: uint32_value = *static_cast(value); break; + case 2: uint32_value = *static_cast(value); break; + case 4: uint32_value = *static_cast(value); break; + case 8: uint64_value = *static_cast(value); break; + default: break; + } + + switch (desc.size_) { + case 0 /*local mem*/ : *static_cast(param) = size; break; + case sizeof(uint32_t): *static_cast(param) = uint32_value; break; + case sizeof(uint64_t): *static_cast(param) = uint64_value; break; + default: ::memcpy(param, value, size); break; + } +} + +bool +KernelBlitManager::copyBufferToImageKernel( + device::Memory& srcMemory, + device::Memory& dstMemory, + const amd::Coord3D& srcOrigin, + const amd::Coord3D& dstOrigin, + const amd::Coord3D& size, + bool entire, + size_t rowPitch, + size_t slicePitch) const +{ + bool rejected = false; + Memory* dstView = &gpuMem(dstMemory); + bool releaseView = false; + bool result = false; + amd::Image::Format newFormat(gpuMem(dstMemory).desc().format_); + + // Find unsupported formats + for (uint i = 0; i < RejectedFormatDataTotal; ++i) { + if (RejectedData[i].clOldType_ == newFormat.image_channel_data_type) { + newFormat.image_channel_data_type = RejectedData[i].clNewType_; + rejected = true; + break; + } + } + + // Find unsupported channel's order + for (uint i = 0; i < RejectedFormatChannelTotal; ++i) { + if (RejectedOrder[i].clOldType_ == newFormat.image_channel_order) { + newFormat.image_channel_order = RejectedOrder[i].clNewType_; + rejected = true; + break; + } + } + + // If the image format was rejected, then attempt to create a view + if (rejected) { + dstView = createView(gpuMem(dstMemory), newFormat); + if (dstView != NULL) { + rejected = false; + releaseView = true; + } + } + + // Fall into the host path if the image format was rejected + if (rejected) { + return HostBlitManager::copyBufferToImage( + srcMemory, dstMemory, srcOrigin, dstOrigin, size, entire); + } + + // Use a common blit type with three dimensions by default + uint blitType = BlitCopyBufferToImage; + size_t dim = 0; + size_t globalWorkOffset[3] = { 0, 0, 0 }; + size_t globalWorkSize[3]; + size_t localWorkSize[3]; + + // Program the kernels workload depending on the blit dimensions + dim = 3; + if (gpuMem(dstMemory).desc().dimSize_ == 1) { + globalWorkSize[0] = amd::alignUp(size[0], 256); + globalWorkSize[1] = amd::alignUp(size[1], 1); + globalWorkSize[2] = amd::alignUp(size[2], 1); + localWorkSize[0] = 256; + localWorkSize[1] = localWorkSize[2] = 1; + } + else if (gpuMem(dstMemory).desc().dimSize_ == 2) { + globalWorkSize[0] = amd::alignUp(size[0], 16); + globalWorkSize[1] = amd::alignUp(size[1], 16); + globalWorkSize[2] = amd::alignUp(size[2], 1); + localWorkSize[0] = localWorkSize[1] = 16; + localWorkSize[2] = 1; + } + else { + globalWorkSize[0] = amd::alignUp(size[0], 8); + globalWorkSize[1] = amd::alignUp(size[1], 8); + globalWorkSize[2] = amd::alignUp(size[2], 4); + localWorkSize[0] = localWorkSize[1] = 8; + localWorkSize[2] = 4; + } + + // Program kernels arguments for the blit operation + Memory* mem = &gpuMem(srcMemory); + setArgument(kernels_[blitType], 0, sizeof(cl_mem), &mem); + mem = dstView; + setArgument(kernels_[blitType], 1, sizeof(cl_mem), &mem); + uint32_t memFmtSize = gpuMem(dstMemory).elementSize(); + uint32_t components = gpuMem(dstMemory).numComponents(); + + // 1 element granularity for writes by default + cl_int granularity = 1; + if (memFmtSize == 2) { + granularity = 2; + } + else if (memFmtSize >= 4) { + granularity = 4; + } + CondLog(((srcOrigin[0] % granularity) != 0), "Unaligned offset in blit!"); + cl_ulong srcOrg[4] = { srcOrigin[0] / granularity, + srcOrigin[1], + srcOrigin[2], 0 }; + setArgument(kernels_[blitType], 2, sizeof(srcOrg), srcOrg); + + cl_int dstOrg[4] = { (cl_int)dstOrigin[0], + (cl_int)dstOrigin[1], + (cl_int)dstOrigin[2], 0 }; + cl_int copySize[4] = { (cl_int)size[0], + (cl_int)size[1], + (cl_int)size[2], 0 }; + + setArgument(kernels_[blitType], 3, sizeof(dstOrg), dstOrg); + setArgument(kernels_[blitType], 4, sizeof(copySize), copySize); + + // Program memory format + uint multiplier = memFmtSize / sizeof(uint32_t); + multiplier = (multiplier == 0) ? 1 : multiplier; + cl_uint format[4] = { components, + memFmtSize / components, + multiplier, 0 }; + setArgument(kernels_[blitType], 5, sizeof(format), format); + + // Program row and slice pitches + cl_ulong pitch[4] = { 0 }; + CalcRowSlicePitches(pitch, copySize, rowPitch, slicePitch, gpuMem(dstMemory)); + setArgument(kernels_[blitType], 6, sizeof(pitch), pitch); + + // Create ND range object for the kernel's execution + amd::NDRangeContainer ndrange(dim, + globalWorkOffset, globalWorkSize, localWorkSize); + + // Execute the blit + address parameters = kernels_[blitType]->parameters().values(); + result = gpu().submitKernelInternal(ndrange, *kernels_[blitType], parameters); + if (releaseView) { + delete dstView; + } + + return result; +} + +bool +KernelBlitManager::copyImageToBuffer( + device::Memory& srcMemory, + device::Memory& dstMemory, + const amd::Coord3D& srcOrigin, + const amd::Coord3D& dstOrigin, + const amd::Coord3D& size, + bool entire, + size_t rowPitch, + size_t slicePitch) const +{ + amd::ScopedLock k(lockXferOps_); + bool result = false; + static const bool CopyRect = false; + // Flush DMA for ASYNC copy + static const bool FlushDMA = true; + + if (setup_.disableCopyImageToBuffer_) { + result = HostBlitManager::copyImageToBuffer( + srcMemory, dstMemory, srcOrigin, dstOrigin, + size, entire, rowPitch, slicePitch); + synchronize(); + return result; + } + // Check if buffer is in system memory with direct access + else if (gpuMem(dstMemory).isHostMemDirectAccess() && + (rowPitch == 0) && (slicePitch == 0)) { + // First attempt to do this all with DMA, + // but there are restriciton with older hardware + if (dev().settings().imageDMA_) { + result = DmaBlitManager::copyImageToBuffer( + srcMemory, dstMemory, srcOrigin, dstOrigin, + size, entire, rowPitch, slicePitch); + if (result) { + synchronize(); + return result; + } + } + + // Find the overall copy size + size_t copySize = size[0] * size[1] * size[2] * gpuMem(srcMemory).elementSize(); + + // Check if double copy was requested + if (xferBufferSize_ != 0) { + amd::Coord3D src(srcOrigin); + amd::Coord3D dst(dstOrigin); + amd::Coord3D xferDst(0, 0, 0); + amd::Coord3D xferRect(size); + // Find transfer size in pixels + size_t xferSizePix = xferBufferSize_ / gpuMem(srcMemory).elementSize(); + bool transfer = true; + + // Find transfer rectangle + if (xferRect[0] > xferSizePix) { + // The algorithm can't break a line. + // It requires multiple rectangles tracking + transfer = false; + } + else { + xferRect.c[1] = xferSizePix / xferRect[0]; + } + // Check if we exceeded the original size boundary in Y + if (xferRect[1] > size[1]) { + xferRect.c[1] = size[1]; + xferRect.c[2] = xferSizePix / (xferRect[0] * xferRect[1]); + } + else { + xferRect.c[2] = 1; + } + // Check if we exceeded the original size boundary in Z + if (xferRect[2] > size[2]) { + xferRect.c[2] = size[2]; + } + // Make sure size in Y dimension is divided by the rectangle size + if (size[2] > 1) { + while ((size[1] % xferRect[1]) != 0) { + xferRect.c[1]--; + } + } + + // Find one step copy size, based on the copy rectange + amd::Coord3D oneStepSize( + xferRect[0] * xferRect[1] * xferRect[2] * gpuMem(srcMemory).elementSize()); + + // Initialize transfer buffer array + Memory* xferBuf[MaxXferBuffers]; + for (uint i = 0; i < MaxXferBuffers; ++i) { + xferBuf[i] = dev().getGpuMemory(xferBuffers_[i]); + if (xferBuf[i] == NULL) { + transfer = false; + break; + } + } + + // Loop until we transfer all data + while (transfer && (copySize > 0)) { + size_t copySizeTmp = copySize; + amd::Coord3D srcTmp(src); + amd::Coord3D oneStepSizeTmp(oneStepSize); + amd::Coord3D xferRectTmp(xferRect); + + // Step 1. Initiate compute transfer with all staging buffers + for (uint i = 0; i < MaxXferBuffers; ++i) { + if (copySizeTmp > 0) { + if (!copyImageToBufferKernel( + srcMemory, *xferBuf[i], + srcTmp, xferDst, xferRectTmp, false)) { + transfer = false; + break; + } + gpu().flushDMA(MainEngine); + + copySizeTmp -= oneStepSizeTmp[0]; + // Change image offset, ignore X offset + for (uint j = 1; j < 3; ++j) { + srcTmp.c[j] += xferRectTmp[j]; + if ((srcTmp[j] - srcOrigin[j]) >= size[j]) { + srcTmp.c[j] = srcOrigin[j]; + } + else { + break; + } + } + // Recalculate rectangle size if the remain data is smaller + if (copySizeTmp < oneStepSizeTmp[0]) { + for (uint j = 0; j < 3; ++j) { + xferRectTmp.c[j] = size[j] - (srcTmp[j] - srcOrigin[j]); + } + } + } + else { + break; + } + } + + // Step 2. Initiate DRM transfer with all staging buffers + for (uint i = 0; i < MaxXferBuffers; ++i) { + // Make sure we don't transfer more than copy size + if (copySize > 0) { + if (!xferBuf[i]->partialMemCopyTo(gpu(), xferDst, dst, + oneStepSize, gpuMem(dstMemory), CopyRect, FlushDMA)) { + transfer = false; + break; + } + + copySize -= oneStepSize[0]; + // Change buffer offset + dst.c[0] += oneStepSize[0]; + // Change image offset, ignore X offset + for (uint j = 1; j < 3; ++j) { + src.c[j] += xferRect[j]; + if ((src[j] - srcOrigin[j]) >= size[j]) { + src.c[j] = srcOrigin[j]; + } + else { + break; + } + } + // Recalculate rectangle size if the remain data is smaller + if (copySize < oneStepSize[0]) { + for (uint j = 0; j < 3; ++j) { + xferRect.c[j] = size[j] - (src[j] - srcOrigin[j]); + } + oneStepSize.c[0] = copySize; + } + } + else { + break; + } + } + } + + if (copySize == 0) { + result = true; + } + else { + LogWarning("2 step transfer in copyBufferToImage failed"); + } + } + } + + if (!result) { + result = copyImageToBufferKernel(srcMemory, + dstMemory, srcOrigin, dstOrigin, size, entire, rowPitch, slicePitch); + } + + synchronize(); + + return result; +} + +bool +KernelBlitManager::copyImageToBufferKernel( + device::Memory& srcMemory, + device::Memory& dstMemory, + const amd::Coord3D& srcOrigin, + const amd::Coord3D& dstOrigin, + const amd::Coord3D& size, + bool entire, + size_t rowPitch, + size_t slicePitch) const +{ + bool rejected = false; + Memory* srcView = &gpuMem(srcMemory); + bool releaseView = false; + bool result = false; + amd::Image::Format newFormat(gpuMem(srcMemory).desc().format_); + + // Find unsupported formats + for (uint i = 0; i < RejectedFormatDataTotal; ++i) { + if (RejectedData[i].clOldType_ == newFormat.image_channel_data_type) { + newFormat.image_channel_data_type = RejectedData[i].clNewType_; + rejected = true; + break; + } + } + + // Find unsupported channel's order + for (uint i = 0; i < RejectedFormatChannelTotal; ++i) { + if (RejectedOrder[i].clOldType_ == newFormat.image_channel_order) { + newFormat.image_channel_order = RejectedOrder[i].clNewType_; + rejected = true; + break; + } + } + + // If the image format was rejected, then attempt to create a view + if (rejected) { + srcView = createView(gpuMem(srcMemory), newFormat); + if (srcView != NULL) { + rejected = false; + releaseView = true; + } + } + + // Fall into the host path if the image format was rejected + if (rejected) { + return HostBlitManager::copyImageToBuffer( + srcMemory, dstMemory, srcOrigin, dstOrigin, size, entire); + } + + uint blitType = BlitCopyImageToBuffer; + size_t dim = 0; + size_t globalWorkOffset[3] = { 0, 0, 0 }; + size_t globalWorkSize[3]; + size_t localWorkSize[3]; + + // Program the kernels workload depending on the blit dimensions + dim = 3; + // Find the current blit type + if (gpuMem(srcMemory).desc().dimSize_ == 1) { + globalWorkSize[0] = amd::alignUp(size[0], 256); + globalWorkSize[1] = amd::alignUp(size[1], 1); + globalWorkSize[2] = amd::alignUp(size[2], 1); + localWorkSize[0] = 256; + localWorkSize[1] = localWorkSize[2] = 1; + } + else if (gpuMem(srcMemory).desc().dimSize_ == 2) { + globalWorkSize[0] = amd::alignUp(size[0], 16); + globalWorkSize[1] = amd::alignUp(size[1], 16); + globalWorkSize[2] = amd::alignUp(size[2], 1); + localWorkSize[0] = localWorkSize[1] = 16; + localWorkSize[2] = 1; + } + else { + globalWorkSize[0] = amd::alignUp(size[0], 8); + globalWorkSize[1] = amd::alignUp(size[1], 8); + globalWorkSize[2] = amd::alignUp(size[2], 4); + localWorkSize[0] = localWorkSize[1] = 8; + localWorkSize[2] = 4; + } + + // Program kernels arguments for the blit operation + Memory* mem = srcView; + setArgument(kernels_[blitType], 0, sizeof(cl_mem), &mem); + mem = &gpuMem(dstMemory); + setArgument(kernels_[blitType], 1, sizeof(cl_mem), &mem); + + // Update extra paramters for USHORT and UBYTE pointers. + // Only then compiler can optimize the kernel to use + // UAV Raw for other writes + setArgument(kernels_[blitType], 2, sizeof(cl_mem), &mem); + setArgument(kernels_[blitType], 3, sizeof(cl_mem), &mem); + + cl_int srcOrg[4] = { (cl_int)srcOrigin[0], + (cl_int)srcOrigin[1], + (cl_int)srcOrigin[2], 0 }; + cl_int copySize[4] = { (cl_int)size[0], + (cl_int)size[1], + (cl_int)size[2], 0 }; + setArgument(kernels_[blitType], 4, sizeof(srcOrg), srcOrg); + uint32_t memFmtSize = gpuMem(srcMemory).elementSize(); + uint32_t components = gpuMem(srcMemory).numComponents(); + + // 1 element granularity for writes by default + cl_int granularity = 1; + if (memFmtSize == 2) { + granularity = 2; + } + else if (memFmtSize >= 4) { + granularity = 4; + } + CondLog(((dstOrigin[0] % granularity) != 0), "Unaligned offset in blit!"); + cl_ulong dstOrg[4] = { dstOrigin[0] / granularity, + dstOrigin[1], + dstOrigin[2], 0 }; + setArgument(kernels_[blitType], 5, sizeof(dstOrg), dstOrg); + setArgument(kernels_[blitType], 6, sizeof(copySize), copySize); + + // Program memory format + uint multiplier = memFmtSize / sizeof(uint32_t); + multiplier = (multiplier == 0) ? 1 : multiplier; + cl_uint format[4] = { components, + memFmtSize / components, + multiplier, 0 }; + setArgument(kernels_[blitType], 7, sizeof(format), format); + + // Program row and slice pitches + cl_ulong pitch[4] = { 0 }; + CalcRowSlicePitches(pitch, copySize, rowPitch, slicePitch, gpuMem(srcMemory)); + setArgument(kernels_[blitType], 8, sizeof(pitch), pitch); + + // Create ND range object for the kernel's execution + amd::NDRangeContainer ndrange(dim, + globalWorkOffset, globalWorkSize, localWorkSize); + + // Execute the blit + address parameters = kernels_[blitType]->parameters().values(); + result = gpu().submitKernelInternal(ndrange, *kernels_[blitType], parameters); + if (releaseView) { + delete srcView; + } + + return result; +} + +bool +KernelBlitManager::copyImage( + device::Memory& srcMemory, + device::Memory& dstMemory, + const amd::Coord3D& srcOrigin, + const amd::Coord3D& dstOrigin, + const amd::Coord3D& size, + bool entire) const +{ + amd::ScopedLock k(lockXferOps_); + bool rejected = false; + Memory* srcView = &gpuMem(srcMemory); + Memory* dstView = &gpuMem(dstMemory); + bool releaseView = false; + bool result = false; + amd::Image::Format newFormat(gpuMem(srcMemory).desc().format_); + + // Find unsupported formats + for (uint i = 0; i < RejectedFormatDataTotal; ++i) { + if (RejectedData[i].clOldType_ == newFormat.image_channel_data_type) { + newFormat.image_channel_data_type = RejectedData[i].clNewType_; + rejected = true; + break; + } + } + + // Search for the rejected channel's order only if the format was rejected + // Note: Image blit is independent from the channel order + if (rejected) { + for (uint i = 0; i < RejectedFormatChannelTotal; ++i) { + if (RejectedOrder[i].clOldType_ == newFormat.image_channel_order) { + newFormat.image_channel_order = RejectedOrder[i].clNewType_; + rejected = true; + break; + } + } + } + + // Attempt to create a view if the format was rejected + if (rejected) { + srcView = createView(gpuMem(srcMemory), newFormat); + if (srcView != NULL) { + dstView = createView(gpuMem(dstMemory), newFormat); + if (dstView != NULL) { + rejected = false; + releaseView = true; + } + else { + delete srcView; + } + } + } + + // Fall into the host path for the entire 2D copy or + // if the image format was rejected + if (rejected) { + result = HostBlitManager::copyImage(srcMemory, dstMemory, + srcOrigin, dstOrigin, size, entire); + synchronize(); + return result; + } + + uint blitType = BlitCopyImage; + size_t dim = 0; + size_t globalWorkOffset[3] = { 0, 0, 0 }; + size_t globalWorkSize[3]; + size_t localWorkSize[3]; + + // Program the kernels workload depending on the blit dimensions + dim = 3; + // Find the current blit type + if ((gpuMem(srcMemory).desc().dimSize_ == 1) || + (gpuMem(dstMemory).desc().dimSize_ == 1)) { + globalWorkSize[0] = amd::alignUp(size[0], 256); + globalWorkSize[1] = amd::alignUp(size[1], 1); + globalWorkSize[2] = amd::alignUp(size[2], 1); + localWorkSize[0] = 256; + localWorkSize[1] = localWorkSize[2] = 1; + } + else if ((gpuMem(srcMemory).desc().dimSize_ == 2) || + (gpuMem(dstMemory).desc().dimSize_ == 2)) { + globalWorkSize[0] = amd::alignUp(size[0], 16); + globalWorkSize[1] = amd::alignUp(size[1], 16); + globalWorkSize[2] = amd::alignUp(size[2], 1); + localWorkSize[0] = localWorkSize[1] = 16; + localWorkSize[2] = 1; + } + else { + globalWorkSize[0] = amd::alignUp(size[0], 8); + globalWorkSize[1] = amd::alignUp(size[1], 8); + globalWorkSize[2] = amd::alignUp(size[2], 4); + localWorkSize[0] = localWorkSize[1] = 8; + localWorkSize[2] = 4; + } + + // The current OpenCL spec allows "copy images from a 1D image + // array object to a 1D image array object" only. + if ((gpuMem(srcMemory).desc().topology_ == CL_MEM_OBJECT_IMAGE1D_ARRAY) || + (gpuMem(dstMemory).desc().topology_ == CL_MEM_OBJECT_IMAGE1D_ARRAY)) { + blitType = BlitCopyImage1DA; + } + + // Program kernels arguments for the blit operation + Memory* mem = srcView; + setArgument(kernels_[blitType], 0, sizeof(cl_mem), &mem); + mem = dstView; + setArgument(kernels_[blitType], 1, sizeof(cl_mem), &mem); + + // Program source origin + cl_int srcOrg[4] = { (cl_int)srcOrigin[0], + (cl_int)srcOrigin[1], + (cl_int)srcOrigin[2], 0 }; + setArgument(kernels_[blitType], 2, sizeof(srcOrg), srcOrg); + + // Program destinaiton origin + cl_int dstOrg[4] = { (cl_int)dstOrigin[0], + (cl_int)dstOrigin[1], + (cl_int)dstOrigin[2], 0 }; + setArgument(kernels_[blitType], 3, sizeof(dstOrg), dstOrg); + + cl_int copySize[4] = { (cl_int)size[0], + (cl_int)size[1], + (cl_int)size[2], 0 }; + setArgument(kernels_[blitType], 4, sizeof(copySize), copySize); + + // Create ND range object for the kernel's execution + amd::NDRangeContainer ndrange(dim, + globalWorkOffset, globalWorkSize, localWorkSize); + + // Execute the blit + address parameters = kernels_[blitType]->parameters().values(); + result = gpu().submitKernelInternal(ndrange, *kernels_[blitType], parameters); + if (releaseView) { + delete srcView; + delete dstView; + } + + synchronize(); + + return result; +} + +void +FindPinSize( + size_t& pinSize, const amd::Coord3D& size, + size_t& rowPitch, size_t& slicePitch, const Memory& mem) +{ + pinSize = size[0] * mem.elementSize(); + if ((rowPitch == 0) || (rowPitch == pinSize)) { + rowPitch = 0; + } + else { + pinSize = rowPitch; + } + + // Calculate the pin size, which should be equal to the copy size + for (uint i = 1; i < mem.desc().dimSize_; ++i) { + pinSize *= size[i]; + if (i == 1) { + if ((slicePitch == 0) || (slicePitch == pinSize)) { + slicePitch = 0; + } + else { + if (mem.desc().topology_ != CL_MEM_OBJECT_IMAGE1D_ARRAY) { + pinSize = slicePitch; + } + else { + pinSize = slicePitch * size[i]; + } + } + } + } +} + +bool +KernelBlitManager::readImage( + device::Memory& srcMemory, + void* dstHost, + const amd::Coord3D& origin, + const amd::Coord3D& size, + size_t rowPitch, + size_t slicePitch, + bool entire) const +{ + amd::ScopedLock k(lockXferOps_); + bool result = false; + + // Use host copy if memory has direct access or it's persistent + if (setup_.disableReadImage_ || + (gpuMem(srcMemory).isHostMemDirectAccess() && gpuMem(srcMemory).isCacheable())) { + result = HostBlitManager::readImage(srcMemory, dstHost, + origin, size, rowPitch, slicePitch, entire); + synchronize(); + return result; + } + else { + size_t pinSize; + FindPinSize(pinSize, size, rowPitch, slicePitch, gpuMem(srcMemory)); + + size_t partial; + amd::Memory* amdMemory = pinHostMemory(dstHost, pinSize, partial); + + if (amdMemory == NULL) { + // Force SW copy + result = HostBlitManager::readImage(srcMemory, dstHost, + origin, size, rowPitch, slicePitch, entire); + synchronize(); + return result; + } + + // Readjust destination offset + const amd::Coord3D dstOrigin(partial); + + // Get device memory for this virtual device + Memory* dstMemory = dev().getGpuMemory(amdMemory); + + // Copy image to buffer + result = copyImageToBuffer(srcMemory, *dstMemory, + origin, dstOrigin, size, entire, rowPitch, slicePitch); + + // Add pinned memory for a later release + gpu().addPinnedMem(amdMemory); + } + + synchronize(); + + return result; +} + +bool +KernelBlitManager::writeImage( + const void* srcHost, + device::Memory& dstMemory, + const amd::Coord3D& origin, + const amd::Coord3D& size, + size_t rowPitch, + size_t slicePitch, + bool entire) const +{ + amd::ScopedLock k(lockXferOps_); + bool result = false; + + // Use host copy if memory has direct access or it's persistent + if (setup_.disableWriteImage_|| + gpuMem(dstMemory).isHostMemDirectAccess() || + gpuMem(dstMemory).isPersistentDirectMap()) { + result = HostBlitManager::writeImage( + srcHost, dstMemory, origin, size, rowPitch, slicePitch, entire); + synchronize(); + return result; + } + else { + size_t pinSize; + FindPinSize(pinSize, size, rowPitch, slicePitch, gpuMem(dstMemory)); + + size_t partial; + amd::Memory* amdMemory = pinHostMemory(srcHost, pinSize, partial); + + if (amdMemory == NULL) { + // Force SW copy + result = HostBlitManager::writeImage( + srcHost, dstMemory, origin, size, rowPitch, slicePitch, entire); + synchronize(); + return result; + } + + // Readjust destination offset + const amd::Coord3D srcOrigin(partial); + + // Get device memory for this virtual device + Memory* srcMemory = dev().getGpuMemory(amdMemory); + + // Copy image to buffer + result = copyBufferToImage(*srcMemory, dstMemory, + srcOrigin, origin, size, entire, rowPitch, slicePitch); + + // Add pinned memory for a later release + gpu().addPinnedMem(amdMemory); + } + + synchronize(); + + return result; +} + +bool +KernelBlitManager::copyBufferRect( + device::Memory& srcMemory, + device::Memory& dstMemory, + const amd::BufferRect& srcRectIn, + const amd::BufferRect& dstRectIn, + const amd::Coord3D& sizeIn, + bool entire) const +{ + amd::ScopedLock k(lockXferOps_); + bool result = false; + bool rejected = false; + + // Fall into the PAL path for rejected transfers + if (setup_.disableCopyBufferRect_ || + gpuMem(srcMemory).isHostMemDirectAccess() || gpuMem(dstMemory).isHostMemDirectAccess()) { + result = DmaBlitManager::copyBufferRect(srcMemory, dstMemory, + srcRectIn, dstRectIn, sizeIn, entire); + + if (result) { + synchronize(); + return result; + } + } + + uint blitType = BlitCopyBufferRect; + size_t dim = 3; + size_t globalWorkOffset[3] = { 0, 0, 0 }; + size_t globalWorkSize[3]; + size_t localWorkSize[3]; + + const static uint CopyRectAlignment[3] = { 16, 4, 1 }; + + bool aligned; + uint i; + for (i = 0; i < sizeof(CopyRectAlignment) / sizeof(uint); i++) { + // Check source alignments + aligned = ((srcRectIn.rowPitch_ % CopyRectAlignment[i]) == 0); + aligned &= ((srcRectIn.slicePitch_ % CopyRectAlignment[i]) == 0); + aligned &= ((srcRectIn.start_ % CopyRectAlignment[i]) == 0); + + // Check destination alignments + aligned &= ((dstRectIn.rowPitch_ % CopyRectAlignment[i]) == 0); + aligned &= ((dstRectIn.slicePitch_ % CopyRectAlignment[i]) == 0); + aligned &= ((dstRectIn.start_ % CopyRectAlignment[i]) == 0); + + // Check copy size alignment in the first dimension + aligned &= ((sizeIn[0] % CopyRectAlignment[i]) == 0); + + if (aligned) { + if (CopyRectAlignment[i] != 1) { + blitType = BlitCopyBufferRectAligned; + } + break; + } + } + + amd::BufferRect srcRect; + amd::BufferRect dstRect; + amd::Coord3D size(sizeIn[0], sizeIn[1], sizeIn[2]); + + srcRect.rowPitch_ = srcRectIn.rowPitch_ / CopyRectAlignment[i]; + srcRect.slicePitch_ = srcRectIn.slicePitch_ / CopyRectAlignment[i]; + srcRect.start_ = srcRectIn.start_ / CopyRectAlignment[i]; + srcRect.end_ = srcRectIn.end_ / CopyRectAlignment[i]; + + dstRect.rowPitch_ = dstRectIn.rowPitch_ / CopyRectAlignment[i]; + dstRect.slicePitch_ = dstRectIn.slicePitch_ / CopyRectAlignment[i]; + dstRect.start_ = dstRectIn.start_ / CopyRectAlignment[i]; + dstRect.end_ = dstRectIn.end_ / CopyRectAlignment[i]; + + size.c[0] /= CopyRectAlignment[i]; + + // Program the kernel's workload depending on the transfer dimensions + if ((size[1] == 1) && (size[2] == 1)) { + globalWorkSize[0] = amd::alignUp(size[0], 256); + globalWorkSize[1] = 1; + globalWorkSize[2] = 1; + localWorkSize[0] = 256; + localWorkSize[1] = 1; + localWorkSize[2] = 1; + } + else if (size[2] == 1) { + globalWorkSize[0] = amd::alignUp(size[0], 16); + globalWorkSize[1] = amd::alignUp(size[1], 16); + globalWorkSize[2] = 1; + localWorkSize[0] = localWorkSize[1] = 16; + localWorkSize[2] = 1; + } + else { + globalWorkSize[0] = amd::alignUp(size[0], 8); + globalWorkSize[1] = amd::alignUp(size[1], 8); + globalWorkSize[2] = amd::alignUp(size[2], 4); + localWorkSize[0] = localWorkSize[1] = 8; + localWorkSize[2] = 4; + } + + + // Program kernels arguments for the blit operation + Memory* mem = &gpuMem(srcMemory); + setArgument(kernels_[blitType], 0, sizeof(cl_mem), &mem); + mem = &gpuMem(dstMemory); + setArgument(kernels_[blitType], 1, sizeof(cl_mem), &mem); + cl_ulong src[4] = { srcRect.rowPitch_, + srcRect.slicePitch_, + srcRect.start_, 0 }; + setArgument(kernels_[blitType], 2, sizeof(src), src); + cl_ulong dst[4] = { dstRect.rowPitch_, + dstRect.slicePitch_, + dstRect.start_, 0 }; + setArgument(kernels_[blitType], 3, sizeof(dst), dst); + cl_ulong copySize[4] = { size[0], size[1], size[2], CopyRectAlignment[i] }; + setArgument(kernels_[blitType], 4, sizeof(copySize), copySize); + + // Create ND range object for the kernel's execution + amd::NDRangeContainer ndrange(dim, + globalWorkOffset, globalWorkSize, localWorkSize); + + // Execute the blit + address parameters = kernels_[blitType]->parameters().values(); + result = gpu().submitKernelInternal(ndrange, *kernels_[blitType], parameters); + + synchronize(); + + return result; +} + +bool +KernelBlitManager::readBuffer( + device::Memory& srcMemory, + void* dstHost, + const amd::Coord3D& origin, + const amd::Coord3D& size, + bool entire) const +{ + amd::ScopedLock k(lockXferOps_); + bool result = false; + // Use host copy if memory has direct access + if (setup_.disableReadBuffer_ || + (gpuMem(srcMemory).isHostMemDirectAccess() && gpuMem(srcMemory).isCacheable())) { + result = HostBlitManager::readBuffer( + srcMemory, dstHost, origin, size, entire); + synchronize(); + return result; + } + else { + size_t pinSize = size[0]; + // Check if a pinned transfer can be executed with a single pin + if ((pinSize <= dev().settings().pinnedXferSize_) && + (pinSize > MinSizeForPinnedTransfer)) { + size_t partial; + amd::Memory* amdMemory = pinHostMemory(dstHost, pinSize, partial); + + if (amdMemory == NULL) { + // Force SW copy + result = HostBlitManager::readBuffer( + srcMemory, dstHost, origin, size, entire); + synchronize(); + return result; + } + + // Readjust host mem offset + amd::Coord3D dstOrigin(partial); + + // Get device memory for this virtual device + Memory* dstMemory = dev().getGpuMemory(amdMemory); + + // Copy image to buffer + result = copyBuffer(srcMemory, *dstMemory, + origin, dstOrigin, size, entire); + + // Add pinned memory for a later release + gpu().addPinnedMem(amdMemory); + } + else { + result = DmaBlitManager::readBuffer( + srcMemory, dstHost, origin, size, entire); + } + } + + synchronize(); + + return result; +} + +bool +KernelBlitManager::readBufferRect( + device::Memory& srcMemory, + void* dstHost, + const amd::BufferRect& bufRect, + const amd::BufferRect& hostRect, + const amd::Coord3D& size, + bool entire) const +{ + amd::ScopedLock k(lockXferOps_); + bool result = false; + + // Use host copy if memory has direct access + if (setup_.disableReadBufferRect_ || + (gpuMem(srcMemory).isHostMemDirectAccess() && gpuMem(srcMemory).isCacheable())) { + result = HostBlitManager::readBufferRect( + srcMemory, dstHost, bufRect, hostRect, size, entire); + synchronize(); + return result; + } + else { + size_t pinSize = hostRect.start_ + hostRect.end_; + size_t partial; + amd::Memory* amdMemory = pinHostMemory(dstHost, pinSize, partial); + + if (amdMemory == NULL) { + // Force SW copy + result = HostBlitManager::readBufferRect( + srcMemory, dstHost, bufRect, hostRect, size, entire); + synchronize(); + return result; + } + + // Readjust host mem offset + amd::BufferRect rect; + rect.rowPitch_ = hostRect.rowPitch_; + rect.slicePitch_ = hostRect.slicePitch_; + rect.start_ = hostRect.start_ + partial; + rect.end_ = hostRect.end_; + + // Get device memory for this virtual device + Memory* dstMemory = dev().getGpuMemory(amdMemory); + + // Copy image to buffer + result = copyBufferRect(srcMemory, *dstMemory, + bufRect, rect, size, entire); + + // Add pinned memory for a later release + gpu().addPinnedMem(amdMemory); + } + + synchronize(); + + return result; +} + +bool +KernelBlitManager::writeBuffer( + const void* srcHost, + device::Memory& dstMemory, + const amd::Coord3D& origin, + const amd::Coord3D& size, + bool entire) const +{ + amd::ScopedLock k(lockXferOps_); + bool result = false; + + // Use host copy if memory has direct access or it's persistent + if (setup_.disableWriteBuffer_ || + gpuMem(dstMemory).isHostMemDirectAccess() || + (gpuMem(dstMemory).memoryType() == Resource::Persistent)) { + result = HostBlitManager::writeBuffer( + srcHost, dstMemory, origin, size, entire); + synchronize(); + return result; + } + else { + size_t pinSize = size[0]; + + // Check if a pinned transfer can be executed with a single pin + if ((pinSize <= dev().settings().pinnedXferSize_) && + (pinSize > MinSizeForPinnedTransfer)) { + size_t partial; + amd::Memory* amdMemory = pinHostMemory(srcHost, pinSize, partial); + + if (amdMemory == NULL) { + // Force SW copy + result = HostBlitManager::writeBuffer( + srcHost, dstMemory, origin, size, entire); + synchronize(); + return result; + } + + // Readjust destination offset + const amd::Coord3D srcOrigin(partial); + + // Get device memory for this virtual device + Memory* srcMemory = dev().getGpuMemory(amdMemory); + + // Copy buffer rect + result = copyBuffer(*srcMemory, dstMemory, + srcOrigin, origin, size, entire); + + // Add pinned memory for a later release + gpu().addPinnedMem(amdMemory); + } + else { + result = DmaBlitManager::writeBuffer( + srcHost, dstMemory, origin, size, entire); + } + } + + synchronize(); + + return result; +} + +bool +KernelBlitManager::writeBufferRect( + const void* srcHost, + device::Memory& dstMemory, + const amd::BufferRect& hostRect, + const amd::BufferRect& bufRect, + const amd::Coord3D& size, + bool entire) const +{ + amd::ScopedLock k(lockXferOps_); + bool result = false; + + // Use host copy if memory has direct access or it's persistent + if (setup_.disableWriteBufferRect_ || + gpuMem(dstMemory).isHostMemDirectAccess() || + gpuMem(dstMemory).isPersistentDirectMap()) { + result = HostBlitManager::writeBufferRect( + srcHost, dstMemory, hostRect, bufRect, size, entire); + synchronize(); + return result; + } + else { + size_t pinSize = hostRect.start_ + hostRect.end_; + size_t partial; + amd::Memory* amdMemory = pinHostMemory(srcHost, pinSize, partial); + + if (amdMemory == NULL) { + // Force SW copy + result = HostBlitManager::writeBufferRect( + srcHost, dstMemory, hostRect, bufRect, size, entire); + synchronize(); + return result; + } + + // Readjust destination offset + const amd::Coord3D srcOrigin(partial); + + // Get device memory for this virtual device + Memory* srcMemory = dev().getGpuMemory(amdMemory); + + // Readjust host mem offset + amd::BufferRect rect; + rect.rowPitch_ = hostRect.rowPitch_; + rect.slicePitch_ = hostRect.slicePitch_; + rect.start_ = hostRect.start_ + partial; + rect.end_ = hostRect.end_; + + // Copy buffer rect + result = copyBufferRect(*srcMemory, dstMemory, + rect, bufRect, size, entire); + + // Add pinned memory for a later release + gpu().addPinnedMem(amdMemory); + } + + synchronize(); + + return result; +} + +bool +KernelBlitManager::fillBuffer( + device::Memory& memory, + const void* pattern, + size_t patternSize, + const amd::Coord3D& origin, + const amd::Coord3D& size, + bool entire + ) const +{ + amd::ScopedLock k(lockXferOps_); + bool result = false; + + // Use host fill if memory has direct access + if (setup_.disableFillBuffer_ || + gpuMem(memory).isHostMemDirectAccess()) { + result = HostBlitManager::fillBuffer( + memory, pattern, patternSize, origin, size, entire); + synchronize(); + return result; + } + else { + uint fillType = FillBuffer; + size_t globalWorkOffset[3] = { 0, 0, 0 }; + cl_ulong fillSize = size[0] / patternSize; + size_t globalWorkSize = amd::alignUp(fillSize, 256); + size_t localWorkSize = 256; + bool dwordAligned = + ((patternSize % sizeof(uint32_t)) == 0) ? true : false; + + // Program kernels arguments for the fill operation + Memory* mem = &gpuMem(memory); + if (dwordAligned) { + setArgument(kernels_[fillType], 0, sizeof(cl_mem), NULL); + setArgument(kernels_[fillType], 1, sizeof(cl_mem), &mem); + } + else { + setArgument(kernels_[fillType], 0, sizeof(cl_mem), &mem); + setArgument(kernels_[fillType], 1, sizeof(cl_mem), NULL); + } + Memory* gpuCB = dev().getGpuMemory(constantBuffer_); + if (gpuCB == NULL) { + return false; + } + void* constBuf = gpuCB->map(&gpu(), Resource::WriteOnly); + memcpy(constBuf, pattern, patternSize); + gpuCB->unmap(&gpu()); + setArgument(kernels_[fillType], 2, sizeof(cl_mem), &gpuCB); + cl_ulong offset = origin[0]; + if (dwordAligned) { + patternSize /= sizeof(uint32_t); + offset /= sizeof(uint32_t); + } + setArgument(kernels_[fillType], 3, sizeof(cl_uint), &patternSize); + setArgument(kernels_[fillType], 4, sizeof(offset), &offset); + setArgument(kernels_[fillType], 5, sizeof(fillSize), &fillSize); + + // Create ND range object for the kernel's execution + amd::NDRangeContainer ndrange(1, + globalWorkOffset, &globalWorkSize, &localWorkSize); + + // Execute the blit + address parameters = kernels_[fillType]->parameters().values(); + result = gpu().submitKernelInternal(ndrange, *kernels_[fillType], parameters); + } + + synchronize(); + + return result; +} + +bool +KernelBlitManager::copyBuffer( + device::Memory& srcMemory, + device::Memory& dstMemory, + const amd::Coord3D& srcOrigin, + const amd::Coord3D& dstOrigin, + const amd::Coord3D& sizeIn, + bool entire) const +{ + amd::ScopedLock k(lockXferOps_); + bool result = false; + + if (!gpuMem(srcMemory).isHostMemDirectAccess() && + !gpuMem(dstMemory).isHostMemDirectAccess()) { + uint blitType = BlitCopyBuffer; + size_t dim = 1; + size_t globalWorkOffset[3] = { 0, 0, 0 }; + size_t globalWorkSize = 0; + size_t localWorkSize = 0; + + const static uint CopyBuffAlignment[3] = { 16, 4, 1 }; + amd::Coord3D size(sizeIn[0], sizeIn[1], sizeIn[2]); + + bool aligned; + uint i; + for (i = 0; i < sizeof(CopyBuffAlignment) / sizeof(uint); i++) { + // Check source alignments + aligned = ((srcOrigin[0] % CopyBuffAlignment[i]) == 0); + // Check destination alignments + aligned &= ((dstOrigin[0] % CopyBuffAlignment[i]) == 0); + // Check copy size alignment in the first dimension + aligned &= ((sizeIn[0] % CopyBuffAlignment[i]) == 0); + + if (aligned) { + if (CopyBuffAlignment[i] != 1) { + blitType = BlitCopyBufferAligned; + } + break; + } + } + + cl_uint remain; + if (blitType == BlitCopyBufferAligned) { + size.c[0] /= CopyBuffAlignment[i]; + } + else { + if (dev().settings().ciPlus_) { + remain = size[0] % 4; + size.c[0] /= 4; + size.c[0] += 1; + } + else { + // Check if offsets are aligned + aligned = ((srcOrigin[0] % sizeof(uint32_t)) == 0); + aligned &= ((dstOrigin[0] % sizeof(uint32_t)) == 0); + if (aligned) { + remain = size[0] % 4; + size.c[0] /= 4; + size.c[0] += 1; + } + else { + remain = 8; + } + } + } + + // Program the dispatch dimensions + localWorkSize = 256; + globalWorkSize = amd::alignUp(size[0] , 256); + + // Program kernels arguments for the blit operation + Memory* mem = &gpuMem(srcMemory); + setArgument(kernels_[blitType], 0, sizeof(cl_mem), &mem); + mem = &gpuMem(dstMemory); + setArgument(kernels_[blitType], 1, sizeof(cl_mem), &mem); + // Program source origin + cl_ulong srcOffset = srcOrigin[0] / CopyBuffAlignment[i];; + setArgument(kernels_[blitType], 2, sizeof(srcOffset), &srcOffset); + + // Program destinaiton origin + cl_ulong dstOffset = dstOrigin[0] / CopyBuffAlignment[i];; + setArgument(kernels_[blitType], 3, sizeof(dstOffset), &dstOffset); + + cl_ulong copySize = size[0]; + setArgument(kernels_[blitType], 4, sizeof(copySize), ©Size); + + if (blitType == BlitCopyBufferAligned) { + cl_int alignment = CopyBuffAlignment[i]; + setArgument(kernels_[blitType], 5, sizeof(alignment), &alignment); + } + else { + setArgument(kernels_[blitType], 5, sizeof(remain), &remain); + } + + // Create ND range object for the kernel's execution + amd::NDRangeContainer ndrange(1, + globalWorkOffset, &globalWorkSize, &localWorkSize); + + // Execute the blit + address parameters = kernels_[blitType]->parameters().values(); + result = gpu().submitKernelInternal(ndrange, *kernels_[blitType], parameters); + } + else { + result = DmaBlitManager::copyBuffer( + srcMemory, dstMemory, srcOrigin, dstOrigin, sizeIn, entire); + } + + synchronize(); + + return result; +} + +bool +KernelBlitManager::fillImage( + device::Memory& memory, + const void* pattern, + const amd::Coord3D& origin, + const amd::Coord3D& size, + bool entire + ) const +{ + amd::ScopedLock k(lockXferOps_); + bool result = false; + + // Use host fill if memory has direct access + if (setup_.disableFillImage_ || + gpuMem(memory).isHostMemDirectAccess()) { + result = HostBlitManager::fillImage( + memory, pattern, origin, size, entire); + synchronize(); + return result; + } + + uint fillType; + size_t dim = 0; + size_t globalWorkOffset[3] = { 0, 0, 0 }; + size_t globalWorkSize[3]; + size_t localWorkSize[3]; + Memory* memView = &gpuMem(memory); + amd::Image::Format newFormat(gpuMem(memory).owner()->asImage()->getImageFormat()); + + // Program the kernels workload depending on the fill dimensions + fillType = FillImage; + dim = 3; + + void *newpattern = const_cast(pattern); + cl_uint4 iFillColor; + + bool rejected = false; + bool releaseView = false; + // For depth, we need to create a view + if ((memView->desc().format_.image_channel_order == CL_DEPTH) || + (memView->desc().format_.image_channel_order == CL_sRGBA)) { + // Find unsupported data type + for (uint i = 0; i < RejectedFormatDataTotal; ++i) { + if (RejectedData[i].clOldType_ == newFormat.image_channel_data_type) { + newFormat.image_channel_data_type = RejectedData[i].clNewType_; + rejected = true; + break; + } + } + + // Below may not be correct. We need to find why unsigned int view doesn't work for DEPTH16. + if ((gpuMem(memory).desc().format_.image_channel_order == CL_DEPTH) && + (gpuMem(memory).desc().format_.image_channel_data_type == CL_UNSIGNED_INT16)) { + newFormat.image_channel_data_type = CL_UNORM_INT16; + } + + if (gpuMem(memory).desc().format_.image_channel_order == CL_sRGBA) { + // Converting a linear RGB floating-point color value to a 8-bit unsigned integer sRGB value because hw is not support write_imagef for sRGB. + float *fColor = static_cast(newpattern); + iFillColor.s[0] = sRGBmap(fColor[0]); + iFillColor.s[1] = sRGBmap(fColor[1]); + iFillColor.s[2] = sRGBmap(fColor[2]); + iFillColor.s[3] = (cl_uint)(fColor[3]*255.0f); + newpattern = static_cast(&iFillColor); + for (uint i = 0; i < RejectedFormatChannelTotal; ++i) { + if (RejectedOrder[i].clOldType_ == newFormat.image_channel_order) { + newFormat.image_channel_order = RejectedOrder[i].clNewType_; + rejected = true; + break; + } + } + } + } + // If the image format was rejected, then attempt to create a view + if (rejected) { + memView = createView(gpuMem(memory), newFormat); + if (memView != NULL) { + rejected = false; + releaseView = true; + } + } + + // Find the current blit type + if (memView->desc().dimSize_ == 1) { + globalWorkSize[0] = amd::alignUp(size[0], 256); + globalWorkSize[1] = amd::alignUp(size[1], 1); + globalWorkSize[2] = amd::alignUp(size[2], 1); + localWorkSize[0] = 256; + localWorkSize[1] = localWorkSize[2] = 1; + } + else if (memView->desc().dimSize_ == 2) { + globalWorkSize[0] = amd::alignUp(size[0], 16); + globalWorkSize[1] = amd::alignUp(size[1], 16); + globalWorkSize[2] = amd::alignUp(size[2], 1); + localWorkSize[0] = localWorkSize[1] = 16; + localWorkSize[2] = 1; + } + else { + globalWorkSize[0] = amd::alignUp(size[0], 8); + globalWorkSize[1] = amd::alignUp(size[1], 8); + globalWorkSize[2] = amd::alignUp(size[2], 4); + localWorkSize[0] = localWorkSize[1] = 8; + localWorkSize[2] = 4; + } + + // Program kernels arguments for the blit operation + Memory* mem = memView; + setArgument(kernels_[fillType], 0, sizeof(cl_mem), &mem); + setArgument(kernels_[fillType], 1, sizeof(cl_float4), newpattern); + setArgument(kernels_[fillType], 2, sizeof(cl_int4), newpattern); + setArgument(kernels_[fillType], 3, sizeof(cl_uint4), newpattern); + + cl_int fillOrigin[4] = { (cl_int)origin[0], + (cl_int)origin[1], + (cl_int)origin[2], 0 }; + cl_int fillSize[4] = { (cl_int)size[0], + (cl_int)size[1], + (cl_int)size[2], 0 }; + setArgument(kernels_[fillType], 4, sizeof(fillOrigin), fillOrigin); + setArgument(kernels_[fillType], 5, sizeof(fillSize), fillSize); + + // Find the type of image + uint32_t type = 0; + switch (newFormat.image_channel_data_type) { + case CL_SNORM_INT8: + case CL_SNORM_INT16: + case CL_UNORM_INT8: + case CL_UNORM_INT16: + case CL_UNORM_SHORT_565: + case CL_UNORM_SHORT_555: + case CL_UNORM_INT_101010: + case CL_HALF_FLOAT: + case CL_FLOAT: + type = 0; + break; + case CL_SIGNED_INT8: + case CL_SIGNED_INT16: + case CL_SIGNED_INT32: + type = 1; + break; + case CL_UNSIGNED_INT8: + case CL_UNSIGNED_INT16: + case CL_UNSIGNED_INT32: + type = 2; + break; + } + setArgument(kernels_[fillType], 6, sizeof(type), &type); + + // Create ND range object for the kernel's execution + amd::NDRangeContainer ndrange(dim, + globalWorkOffset, globalWorkSize, localWorkSize); + + // Execute the blit + address parameters = kernels_[fillType]->parameters().values(); + result = gpu().submitKernelInternal(ndrange, *kernels_[fillType], parameters); + if (releaseView) { + delete memView; + } + + synchronize(); + + return result; +} + +bool +KernelBlitManager::runScheduler( + device::Memory& vqueue, + device::Memory& params, + uint paramIdx, + uint threads + ) const +{ + amd::ScopedLock k(lockXferOps_); + bool result = false; + + size_t dim = 1; + size_t globalWorkOffset[1] = { 0 }; + size_t globalWorkSize[1] = { threads }; + size_t localWorkSize[1] = { 1 }; + + // Program kernels arguments + Memory* q = &gpuMem(vqueue); + Memory* p = &gpuMem(params); + setArgument(kernels_[Scheduler], 0, sizeof(cl_mem), &q); + setArgument(kernels_[Scheduler], 1, sizeof(cl_mem), &p); + setArgument(kernels_[Scheduler], 2, sizeof(uint), ¶mIdx); + + // Create ND range object for the kernel's execution + amd::NDRangeContainer ndrange(1, + globalWorkOffset, globalWorkSize, localWorkSize); + + // Execute the blit + address parameters = kernels_[Scheduler]->parameters().values(); + result = gpu().submitKernelInternal(ndrange, *kernels_[Scheduler], parameters); + + synchronize(); + + return result; +} + +amd::Memory* +DmaBlitManager::pinHostMemory( + const void* hostMem, + size_t pinSize, + size_t& partial) const +{ + size_t pinAllocSize; + const static bool SysMem = true; + amd::Memory* amdMemory; + + // Allign offset to 4K boundary (Vista/Win7 limitation) + char* tmpHost = const_cast( + amd::alignDown(reinterpret_cast(hostMem), + PinnedMemoryAlignment)); + + // Find the partial size for unaligned copy + partial = reinterpret_cast(hostMem) - tmpHost; + + // Recalculate pin memory size + pinAllocSize = amd::alignUp(pinSize + partial, PinnedMemoryAlignment); + + amdMemory = gpu().findPinnedMem(tmpHost, pinAllocSize); + + if (NULL != amdMemory) { + return amdMemory; + } + + amdMemory = new(*context_) + amd::Buffer(*context_, CL_MEM_USE_HOST_PTR, pinAllocSize); + + if ((amdMemory != NULL) && !amdMemory->create(tmpHost, SysMem)) { + amdMemory->release(); + return NULL; + } + + // Get device memory for this virtual device + // @note: This will force real memory pinning + amdMemory->setVirtualDevice(&gpu()); + Memory* srcMemory = dev().getGpuMemory(amdMemory); + + if (srcMemory == NULL) { + // Release all pinned memory and attempt pinning again + gpu().releasePinnedMem(); + srcMemory = dev().getGpuMemory(amdMemory); + if (srcMemory == NULL) { + // Release memory + amdMemory->release(); + amdMemory = NULL; + } + } + + return amdMemory; +} + +Memory* +KernelBlitManager::createView( + const Memory& parent, + const cl_image_format format +) const +{ + assert(!parent.desc().buffer_ && "View supports images only"); + Memory* gpuImage = NULL; + + gpuImage = new Image(dev(), parent.size(), + parent.desc().width_, + parent.desc().height_, + parent.desc().depth_, + format, + parent.desc().topology_, + 1); + + // Create resource + if (NULL != gpuImage) { + bool result = false; + Resource::ImageViewParams params; + const Memory& gpuMem = static_cast(parent); + + params.owner_ = parent.owner(); + params.level_ = 0; + params.layer_ = 0; + params.resource_ = &gpuMem; + params.memory_ = &gpuMem; + params.gpu_ = &gpu(); + + // Create memory object + result = gpuImage->create(Resource::ImageView, ¶ms); + if (!result) { + delete gpuImage; + return NULL; + } + } + + return gpuImage; +} + +} // namespace pal diff --git a/projects/clr/rocclr/runtime/device/pal/palblit.hpp b/projects/clr/rocclr/runtime/device/pal/palblit.hpp new file mode 100644 index 0000000000..2a2915f753 --- /dev/null +++ b/projects/clr/rocclr/runtime/device/pal/palblit.hpp @@ -0,0 +1,451 @@ +// +// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved. +// +#ifndef PALBLIT_HPP_ +#define PALBLIT_HPP_ + +#include "top.hpp" +#include "platform/command.hpp" +#include "device/pal/paldefs.hpp" +#include "device/device.hpp" +#include "device/blit.hpp" + +/*! \addtogroup PAL Blit Implementation + * @{ + */ + +//! PAL Blit Manager Implementation +namespace pal { + +class Device; +class Kernel; +class Memory; +class VirtualGPU; + +//! DMA Blit Manager +class DmaBlitManager : public device::HostBlitManager +{ +public: + //! Constructor + DmaBlitManager( + VirtualGPU& gpu, //!< Virtual GPU to be used for blits + Setup setup = Setup() //!< Specifies HW accelerated blits + ); + + //! Destructor + virtual ~DmaBlitManager() {} + + //! Creates DmaBlitManager object + virtual bool create(amd::Device& device) { return true; } + + //! Copies a buffer object to system memory + virtual bool readBuffer( + device::Memory& srcMemory, //!< Source memory object + void* dstHost, //!< Destination host memory + const amd::Coord3D& origin, //!< Source origin + const amd::Coord3D& size, //!< Size of the copy region + bool entire = false //!< Entire buffer will be updated + ) const; + + //! Copies a buffer object to system memory + virtual bool readBufferRect( + device::Memory& srcMemory, //!< Source memory object + void* dstHost, //!< Destinaiton host memory + const amd::BufferRect& bufRect, //!< Source rectangle + const amd::BufferRect& hostRect, //!< Destination rectangle + const amd::Coord3D& size, //!< Size of the copy region + bool entire = false //!< Entire buffer will be updated + ) const; + + //! Copies an image object to system memory + virtual bool readImage( + device::Memory& srcMemory, //!< Source memory object + void* dstHost, //!< Destination host memory + const amd::Coord3D& origin, //!< Source origin + const amd::Coord3D& size, //!< Size of the copy region + size_t rowPitch, //!< Row pitch for host memory + size_t slicePitch, //!< Slice pitch for host memory + bool entire = false //!< Entire buffer will be updated + ) const; + + //! Copies system memory to a buffer object + virtual bool writeBuffer( + const void* srcHost, //!< Source host memory + device::Memory& dstMemory, //!< Destination memory object + const amd::Coord3D& origin, //!< Destination origin + const amd::Coord3D& size, //!< Size of the copy region + bool entire = false //!< Entire buffer will be updated + ) const; + + //! Copies system memory to a buffer object + virtual bool writeBufferRect( + const void* srcHost, //!< Source host memory + device::Memory& dstMemory, //!< Destination memory object + const amd::BufferRect& hostRect, //!< Destination rectangle + const amd::BufferRect& bufRect, //!< Source rectangle + const amd::Coord3D& size, //!< Size of the copy region + bool entire = false //!< Entire buffer will be updated + ) const; + + //! Copies system memory to an image object + virtual bool writeImage( + const void* srcHost, //!< Source host memory + device::Memory& dstMemory, //!< Destination memory object + const amd::Coord3D& origin, //!< Destination origin + const amd::Coord3D& size, //!< Size of the copy region + size_t rowPitch, //!< Row pitch for host memory + size_t slicePitch, //!< Slice pitch for host memory + bool entire = false //!< Entire buffer will be updated + ) const; + + //! Copies a buffer object to another buffer object + virtual bool copyBuffer( + device::Memory& srcMemory, //!< Source memory object + device::Memory& dstMemory, //!< Destination memory object + const amd::Coord3D& srcOrigin, //!< Source origin + const amd::Coord3D& dstOrigin, //!< Destination origin + const amd::Coord3D& size, //!< Size of the copy region + bool entire = false //!< Entire buffer will be updated + ) const; + + //! Copies a buffer object to another buffer object + virtual bool copyBufferRect( + device::Memory& srcMemory, //!< Source memory object + device::Memory& dstMemory, //!< Destination memory object + const amd::BufferRect& srcRect, //!< Source rectangle + const amd::BufferRect& dstRect, //!< Destination rectangle + const amd::Coord3D& size, //!< Size of the copy region + bool entire = false //!< Entire buffer will be updated + ) const; + + //! Copies an image object to a buffer object + virtual bool copyImageToBuffer( + device::Memory& srcMemory, //!< Source memory object + device::Memory& dstMemory, //!< Destination memory object + const amd::Coord3D& srcOrigin, //!< Source origin + const amd::Coord3D& dstOrigin, //!< Destination origin + const amd::Coord3D& size, //!< Size of the copy region + bool entire = false, //!< Entire buffer will be updated + size_t rowPitch = 0, //!< Pitch for buffer + size_t slicePitch = 0 //!< Slice for buffer + ) const; + + //! Copies a buffer object to an image object + virtual bool copyBufferToImage( + device::Memory& srcMemory, //!< Source memory object + device::Memory& dstMemory, //!< Destination memory object + const amd::Coord3D& srcOrigin, //!< Source origin + const amd::Coord3D& dstOrigin, //!< Destination origin + const amd::Coord3D& size, //!< Size of the copy region + bool entire = false, //!< Entire buffer will be updated + size_t rowPitch = 0, //!< Pitch for buffer + size_t slicePitch = 0 //!< Slice for buffer + ) const; + + //! Copies an image object to another image object + virtual bool copyImage( + device::Memory& srcMemory, //!< Source memory object + device::Memory& dstMemory, //!< Destination memory object + const amd::Coord3D& srcOrigin, //!< Source origin + const amd::Coord3D& dstOrigin, //!< Destination origin + const amd::Coord3D& size, //!< Size of the copy region + bool entire = false //!< Entire buffer will be updated + ) const; + +protected: + const static uint MaxPinnedBuffers = 4; + + //! Synchronizes the blit operations if necessary + inline void synchronize() const; + + //! Returns the virtual GPU object + VirtualGPU& gpu() const { return static_cast(vDev_); } + + //! Returns the GPU device object + const Device& dev() const { return static_cast(dev_); }; + + inline Memory& gpuMem(device::Memory& mem) const; + + //! Pins host memory for GPU access + amd::Memory* pinHostMemory( + const void* hostMem, //!< Host memory pointer + size_t pinSize, //!< Host memory size + size_t& partial //!< Extra offset for memory alignment + ) const; + + const size_t MinSizeForPinnedTransfer; + bool completeOperation_; //!< DMA blit manager must complete operation + amd::Context* context_; //!< A dummy context + +private: + + //! Disable copy constructor + DmaBlitManager(const DmaBlitManager&); + + //! Disable operator= + DmaBlitManager& operator=(const DmaBlitManager&); + + //! Reads video memory, using a staged buffer + bool readMemoryStaged( + Memory& srcMemory, //!< Source memory object + void* dstHost, //!< Destination host memory + Memory** xferBuf, //!< Staged buffer for read + size_t origin, //!< Original offset in the source memory + size_t& offset, //!< Offset for the current copy pointer + size_t& totalSize, //!< Total size for copy region + size_t xferSize //!< Transfer size + ) const; + + //! Write into video memory, using a staged buffer + bool writeMemoryStaged( + const void* srcHost, //!< Source host memory + Memory& dstMemory, //!< Destination memory object + Memory& xferBuf, //!< Staged buffer for write + size_t origin, //!< Original offset in the destination memory + size_t& offset, //!< Offset for the current copy pointer + size_t& totalSize, //!< Total size for the copy region + size_t xferSize //!< Transfer size + ) const; +}; + +//! Kernel Blit Manager +class KernelBlitManager : public DmaBlitManager +{ +public: + enum { + BlitCopyImage = 0, + BlitCopyImage1DA, + BlitCopyImageToBuffer, + BlitCopyBufferToImage, + BlitCopyBufferRect, + BlitCopyBufferRectAligned, + BlitCopyBuffer, + BlitCopyBufferAligned, + FillBuffer, + FillImage, + Scheduler, + BlitTotal + }; + + //! Constructor + KernelBlitManager( + VirtualGPU& gpu, //!< Virtual GPU to be used for blits + Setup setup = Setup() //!< Specifies HW accelerated blits + ); + + //! Destructor + virtual ~KernelBlitManager(); + + //! Creates DmaBlitManager object + virtual bool create(amd::Device& device); + + //! Copies a buffer object to another buffer object + virtual bool copyBufferRect( + device::Memory& srcMemory, //!< Source memory object + device::Memory& dstMemory, //!< Destination memory object + const amd::BufferRect& srcRectIn, //!< Source rectangle + const amd::BufferRect& dstRectIn, //!< Destination rectangle + const amd::Coord3D& sizeIn, //!< Size of the copy region + bool entire = false //!< Entire buffer will be updated + ) const; + + //! Copies a buffer object to system memory + virtual bool readBuffer( + device::Memory& srcMemory, //!< Source memory object + void* dstHost, //!< Destination host memory + const amd::Coord3D& origin, //!< Source origin + const amd::Coord3D& size, //!< Size of the copy region + bool entire = false //!< Entire buffer will be updated + ) const; + + //! Copies a buffer object to system memory + virtual bool readBufferRect( + device::Memory& srcMemory, //!< Source memory object + void* dstHost, //!< Destinaiton host memory + const amd::BufferRect& bufRect, //!< Source rectangle + const amd::BufferRect& hostRect, //!< Destination rectangle + const amd::Coord3D& size, //!< Size of the copy region + bool entire = false //!< Entire buffer will be updated + ) const; + + //! Copies system memory to a buffer object + virtual bool writeBuffer( + const void* srcHost, //!< Source host memory + device::Memory& dstMemory, //!< Destination memory object + const amd::Coord3D& origin, //!< Destination origin + const amd::Coord3D& size, //!< Size of the copy region + bool entire = false //!< Entire buffer will be updated + ) const; + + //! Copies system memory to a buffer object + virtual bool writeBufferRect( + const void* srcHost, //!< Source host memory + device::Memory& dstMemory, //!< Destination memory object + const amd::BufferRect& hostRect, //!< Destination rectangle + const amd::BufferRect& bufRect, //!< Source rectangle + const amd::Coord3D& size, //!< Size of the copy region + bool entire = false //!< Entire buffer will be updated + ) const; + + //! Copies a buffer object to an image object + virtual bool copyBuffer( + device::Memory& srcMemory, //!< Source memory object + device::Memory& dstMemory, //!< Destination memory object + const amd::Coord3D& srcOrigin, //!< Source origin + const amd::Coord3D& dstOrigin, //!< Destination origin + const amd::Coord3D& size, //!< Size of the copy region + bool entire = false //!< Entire buffer will be updated + ) const; + + //! Copies a buffer object to an image object + virtual bool copyBufferToImage( + device::Memory& srcMemory, //!< Source memory object + device::Memory& dstMemory, //!< Destination memory object + const amd::Coord3D& srcOrigin, //!< Source origin + const amd::Coord3D& dstOrigin, //!< Destination origin + const amd::Coord3D& size, //!< Size of the copy region + bool entire = false, //!< Entire buffer will be updated + size_t rowPitch = 0, //!< Pitch for buffer + size_t slicePitch = 0 //!< Slice for buffer + ) const; + + //! Copies an image object to a buffer object + virtual bool copyImageToBuffer( + device::Memory& srcMemory, //!< Source memory object + device::Memory& dstMemory, //!< Destination memory object + const amd::Coord3D& srcOrigin, //!< Source origin + const amd::Coord3D& dstOrigin, //!< Destination origin + const amd::Coord3D& size, //!< Size of the copy region + bool entire = false, //!< Entire buffer will be updated + size_t rowPitch = 0, //!< Pitch for buffer + size_t slicePitch = 0 //!< Slice for buffer + ) const; + + //! Copies an image object to another image object + virtual bool copyImage( + device::Memory& srcMemory, //!< Source memory object + device::Memory& dstMemory, //!< Destination memory object + const amd::Coord3D& srcOrigin, //!< Source origin + const amd::Coord3D& dstOrigin, //!< Destination origin + const amd::Coord3D& size, //!< Size of the copy region + bool entire = false //!< Entire buffer will be updated + ) const; + + //! Copies an image object to system memory + virtual bool readImage( + device::Memory& srcMemory, //!< Source memory object + void* dstHost, //!< Destination host memory + const amd::Coord3D& origin, //!< Source origin + const amd::Coord3D& size, //!< Size of the copy region + size_t rowPitch, //!< Row pitch for host memory + size_t slicePitch, //!< Slice pitch for host memory + bool entire = false //!< Entire buffer will be updated + ) const; + + //! Copies system memory to an image object + virtual bool writeImage( + const void* srcHost, //!< Source host memory + device::Memory& dstMemory, //!< Destination memory object + const amd::Coord3D& origin, //!< Destination origin + const amd::Coord3D& size, //!< Size of the copy region + size_t rowPitch, //!< Row pitch for host memory + size_t slicePitch, //!< Slice pitch for host memory + bool entire = false //!< Entire buffer will be updated + ) const; + + //! Fills a buffer memory with a pattern data + virtual bool fillBuffer( + device::Memory& memory, //!< Memory object to fill with pattern + const void* pattern, //!< Pattern data + size_t patternSize, //!< Pattern size + const amd::Coord3D& origin, //!< Destination origin + const amd::Coord3D& size, //!< Size of the copy region + bool entire = false //!< Entire buffer will be updated + ) const; + + //! Fills an image memory with a pattern data + virtual bool fillImage( + device::Memory& dstMemory, //!< Memory object to fill with pattern + const void* pattern, //!< Pattern data + const amd::Coord3D& origin, //!< Destination origin + const amd::Coord3D& size, //!< Size of the copy region + bool entire = false //!< Entire buffer will be updated + ) const; + + //! Fills an image memory with a pattern data + virtual bool runScheduler( + device::Memory& vqueue, //!< Memory object for virtual queue + device::Memory& params, //!< Extra arguments for the scheduler + uint paramIdx, //!< Parameter index + uint threads //!< Number of scheduling threads + ) const; + +private: + static const size_t MaxXferBuffers = 2; + + //! Copies a buffer object to an image object + bool copyBufferToImageKernel( + device::Memory& srcMemory, //!< Source memory object + device::Memory& dstMemory, //!< Destination memory object + const amd::Coord3D& srcOrigin, //!< Source origin + const amd::Coord3D& dstOrigin, //!< Destination origin + const amd::Coord3D& size, //!< Size of the copy region + bool entire = false, //!< Entire buffer will be updated + size_t rowPitch = 0, //!< Pitch for buffer + size_t slicePitch = 0 //!< Slice for buffer + ) const; + + //! Copies an image object to a buffer object + bool copyImageToBufferKernel( + device::Memory& srcMemory, //!< Source memory object + device::Memory& dstMemory, //!< Destination memory object + const amd::Coord3D& srcOrigin, //!< Source origin + const amd::Coord3D& dstOrigin, //!< Destination origin + const amd::Coord3D& size, //!< Size of the copy region + bool entire = false, //!< Entire buffer will be updated + size_t rowPitch = 0, //!< Pitch for buffer + size_t slicePitch = 0 //!< Slice for buffer + ) const; + + //! Creates a program for all blit operations + bool createProgram( + Device& device //!< Device object + ); + + //! Creates a view memory object + Memory* createView( + const Memory& parent, //!< Parent memory object + const cl_image_format format //!< The new format for a view + ) const; + + //! Disable copy constructor + KernelBlitManager(const KernelBlitManager&); + + //! Disable operator= + KernelBlitManager& operator=(const KernelBlitManager&); + + amd::Program* program_; //!< GPU program obejct + amd::Kernel* kernels_[BlitTotal]; //!< GPU kernels for blit + amd::Memory* constantBuffer_; //!< An internal CB for blits + amd::Memory* xferBuffers_[MaxXferBuffers]; //!< Transfer buffers for images + size_t xferBufferSize_; //!< Transfer buffer size + amd::Monitor* lockXferOps_; //!< Lock transfer operation +}; + +static const char* BlitName[KernelBlitManager::BlitTotal] = { + "copyImage", + "copyImage1DA", + "copyImageToBuffer", + "copyBufferToImage", + "copyBufferRect", + "copyBufferRectAligned", + "copyBuffer", + "copyBufferAligned", + "fillBuffer", + "fillImage", + "scheduler", + }; + +/*@}*/} // namespace pal + +#endif /*PALBLIT_HPP_*/ diff --git a/projects/clr/rocclr/runtime/device/pal/palcompiler.cpp b/projects/clr/rocclr/runtime/device/pal/palcompiler.cpp new file mode 100644 index 0000000000..c7320ed45e --- /dev/null +++ b/projects/clr/rocclr/runtime/device/pal/palcompiler.cpp @@ -0,0 +1,147 @@ +// +// Copyright (c) 2008 Advanced Micro Devices, Inc. All rights reserved. +// + +#include +#include +#include +#include + +#include "os/os.hpp" +#include "device/pal/paldevice.hpp" +#include "device/pal/palprogram.hpp" +#include "device/pal/palkernel.hpp" +#include "utils/options.hpp" +#include + +//CLC_IN_PROCESS_CHANGE +extern int openclFrontEnd(const char* cmdline, std::string*, std::string* typeInfo = nullptr); + +namespace pal { + +bool +HSAILProgram::compileImpl( + const std::string& sourceCode, + const std::vector& headers, + const char** headerIncludeNames, + amd::option::Options* options) +{ + acl_error errorCode; + aclTargetInfo target; + + std::string arch = "hsail"; + if (dev().settings().use64BitPtr_) { + arch += "64"; + } + target = aclGetTargetInfo(arch.c_str(), + dev().info().name_, &errorCode); + + // end if asic info is ready + // We dump the source code for each program (param: headers) + // into their filenames (headerIncludeNames) into the TEMP + // folder specific to the OS and add the include path while + // compiling + + // Find the temp folder for the OS + std::string tempFolder = amd::Os::getTempPath(); + std::string tempFileName = amd::Os::getTempFileName(); + + // Iterate through each source code and dump it into tmp + std::fstream f; + std::vector headerFileNames(headers.size()); + std::vector newDirs; + for (size_t i = 0; i < headers.size(); ++i) { + std::string headerPath = tempFolder; + std::string headerIncludeName(headerIncludeNames[i]); + // replace / in path with current os's file separator + if (amd::Os::fileSeparator() != '/') { + for (std::string::iterator it = headerIncludeName.begin(), + end = headerIncludeName.end(); it != end; ++it) { + if (*it == '/') *it = amd::Os::fileSeparator(); + } + } + size_t pos = headerIncludeName.rfind(amd::Os::fileSeparator()); + if (pos != std::string::npos) { + headerPath += amd::Os::fileSeparator(); + headerPath += headerIncludeName.substr(0, pos); + headerIncludeName = headerIncludeName.substr(pos+1); + } + if (!amd::Os::pathExists(headerPath)) { + bool ret = amd::Os::createPath(headerPath); + assert(ret && "failed creating path!"); + newDirs.push_back(headerPath); + } + std::string headerFullName = + headerPath + amd::Os::fileSeparator() + headerIncludeName; + headerFileNames[i] = headerFullName; + f.open(headerFullName.c_str(), std::fstream::out); + // Should we allow asserts + assert(!f.fail() && "failed creating header file!"); + f.write(headers[i]->c_str(), headers[i]->length()); + f.close(); + } + + // Create Binary + binaryElf_ = aclBinaryInit(sizeof(aclBinary), + &target, &binOpts_, &errorCode); + if (errorCode != ACL_SUCCESS) { + buildLog_ += "Error: aclBinary init failure\n"; + LogWarning("aclBinaryInit failed"); + return false; + } + + // Insert opencl into binary + errorCode = aclInsertSection(dev().compiler(), binaryElf_, + sourceCode.c_str(), strlen(sourceCode.c_str()), aclSOURCE); + if (errorCode != ACL_SUCCESS) { + buildLog_ += "Error: Inserting openCl Source to binary\n"; + } + + // Set the options for the compiler + // Set the include path for the temp folder that contains the includes + if (!headers.empty()) { + compileOptions_.append(" -I"); + compileOptions_.append(tempFolder); + } + + //Add only for CL2.0 and above + if (options->oVariables->CLStd[2] >= '2') { + std::stringstream opts; + opts << " -D" << "CL_DEVICE_MAX_GLOBAL_VARIABLE_SIZE=" + << device().info().maxGlobalVariableSize_; + compileOptions_.append(opts.str()); + } + +#if !defined(_LP64) && defined(ATI_OS_LINUX) + if (options->origOptionStr.find("-cl-std=CL2.0") != std::string::npos && !dev().settings().force32BitOcl20_) { + errorCode = ACL_UNSUPPORTED; + LogWarning("aclCompile failed"); + return false; + } +#endif + + // Compile source to IR + compileOptions_.append(hsailOptions()); + errorCode = aclCompile(dev().compiler(), binaryElf_, compileOptions_.c_str(), + ACL_TYPE_OPENCL, ACL_TYPE_LLVMIR_BINARY, nullptr); + buildLog_ += aclGetCompilerLog(dev().compiler()); + if (errorCode != ACL_SUCCESS) { + LogWarning("aclCompile failed"); + buildLog_ += "Error: Compiling CL to IR\n"; + return false; + } + + clBinary()->storeCompileOptions(compileOptions_); + // Save the binary in the interface class + size_t size = 0; + void* mem = nullptr; + aclWriteToMem(binaryElf_, &mem, &size); + setBinary(static_cast(mem), size); + + // Save the binary inside the program + // The FSAILProgram will be responsible to free it during destruction + rawBinary_ = mem; + return true; +} + +} // namespace pal diff --git a/projects/clr/rocclr/runtime/device/pal/palconstbuf.cpp b/projects/clr/rocclr/runtime/device/pal/palconstbuf.cpp new file mode 100644 index 0000000000..6e3ed49c10 --- /dev/null +++ b/projects/clr/rocclr/runtime/device/pal/palconstbuf.cpp @@ -0,0 +1,89 @@ +// +// Copyright (c) 2010 Advanced Micro Devices, Inc. All rights reserved. +// + +#include "device/pal/palconstbuf.hpp" +#include "device/pal/palvirtual.hpp" +#include "device/pal/paldevice.hpp" +#include "device/pal/palsettings.hpp" + +namespace pal { + +ConstBuffer::ConstBuffer( + VirtualGPU& gpu, + size_t size) + : Memory(const_cast(gpu.dev()), size * VectorSize) + , gpu_(gpu) + , size_(size * VectorSize) + , wrtOffset_(0) + , lastWrtSize_(0) + , wrtAddress_(nullptr) +{ +} + +ConstBuffer::~ConstBuffer() +{ + if (wrtAddress_ != nullptr) { + unmap(&gpu_); + } + + amd::AlignedMemory::deallocate(sysMemCopy_); +} + +bool +ConstBuffer::create() +{ + // Create sysmem copy for the constant buffer + sysMemCopy_ = reinterpret_cast
(amd::AlignedMemory::allocate(size_, 256)); + if (sysMemCopy_ == nullptr) { + LogPrintfError("We couldn't allocate sysmem copy for constant buffer,\ + size(%d)!", size_); + return false; + } + memset(sysMemCopy_, 0, size_); + + if (!Memory::create(Resource::RemoteUSWC)) { + LogPrintfError("We couldn't create HW constant buffer, size(%d)!", size_); + return false; + } + + // Constant buffer warm-up + warmUpRenames(gpu_); + + wrtAddress_ = map(&gpu_, Resource::Discard); + if (wrtAddress_ == nullptr) { + LogPrintfError("We couldn't map HW constant buffer, size(%d)!", size_); + return false; + } + + return true; +} + +bool +ConstBuffer::uploadDataToHw(size_t size) +{ + static const size_t HwCbAlignment = 256; + + // Align copy size on the vector's boundary + size_t count = amd::alignUp(size, VectorSize); + wrtOffset_ += lastWrtSize_; + + // Check if CB has enough space for copy + if ((wrtOffset_ + count) > size_) { + if (wrtAddress_ != nullptr) { + unmap(&gpu_); + } + wrtAddress_ = map(&gpu_, Resource::Discard); + wrtOffset_ = 0; + lastWrtSize_ = 0; + } + + // Update memory with new CB data + memcpy((reinterpret_cast(wrtAddress_) + wrtOffset_), sysMemCopy_, count); + + // Adjust the size by the HW CB buffer alignment + lastWrtSize_ = amd::alignUp(size, HwCbAlignment); + return true; +} + +} // namespace pal diff --git a/projects/clr/rocclr/runtime/device/pal/palconstbuf.hpp b/projects/clr/rocclr/runtime/device/pal/palconstbuf.hpp new file mode 100644 index 0000000000..4d447b084d --- /dev/null +++ b/projects/clr/rocclr/runtime/device/pal/palconstbuf.hpp @@ -0,0 +1,70 @@ +// +// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved. +// + +#ifndef PALCONSTBUF_HPP_ +#define PALCONSTBUF_HPP_ + +#include "device/pal/palmemory.hpp" + +//! \namespace pal PAL Resource Implementation +namespace pal { + +//! Cconstant buffer +class ConstBuffer : public Memory +{ +public: + //! Vector size of the constant buffer + static const size_t VectorSize = 16; + + //! Constructor for the ConstBuffer class + ConstBuffer( + VirtualGPU& gpu, //!< Virtual GPU device object + size_t size //!< size of the constant buffer in vectors + ); + + //! Destructor for the ConstBuffer class + ~ConstBuffer(); + + //! Creates the real HW constant buffer + bool create(); + + /*! \brief Uploads current constant buffer data from sysMemCopy_ to HW + * + * \return True if the data upload was succesful + */ + bool uploadDataToHw( + size_t size //!< real data size for upload + ); + + //! Returns a pointer to the system memory copy for CB + address sysMemCopy() const { return sysMemCopy_; } + + //! Returns CB size + size_t size() const { return size_; } + + //! Returns current write offset for the constant buffer + size_t wrtOffset() const { return wrtOffset_; } + + //! Returns last write size for the constant buffer + size_t lastWrtSize() const { return lastWrtSize_; } + +private: + //! Disable copy constructor + ConstBuffer(const ConstBuffer&); + + //! Disable operator= + ConstBuffer& operator=(const ConstBuffer&); + + VirtualGPU& gpu_; //!< Virtual GPU object + address sysMemCopy_; //!< System memory copy + size_t size_; //!< Constant buffer size + size_t wrtOffset_; //!< Current write offset + size_t lastWrtSize_; //!< Last write size + void* wrtAddress_; //!< Write address in CB +}; + + +/*@}*/} // namespace pal + +#endif /*PALCONSTBUF_HPP_*/ diff --git a/projects/clr/rocclr/runtime/device/pal/palcounters.cpp b/projects/clr/rocclr/runtime/device/pal/palcounters.cpp new file mode 100644 index 0000000000..4835f1a16e --- /dev/null +++ b/projects/clr/rocclr/runtime/device/pal/palcounters.cpp @@ -0,0 +1,119 @@ +// +// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved. +// + +#include "device/pal/paldefs.hpp" +#include "device/pal/palcounters.hpp" +#include "device/pal/palvirtual.hpp" + +namespace pal { + +PalCounterReference* +PalCounterReference::Create( + VirtualGPU& gpu, + const Pal::PerfExperimentCreateInfo& createInfo) +{ + Pal::Result result; + size_t palExperSize = gpu.dev().iDev()->GetPerfExperimentSize( + createInfo, &result); + if (result != Pal::Result::Success) { + return nullptr; + } + + PalCounterReference* memRef = new (palExperSize) PalCounterReference(gpu); + if (memRef != nullptr) { + result = gpu.dev().iDev()->CreatePerfExperiment(createInfo, + &memRef[1], &memRef->perfExp_); + if (result != Pal::Result::Success) { + memRef->release(); + return nullptr; + } + } + + return memRef; +} + +PalCounterReference::~PalCounterReference() { + // The counter object is always associated with a particular queue, + // so we have to lock just this queue + amd::ScopedLock lock(gpu_.execution()); + if (nullptr != iPerf()) { + iPerf()->Destroy(); + } +} + +bool +PalCounterReference::growResultArray(uint index) { + if (results_ != nullptr) { + delete [] results_; + } + results_ = new uint64_t [index + 1]; + if (results_ == nullptr) { + return false; + } + return true; +} + +PerfCounter::~PerfCounter() +{ + if (calRef_ == nullptr) { + return; + } + + // Release the counter reference object + calRef_->release(); +} + +bool +PerfCounter::create( + PalCounterReference* calRef) +{ + assert(&gpu() == &calRef->gpu()); + + calRef_ = calRef; + counter_ = calRef->iPerf(); + index_ = calRef->retain() - 2; + calRef->growResultArray(index_); + + // Initialize the counter + Pal::PerfCounterInfo counterInfo = {}; + counterInfo.counterType = Pal::PerfCounterType::Global; + counterInfo.block = static_cast(info_.blockIndex_); + counterInfo.instance = info_.counterIndex_; + counterInfo.eventId = info_.eventIndex_; + Pal::Result result = counter_->AddCounter(counterInfo); + if (result != Pal::Result::Success) { + return false; + } + + return true; +} + +uint64_t +PerfCounter::getInfo(uint64_t infoType) const +{ + switch (infoType) { + case CL_PERFCOUNTER_GPU_BLOCK_INDEX: { + // Return the GPU block index + return info()->blockIndex_; + } + case CL_PERFCOUNTER_GPU_COUNTER_INDEX: { + // Return the GPU counter index + return info()->counterIndex_; + } + case CL_PERFCOUNTER_GPU_EVENT_INDEX: { + // Return the GPU event index + return info()->eventIndex_; + } + case CL_PERFCOUNTER_DATA: { + Unimplemented(); + //gslCounter()->GetResult(gpu().cs(), reinterpret_cast(calRef_->results())); + return calRef_->results()[index_]; + } + default: + LogError("Wrong PerfCounter::getInfo parameter"); + } + return 0; +} + +} // namespace pal diff --git a/projects/clr/rocclr/runtime/device/pal/palcounters.hpp b/projects/clr/rocclr/runtime/device/pal/palcounters.hpp new file mode 100644 index 0000000000..9dc727f069 --- /dev/null +++ b/projects/clr/rocclr/runtime/device/pal/palcounters.hpp @@ -0,0 +1,152 @@ +// +// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved. +// +#ifndef PALCOUNTERS_HPP_ +#define PALCOUNTERS_HPP_ + +#include "top.hpp" +#include "device/device.hpp" +#include "device/pal/paldevice.hpp" +#include "palPerfExperiment.h" + +namespace pal { + +class VirtualGPU; + +class PalCounterReference : public amd::ReferenceCountedObject +{ +public: + static PalCounterReference* Create( + VirtualGPU& gpu, + const Pal::PerfExperimentCreateInfo& createInfo); + + //! Default constructor + PalCounterReference( + VirtualGPU& gpu //!< Virtual GPU device object + ) + : perfExp_(nullptr) + , gpu_(gpu) + , results_(nullptr) {} + + //! Get PAL counter + Pal::IPerfExperiment* iPerf() const { return perfExp_; } + + //! Returns the virtual GPU device + const VirtualGPU& gpu() const { return gpu_; } + + //! Increases the results array for this PAL counter(container) + bool growResultArray( + uint maxIndex //!< the maximum HW counter index in the PAL counter + ); + + void finalize() { + iPerf()->Finalize(); + Pal::GlobalCounterLayout layout = {}; + layout.sampleCount = referenceCount() - 1; + iPerf()->GetGlobalCounterLayout(&layout); } + + //! Returns the PAL counter results + uint64_t* results() const { return results_; } + + Pal::IPerfExperiment* perfExp_; //!< PAL performance experiment object + +protected: + //! Default destructor + ~PalCounterReference(); + +private: + //! Disable copy constructor + PalCounterReference(const PalCounterReference&); + + //! Disable operator= + PalCounterReference& operator=(const PalCounterReference&); + + VirtualGPU& gpu_; //!< The virtual GPU device object + uint64_t* results_; //!< Counter results +}; + +//! Performance counter implementation on GPU +class PerfCounter : public device::PerfCounter +{ +public: + //! The performance counter info + struct Info : public amd::EmbeddedObject + { + uint blockIndex_; //!< Index of the block to configure + uint counterIndex_; //!< Index of the hardware counter + uint eventIndex_; //!< Event you wish to count with the counter + }; + + //! The PerfCounter flags + enum Flags + { + BeginIssued = 0x00000001, + EndIssued = 0x00000002, + ResultReady = 0x00000004 + }; + + //! Constructor for the GPU PerfCounter object + PerfCounter( + const Device& device, //!< A GPU device object + const VirtualGPU& gpu, //!< Virtual GPU device object + cl_uint blockIndex, //!< HW block index + cl_uint counterIndex, //!< Counter index within the block + cl_uint eventIndex) //!< Event index for profiling + : gpuDevice_(device) + , gpu_(gpu) + , calRef_(NULL) + , flags_(0) + , counter_(0) + , index_(0) + { + info_.blockIndex_ = blockIndex; + info_.counterIndex_ = counterIndex; + info_.eventIndex_ = eventIndex; + } + + //! Destructor for the GPU PerfCounter object + virtual ~PerfCounter(); + + //! Creates the current object + bool create( + PalCounterReference* calRef //!< Reference counter + ); + + //! Returns the specific information about the counter + uint64_t getInfo( + uint64_t infoType //!< The type of returned information + ) const; + + //! Returns the GPU device, associated with the current object + const Device& dev() const { return gpuDevice_; } + + //! Returns the virtual GPU device + const VirtualGPU& gpu() const { return gpu_; } + + //! Returns the CAL performance counter descriptor + const Info* info() const { return &info_; } + + //! Returns the Info structure for performance counter + Pal::IPerfExperiment* iPerf() const { return counter_; } + +private: + //! Disable default copy constructor + PerfCounter(const PerfCounter&); + + //! Disable default operator= + PerfCounter& operator=(const PerfCounter&); + + const Device& gpuDevice_; //!< The backend device + const VirtualGPU& gpu_; //!< The virtual GPU device object + + PalCounterReference* calRef_; //!< Reference counter + uint flags_; //!< The perfcounter object state + Info info_; //!< The info structure for perfcounter + Pal::IPerfExperiment* counter_; //!< GSL counter object + uint index_; //!< Counter index in the CAL container +}; + +} // namespace pal + +#endif // PALCOUNTERS_HPP_ + diff --git a/projects/clr/rocclr/runtime/device/pal/paldebugger.hpp b/projects/clr/rocclr/runtime/device/pal/paldebugger.hpp new file mode 100644 index 0000000000..5a96b3f552 --- /dev/null +++ b/projects/clr/rocclr/runtime/device/pal/paldebugger.hpp @@ -0,0 +1,121 @@ +// +// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved. +// +#ifndef PALDEBGGER_H_ +#define PALDEBGGER_H_ + +#include +#include +#include "hsa.h" +#include "amd_hsa_kernel_code.h" +#include "device/device.hpp" +#include "device/hwdebug.hpp" +#include "acl.h" + +static const int NumberReserveVgprs = 4; + +namespace pal { + +/** + * \defgroup Services_API OCL Runtime Services API + * @{ + */ + +/*! \brief Dispatch packet information + * + * This structure contains the packet information for kernel dispatch + */ +struct PacketAmdInfo +{ + uint32_t trapReservedVgprIndex_; //!< reserved VGPR index, -1 when they are not valid + uint32_t scratchBufferWaveOffset_; //!< scratch buffer wave offset, -1 when no scratch buffer + void* pointerToIsaBuffer_; //!< pointer to the buffer containing ISA + size_t sizeOfIsaBuffer_; //!< size of the ISA buffer + uint32_t numberOfVgprs_; //!< number of VGPRs used by the kernel + uint32_t numberOfSgprs_; //!< number of SGPRs used by the kernel + size_t sizeOfStaticGroupMemory_; //!< Static local memory used by the kernel +}; + +/*! \brief Cache mask for invalidation + */ +struct HwDbgGpuCacheMask +{ + HwDbgGpuCacheMask() :ui32All_(0) {} + + HwDbgGpuCacheMask(uint32_t mask) :ui32All_(mask) {} + + union { + struct { + uint32_t sqICache_ : 1; //!< Instruction cache + uint32_t sqKCache_ : 1; //!< Data cache + uint32_t tcL1_ : 1; //!< tcL1 cache + uint32_t tcL2_ : 1; //!< tcL2 cache + uint32_t reserved_ : 28; + }; + uint32_t ui32All_; + }; +}; + +/*! \brief Address watch information + * + * Information about each watch point - address, mask, mode and event + */ +struct HwDbgAddressWatch +{ + void* watchAddress_; //! The address of watch point + uint64_t watchMask_; //! The mask for watch point (lower 24 bits) + cl_dbg_address_watch_mode_amd watchMode_; //! The watch mode for this watch + DebugEvent event_; //! Event of the watch point (not used for now) +}; + +/*! \brief Runtime structure used to communicate debug information + * between Ocl services and core for a kernel dispatch. + */ +struct DebugToolInfo +{ + uint64_t scratchAddress_; //! Scratch memory address + size_t scratchSize_; //! Scratch memory size + uint64_t globalAddress_; //! Global memory address + uint32_t cacheDisableMask_; //! Cache mask, indicating caches disabled + uint32_t exceptionMask_; //! Exception mask + uint32_t reservedCuNum_; //! Number of reserved CUs for display, + //! which ranges from 0 to 7 in the current implementation. + bool monitorMode_; //! Debug or profiler mode + bool gpuSingleStepMode_; //! SQ debug mode + amd::Memory* trapHandler_; //! Trap handler address + amd::Memory* trapBuffer_; //! Trap buffer address + bool sqPerfcounterEnable_; //! whether SQ perf counters are enabled + aclBinary* aclBinary_; //! pointer of the kernel ACL binary + amd::Event* event_; //! pointer of the kernel event in the enqueue command +}; + +/*! \brief Message used by the KFD wave control for CI + * + * Structure indicates the various information used by the wave control function. + */ +struct HwDebugWaveAddr +{ + uint32_t VMID_ : 4; //! Virtual memory id + uint32_t wave_ : 4; //! Wave id + uint32_t SIMD_ : 2; //! SIMD id + uint32_t CU_ : 4; //! Compute unit + uint32_t SH_ : 1; //! Shader array + uint32_t SE_ : 1; //! Shader engine +}; + +/*! \brief Kernel code information +* +* This structure contains the pointer of mapped kernel code for host access +* and its size (in bytes) +*/ +struct AqlCodeInfo +{ + amd_kernel_code_t * aqlCode_; //! pointer of AQL code to allow host access + uint32_t aqlCodeSize_; //! size of AQL code +}; + +/**@}*/ + +} // namespace pal + +#endif // PALDEBGGER_H_ diff --git a/projects/clr/rocclr/runtime/device/pal/paldebugmanager.cpp b/projects/clr/rocclr/runtime/device/pal/paldebugmanager.cpp new file mode 100644 index 0000000000..55438b881f --- /dev/null +++ b/projects/clr/rocclr/runtime/device/pal/paldebugmanager.cpp @@ -0,0 +1,412 @@ +// +// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved. +// +#include "platform/commandqueue.hpp" +#include "device/device.hpp" +#include "device/pal/paldevice.hpp" +#include "device/pal/palmemory.hpp" +#include "device/pal/paltrap.hpp" +#include "device/pal/paldebugmanager.hpp" +#include +#include +#include + +namespace pal { + +class VirtualGPU; +class Device; +class Memory; + +/* + *************************************************************************** + * Implementation of GPU Debug Manager class + *************************************************************************** + */ + +GpuDebugManager::GpuDebugManager(amd::Device* device) + : HwDebugManager(device) + , vGpu_(nullptr) + , debugMessages_(0) + , addressWatch_(nullptr) + , addressWatchSize_(0) + , oclEventHandle_(nullptr) +{ + // Initialize the exception info and the kernel execution mode + excpPolicy_.exceptionMask = 0x0; + excpPolicy_.waveAction = CL_DBG_WAVES_RESUME; + excpPolicy_.hostAction = CL_DBG_HOST_IGNORE; + excpPolicy_.waveMode = CL_DBG_WAVEMODE_BROADCAST; + + execMode_.ui32All = 0; + + rtTrapHandlerInfo_.trap_.trapHandler_ = nullptr; + rtTrapHandlerInfo_.trap_.trapBuffer_ = nullptr; + + aqlPacket_ = (hsa_kernel_dispatch_packet_t *) nullptr; + + return; +} + +GpuDebugManager::~GpuDebugManager() +{ + if (nullptr != addressWatch_) { + delete [] addressWatch_; + } +} + +void +GpuDebugManager::executePreDispatchCallBack(void* aqlPacket, + void* toolInfo) +{ + DebugToolInfo* info = reinterpret_cast(toolInfo); + + aqlPacket_ = reinterpret_cast(aqlPacket); + Unimplemented(); + // Only if the pre-dispatch callback is set, will we update cache + // flush configuration and build the memory descriptor. + if (nullptr != preDispatchCallBackFunc_) { +/* + // Build the scratch memory descriptor + device()->gslCtx()->BuildScratchBufferResource(debugInfo_.scratchMemoryDescriptor_, + info->scratchAddress_, + info->scratchSize_); + + // Build the global memory descriptor + device()->gslCtx()->BuildHeapBufferResource(debugInfo_.globalMemoryDescriptor_, + info->globalAddress_); +*/ +// // for invalidate cache (BuildEndOfKernelNotifyCommands) +// aqlPacket->release_fence_scope = 2; + + aclBinary_ = reinterpret_cast(info->aclBinary_); + oclEventHandle_ = reinterpret_cast(as_cl(info->event_)); + + cl_device_id clDeviceId = as_cl(device_); + preDispatchCallBackFunc_(clDeviceId, + oclEventHandle_, + aqlPacket_, + aclBinary_, + preDispatchCallBackArgs_); + } + + // setup the trap handler information only if the debugger has been registered + if (isRegistered()) { + // Copy the various info set by the debugger/profiler to the tool info structure + setupTrapInformation(info); + } +} + +void +GpuDebugManager::executePostDispatchCallBack() +{ + if (nullptr != postDispatchCallBackFunc_) { + cl_device_id clDeviceId = as_cl(device_); + postDispatchCallBackFunc_(clDeviceId, + aqlPacket_->completion_signal.handle, + postDispatchCallBackArgs_); + } +} + +//! Map the kernel code for host access +void +GpuDebugManager::mapKernelCode(void* aqlCodeInfo) const +{ + AqlCodeInfo* codeInfo = reinterpret_cast(aqlCodeInfo); + + codeInfo->aqlCode_ = reinterpret_cast(aqlCodeAddr_); + codeInfo->aqlCodeSize_ = aqlCodeSize_; +} + +cl_int +GpuDebugManager::registerDebugger(amd::Context* context, uintptr_t messageStorage) +{ + if (!device()->settings().enableHwDebug_) { + LogError("debugmanager: Register debugger error - HW DEBUG is not enable"); + return CL_DEBUGGER_REGISTER_FAILURE_AMD; + } + + // first time register - set the message storage, flush queue and enable hw debug + if (!isRegistered()) { + debugMessages_ = messageStorage; + Unimplemented(); +/* + if (!device()->gslCtx()->registerHwDebugger(debugMessages_)) { + LogError("debugmanager: Register debugger failed"); + return CL_OUT_OF_RESOURCES; + } +*/ + isRegistered_ = true; + + if (CL_SUCCESS != createRuntimeTrapHandler()) { + LogError("debugmanager: Create runtime trap handler failed"); + return CL_OUT_OF_RESOURCES; + } + } + + context_ = context; + + return CL_SUCCESS; +} + +void +GpuDebugManager::unregisterDebugger() +{ + if (isRegistered()) { + // reset the debugger registration flag + isRegistered_ = false; + context_ = nullptr; + } +} + +void +GpuDebugManager::flushCache(uint32_t mask) +{ + HwDbgGpuCacheMask cacheMask(mask); + device()->xferQueue()->flushCuCaches(cacheMask); +} + + +void +GpuDebugManager::setupTrapInformation(DebugToolInfo* toolInfo) +{ + toolInfo->scratchAddress_ = 0; + toolInfo->scratchSize_ = 0; + toolInfo->globalAddress_ = 0; + toolInfo->sqPerfcounterEnable_ = false; + + // Set up trap related info in the kernel info structure to be + // used in the kernel dispatch. + toolInfo->exceptionMask_ = excpPolicy_.exceptionMask; + toolInfo->gpuSingleStepMode_ = execMode_.gpuSingleStepMode; + toolInfo->monitorMode_ = execMode_.monitorMode; + + // The order of these three bits is determined by the definition + // of the register COMPUTE_DISPATCH_INITIATOR + toolInfo->cacheDisableMask_ = ((execMode_.disableL1Scalar << 2) + | (execMode_.disableL2Cache << 1) + | (execMode_.disableL1Vector)); + + toolInfo->reservedCuNum_ = execMode_.reservedCuNum; + + toolInfo->trapHandler_ = rtTrapInfo_[kDebugTrapHandlerLocation]; + toolInfo->trapBuffer_ = rtTrapInfo_[kDebugTrapBufferLocation]; +} + +void +GpuDebugManager::getPacketAmdInfo( + const void* aqlCodeInfo, + void* packetInfo) const + +{ + const AqlCodeInfo* codeInfo = + reinterpret_cast(aqlCodeInfo); + + const amd_kernel_code_t* hostAqlCode = codeInfo->aqlCode_; + + PacketAmdInfo* packet = + reinterpret_cast(packetInfo); + + const amd_kernel_code_t* akc = hostAqlCode; + + packet->numberOfSgprs_ = akc->wavefront_sgpr_count; + packet->numberOfVgprs_ = akc->workitem_vgpr_count; + + // use mapped kernel_object_address for host accessing of ISA buffer + packet->pointerToIsaBuffer_ = (char*) (hostAqlCode) + + akc->kernel_code_entry_byte_offset; + + packet->scratchBufferWaveOffset_ = + akc->debug_wavefront_private_segment_offset_sgpr; + + packet->sizeOfIsaBuffer_ = codeInfo->aqlCodeSize_; + + packet->sizeOfStaticGroupMemory_ = akc->workgroup_group_segment_byte_size; + + // The trap_reserved_vgpr_index will be 4 less the original + // This value must be used only by the debugger + packet->trapReservedVgprIndex_ = akc->workitem_vgpr_count - NumberReserveVgprs; +} + +DebugEvent +GpuDebugManager::createDebugEvent( + const bool autoReset) +{ + Unimplemented(); +/* + // create the event object + osEventHandle shaderEvent = osEventCreate(!autoReset); + + // event object has been created, set the initial state + if (shaderEvent != 0) { + + osEventReset(shaderEvent); // initial state is non-signaled + + if (device()->gslCtx()->exceptionNotification(shaderEvent)) { + return shaderEvent; + } + } +*/ + return 0; +} + +cl_int +GpuDebugManager::waitDebugEvent( + DebugEvent pEvent, + uint32_t timeOut) const +{ + Unimplemented(); +/* + if (osEventTimedWait(pEvent, timeOut)) { + return CL_SUCCESS; + } + else { + return CL_EVENT_TIMEOUT_AMD; + } +*/ + return CL_SUCCESS; +} + +void +GpuDebugManager::destroyDebugEvent(DebugEvent* pEvent) +{ + Unimplemented(); +/* + osEventDestroy(*pEvent); + *pEvent = 0; + + device()->gslCtx()->exceptionNotification(0); +*/ +} + +void +GpuDebugManager::wavefrontControl( + uint32_t waveAction, + uint32_t waveMode, + uint32_t trapId, + void* waveAddr) const +{ + Unimplemented(); + //device()->gslCtx()->executeSqCommand(waveAction, waveMode, trapId, waveAddr); +} + +void +GpuDebugManager::setAddressWatch( + uint32_t numWatchPoints, + void** watchAddress, + uint64_t* watchMask, + uint64_t* watchMode, + DebugEvent* event) +{ + size_t requiredSize = numWatchPoints * sizeof(HwDbgAddressWatch); + + // previously allocated size is not big enough, allocate new memory + if (addressWatchSize_ < requiredSize) { + if (nullptr != addressWatch_) { // free the smaller address watch storage + delete [] addressWatch_; + } + addressWatch_ = new HwDbgAddressWatch[numWatchPoints]; + addressWatchSize_ = requiredSize; + } + + // fill in the address watch structure + memset(addressWatch_, 0, addressWatchSize_); + + for (uint32_t i = 0; i < numWatchPoints; i++) + { + amd::Memory* watchMem = as_amd(reinterpret_cast(watchAddress[i])); + Memory* watchMemAddress = device()->getGpuMemory(watchMem); + + addressWatch_[i].watchAddress_ = reinterpret_cast(watchMemAddress->vmAddress()); + addressWatch_[i].watchMask_ = watchMask[i]; + addressWatch_[i].watchMode_ = (cl_dbg_address_watch_mode_amd) watchMode[i]; + addressWatch_[i].event_ = (0 != event) ? event[i] : 0; + } + + Unimplemented(); + // setup the watch addresses + //device()->gslCtx()->setAddressWatch(numWatchPoints, (void*) addressWatch_); + +} + +void +GpuDebugManager::setGlobalMemory( + amd::Memory* memObj, + uint32_t offset, + void* srcPtr, + uint32_t size) +{ + Memory* globalMem = device()->getGpuMemory(memObj); + + address mappedMem = static_cast
(globalMem->map(nullptr,0)); + assert(mappedMem != 0); + + void* dest_ptr = reinterpret_cast(mappedMem + offset); + memcpy(dest_ptr, srcPtr, size); + + globalMem->unmap(nullptr); +} + +cl_int +GpuDebugManager::createRuntimeTrapHandler() +{ + size_t codeSize = 0; + const uint32_t* rtTrapCode = nullptr; + + if (device()->settings().viPlus_) { + codeSize = sizeof(RuntimeTrapCodeVi); + rtTrapCode = RuntimeTrapCodeVi; + } + else { + codeSize = sizeof(RuntimeTrapCode); + rtTrapCode = RuntimeTrapCode; + } + + uint32_t numCodes = codeSize / sizeof(uint32_t); + + // Handle TMA corruption hw bug workaround - + // The trap handler buffer has extra 256 bytes allocated, the TMA address + // is stored in the first two DWORDs and the actual trap handler code + // is stored starting at the location of 256 bytes (TbaStartOffset). + // + // allocate memory for the runtime trap handler (TBA) + TMA address + uint32_t allocSize = codeSize + TbaStartOffset; + + Memory* rtTBA = new Memory(*device(), allocSize); + runtimeTBA_ = rtTBA; + + if ((rtTBA == nullptr) || !rtTBA->create(Resource::RemoteUSWC)) { + return CL_OUT_OF_RESOURCES; + } + address tbaAddress = reinterpret_cast
(rtTBA->map(nullptr)); + + // allocate buffer for the runtime trap handler buffer (TMA) + uint32_t tmaSize = 0x100; + Memory* rtTMA = new Memory(*device(), tmaSize); + runtimeTMA_ = rtTMA; + + if ((rtTMA == nullptr) || !rtTMA->create(Resource::RemoteUSWC)) { + return CL_OUT_OF_RESOURCES; + } + + uint64_t rtTmaAddress = rtTMA->vmAddress(); + if ((rtTBA->vmAddress() & 0xFF) != 0 || (rtTmaAddress & 0xFF) != 0) { + LogError("debugmanager: Trap handler/buffer is not 256-byte aligned"); + return CL_INVALID_VALUE; + } + + // store the TMA address at the beginning of trap handler buffer + uint64_t* tbaStorage = reinterpret_cast(tbaAddress); + tbaStorage[0] = rtTmaAddress; + + // save the trap handler code + uint32_t* trapHandlerPtr = (uint32_t*)(tbaAddress + TbaStartOffset); + for (uint32_t i = 0; i < numCodes; i++) { + trapHandlerPtr[i] = rtTrapCode[i]; + } + + rtTBA->unmap(nullptr); + + return CL_SUCCESS; +} + +} // namespace pal diff --git a/projects/clr/rocclr/runtime/device/pal/paldebugmanager.hpp b/projects/clr/rocclr/runtime/device/pal/paldebugmanager.hpp new file mode 100644 index 0000000000..dc39d21e33 --- /dev/null +++ b/projects/clr/rocclr/runtime/device/pal/paldebugmanager.hpp @@ -0,0 +1,117 @@ +// +// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved. +// +#ifndef PALDEBUGMANAGER_H__ +#define PALDEBUGMANAGER_H__ + +#include "device/pal/palvirtual.hpp" +#include "device/pal/paldebugger.hpp" + +namespace pal { + +class GpuDebugManager; +class Device; +class Memory; + + +/*! \brief Debug Manager Class + * + * The debug manager class is used to pass all the trap info to the + * kernel dispatch and then the kernel execution can use such trap information + * for kernel execution. This class contains the trap handler and shader event + * objects. The trap handler is setup by users and passed to the kernel dispatch. + * The shader event is to receive interrupts from the GPU and then users can + * perform various operations. + * + * This class also provides the interface for setting up the pre-dispatch + * callback functions used by the profiler and debugger. It also provides + * a way to retrieve various debug information for the kernel execution. + * + */ +class GpuDebugManager : public amd::HwDebugManager { +public: + + //! Constructor of the debug manager class + GpuDebugManager(amd::Device* device); + + //! Destructor of the debug manager class + ~GpuDebugManager(); + + //! Get the single instance of the GpuDebugManager class + static GpuDebugManager* getDefaultInstance(); + + //! Destroy the GpuDebugManager class object + static void destroyInstances(); + + //! Flush cache + void flushCache(uint32_t mask); + + //! Create the debug event + DebugEvent createDebugEvent(const bool autoReset); + + //! Wait for the debug event + cl_int waitDebugEvent(DebugEvent pEvent, uint32_t timeOut) const; + + //! Destroy the debug event + void destroyDebugEvent(DebugEvent* pEvent); + + //! Register the debugger + cl_int registerDebugger(amd::Context*context, uintptr_t messageStorage); + + //! Unregister the debugger + void unregisterDebugger(); + + //! Send the wavefront control cmmand + void wavefrontControl(uint32_t waveAction, + uint32_t waveMode, + uint32_t trapId, + void* waveAddr) const; + + //! Set address watching point + void setAddressWatch(uint32_t numWatchPoints, + void** watchAddress, + uint64_t* watchMask, + uint64_t* watchMode, + DebugEvent* pEvent); + + //! Map the kernel code for host access + void mapKernelCode(void* aqlCodeInfo) const; + + //! Get the packet information for dispatch + void getPacketAmdInfo(const void* aqlCodeInfo, void* packetInfo) const; + + //! Set global memory values + void setGlobalMemory(amd::Memory* memObj, uint32_t offset, void* srcPtr, uint32_t size); + + //! Execute the post-dispatch callback function + void executePostDispatchCallBack(); + + //! Execute the pre-dispatch callback function + void executePreDispatchCallBack(void* aqlPacket, + void* toolInfo); + +protected: + const VirtualGPU* vGpu() const { return vGpu_; } + +private: + //! Setup trap handler info for kernel execution + void setupTrapInformation(DebugToolInfo* toolInfo); + + //! Create runtime trap handler + cl_int createRuntimeTrapHandler(); + + const pal::Device* device() const { + return reinterpret_cast(device_); } + + VirtualGPU* vGpu_; //!< the virtual GPU + uintptr_t debugMessages_; //!< Pointer to a SHARED_DEBUG_MESSAGES pass to the KMD + HwDbgAddressWatch* addressWatch_; //!< Address watch data + size_t addressWatchSize_; //!< Size of address watch data + //! Arguments used by the callback function + void* oclEventHandle_; //!< event handler + const hsa_kernel_dispatch_packet_t* aqlPacket_; //!< AQL packet +}; + +} // namespace pal + +#endif // PALDEBUGMANAGER_H__ diff --git a/projects/clr/rocclr/runtime/device/pal/paldefs.hpp b/projects/clr/rocclr/runtime/device/pal/paldefs.hpp new file mode 100644 index 0000000000..9fe2dec2b5 --- /dev/null +++ b/projects/clr/rocclr/runtime/device/pal/paldefs.hpp @@ -0,0 +1,584 @@ +// +// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved. +// +#ifndef PALDEFS_HPP_ +#define PALDEFS_HPP_ + +#include "top.hpp" +#include "pal.h" +#include "palGpuMemory.h" +#include "palImage.h" +#include "palFormatInfo.h" + +// +/// Memory Object Type +// +enum PalGpuMemoryType { + PAL_DEPTH_BUFFER = 0, ///< Depth Buffer + PAL_BUFFER, ///< Pure buffer + PAL_TEXTURE_3D, ///< 3D texture + PAL_TEXTURE_2D, ///< 2D texture + PAL_TEXTURE_1D, ///< 1D texture + PAL_TEXTURE_1D_ARRAY, ///< 1D Array texture + PAL_TEXTURE_2D_ARRAY, ///< 2D Array texture + PAL_TEXTURE_BUFFER, ///< "buffer" texture inside VBO +}; + +struct HwDbgKernelInfo +{ + uint64_t scratchBufAddr; ///< Handle of GPU local memory for kernel private scratch space + size_t scratchBufferSizeInBytes; ///< size of memory pointed to by pScratchBuffer, + uint64_t heapBufAddr; ///< Address of the global heap base + const void* pAqlDispatchPacket; ///< Pointer to the dipatch packet + const void* pAqlQueuePtr; ///< pointer to the AQL Queue + void* trapHandler; ///< address of the trap handler (TBA) + void* trapHandlerBuffer; ///< address of the trap handler buffer (TMA) + uint32_t excpEn; ///< excecption mask + bool trapPresent; ///< trap present flag + bool sqDebugMode; ///< debug mode flag (GPU single step mode) + uint32_t mgmtSe0Mask; ///< mask for SE0 (reserving CU for display) + uint32_t mgmtSe1Mask; ///< mask for SE1 (reserving CU for display) + uint32_t cacheDisableMask; ///< cache disable mask +}; + +//! Engine types +enum EngineType +{ + MainEngine = 0, + SdmaEngine, + AllEngines +}; + +struct GpuEvent +{ + static const unsigned int InvalidID = ((1<<30) - 1); + + EngineType engineId_; ///< type of the id + unsigned int id; ///< actual event id + + //! GPU event default constructor + GpuEvent(): engineId_(MainEngine), id(InvalidID) {} + + //! Returns true if the current event is valid + bool isValid() const { return (id != InvalidID) ? true : false; } + + //! Set invalid event id + void invalidate() { id = InvalidID; } +}; + +/*! \addtogroup PAL + * @{ + */ + +//! PAL Device Implementation + +namespace pal { + +//! Maximum number of the supported global atomic counters +const static uint MaxAtomicCounters = 8; +//! Maximum number of the supported samplers +const static uint MaxSamplers = 16; +//! Maximum number of supported read images +const static uint MaxReadImage = 128; +//! Maximum number of supported write images +const static uint MaxWriteImage = 8; +//! Maximum number of supported read/write images for OCL20 +const static uint MaxReadWriteImage = 64; +//! Maximum number of supported constant arguments +const static uint MaxConstArguments = 8; +//! Maximum number of supported kernel UAV arguments +const static uint MaxUavArguments = 1024; +//! Maximum number of pixels for a 1D image created from a buffer +const static size_t MaxImageBufferSize = 65536; +//! Maximum number of pixels for a 1D image created from a buffer +const static size_t MaxImageArraySize = 2048; + +//! Maximum number of supported constant buffers +const static uint MaxConstBuffers = MaxConstArguments + 8; + +//! Maximum number of constant buffers for arguments +const static uint MaxConstBuffersArguments = 2; + +//! Alignment restriciton for the pinned memory +const static size_t PinnedMemoryAlignment = 4 * Ki; + +//! HSA path specific defines for images +const static uint HsaImageObjectSize = 48; +const static uint HsaImageObjectAlignment = 16; +const static uint HsaSamplerObjectSize = 32; +const static uint HsaSamplerObjectAlignment = 16; + +//! HSA path specific defines for images +const static uint DeviceQueueMaskSize = 32; + +struct AMDDeviceInfo { + const char* targetName_; //!< Target name + const char* machineTarget_; //!< Machine target + uint simdPerCU_; //!< Number of SIMDs per CU + uint simdWidth_; //!< Number of workitems processed per SIMD + uint simdInstructionWidth_; //!< Number of instructions processed per SIMD + uint memChannelBankWidth_; //!< Memory channel bank width + uint localMemSizePerCU_; //!< Local memory size per CU + uint localMemBanks_; //!< Number of banks of local memory + uint gfxipVersion_; //!< The core engine GFXIP version +}; + +static const AMDDeviceInfo DeviceInfo[] = { +/* Unknown */ { "", "unknown", 4, 16, 1, 256, 64 * Ki, 32, 702 }, +/* Tahiti */ { "", "tahiti", 4, 16, 1, 256, 64 * Ki, 32, 702 }, +/* Pitcairn */ { "", "pitcairn", 4, 16, 1, 256, 64 * Ki, 32, 702 }, +/* Capeverde */ { "", "bonaire", 4, 16, 1, 256, 64 * Ki, 32, 702 }, +/* Oland */ { "", "oland", 4, 16, 1, 256, 64 * Ki, 32, 702 }, +/* Hainan */ { "", "hainan", 4, 16, 1, 256, 64 * Ki, 32, 702 }, + +/* Bonaire */ { "Bonaire", "bonaire", 4, 16, 1, 256, 64 * Ki, 32, 702 }, +/* Hawaii */ { "Hawaii", "hawaii", 4, 16, 1, 256, 64 * Ki, 32, 702 }, +/* Kalindi */ { "Kalindi", "kalindi", 4, 16, 1, 256, 64 * Ki, 32, 702 }, +/* Spectre */ { "Spectre", "spectre", 4, 16, 1, 256, 64 * Ki, 32, 701 }, + +/* Carrizo */ { "Carrizo" , "carrizo", 4, 16, 1, 256, 64 * Ki, 32, 800 }, +/* Stoney */ { "Stoney", "stoney", 4, 16, 1, 256, 64 * Ki, 32, 800 }, + +/* Iceland */ { "Iceland", "iceland", 4, 16, 1, 256, 64 * Ki, 32, 800 }, +/* Tonga */ { "Tonga", "tonga", 4, 16, 1, 256, 64 * Ki, 32, 800 }, +/* Fiji */ { "Fiji", "fiji", 4, 16, 1, 256, 64 * Ki, 32, 800 }, +/* Ellesmere */ { "Horse", "horse", 4, 16, 1, 256, 64 * Ki, 32, 800 }, +/* Buffin */ { "Goose", "goose", 4, 16, 1, 256, 64 * Ki, 32, 800 }, +}; + +static const char* Gfx700 = "AMD:AMDGPU:7:0:0"; +static const char* Gfx701 = "AMD:AMDGPU:7:0:1"; +static const char* Gfx800 = "AMD:AMDGPU:8:0:0"; +static const char* Gfx801 = "AMD:AMDGPU:8:0:1"; +static const char* Gfx804 = "AMD:AMDGPU:8:0:4"; +static const char* Gfx810 = "AMD:AMDGPU:8:1:0"; +static const char* Gfx900 = "AMD:AMDGPU:9:0:0"; +static const char* Gfx901 = "AMD:AMDGPU:9:0:1"; + +// Supported OpenCL versions +enum OclVersion { + OpenCL10, + OpenCL11, + OpenCL12, + OpenCL20 +}; + +struct MemoryFormat { + cl_image_format clFormat_; //!< CL image format + Pal::Format palFormat_; //!< PAL image format + Pal::ChannelMapping palChannel_;//!< PAL channel mapping +}; + +static const MemoryFormat +MemoryFormatMap[] = { + // R + { { CL_R, CL_UNORM_INT8 }, + { Pal::ChFmt::R8, Pal::NumFmt::Unorm }, + { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::Zero, + Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::One } }, + { { CL_R, CL_UNORM_INT16 }, + { Pal::ChFmt::R16, Pal::NumFmt::Unorm }, + { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::Zero, + Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::One } }, + + { { CL_R, CL_SNORM_INT8 }, + { Pal::ChFmt::R8, Pal::NumFmt::Snorm }, + { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::Zero, + Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::One } }, + { { CL_R, CL_SNORM_INT16 }, + { Pal::ChFmt::R16, Pal::NumFmt::Snorm }, + { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::Zero, + Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::One } }, + + { { CL_R, CL_SIGNED_INT8 }, + { Pal::ChFmt::R8, Pal::NumFmt::Sint }, + { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::Zero, + Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::One } }, + { { CL_R, CL_SIGNED_INT16 }, + { Pal::ChFmt::R16, Pal::NumFmt::Sint }, + { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::Zero, + Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::One } }, + { { CL_R, CL_SIGNED_INT32 }, + { Pal::ChFmt::R32, Pal::NumFmt::Sint }, + { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::Zero, + Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::One } }, + { { CL_R, CL_UNSIGNED_INT8 }, + { Pal::ChFmt::R8, Pal::NumFmt::Uint }, + { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::Zero, + Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::One } }, + { { CL_R, CL_UNSIGNED_INT16 }, + { Pal::ChFmt::R16, Pal::NumFmt::Uint }, + { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::Zero, + Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::One } }, + { { CL_R, CL_UNSIGNED_INT32 }, + { Pal::ChFmt::R32, Pal::NumFmt::Uint }, + { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::Zero, + Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::One } }, + + { { CL_R, CL_HALF_FLOAT }, + { Pal::ChFmt::R16, Pal::NumFmt::Float }, + { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::Zero, + Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::One } }, + { { CL_R, CL_FLOAT }, + { Pal::ChFmt::R32, Pal::NumFmt::Float }, + { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::Zero, + Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::One } }, + + // A + { { CL_A, CL_UNORM_INT8 }, + { Pal::ChFmt::R8, Pal::NumFmt::Unorm }, + { Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::Zero, + Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::R } }, + { { CL_A, CL_UNORM_INT16 }, + { Pal::ChFmt::R16, Pal::NumFmt::Unorm }, + { Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::Zero, + Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::R } }, + + { { CL_A, CL_SNORM_INT8 }, + { Pal::ChFmt::R8, Pal::NumFmt::Snorm }, + { Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::Zero, + Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::R } }, + { { CL_A, CL_SNORM_INT16 }, + { Pal::ChFmt::R16, Pal::NumFmt::Snorm }, + { Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::Zero, + Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::R } }, + + { { CL_A, CL_SIGNED_INT8 }, + { Pal::ChFmt::R8, Pal::NumFmt::Sint }, + { Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::Zero, + Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::R } }, + { { CL_A, CL_SIGNED_INT16 }, + { Pal::ChFmt::R16, Pal::NumFmt::Sint }, + { Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::Zero, + Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::R } }, + { { CL_A, CL_SIGNED_INT32}, + { Pal::ChFmt::R32, Pal::NumFmt::Sint }, + { Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::Zero, + Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::R } }, + { { CL_A, CL_UNSIGNED_INT8 }, + { Pal::ChFmt::R8, Pal::NumFmt::Uint }, + { Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::Zero, + Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::R } }, + { { CL_A, CL_UNSIGNED_INT16 }, + { Pal::ChFmt::R16, Pal::NumFmt::Uint }, + { Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::Zero, + Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::R } }, + { { CL_A, CL_UNSIGNED_INT32}, + { Pal::ChFmt::R32 , Pal::NumFmt::Uint }, + { Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::Zero, + Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::R } }, + + { { CL_A, CL_HALF_FLOAT }, + { Pal::ChFmt::R16, Pal::NumFmt::Float }, + { Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::Zero, + Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::R } }, + { { CL_A, CL_FLOAT }, + { Pal::ChFmt::R32, Pal::NumFmt::Float }, + { Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::Zero, + Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::R } }, + + // RG + { { CL_RG, CL_UNORM_INT8 }, + { Pal::ChFmt::R8G8, Pal::NumFmt::Unorm }, + { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::G, + Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::One } }, + { { CL_RG, CL_UNORM_INT16 }, + { Pal::ChFmt::R16G16, Pal::NumFmt::Unorm }, + { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::G, + Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::One } }, + + { { CL_RG, CL_SNORM_INT8 }, + { Pal::ChFmt::R8G8, Pal::NumFmt::Snorm }, + { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::G, + Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::One } }, + { { CL_RG, CL_SNORM_INT16 }, + { Pal::ChFmt::R16G16, Pal::NumFmt::Snorm }, + { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::G, + Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::One } }, + + { { CL_RG, CL_SIGNED_INT8 }, + { Pal::ChFmt::R8G8, Pal::NumFmt::Sint }, + { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::G, + Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::One } }, + { { CL_RG, CL_SIGNED_INT16 }, + { Pal::ChFmt::R16G16, Pal::NumFmt::Sint }, + { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::G, + Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::One } }, + { { CL_RG, CL_SIGNED_INT32}, + { Pal::ChFmt::R32G32, Pal::NumFmt::Sint }, + { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::G, + Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::One } }, + { { CL_RG, CL_UNSIGNED_INT8 }, + { Pal::ChFmt::R8G8, Pal::NumFmt::Uint }, + { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::G, + Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::One } }, + { { CL_RG, CL_UNSIGNED_INT16 }, + { Pal::ChFmt::R16G16, Pal::NumFmt::Uint }, + { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::G, + Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::One } }, + { { CL_RG, CL_UNSIGNED_INT32}, + { Pal::ChFmt::R32G32, Pal::NumFmt::Uint }, + { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::G, + Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::One } }, + + { { CL_RG, CL_HALF_FLOAT }, + { Pal::ChFmt::R16G16, Pal::NumFmt::Float }, + { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::G, + Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::One } }, + { { CL_RG, CL_FLOAT }, + { Pal::ChFmt::R32G32, Pal::NumFmt::Float }, + { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::G, + Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::One } }, +/* + // RA + { { CL_RA, CL_UNORM_INT8 }, + { GSL_CHANNEL_ORDER_RA, CM_SURF_FMT_RG8 } }, + { { CL_RA, CL_UNORM_INT16 }, + { GSL_CHANNEL_ORDER_RA, CM_SURF_FMT_RG16 } }, + + { { CL_RA, CL_SNORM_INT8 }, + { GSL_CHANNEL_ORDER_RA, CM_SURF_FMT_sRG8 } }, + { { CL_RA, CL_SNORM_INT16 }, + { GSL_CHANNEL_ORDER_RA, CM_SURF_FMT_sUV16 } }, + + { { CL_RA, CL_SIGNED_INT8 }, + { GSL_CHANNEL_ORDER_RA, CM_SURF_FMT_sRG8I } }, + { { CL_RA, CL_SIGNED_INT16 }, + { GSL_CHANNEL_ORDER_RA, CM_SURF_FMT_sRG16I } }, + { { CL_RA, CL_SIGNED_INT32}, + { GSL_CHANNEL_ORDER_RA, CM_SURF_FMT_sRG32I } }, + { { CL_RA, CL_UNSIGNED_INT8 }, + { GSL_CHANNEL_ORDER_RA, CM_SURF_FMT_RG8I } }, + { { CL_RA, CL_UNSIGNED_INT16 }, + { GSL_CHANNEL_ORDER_RA, CM_SURF_FMT_RG16I } }, + { { CL_RA, CL_UNSIGNED_INT32}, + { GSL_CHANNEL_ORDER_RA , CM_SURF_FMT_RG32I } }, + + { { CL_RA, CL_HALF_FLOAT }, + { GSL_CHANNEL_ORDER_RA, CM_SURF_FMT_RG16F } }, + { { CL_RA, CL_FLOAT }, + { GSL_CHANNEL_ORDER_RA, CM_SURF_FMT_RG32F } }, +*/ + // RGB + { { CL_RGB, CL_UNORM_INT_101010 }, + { Pal::ChFmt::R10G10B10A2, Pal::NumFmt::Unorm }, + { Pal::ChannelSwizzle::B, Pal::ChannelSwizzle::G, + Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::One } }, + { { CL_RGB, CL_UNSIGNED_INT8 }, // This is used only by blit kernel + { Pal::ChFmt::R8G8B8A8, Pal::NumFmt::Uint }, + { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::G, + Pal::ChannelSwizzle::B, Pal::ChannelSwizzle::One } }, + + // RGBA + { { CL_RGBA, CL_UNORM_INT8 }, + { Pal::ChFmt::R8G8B8A8, Pal::NumFmt::Unorm }, + { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::G, + Pal::ChannelSwizzle::B, Pal::ChannelSwizzle::A } }, + { { CL_RGBA, CL_UNORM_INT16 }, + { Pal::ChFmt::R16G16B16A16, Pal::NumFmt::Unorm }, + { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::G, + Pal::ChannelSwizzle::B, Pal::ChannelSwizzle::A } }, + + { { CL_RGBA, CL_SNORM_INT8 }, + { Pal::ChFmt::R8G8B8A8, Pal::NumFmt::Snorm }, + { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::G, + Pal::ChannelSwizzle::B, Pal::ChannelSwizzle::A } }, + { { CL_RGBA, CL_SNORM_INT16 }, + { Pal::ChFmt::R16G16B16A16, Pal::NumFmt::Snorm }, + { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::G, + Pal::ChannelSwizzle::B, Pal::ChannelSwizzle::A } }, + + { { CL_RGBA, CL_SIGNED_INT8 }, + { Pal::ChFmt::R8G8B8A8, Pal::NumFmt::Sint }, + { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::G, + Pal::ChannelSwizzle::B, Pal::ChannelSwizzle::A } }, + { { CL_RGBA, CL_SIGNED_INT16 }, + { Pal::ChFmt::R16G16B16A16, Pal::NumFmt::Sint }, + { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::G, + Pal::ChannelSwizzle::B, Pal::ChannelSwizzle::A } }, + { { CL_RGBA, CL_SIGNED_INT32 }, + { Pal::ChFmt::R32G32B32A32, Pal::NumFmt::Sint }, + { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::G, + Pal::ChannelSwizzle::B, Pal::ChannelSwizzle::A } }, + { { CL_RGBA, CL_UNSIGNED_INT8 }, + { Pal::ChFmt::R8G8B8A8, Pal::NumFmt::Uint }, + { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::G, + Pal::ChannelSwizzle::B, Pal::ChannelSwizzle::A } }, + { { CL_RGBA, CL_UNSIGNED_INT16 }, + { Pal::ChFmt::R16G16B16A16, Pal::NumFmt::Uint }, + { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::G, + Pal::ChannelSwizzle::B, Pal::ChannelSwizzle::A } }, + { { CL_RGBA, CL_UNSIGNED_INT32}, + { Pal::ChFmt::R32G32B32A32, Pal::NumFmt::Uint }, + { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::G, + Pal::ChannelSwizzle::B, Pal::ChannelSwizzle::A } }, + + { { CL_RGBA, CL_HALF_FLOAT }, + { Pal::ChFmt::R16G16B16A16, Pal::NumFmt::Float }, + { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::G, + Pal::ChannelSwizzle::B, Pal::ChannelSwizzle::A } }, + { { CL_RGBA, CL_FLOAT }, + { Pal::ChFmt::R32G32B32A32, Pal::NumFmt::Float }, + { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::G, + Pal::ChannelSwizzle::B, Pal::ChannelSwizzle::A } }, + + // ARGB + { { CL_ARGB, CL_UNORM_INT8 }, + { Pal::ChFmt::R8G8B8A8, Pal::NumFmt::Unorm }, + { Pal::ChannelSwizzle::G, Pal::ChannelSwizzle::B, + Pal::ChannelSwizzle::A, Pal::ChannelSwizzle::R } }, + { { CL_ARGB, CL_SNORM_INT8 }, + { Pal::ChFmt::R8G8B8A8, Pal::NumFmt::Snorm }, + { Pal::ChannelSwizzle::G, Pal::ChannelSwizzle::B, + Pal::ChannelSwizzle::A, Pal::ChannelSwizzle::R } }, + { { CL_ARGB, CL_SIGNED_INT8 }, + { Pal::ChFmt::R8G8B8A8, Pal::NumFmt::Sint }, + { Pal::ChannelSwizzle::G, Pal::ChannelSwizzle::B, + Pal::ChannelSwizzle::A, Pal::ChannelSwizzle::R } }, + { { CL_ARGB, CL_UNSIGNED_INT8 }, + { Pal::ChFmt::R8G8B8A8, Pal::NumFmt::Uint }, + { Pal::ChannelSwizzle::G, Pal::ChannelSwizzle::B, + Pal::ChannelSwizzle::A, Pal::ChannelSwizzle::R } }, + + // BGRA + { { CL_BGRA, CL_UNORM_INT8 }, + { Pal::ChFmt::B8G8R8A8, Pal::NumFmt::Unorm }, + { Pal::ChannelSwizzle::B, Pal::ChannelSwizzle::G, + Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::A } }, + { { CL_BGRA, CL_SNORM_INT8 }, + { Pal::ChFmt::B8G8R8A8, Pal::NumFmt::Snorm }, + { Pal::ChannelSwizzle::B, Pal::ChannelSwizzle::G, + Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::A } }, + { { CL_BGRA, CL_SIGNED_INT8 }, + { Pal::ChFmt::B8G8R8A8, Pal::NumFmt::Sint }, + { Pal::ChannelSwizzle::B, Pal::ChannelSwizzle::G, + Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::A } }, + { { CL_BGRA, CL_UNSIGNED_INT8 }, + { Pal::ChFmt::B8G8R8A8, Pal::NumFmt::Uint }, + { Pal::ChannelSwizzle::B, Pal::ChannelSwizzle::G, + Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::A } }, + + // LUMINANCE + { { CL_LUMINANCE, CL_SNORM_INT8 }, + { Pal::ChFmt::R8, Pal::NumFmt::Snorm }, + { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::R, + Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::One } }, + { { CL_LUMINANCE, CL_SNORM_INT16 }, + { Pal::ChFmt::R16, Pal::NumFmt::Snorm }, + { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::R, + Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::One } }, + { { CL_LUMINANCE, CL_UNORM_INT8 }, + { Pal::ChFmt::R8, Pal::NumFmt::Unorm }, + { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::R, + Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::One } }, + { { CL_LUMINANCE, CL_UNORM_INT16 }, + { Pal::ChFmt::R16, Pal::NumFmt::Unorm }, + { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::R, + Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::One } }, + { { CL_LUMINANCE, CL_HALF_FLOAT }, + { Pal::ChFmt::R16, Pal::NumFmt::Float }, + { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::R, + Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::One } }, + { { CL_LUMINANCE, CL_FLOAT }, + { Pal::ChFmt::R32, Pal::NumFmt::Float }, + { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::R, + Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::One } }, + + // INTENSITY + { { CL_INTENSITY, CL_SNORM_INT8 }, + { Pal::ChFmt::R8, Pal::NumFmt::Snorm }, + { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::R, + Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::R } }, + { { CL_INTENSITY, CL_SNORM_INT16 }, + { Pal::ChFmt::R16, Pal::NumFmt::Snorm }, + { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::R, + Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::R } }, + { { CL_INTENSITY, CL_UNORM_INT8 }, + { Pal::ChFmt::R8, Pal::NumFmt::Unorm }, + { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::R, + Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::R } }, + { { CL_INTENSITY, CL_UNORM_INT16 }, + { Pal::ChFmt::R16, Pal::NumFmt::Unorm }, + { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::R, + Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::R } }, + { { CL_INTENSITY, CL_HALF_FLOAT }, + { Pal::ChFmt::R16, Pal::NumFmt::Float }, + { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::R, + Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::R } }, + { { CL_INTENSITY, CL_FLOAT }, + { Pal::ChFmt::R32, Pal::NumFmt::Float }, + { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::R, + Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::R } }, + + // sRBGA + { { CL_sRGBA, CL_UNORM_INT8 }, + { Pal::ChFmt::R8G8B8A8, Pal::NumFmt::Srgb }, + { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::G, + Pal::ChannelSwizzle::B, Pal::ChannelSwizzle::A } }, + { { CL_sRGBA, CL_UNSIGNED_INT8 }, // This is used only by blit kernel + { Pal::ChFmt::R8G8B8A8, Pal::NumFmt::Uint }, + { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::G, + Pal::ChannelSwizzle::B, Pal::ChannelSwizzle::A } }, + + // sRBG + { { CL_sRGB, CL_UNORM_INT8 }, + { Pal::ChFmt::R8G8B8A8, Pal::NumFmt::Srgb }, + { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::G, + Pal::ChannelSwizzle::B, Pal::ChannelSwizzle::One } }, + { { CL_sRGB, CL_UNSIGNED_INT8 }, // This is used only by blit kernel + { Pal::ChFmt::R8G8B8A8, Pal::NumFmt::Uint }, + { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::G, + Pal::ChannelSwizzle::B, Pal::ChannelSwizzle::One } }, + + // sRBGx + { { CL_sRGBx, CL_UNORM_INT8 }, + { Pal::ChFmt::R8G8B8A8, Pal::NumFmt::Srgb }, + { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::G, + Pal::ChannelSwizzle::B, Pal::ChannelSwizzle::One } }, + { { CL_sRGBx, CL_UNSIGNED_INT8 }, // This is used only by blit kernel + { Pal::ChFmt::R8G8B8A8, Pal::NumFmt::Uint }, + { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::G, + Pal::ChannelSwizzle::B, Pal::ChannelSwizzle::One } }, + + // sBGRA + { { CL_sBGRA, CL_UNORM_INT8 }, + { Pal::ChFmt::B8G8R8A8, Pal::NumFmt::Srgb }, + { Pal::ChannelSwizzle::B, Pal::ChannelSwizzle::G, + Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::A } }, + { { CL_sBGRA, CL_UNSIGNED_INT8 }, // This is used only by blit kernel + { Pal::ChFmt::B8G8R8A8, Pal::NumFmt::Uint }, + { Pal::ChannelSwizzle::B, Pal::ChannelSwizzle::G, + Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::A } }, + + // DEPTH + { { CL_DEPTH, CL_FLOAT }, + { Pal::ChFmt::R32, Pal::NumFmt::Float }, + { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::R, + Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::R } }, + { { CL_DEPTH, CL_UNSIGNED_INT32 }, // This is used only by blit kernel + { Pal::ChFmt::R32, Pal::NumFmt::Uint }, + { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::R, + Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::R } }, + + { { CL_DEPTH, CL_UNORM_INT16 }, + { Pal::ChFmt::R16, Pal::NumFmt::Unorm }, + { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::R, + Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::R } }, + { { CL_DEPTH, CL_UNSIGNED_INT16 }, // This is used only by blit kernel + { Pal::ChFmt::R16, Pal::NumFmt::Uint }, + { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::R, + Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::R } }, + + { { CL_DEPTH_STENCIL, CL_UNORM_INT24 }, + { Pal::ChFmt::R32, Pal::NumFmt::Unorm }, + { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::R, + Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::R } }, + { { CL_DEPTH_STENCIL, CL_FLOAT }, + { Pal::ChFmt::R32, Pal::NumFmt::Float }, + { Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::R, + Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::R } } +}; + +} // namespace pal + +#endif // PALDEFS_HPP_ diff --git a/projects/clr/rocclr/runtime/device/pal/paldevice.cpp b/projects/clr/rocclr/runtime/device/pal/paldevice.cpp new file mode 100644 index 0000000000..0937ed086a --- /dev/null +++ b/projects/clr/rocclr/runtime/device/pal/paldevice.cpp @@ -0,0 +1,2207 @@ +// +// Copyright (c) 2008 Advanced Micro Devices, Inc. All rights reserved. +// +#include "platform/program.hpp" +#include "platform/kernel.hpp" +#include "os/os.hpp" +#include "device/device.hpp" +#include "device/pal/paldefs.hpp" +#include "device/pal/palmemory.hpp" +#include "device/pal/paldevice.hpp" +#include "utils/flags.hpp" +#include "utils/versions.hpp" +#include "thread/monitor.hpp" +#include "device/pal/palprogram.hpp" +#include "device/pal/palsettings.hpp" +#include "device/pal/palblit.hpp" +#include "device/pal/paldebugmanager.hpp" +#include "palLib.h" +#include "palPlatform.h" +#include "palDevice.h" + +#include "acl.h" + +#include "amdocl/cl_common.hpp" +//#include "CL/cl_gl.h" + +#ifdef _WIN32 +#include +#include +#include "CL/cl_d3d10.h" +#include "CL/cl_d3d11.h" +#include "CL/cl_dx9_media_sharing.h" +#endif // _WIN32 + +#include +#include +#include +#include +#include +#include + +bool +PalDeviceLoad() +{ + bool ret = false; + + // Create online devices + ret |= pal::Device::init(); + // Create offline GPU devices + ret |= pal::NullDevice::init(); + + return ret; +} + +void +PalDeviceUnload() +{ + pal::Device::tearDown(); +} + +namespace pal { + +aclCompiler* NullDevice::compiler_; +AppProfile Device::appProfile_; + +NullDevice::NullDevice() + : amd::Device(nullptr) + , ipLevel_(Pal::GfxIpLevel::None) + , hwInfo_(nullptr) +{ +} + +bool +NullDevice::init() +{ + std::vector devices; + + devices = getDevices(CL_DEVICE_TYPE_GPU, false); + + // Loop through all supported devices and create each of them + for (uint id = static_cast(Pal::GfxIpLevel::GfxIp7); + id <= static_cast(Pal::GfxIpLevel::GfxIp9); ++id) { + bool foundActive = false; + Pal::GfxIpLevel ipLevel = static_cast(id); + + if (pal::DeviceInfo[id].targetName_[0] == '\0') { + continue; + } + + // Loop through all active devices and see if we match one + for (uint i = 0; i < devices.size(); ++i) { + if (static_cast(devices[i])->ipLevel() == ipLevel) { + foundActive = true; + break; + } + } + + // Don't report an offline device if it's active + if (foundActive) { + continue; + } + + NullDevice* dev = new NullDevice(); + if (nullptr != dev) { + if (!dev->create(ipLevel)) { + delete dev; + } + else { + dev->registerDevice(); + } + } + } + + return true; +} + +bool +NullDevice::create(Pal::GfxIpLevel ipLevel) +{ + online_ = false; + Pal::DeviceProperties properties = {}; + + // Use fake GFX IP for the device init + ipLevel_ = ipLevel; + properties.gfxLevel = ipLevel; + hwInfo_ = &DeviceInfo[static_cast(ipLevel)]; + + settings_ = new pal::Settings(); + pal::Settings* palSettings = reinterpret_cast(settings_); + + // Report 512MB for all offline devices + Pal::GpuMemoryHeapProperties heaps[Pal::GpuHeapCount]; + heaps[Pal::GpuHeapLocal].heapSize = 512 * Mi; + + // Create setting for the offline target + if ((palSettings == nullptr) || !palSettings->create(properties, heaps)) { + return false; + } + + // Fill the device info structure + fillDeviceInfo(properties, heaps, 4096, 1); + + // Runtime doesn't know what local size could be on the real board + info_.maxGlobalVariableSize_ = static_cast(512 * Mi); + + return true; +} + +device::Program* +NullDevice::createProgram(amd::option::Options* options) +{ + device::Program* nullProgram; + if (settings().hsail_) { + nullProgram = new HSAILProgram(*this); + } + else { + // AMDIL path + ShouldNotReachHere(); + } + if (nullProgram == nullptr) { + LogError("Memory allocation has failed!"); + } + + return nullProgram; +} + +void NullDevice::fillDeviceInfo( + const Pal::DeviceProperties& palProp, + const Pal::GpuMemoryHeapProperties heaps[Pal::GpuHeapCount], + size_t maxTextureSize, + uint numComputeRings) +{ + info_.type_ = CL_DEVICE_TYPE_GPU; + info_.vendorId_ = palProp.vendorId; + + info_.maxWorkItemDimensions_ = 3; + info_.maxComputeUnits_ = + palProp.gfxipProperties.engineCore.numOfShaderEngines * + palProp.gfxipProperties.engineCore.numOfShaderArrays * + palProp.gfxipProperties.engineCore.numOfCUsPerShaderArray; + info_.numberOfShaderEngines = palProp.gfxipProperties.engineCore.numOfShaderEngines; + + // SI parts are scalar. Also, reads don't need to be 128-bits to get peak rates. + // For example, float4 is not faster than float as long as all threads fetch the same + // amount of data and the reads are coalesced. This is from the H/W team and confirmed + // through experimentation. May also be true on EG/NI, but no point in confusing + // developers now. + info_.nativeVectorWidthChar_ = info_.preferredVectorWidthChar_ = 4; + info_.nativeVectorWidthShort_ = info_.preferredVectorWidthShort_ = 2; + info_.nativeVectorWidthInt_ = info_.preferredVectorWidthInt_ = 1; + info_.nativeVectorWidthLong_ = info_.preferredVectorWidthLong_ = 1; + info_.nativeVectorWidthFloat_ = info_.preferredVectorWidthFloat_ = 1; + info_.nativeVectorWidthDouble_ = info_.preferredVectorWidthDouble_ = + (settings().checkExtension(ClKhrFp64)) ? 1 : 0; + info_.nativeVectorWidthHalf_ = info_.preferredVectorWidthHalf_ = 0; // no half support + + info_.maxClockFrequency_ = (palProp.gfxipProperties.performance.maxGpuClock != 0) ? + palProp.gfxipProperties.performance.maxGpuClock : 555; + info_.maxParameterSize_ = 1024; + info_.minDataTypeAlignSize_ = sizeof(cl_long16); + info_.singleFPConfig_ = CL_FP_ROUND_TO_NEAREST | CL_FP_ROUND_TO_ZERO + | CL_FP_ROUND_TO_INF | CL_FP_INF_NAN | CL_FP_FMA; + + if (settings().singleFpDenorm_) { + info_.singleFPConfig_ |= CL_FP_DENORM; + } + + if (settings().checkExtension(ClKhrFp64)) { + info_.doubleFPConfig_ = info_.singleFPConfig_ | CL_FP_DENORM; + } + + if (settings().reportFMA_) { + info_.singleFPConfig_ |= CL_FP_CORRECTLY_ROUNDED_DIVIDE_SQRT; + } + + info_.globalMemCacheLineSize_ = settings().cacheLineSize_; + info_.globalMemCacheSize_ = settings().cacheSize_; + if ((settings().cacheLineSize_ != 0) || (settings().cacheSize_ != 0)) { + info_.globalMemCacheType_ = CL_READ_WRITE_CACHE; + } + else { + info_.globalMemCacheType_ = CL_NONE; + } + + uint64_t localRAM = heaps[Pal::GpuHeapLocal].heapSize + + heaps[Pal::GpuHeapInvisible].heapSize; +#if defined(ATI_OS_LINUX) + info_.globalMemSize_ = + (static_cast(std::min(GPU_MAX_HEAP_SIZE, 100u)) * + // globalMemSize is the actual available size for app on Linux + // Because Linux base driver doesn't support paging + static_cast(memInfo.cardMemAvailableBytes + memInfo.cardExtMemAvailableBytes) / 100u); +#else + info_.globalMemSize_ = + (static_cast(std::min(GPU_MAX_HEAP_SIZE, 100u)) * + static_cast(localRAM) / 100u); +#endif + if (settings().apuSystem_) { + info_.globalMemSize_ += + (static_cast(heaps[Pal::GpuHeapGartUswc].heapSize) * Mi * 75)/100; + } + + // Find the largest heap form FB memory + info_.maxMemAllocSize_ = std::max( + cl_ulong(heaps[Pal::GpuHeapLocal].heapSize), + cl_ulong(heaps[Pal::GpuHeapInvisible].heapSize)); + +#if defined(ATI_OS_WIN) + if (settings().apuSystem_) { + info_.maxMemAllocSize_ = std::max( + (static_cast(heaps[Pal::GpuHeapGartUswc].heapSize) * Mi * 75)/100, + info_.maxMemAllocSize_); + } +#endif + info_.maxMemAllocSize_ = cl_ulong(info_.maxMemAllocSize_ * + std::min(GPU_SINGLE_ALLOC_PERCENT, 100u) / 100u); + + //! \note Force max single allocation size. + //! 4GB limit for the blit kernels and 64 bit optimizations. + info_.maxMemAllocSize_ = std::min(info_.maxMemAllocSize_, + static_cast(settings().maxAllocSize_)); + + if (info_.maxMemAllocSize_ < cl_ulong(128 * Mi)) { + LogError("We are unable to get a heap large enough to support the OpenCL minimum "\ + "requirement for FULL_PROFILE"); + } + + info_.maxMemAllocSize_ = std::max(cl_ulong(128 * Mi), info_.maxMemAllocSize_); + + // Clamp max single alloc size to the globalMemSize since it's + // reduced by default + info_.maxMemAllocSize_ = std::min(info_.maxMemAllocSize_, info_.globalMemSize_); + + // We need to verify that we are not reporting more global memory + // that 4x single alloc + info_.globalMemSize_ = std::min( 4 * info_.maxMemAllocSize_, info_.globalMemSize_); + + // Use 64 bit pointers + if (settings().use64BitPtr_) { + info_.addressBits_ = 64; + } + else { + info_.addressBits_ = 32; + // Limit total size with 3GB for 32 bit + info_.globalMemSize_ = std::min(info_.globalMemSize_, cl_ulong(3 * Gi)); + } + + // Alignment in BITS of the base address of any allocated memory object + static const size_t MemBaseAlignment = 256; + //! @note Force 256 bytes alignment, since currently + //! calAttr.surface_alignment returns 4KB. For pinned memory runtime + //! should be able to create a view with 256 bytes alignement + info_.memBaseAddrAlign_ = 8 * MemBaseAlignment; + + info_.maxConstantBufferSize_ = 64 * Ki; + info_.maxConstantArgs_ = MaxConstArguments; + + // Image support fields + if (settings().imageSupport_) { + info_.imageSupport_ = CL_TRUE; + info_.maxSamplers_ = MaxSamplers; + info_.maxReadImageArgs_ = MaxReadImage; + info_.maxWriteImageArgs_ = MaxWriteImage; + info_.image2DMaxWidth_ = maxTextureSize; + info_.image2DMaxHeight_ = maxTextureSize; + info_.image3DMaxWidth_ = std::min(2 * Ki, maxTextureSize); + info_.image3DMaxHeight_ = std::min(2 * Ki, maxTextureSize); + info_.image3DMaxDepth_ = std::min(2 * Ki, maxTextureSize); + + info_.imagePitchAlignment_ = 1; // PAL uses LINEAR_GENERAL + info_.imageBaseAddressAlignment_ = 256; // XXX: 256 byte base address alignment for now + + info_.bufferFromImageSupport_ = CL_TRUE; + } + + info_.errorCorrectionSupport_ = CL_FALSE; + + if (settings().apuSystem_) { + info_.hostUnifiedMemory_ = CL_TRUE; + } + + info_.profilingTimerResolution_ = 1; + info_.profilingTimerOffset_ = amd::Os::offsetToEpochNanos(); + info_.littleEndian_ = CL_TRUE; + info_.available_ = CL_TRUE; + info_.compilerAvailable_ = CL_TRUE; + info_.linkerAvailable_ = CL_TRUE; + + info_.executionCapabilities_ = CL_EXEC_KERNEL; + info_.preferredPlatformAtomicAlignment_ = 0; + info_.preferredGlobalAtomicAlignment_ = 0; + info_.preferredLocalAtomicAlignment_ = 0; + info_.queueProperties_ = CL_QUEUE_PROFILING_ENABLE; + + info_.platform_ = AMD_PLATFORM; + + ::strcpy(info_.name_, hwInfo()->targetName_); + ::strcpy(info_.vendor_, "Advanced Micro Devices, Inc."); + ::snprintf(info_.driverVersion_, sizeof(info_.driverVersion_) - 1, + AMD_BUILD_STRING "%s", " (VM)"); + + info_.profile_ = "FULL_PROFILE"; + if (settings().oclVersion_ == OpenCL20) { + info_.version_ = "OpenCL 2.0 " AMD_PLATFORM_INFO; + info_.oclcVersion_ = "OpenCL C 2.0 "; + info_.spirVersions_ = "1.2"; + } + else if (settings().oclVersion_ == OpenCL12) { + info_.version_ = "OpenCL 1.2 " AMD_PLATFORM_INFO; + info_.oclcVersion_ = "OpenCL C 1.2 "; + info_.spirVersions_ = "1.2"; + } + else { + info_.version_ = "OpenCL 1.0 " AMD_PLATFORM_INFO; + info_.oclcVersion_ = "OpenCL C 1.0 "; + info_.spirVersions_ = ""; + LogError("Unknown version for support"); + } + + // Fill workgroup info size + info_.maxWorkGroupSize_ = settings().maxWorkGroupSize_; + info_.maxWorkItemSizes_[0] = info_.maxWorkGroupSize_; + info_.maxWorkItemSizes_[1] = info_.maxWorkGroupSize_; + info_.maxWorkItemSizes_[2] = info_.maxWorkGroupSize_; + + info_.localMemType_ = CL_LOCAL; + info_.localMemSize_ = settings().hwLDSSize_; + info_.extensions_ = getExtensionString(); + +/* + info_.deviceTopology_.pcie.type = CL_DEVICE_TOPOLOGY_TYPE_PCIE_AMD; + info_.deviceTopology_.pcie.bus = (calAttr.pciTopologyInformation&(0xFF<<8))>>8; + info_.deviceTopology_.pcie.device = (calAttr.pciTopologyInformation&(0x1F<<3))>>3; + info_.deviceTopology_.pcie.function = (calAttr.pciTopologyInformation&0x07); + + ::strncpy(info_.boardName_, calAttr.boardName, sizeof(info_.boardName_)); +*/ + // OpenCL1.2 device info fields + info_.builtInKernels_ = ""; + info_.imageMaxBufferSize_ = MaxImageBufferSize; + info_.imageMaxArraySize_ = MaxImageArraySize; + info_.preferredInteropUserSync_ = true; + info_.printfBufferSize_ = PrintfDbg::WorkitemDebugSize * info().maxWorkGroupSize_; + + if (settings().oclVersion_ >= OpenCL20) { + info_.svmCapabilities_ = + (CL_DEVICE_SVM_COARSE_GRAIN_BUFFER | CL_DEVICE_SVM_FINE_GRAIN_BUFFER); + if (settings().svmAtomics_) { + info_.svmCapabilities_ |= CL_DEVICE_SVM_ATOMICS; + } + if (settings().svmFineGrainSystem_) { + info_.svmCapabilities_ |= CL_DEVICE_SVM_FINE_GRAIN_SYSTEM; + } + // OpenCL2.0 device info fields + info_.maxWriteImageArgs_ = MaxReadWriteImage; //!< For compatibility + info_.maxReadWriteImageArgs_ = MaxReadWriteImage; + + info_.maxPipePacketSize_ = info_.maxMemAllocSize_; + info_.maxPipeActiveReservations_ = 16; + info_.maxPipeArgs_ = 16; + + info_.queueOnDeviceProperties_ = + CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | CL_QUEUE_PROFILING_ENABLE; + info_.queueOnDevicePreferredSize_ = 256 * Ki; + info_.queueOnDeviceMaxSize_ = 8 * Mi; + info_.maxOnDeviceQueues_ = 1; + info_.maxOnDeviceEvents_ = settings().numDeviceEvents_; + info_.globalVariablePreferredTotalSize_ = static_cast(info_.globalMemSize_); + //! \todo Remove % calculation. + //! Use 90% of max single alloc size. + //! Boards with max single alloc size around 4GB will fail allocations + info_.maxGlobalVariableSize_ = static_cast( + amd::alignDown(info_.maxMemAllocSize_ * 9 / 10, 256)); + } + + if (settings().checkExtension(ClAmdDeviceAttributeQuery)) { + info_.simdPerCU_ = hwInfo()->simdPerCU_; + info_.simdWidth_ = hwInfo()->simdWidth_; + info_.simdInstructionWidth_ = hwInfo()->simdInstructionWidth_; + info_.wavefrontWidth_ = palProp.gfxipProperties.engineCore.wavefrontSize; + //info_.globalMemChannels_ = calAttr.memBusWidth / 32; + //info_.globalMemChannelBanks_ = calAttr.numMemBanks; + info_.globalMemChannelBankWidth_ = hwInfo()->memChannelBankWidth_; + info_.localMemSizePerCU_ = hwInfo()->localMemSizePerCU_; + info_.localMemBanks_ = hwInfo()->localMemBanks_; + info_.gfxipVersion_ = hwInfo()->gfxipVersion_; + info_.numAsyncQueues_ = numComputeRings; + info_.numRTQueues_ = 2; + info_.numRTCUs_ = 4; + info_.threadTraceEnable_ = settings().threadTraceEnable_; + } +} + +Device::XferBuffers::~XferBuffers() +{ + // Destroy temporary buffer for reads + for (const auto& buf : freeBuffers_) { + // CPU optimization: unmap staging buffer just once + if (!buf->desc().cardMemory_) { + buf->unmap(nullptr); + } + delete buf; + } + freeBuffers_.clear(); +} + +bool +Device::XferBuffers::create() +{ + Memory* xferBuf = nullptr; + bool result = false; + // Create a buffer object + xferBuf = new Memory(dev(), bufSize_); + + // Try to allocate memory for the transfer buffer + if ((nullptr == xferBuf) || !xferBuf->create(type_)) { + delete xferBuf; + xferBuf = nullptr; + LogError("Couldn't allocate a transfer buffer!"); + } + else { + result = true; + freeBuffers_.push_back(xferBuf); + // CPU optimization: map staging buffer just once + if (!xferBuf->desc().cardMemory_) { + xferBuf->map(nullptr); + } + } + + return result; +} + +Memory& +Device::XferBuffers::acquire() +{ + Memory* xferBuf = nullptr; + size_t listSize; + + // Lock the operations with the staged buffer list + amd::ScopedLock l(lock_); + listSize = freeBuffers_.size(); + + // If the list is empty, then attempt to allocate a staged buffer + if (listSize == 0) { + // Allocate memory + xferBuf = new Memory(dev(), bufSize_); + + // Allocate memory for the transfer buffer + if ((nullptr == xferBuf) || !xferBuf->create(type_)) { + delete xferBuf; + xferBuf = nullptr; + LogError("Couldn't allocate a transfer buffer!"); + } + else { + ++acquiredCnt_; + // CPU optimization: map staging buffer just once + if (!xferBuf->desc().cardMemory_) { + xferBuf->map(nullptr); + } + } + } + + if (xferBuf == nullptr) { + xferBuf = *(freeBuffers_.begin()); + freeBuffers_.erase(freeBuffers_.begin()); + ++acquiredCnt_; + } + + return *xferBuf; +} + +void +Device::XferBuffers::release(VirtualGPU& gpu, Memory& buffer) +{ + // Make sure buffer isn't busy on the current VirtualGPU, because + // the next aquire can come from different queue + buffer.wait(gpu); + // Lock the operations with the staged buffer list + amd::ScopedLock l(lock_); + freeBuffers_.push_back(&buffer); + --acquiredCnt_; +} + + +Device::ScopedLockVgpus::ScopedLockVgpus(const Device& dev) + : dev_(dev) +{ + // Lock the virtual GPU list + dev_.vgpusAccess()->lock(); + + // Find all available virtual GPUs and lock them + // from the execution of commands + for (uint idx = 0; idx < dev_.vgpus().size(); ++idx) { + dev_.vgpus()[idx]->execution().lock(); + } +} + +Device::ScopedLockVgpus::~ScopedLockVgpus() +{ + // Find all available virtual GPUs and unlock them + // for the execution of commands + for (uint idx = 0; idx < dev_.vgpus().size(); ++idx) { + dev_.vgpus()[idx]->execution().unlock(); + } + + // Unock the virtual GPU list + dev_.vgpusAccess()->unlock(); +} + +Device::Device() + : NullDevice() + , numOfVgpus_(0) + , context_(nullptr) + , lockAsyncOps_(nullptr) + , lockForInitHeap_(nullptr) + , lockPAL_(nullptr) + , vgpusAccess_(nullptr) + , scratchAlloc_(nullptr) + , mapCacheOps_(nullptr) + , xferRead_(nullptr) + , xferWrite_(nullptr) + , vaCacheAccess_(nullptr) + , vaCacheList_(nullptr) + , mapCache_(nullptr) + , resourceCache_(nullptr) + , numComputeEngines_(0) + , numDmaEngines_(0) + , heapInitComplete_(false) + , xferQueue_(nullptr) + , globalScratchBuf_(nullptr) + , srdManager_(nullptr) +{ +} + +Device::~Device() +{ + // remove the HW debug manager + delete hwDebugMgr_; + hwDebugMgr_ = nullptr; + + CondLog(vaCacheList_ == nullptr || + (vaCacheList_->size() != 0), "Application didn't unmap all host memory!"); + + delete srdManager_; + + for (uint s = 0; s < scratch_.size(); ++s) { + delete scratch_[s]; + scratch_[s] = nullptr; + } + + delete globalScratchBuf_; + globalScratchBuf_ = nullptr; + + // Destroy transfer queue + delete xferQueue_; + + // Destroy blit program + delete blitProgram_; + + // Release cached map targets + for (uint i = 0; mapCache_ != nullptr && i < mapCache_->size(); ++i) { + if ((*mapCache_)[i] != nullptr) { + (*mapCache_)[i]->release(); + } + } + delete mapCache_; + + // Destroy temporary buffers for read/write + delete xferRead_; + delete xferWrite_; + + // Destroy resource cache + delete resourceCache_; + + delete lockAsyncOps_; + delete lockForInitHeap_; + delete lockPAL_; + delete vgpusAccess_; + delete scratchAlloc_; + delete mapCacheOps_; + delete vaCacheAccess_; + delete vaCacheList_; + + if (context_ != nullptr) { + context_->release(); + } + + device_ = nullptr; +} + +extern const char* SchedulerSourceCode; + +bool +Device::create(Pal::IDevice* device) +{ + appProfile_.init(); + device_ = device; + Pal::Result result; + + // Retrive device properties + result = iDev()->GetProperties(&properties_); + + // Save the IP level for the offline detection + ipLevel_ = properties().gfxLevel; + + // Update HW info for the device + hwInfo_ = &DeviceInfo[static_cast(properties().revision)]; + + Pal::PalPublicSettings*const palSettings = iDev()->GetPublicSettings(); + // Modify settings here + // palSettings ... + palSettings->textureOptLevel = Pal::TextureFilterOptimizationsDisabled; + // Commit the new settings for the device + result = iDev()->CommitSettingsAndInit(); + if (result == Pal::Result::Success) { + Pal::DeviceFinalizeInfo finalizeInfo = {}; + + // Request 2 compute engines + finalizeInfo.engineCounts[Pal::QueueTypeCompute] = 2; + // Request 2 SDMA engines + finalizeInfo.engineCounts[Pal::QueueTypeDma] = 2; + + result = iDev()->Finalize(finalizeInfo); + } + + Pal::GpuMemoryHeapProperties heaps[Pal::GpuHeapCount]; + iDev()->GetGpuMemoryHeapProperties(heaps); + + // Creates device settings + settings_ = new pal::Settings(); + pal::Settings* gpuSettings = reinterpret_cast(settings_); + if ((gpuSettings == nullptr) || !gpuSettings->create(properties(), heaps, + appProfile_.reportAsOCL12Device())) { + return false; + } + + // Find the number of available engines + numComputeEngines_ = + properties().engineProperties[Pal::QueueTypeCompute].engineCount; + numDmaEngines_ = + properties().engineProperties[Pal::QueueTypeDma].engineCount; + numComputeEngines_ = std::min(numComputeEngines_, settings().numComputeRings_); + + amd::Context::Info info = {0}; + std::vector devices; + devices.push_back(this); + + // Create a dummy context + context_ = new amd::Context(devices, info); + if (context_ == nullptr) { + return false; + } + + // Create the locks + lockAsyncOps_ = new amd::Monitor("Device Async Ops Lock", true); + if (nullptr == lockAsyncOps_) { + return false; + } + lockPAL_ = new amd::Monitor("PAL Ops Lock", true); + if (nullptr == lockPAL_) { + return false; + } + + lockForInitHeap_ = new amd::Monitor("Async Ops Lock For Initialization of Heap Resource", true); + if (nullptr == lockForInitHeap_) { + return false; + } + + vgpusAccess_ = new amd::Monitor("Virtual GPU List Ops Lock", true); + if (nullptr == vgpusAccess_) { + return false; + } + + scratchAlloc_ = new amd::Monitor("Scratch Allocation Lock", true); + if (nullptr == scratchAlloc_) { + return false; + } + + mapCacheOps_ = new amd::Monitor("Map Cache Lock", true); + if (nullptr == mapCacheOps_) { + return false; + } + + vaCacheAccess_ = new amd::Monitor("VA Cache Ops Lock", true); + if (nullptr == vaCacheAccess_) { + return false; + } + vaCacheList_ = new std::list(); + if (nullptr == vaCacheList_) { + return false; + } + + mapCache_ = new std::vector(); + if (mapCache_ == nullptr) { + return false; + } + // Use just 1 entry by default for the map cache + mapCache_->push_back(nullptr); + + size_t resourceCacheSize = settings().resourceCacheSize_; + +#ifdef DEBUG + std::stringstream message; + if (settings().remoteAlloc_) { + message << "Using *Remote* memory"; + } + else { + message << "Using *Local* memory"; + } + + message << std::endl; + LogInfo(message.str().c_str()); +#endif // DEBUG + + // Create resource cache. + // \note Cache must be created before any resource creation to avoid nullptr check + resourceCache_ = new ResourceCache(resourceCacheSize); + if (nullptr == resourceCache_) { + return false; + } + + // Fill the device info structure + fillDeviceInfo(properties(), heaps, 16*Ki, numComputeEngines()); + + for (uint i = 0; i < Pal::GpuHeap::GpuHeapCount; ++i) { + freeMem[i] = heaps[i].heapSize; + } + + // Allocate SRD manager + srdManager_ = new SrdManager(*this, + std::max(HsaImageObjectSize, HsaSamplerObjectSize), 64 * Ki); + if (srdManager_ == nullptr) { + return false; + } + + // create the HW debug manager if needed + if (settings().enableHwDebug_) { + hwDebugMgr_ = new GpuDebugManager(this); + } + + return true; +} + +bool +Device::initializeHeapResources() +{ + amd::ScopedLock k(lockForInitHeap_); + if (!heapInitComplete_) { + heapInitComplete_ = true; + + scratch_.resize((settings().useSingleScratch_) ? + 1 : (numComputeEngines() ? numComputeEngines() : 1)); + + // Initialize the number of mem object for the scratch buffer + for (uint s = 0; s < scratch_.size(); ++s) { + scratch_[s] = new ScratchBuffer(); + if (nullptr == scratch_[s]) { + return false; + } + } + + if (settings().stagedXferSize_ != 0) { + // Initialize staged write buffers + if (settings().stagedXferWrite_) { + Resource::MemoryType type; + if (settings().stagingWritePersistent_ && !settings().disablePersistent_) { + type = Resource::Persistent; + } else { + type = Resource::RemoteUSWC; + } + xferWrite_ = new XferBuffers(*this, type, + amd::alignUp(settings().stagedXferSize_, 4 * Ki)); + if ((xferWrite_ == nullptr) || !xferWrite_->create()) { + LogError("Couldn't allocate transfer buffer objects for read"); + return false; + } + } + + // Initialize staged read buffers + if (settings().stagedXferRead_) { + xferRead_ = new XferBuffers(*this, Resource::Remote, + amd::alignUp(settings().stagedXferSize_, 4 * Ki)); + if ((xferRead_ == nullptr) || !xferRead_->create()) { + LogError("Couldn't allocate transfer buffer objects for write"); + return false; + } + } + } + + // Delay compilation due to brig_loader memory allocation + if (settings().hsail_ || (settings().oclVersion_ == OpenCL20)) { + const char* scheduler = nullptr; + const char* ocl20 = nullptr; + if (settings().oclVersion_ == OpenCL20) { + scheduler = SchedulerSourceCode; + ocl20 = "-cl-std=CL2.0"; + } + blitProgram_ = new BlitProgram(context_); + // Create blit programs + if (blitProgram_ == nullptr || + !blitProgram_->create(this, scheduler, ocl20)) { + delete blitProgram_; + blitProgram_ = nullptr; + LogError("Couldn't create blit kernels!"); + return false; + } + } + + // Create a synchronized transfer queue + xferQueue_ = new VirtualGPU(*this); + if (!(xferQueue_ && xferQueue_->create( + false + ))) { + delete xferQueue_; + xferQueue_ = nullptr; + } + if (nullptr == xferQueue_) { + LogError("Couldn't create the device transfer manager!"); + return false; + } + xferQueue_->enableSyncedBlit(); + } + return true; +} + +device::VirtualDevice* +Device::createVirtualDevice( + amd::CommandQueue* queue + ) +{ + bool profiling = false; + bool interopQueue = false; + uint rtCUs = 0; + uint deviceQueueSize = 0; + + if (queue != nullptr) { + profiling = queue->properties().test(CL_QUEUE_PROFILING_ENABLE); + if (queue->asHostQueue() != nullptr) { + interopQueue = (0 != (queue->context().info().flags_ & + (amd::Context::GLDeviceKhr | + amd::Context::D3D10DeviceKhr | + amd::Context::D3D11DeviceKhr))); + rtCUs = queue->rtCUs(); + } + else if (queue->asDeviceQueue() != nullptr) { + deviceQueueSize = queue->asDeviceQueue()->size(); + } + } + + // Not safe to add a queue. So lock the device + amd::ScopedLock k(lockAsyncOps()); + amd::ScopedLock lock(vgpusAccess()); + + // Initialization of heap and other resources occur during the command queue creation time. + if (!initializeHeapResources()) { + LogError("Heap initializaiton fails!"); + return nullptr; + } + + VirtualGPU* vgpu = new VirtualGPU(*this); + if (vgpu && vgpu->create( + profiling + , deviceQueueSize + )) { + return vgpu; + } else { + delete vgpu; + return nullptr; + } +} + +device::Program* +Device::createProgram(amd::option::Options* options) +{ + device::Program* gpuProgram; + if (settings().hsail_) { + gpuProgram = new HSAILProgram(*this); + } + else { + ShouldNotReachHere(); + //AMDIL + //gpuProgram = new Program(*this); + } + if (gpuProgram == nullptr) { + LogError("We failed memory allocation for program!"); + } + + return gpuProgram; +} + +//! Requested devices list as configured by the GPU_DEVICE_ORDINAL +typedef std::map requestedDevices_t; + +//! Parses the requested list of devices to be exposed to the user. +static void +parseRequestedDeviceList(requestedDevices_t &requestedDevices) { + char *pch = nullptr; + int requestedDeviceCount = 0; + const char* requestedDeviceList = GPU_DEVICE_ORDINAL; + + pch = strtok(const_cast(requestedDeviceList), ","); + while (pch != nullptr) { + bool deviceIdValid = true; + int currentDeviceIndex = atoi(pch); + // Validate device index. + for (size_t i = 0; i < strlen(pch); i++) { + if (!isdigit(pch[i])) { + deviceIdValid = false; + break; + } + } + if (currentDeviceIndex < 0) { + deviceIdValid = false; + } + // Get next token. + pch = strtok(nullptr, ","); + if (!deviceIdValid) { + continue; + } + + // Requested device is valid. + requestedDevices[currentDeviceIndex] = true; + } +} + +#if defined(_WIN32) && defined (DEBUG) +#include +#include +static int reportHook(int reportType, char *message, int *returnValue) +{ + fprintf(stderr, "%s", message); + ::exit(3); + return 1; +} +#endif // _WIN32 & DEBUG + +static char* platformObj; +static Pal::IPlatform* platform; + +bool +Device::init() +{ + uint32_t numDevices = 0; + bool useDeviceList = false; + requestedDevices_t requestedDevices; + + const char* library = getenv("HSA_COMPILER_LIBRARY"); + aclCompilerOptions opts = { + sizeof(aclCompilerOptions_0_8), + library, + nullptr, + nullptr, + nullptr, + nullptr, + nullptr, + AMD_OCL_SC_LIB + }; + // Initialize the compiler handle + acl_error error; + compiler_ = aclCompilerInit(&opts, &error); + if (error != ACL_SUCCESS) { + LogError("Error initializing the compiler"); + return false; + } + + size_t size = Pal::GetPlatformSize(); + platformObj = new char[size]; + Pal::PlatformCreateInfo info = {}; + info.pSettingsPath = "OCL"; + + // PAL init + if (Pal::Result::Success != + Pal::CreatePlatform(info, platformObj, &platform)) { + return false; + } + + // Get the total number of active devices + // Count up all the devices in the system. + Pal::IDevice* deviceList[Pal::MaxDevices] = {}; + platform->EnumerateDevices(&numDevices, &deviceList[0]); + + uint ordinal = 0; + const char* selectDeviceByName = nullptr; + if (!flagIsDefault(GPU_DEVICE_ORDINAL)) { + useDeviceList = true; + parseRequestedDeviceList(requestedDevices); + } + else if (!flagIsDefault(GPU_DEVICE_NAME)) { + selectDeviceByName = GPU_DEVICE_NAME; + } + + // Loop through all active devices and initialize the device info structure + for (; ordinal < numDevices; ++ordinal) { + // Create the GPU device object + Device *d = new Device(); + bool result = (nullptr != d) && d->create(deviceList[ordinal]); + if (useDeviceList) { + result &= (requestedDevices.find(ordinal) != requestedDevices.end()); + } + if (result && + ((nullptr == selectDeviceByName) || ('\0' == selectDeviceByName[0]) || + (strstr(selectDeviceByName, d->info().name_) != nullptr))) { + d->registerDevice(); + } + else { + delete d; + } + } + return true; +} + +void +Device::tearDown() +{ + platform->Destroy(); + delete platformObj; + + if (compiler_ != nullptr) { + aclCompilerFini(compiler_); + } +} + +Memory* +Device::getGpuMemory(amd::Memory* mem) const +{ + return static_cast(mem->getDeviceMemory(*this)); +} + +const device::BlitManager& +Device::xferMgr() const +{ + return xferQueue_->blitMgr(); +} + +Pal::Format +Device::getPalFormat(const amd::Image::Format& format, Pal::ChannelMapping* channel) const +{ + // Find PAL format + for (uint i = 0; i < sizeof(MemoryFormatMap) / sizeof(MemoryFormat); ++i) { + if ((format.image_channel_data_type == + MemoryFormatMap[i].clFormat_.image_channel_data_type) && + (format.image_channel_order == + MemoryFormatMap[i].clFormat_.image_channel_order)) { + *channel = MemoryFormatMap[i].palChannel_; + return MemoryFormatMap[i].palFormat_; + } + } + assert(!"We didn't find PAL resource format!"); + *channel = MemoryFormatMap[0].palChannel_; + return MemoryFormatMap[0].palFormat_; +} + +// Create buffer without an owner (merge common code with createBuffer() ?) +pal::Memory* +Device::createScratchBuffer(size_t size) const +{ + Memory* gpuMemory = nullptr; + + // Create a memory object + gpuMemory = new pal::Memory(*this, size); + if (nullptr == gpuMemory || !gpuMemory->create(Resource::Local)) { + delete gpuMemory; + gpuMemory = nullptr; + } + + return gpuMemory; +} + +pal::Memory* +Device::createBuffer( + amd::Memory& owner, + bool directAccess) const +{ + size_t size = owner.getSize(); + pal::Memory* gpuMemory; + + // Create resource + bool result = false; + + if (owner.getType() == CL_MEM_OBJECT_PIPE) { + // directAccess isnt needed as Pipes shouldnt be host accessible for GPU + directAccess = false; + } + + if (nullptr != owner.parent()) { + pal::Memory* gpuParent = getGpuMemory(owner.parent()); + if (nullptr == gpuParent) { + LogError("Can't get the owner object for subbuffer allocation"); + return nullptr; + } + + return gpuParent->createBufferView(owner); + } + + Resource::MemoryType type = (owner.forceSysMemAlloc() || (owner.getMemFlags() & CL_MEM_SVM_FINE_GRAIN_BUFFER)) ? + Resource::Remote : Resource::Local; + + if (owner.getMemFlags() & CL_MEM_BUS_ADDRESSABLE_AMD) { + type = Resource::BusAddressable; + } + else if (owner.getMemFlags() & CL_MEM_EXTERNAL_PHYSICAL_AMD) { + type = Resource::ExternalPhysical; + } + + // Use direct access if it's possible + bool remoteAlloc = false; + // Internal means VirtualDevice!=nullptr + bool internalAlloc = ((owner.getMemFlags() & CL_MEM_USE_HOST_PTR) && + (owner.getVirtualDevice() != nullptr)) ? true : false; + + // Create a memory object + gpuMemory = new pal::Buffer(*this, owner, owner.getSize()); + if (nullptr == gpuMemory) { + return nullptr; + } + + // Check if owner is interop memory + if (owner.isInterop()) { + result = gpuMemory->createInterop(Memory::InteropDirectAccess); + } + else if (owner.getMemFlags() & CL_MEM_USE_PERSISTENT_MEM_AMD) { + // Attempt to allocate from persistent heap + result = gpuMemory->create(Resource::Persistent); + } + else if (directAccess || (type == Resource::Remote)) { + // Check for system memory allocations + if ((owner.getMemFlags() & (CL_MEM_ALLOC_HOST_PTR | CL_MEM_USE_HOST_PTR)) + || (settings().remoteAlloc_)) { + // Allocate remote memory if AHP allocation and context has just 1 device + if ((owner.getMemFlags() & CL_MEM_ALLOC_HOST_PTR) && + (owner.getContext().devices().size() == 1)) { + if (owner.getMemFlags() & (CL_MEM_READ_ONLY | + CL_MEM_HOST_WRITE_ONLY | CL_MEM_HOST_NO_ACCESS)) { + // GPU will be reading from this host memory buffer, + // so assume Host write into it + type = Resource::RemoteUSWC; + remoteAlloc = true; + } + } + // Make sure owner has a valid hostmem pointer and it's not COPY + if (!remoteAlloc && (owner.getHostMem() != nullptr)) { + Resource::PinnedParams params; + params.owner_ = &owner; + params.gpu_ = + reinterpret_cast(owner.getVirtualDevice()); + + params.hostMemRef_ = owner.getHostMemRef(); + params.size_ = owner.getHostMemRef()->size(); + if (0 == params.size_) { + params.size_ = owner.getSize(); + } + // Create memory object + result = gpuMemory->create(Resource::Pinned, ¶ms); + + // If direct access failed + if (!result) { + // Don't use cached allocation + // if size is biger than max single alloc + if (owner.getSize() > info().maxMemAllocSize_) { + delete gpuMemory; + return nullptr; + } + } + } + } + } + + if (!result && + // Make sure it's not internal alloc + !internalAlloc) { + Resource::CreateParams params; + params.owner_ = &owner; + params.gpu_ = static_cast(owner.getVirtualDevice()); + + // Create memory object + result = gpuMemory->create(type, ¶ms); + + // If allocation was successful + if (result) { + // Initialize if the memory is a pipe object + if (owner.getType() == CL_MEM_OBJECT_PIPE) { + // Pipe initialize in order read_idx, write_idx, end_idx. Refer clk_pipe_t structure. + // Init with 3 DWORDS for 32bit addressing and 6 DWORDS for 64bit + size_t pipeInit[3] = {0 , 0, owner.asPipe()->getMaxNumPackets()}; + gpuMemory->writeRawData(*xferQueue_, sizeof(pipeInit), pipeInit, true); + } + // If memory has direct access from host, then get CPU address + if (gpuMemory->isHostMemDirectAccess() && + (type != Resource::ExternalPhysical)) { + void* address = gpuMemory->map(nullptr); + if (address != nullptr) { + // Copy saved memory + if (owner.getMemFlags() & CL_MEM_COPY_HOST_PTR) { + memcpy(address, owner.getHostMem(), owner.getSize()); + } + // It should be safe to change the host memory pointer, + // because it's lock protected from the upper caller + owner.setHostMem(address); + } + else { + result = false; + } + } + // An optimization for CHP. Copy memory and destroy sysmem allocation + else if ((gpuMemory->memoryType() != Resource::Pinned) && + (owner.getMemFlags() & CL_MEM_COPY_HOST_PTR) && + (owner.getContext().devices().size() == 1)) { + amd::Coord3D origin(0, 0, 0); + amd::Coord3D region(owner.getSize()); + static const bool Entire = true; + if (xferMgr().writeBuffer(owner.getHostMem(), + *gpuMemory, origin, region, Entire)) { + // Clear CHP memory + owner.setHostMem(nullptr); + } + } + } + } + + if (!result) { + delete gpuMemory; + return nullptr; + } + + return gpuMemory; +} + +pal::Memory* +Device::createImage(amd::Memory& owner, bool directAccess) const +{ + size_t size = owner.getSize(); + amd::Image& image = *owner.asImage(); + pal::Memory* gpuImage = nullptr; + + if ((nullptr != owner.parent()) && (owner.parent()->asImage() != nullptr)) { + device::Memory* devParent = owner.parent()->getDeviceMemory(*this); + if (nullptr == devParent) { + LogError("Can't get the owner object for image view allocation"); + return nullptr; + } + // Create a view on the specified device + gpuImage = (pal::Memory*)createView(owner, *devParent); + if ((nullptr != gpuImage) && (gpuImage->owner() != nullptr)) { + gpuImage->owner()->setHostMem((address)(owner.parent()->getHostMem()) + gpuImage->owner()->getOrigin()); + } + return gpuImage; + } + + gpuImage = new pal::Image(*this, owner, + image.getWidth(), + image.getHeight(), + image.getDepth(), + image.getImageFormat(), + image.getType(), + image.getMipLevels()); + + // Create resource + if (nullptr != gpuImage) { + const bool imageBuffer = + ((owner.getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER) || + ((owner.getType() == CL_MEM_OBJECT_IMAGE2D) && + (owner.parent() != nullptr) && + (owner.parent()->asBuffer() != nullptr))); + bool result = false; + + // Check if owner is interop memory + if (owner.isInterop()) { + result = gpuImage->createInterop(Memory::InteropDirectAccess); + } + else if (imageBuffer) { + Resource::ImageBufferParams params; + pal::Memory* buffer = reinterpret_cast + (image.parent()->getDeviceMemory(*this)); + if (buffer == nullptr) { + LogError("Buffer creation for ImageBuffer failed!"); + delete gpuImage; + return nullptr; + } + params.owner_ = &owner; + params.resource_ = buffer; + params.memory_ = buffer; + + // Create memory object + result = gpuImage->create(Resource::ImageBuffer, ¶ms); + } + else if (directAccess && (owner.getMemFlags() & CL_MEM_ALLOC_HOST_PTR)) { + Resource::PinnedParams params; + params.owner_ = &owner; + params.hostMemRef_ = owner.getHostMemRef(); + params.size_ = owner.getHostMemRef()->size(); + + // Create memory object + result = gpuImage->create(Resource::Pinned, ¶ms); + } + + if (!result && !owner.isInterop()) { + if (owner.getMemFlags() & CL_MEM_USE_PERSISTENT_MEM_AMD) { + // Attempt to allocate from persistent heap + result = gpuImage->create(Resource::Persistent); + } + else { + Resource::MemoryType type = (owner.forceSysMemAlloc()) ? + Resource::RemoteUSWC : Resource::Local; + // Create memory object + result = gpuImage->create(type); + } + } + + if (!result) { + delete gpuImage; + return nullptr; + } + else if ((gpuImage->memoryType() != Resource::Pinned) && + (owner.getMemFlags() & CL_MEM_COPY_HOST_PTR) && + (owner.getContext().devices().size() == 1)) { + // Ignore copy for image1D_buffer, since it was already done for buffer + if (imageBuffer) { + // Clear CHP memory + owner.setHostMem(nullptr); + } + else { + amd::Coord3D origin(0, 0, 0); + static const bool Entire = true; + if (xferMgr().writeImage(owner.getHostMem(), + *gpuImage, origin, image.getRegion(), 0, 0, Entire)) { + // Clear CHP memory + owner.setHostMem(nullptr); + } + } + } + + if (result) { + size_t bytePitch = gpuImage->elementSize() * gpuImage->desc().width_; + image.setBytePitch(bytePitch); + } + } + + return gpuImage; +} + +//! Allocates cache memory on the card +device::Memory* +Device::createMemory( + amd::Memory& owner) const +{ + bool directAccess = false; + pal::Memory* memory = nullptr; + + if (owner.asBuffer()) { + directAccess = (settings().hostMemDirectAccess_ & Settings::HostMemBuffer) + ? true : false; + memory = createBuffer(owner, directAccess); + } + else if (owner.asImage()) { + directAccess = (settings().hostMemDirectAccess_ & Settings::HostMemImage) + ? true : false; + memory = createImage(owner, directAccess); + } + else { + LogError("Unknown memory type!"); + } + + // Attempt to pin system memory if runtime didn't use direct access + if ((memory != nullptr) && + (memory->memoryType() != Resource::Pinned) && + (memory->memoryType() != Resource::Remote) && + (memory->memoryType() != Resource::RemoteUSWC) && + (memory->memoryType() != Resource::ExternalPhysical) && + ((owner.getHostMem() != nullptr) || + ((nullptr != owner.parent()) && (owner.getHostMem() != nullptr)))) { + bool ok = memory->pinSystemMemory( + owner.getHostMem(), (owner.getHostMemRef()->size()) ? + owner.getHostMemRef()->size() : owner.getSize()); + //! \note: Ignore the pinning result for now + } + + return memory; +} + +bool +Device::createSampler(const amd::Sampler& owner, device::Sampler** sampler) const +{ + *sampler = nullptr; + if (settings().hsail_ || (settings().oclVersion_ >= OpenCL20)) { + Sampler* gpuSampler = new Sampler(*this); + if ((nullptr == gpuSampler) || !gpuSampler->create(owner)) { + delete gpuSampler; + return false; + } + *sampler = gpuSampler; + } + return true; +} + +//! \note reallocMemory() must be called only from outside of +//! VirtualGPU submit commands methods. +//! Otherwise a deadlock in lockVgpus() is possible + +bool +Device::reallocMemory(amd::Memory& owner) const +{ + bool directAccess = false; + + // For now we have to serialize reallocation code + amd::ScopedLock lk(*lockAsyncOps_); + + // Read device memory after the lock, + // since realloc from another thread can replace the pointer + pal::Memory* gpuMemory = getGpuMemory(&owner); + if (gpuMemory == nullptr) { + return false; + } + + if (gpuMemory->pinOffset() == 0) { + return true; + } + else if (nullptr != owner.parent()) { + if (!reallocMemory(*owner.parent())) { + return false; + } + } + + if (owner.asBuffer()) { + gpuMemory = createBuffer(owner, directAccess); + } + else if (owner.asImage()) { + return true; + } + else { + LogError("Unknown memory type!"); + } + + if (gpuMemory != nullptr) { + pal::Memory* newMemory = gpuMemory; + pal::Memory* oldMemory = getGpuMemory(&owner); + + // Transfer the object + if (oldMemory != nullptr) { + if (!oldMemory->moveTo(*newMemory)) { + delete newMemory; + return false; + } + } + + // Attempt to pin system memory + if ((newMemory->memoryType() != Resource::Pinned) && + ((owner.getHostMem() != nullptr) || + ((nullptr != owner.parent()) && (owner.getHostMem() != nullptr)))) { + bool ok = newMemory->pinSystemMemory( + owner.getHostMem(), (owner.getHostMemRef()->size()) ? + owner.getHostMemRef()->size() : owner.getSize()); + //! \note: Ignore the pinning result for now + } + + return true; + } + + return false; +} + +device::Memory* +Device::createView(amd::Memory& owner, const device::Memory& parent) const +{ + size_t size = owner.getSize(); + assert((owner.asImage() != nullptr) && "View supports images only"); + const amd::Image& image = *owner.asImage(); + pal::Memory* gpuImage = nullptr; + + gpuImage = new pal::Image(*this, owner, + image.getWidth(), + image.getHeight(), + image.getDepth(), + image.getImageFormat(), + image.getType(), + image.getMipLevels()); + + // Create resource + if (nullptr != gpuImage) { + bool result = false; + Resource::ImageViewParams params; + const pal::Memory& gpuMem = static_cast(parent); + + params.owner_ = &owner; + params.level_ = image.getBaseMipLevel(); + params.layer_ = 0; + params.resource_ = &gpuMem; + params.gpu_ = reinterpret_cast(owner.getVirtualDevice()); + params.memory_ = &gpuMem; + + // Create memory object + result = gpuImage->create(Resource::ImageView, ¶ms); + if (!result) { + delete gpuImage; + return nullptr; + } + } + + return gpuImage; +} + + +//! Attempt to bind with external graphics API's device/context +bool +Device::bindExternalDevice(intptr_t type, void* pDevice, void* pContext, bool validateOnly) +{ + assert(pDevice); + + switch (type) { +#ifdef _WIN32 + case CL_CONTEXT_D3D10_DEVICE_KHR: + if (!associateD3D10Device(pDevice)) { + LogError("Failed gslD3D10Associate()"); + return false; + } + break; + case CL_CONTEXT_D3D11_DEVICE_KHR: + if (!associateD3D11Device(pDevice)) { + LogError("Failed gslD3D11Associate()"); + return false; + } + break; + case CL_CONTEXT_ADAPTER_D3D9_KHR: + case CL_CONTEXT_ADAPTER_D3D9EX_KHR: + if (!associateD3D9Device(pDevice)) { + LogWarning("D3D9<->OpenCL adapter mismatch or D3D9Associate() failure"); + return false; + } + break; + case CL_CONTEXT_ADAPTER_DXVA_KHR: + break; +#endif //_WIN32 + case CL_GL_CONTEXT_KHR: + // Attempt to associate GSL-OGL + if (!glAssociate(pContext, pDevice)) { + if (!validateOnly) { + LogError("Failed gslGLAssociate()"); + } + return false; + } + break; + default: + LogError("Unknown external device!"); + return false; + break; + } + + return true; +} + +bool +Device::unbindExternalDevice(intptr_t type, void* pDevice, void* pContext, bool validateOnly) +{ + if (type != CL_GL_CONTEXT_KHR) { + return true; + } + + if (pDevice != nullptr) { + // Dissociate GSL-OGL + if (!glDissociate(pContext, pDevice)) { + if (validateOnly) { + LogWarning("Failed gslGLDiassociate()"); + } + return false; + } + } + return true; +} + +bool +Device::globalFreeMemory(size_t* freeMemory) const +{ + const uint TotalFreeMemory = 0; + const uint LargestFreeBlock = 1; + + // Initialization of heap and other resources because getMemInfo needs it. + if (!(const_cast(this)->initializeHeapResources())) { + return false; + } + + Pal::gpusize local = freeMem[Pal::GpuHeapLocal]; + Pal::gpusize invisible = freeMem[Pal::GpuHeapInvisible]; + + // Fill free memory info + freeMemory[TotalFreeMemory] = static_cast((local + invisible) / Ki); + freeMemory[LargestFreeBlock] = static_cast(std::max(local, invisible) / Ki); + + if (settings().apuSystem_) { + Pal::gpusize uswc = freeMem[Pal::GpuHeapGartUswc]; + uswc /= Ki; + freeMemory[TotalFreeMemory] += static_cast(uswc); + if (freeMemory[LargestFreeBlock] < uswc) { + freeMemory[LargestFreeBlock] = static_cast(uswc); + } + } + + return true; +} + +void +Device::addVACache(Memory* memory) const +{ + // Make sure system memory has direct access + if (memory->isHostMemDirectAccess()) { + // VA cache access must be serialised + amd::ScopedLock lk(*vaCacheAccess_); + void* start = memory->owner()->getHostMem(); + void* end = reinterpret_cast
(start) + memory->owner()->getSize(); + size_t offset; + Memory* doubleMap = findMemoryFromVA(start, &offset); + + if (doubleMap == nullptr) { + // Allocate a new entry + VACacheEntry* entry = new VACacheEntry(start, end, memory); + if (entry != nullptr) { + vaCacheList_->push_back(entry); + } + } + else { + LogError("Unexpected double map() call from the app!"); + } + } +} + +void +Device::removeVACache(const Memory* memory) const +{ + // Make sure system memory has direct access + if (memory->isHostMemDirectAccess() && memory->owner()) { + // VA cache access must be serialised + amd::ScopedLock lk(*vaCacheAccess_); + void* start = memory->owner()->getHostMem(); + void* end = reinterpret_cast
(start) + memory->owner()->getSize(); + + // Find VA cache entry for the specified memory + for (const auto& entry : *vaCacheList_) { + if (entry->startAddress_ == start) { + CondLog((entry->endAddress_ != end), "Incorrect VA range"); + delete entry; + vaCacheList_->remove(entry); + break; + } + } + } +} + +Memory* +Device::findMemoryFromVA(const void* ptr, size_t* offset) const +{ + // VA cache access must be serialised + amd::ScopedLock lk(*vaCacheAccess_); + for (const auto& entry : *vaCacheList_) { + if ((entry->startAddress_ <= ptr) && (entry->endAddress_ > ptr)) { + *offset = static_cast(reinterpret_cast(ptr) - + reinterpret_cast(entry->startAddress_)); + return entry->memory_; + } + } + return nullptr; +} + +amd::Memory* +Device::findMapTarget(size_t size) const +{ + // Must be serialised for access + amd::ScopedLock lk(*mapCacheOps_); + + amd::Memory* map = nullptr; + size_t minSize = 0; + size_t maxSize = 0; + uint mapId = mapCache_->size(); + uint releaseId = mapCache_->size(); + + // Find if the list has a map target of appropriate size + for (uint i = 0; i < mapCache_->size(); i++) { + if ((*mapCache_)[i] != nullptr) { + // Requested size is smaller than the entry size + if (size < (*mapCache_)[i]->getSize()) { + if ((minSize == 0) || + (minSize > (*mapCache_)[i]->getSize())) { + minSize = (*mapCache_)[i]->getSize(); + mapId = i; + } + } + // Requeted size matches the entry size + else if (size == (*mapCache_)[i]->getSize()) { + mapId = i; + break; + } + else { + // Find the biggest map target in the list + if (maxSize < (*mapCache_)[i]->getSize()) { + maxSize = (*mapCache_)[i]->getSize(); + releaseId = i; + } + } + } + } + + // Check if we found any map target + if (mapId < mapCache_->size()) { + map = (*mapCache_)[mapId]; + (*mapCache_)[mapId] = nullptr; + Memory* gpuMemory = reinterpret_cast + (map->getDeviceMemory(*this)); + + // Get the base pointer for the map resource + if ((gpuMemory == nullptr) || (nullptr == gpuMemory->map(nullptr))) { + (*mapCache_)[mapId]->release(); + map = nullptr; + } + } + // If cache is full, then release the biggest map target + else if (releaseId < mapCache_->size()) { + (*mapCache_)[releaseId]->release(); + (*mapCache_)[releaseId] = nullptr; + } + + return map; +} + +bool +Device::addMapTarget(amd::Memory* memory) const +{ + // Must be serialised for access + amd::ScopedLock lk(*mapCacheOps_); + + //the svm memory shouldn't be cached + if (!memory->canBeCached()) { + return false; + } + // Find if the list has a map target of appropriate size + for (uint i = 0; i < mapCache_->size(); ++i) { + if ((*mapCache_)[i] == nullptr) { + (*mapCache_)[i] = memory; + return true; + } + } + + // Add a new entry + mapCache_->push_back(memory); + + return true; +} + +Device::ScratchBuffer::~ScratchBuffer() +{ + destroyMemory(); +} + +void +Device::ScratchBuffer::destroyMemory() +{ + // Release memory object + delete memObj_; + memObj_ = nullptr; +} + +bool +Device::allocScratch(uint regNum, const VirtualGPU* vgpu) +{ + if (regNum > 0) { + // Serialize the scratch buffer allocation code + amd::ScopedLock lk(*scratchAlloc_); + uint sb = vgpu->hwRing(); + + // Check if the current buffer isn't big enough + if (regNum > scratch_[sb]->regNum_) { + // Stall all command queues, since runtime will reallocate memory + ScopedLockVgpus lock(*this); + + scratch_[sb]->regNum_ = regNum; + size_t size = 0; + uint offset = 0; + + // Destroy all views + for (uint s = 0; s < scratch_.size(); ++s) { + ScratchBuffer* scratchBuf = scratch_[s]; + if (scratchBuf->regNum_ > 0) { + scratchBuf->destroyMemory(); + // Calculate the size of the scratch buffer for a queue + uint32_t numTotalCUs = info().maxComputeUnits_; + uint32_t numMaxWaves = + properties().gfxipProperties.engineCore.maxScratchWavesPerCU * numTotalCUs; + scratchBuf->size_ = properties().gfxipProperties.engineCore.wavefrontSize * + scratchBuf->regNum_ * numMaxWaves * sizeof(uint32_t); + scratchBuf->size_ = amd::alignUp(scratchBuf->size_, 0xFFFF); + scratchBuf->offset_ = offset; + size += scratchBuf->size_; + offset += scratchBuf->size_; + } + } + + delete globalScratchBuf_; + + // Allocate new buffer. + globalScratchBuf_ = new pal::Memory(*this, size); + if ((globalScratchBuf_ == nullptr) || + !globalScratchBuf_->create(Resource::Scratch)) { + LogError("Couldn't allocate scratch memory"); + for (uint s = 0; s < scratch_.size(); ++s) { + scratch_[s]->regNum_ = 0; + } + return false; + } + + for (uint s = 0; s < scratch_.size(); ++s) { + // Loop through all memory objects and reallocate them + if (scratch_[s]->regNum_ > 0) { + // Allocate new buffer + scratch_[s]->memObj_ = new pal::Memory(*this, scratch_[s]->size_); + Resource::ViewParams view; + view.resource_ = globalScratchBuf_; + view.offset_ = scratch_[s]->offset_; + view.size_ = scratch_[s]->size_; + if ((scratch_[s]->memObj_ == nullptr) || + !scratch_[s]->memObj_->create(Resource::View, &view)) { + LogError("Couldn't allocate a scratch view"); + delete scratch_[s]->memObj_; + scratch_[s]->regNum_ = 0; + return false; + } + } + } + } + } + return true; +} + +bool +Device::validateKernel(const amd::Kernel& kernel, const device::VirtualDevice* vdev) +{ + // Find the number of scratch registers used in the kernel + const device::Kernel* devKernel = kernel.getDeviceKernel(*this); + uint regNum = static_cast(devKernel->workGroupInfo()->scratchRegs_); + const VirtualGPU* vgpu = static_cast(vdev); + + if (!allocScratch(regNum, vgpu)) { + return false; + } + + if (devKernel->hsa()) { + const HSAILKernel* hsaKernel = static_cast(devKernel); + if (hsaKernel->dynamicParallelism()) { + amd::DeviceQueue* defQueue = + kernel.program().context().defDeviceQueue(*this); + if (defQueue != nullptr) { + vgpu = static_cast(defQueue->vDev()); + if (!allocScratch(hsaKernel->prog().maxScratchRegs(), vgpu)) { + return false; + } + } + else { + return false; + } + } + } + + return true; +} + +void +Device::destroyScratchBuffers() +{ + if (globalScratchBuf_ != nullptr) { + for (uint s = 0; s < scratch_.size(); ++s) { + scratch_[s]->destroyMemory(); + scratch_[s]->regNum_ = 0; + } + delete globalScratchBuf_; + globalScratchBuf_ = nullptr; + } +} + +void +Device::fillHwSampler( + uint32_t state, void* hwState, uint32_t hwStateSize, + uint32_t mipFilter, float minLod, float maxLod) const +{ + Pal::SamplerInfo samplerInfo = {}; + + samplerInfo.borderColorType = Pal::BorderColorType::TransparentBlack; + + // Assign defaults + samplerInfo.filter = Pal::TexFilter::MagPointMinPointMipBase; + + samplerInfo.flags.unnormalizedCoords = !(state & amd::Sampler::StateNormalizedCoordsMask); + + state &= ~amd::Sampler::StateNormalizedCoordsMask; + + // Program the sampler address mode + switch (state & amd::Sampler::StateAddressMask) { + case amd::Sampler::StateAddressRepeat: + samplerInfo.addressU = Pal::TexAddressMode::Wrap; + samplerInfo.addressV = Pal::TexAddressMode::Wrap; + samplerInfo.addressW = Pal::TexAddressMode::Wrap; + break; + case amd::Sampler::StateAddressClampToEdge: + samplerInfo.addressU = Pal::TexAddressMode::Clamp; + samplerInfo.addressV = Pal::TexAddressMode::Clamp; + samplerInfo.addressW = Pal::TexAddressMode::Clamp; + break; + case amd::Sampler::StateAddressMirroredRepeat: + samplerInfo.addressU = Pal::TexAddressMode::Mirror; + samplerInfo.addressV = Pal::TexAddressMode::Mirror; + samplerInfo.addressW = Pal::TexAddressMode::Mirror; + break; + case amd::Sampler::StateAddressClamp: + case amd::Sampler::StateAddressNone: + samplerInfo.addressU = Pal::TexAddressMode::ClampBorder; + samplerInfo.addressV = Pal::TexAddressMode::ClampBorder; + samplerInfo.addressW = Pal::TexAddressMode::ClampBorder; + default: + break; + } + state &= ~amd::Sampler::StateAddressMask; + + // Program texture filter mode + if (state == amd::Sampler::StateFilterLinear) { + samplerInfo.filter = Pal::TexFilter::MagLinearMinLinearMipBase; + } + + if (mipFilter == CL_FILTER_NEAREST) { + if (state == amd::Sampler::StateFilterLinear) { + samplerInfo.filter = Pal::TexFilter::MagLinearMinLinearMipPoint; + } + else { + samplerInfo.filter = Pal::TexFilter::MagPointMinPointMipPoint; + } + } + else if (mipFilter == CL_FILTER_LINEAR) { + if (state == amd::Sampler::StateFilterLinear) { + samplerInfo.filter = Pal::TexFilter::MagLinearMinLinearMipLinear; + } + else { + samplerInfo.filter = Pal::TexFilter::MagPointMinPointMipLinear; + } + } + + iDev()->CreateSamplerSrds(1, &samplerInfo, hwState); +} + +void* +Device::hostAlloc(size_t size, size_t alignment, bool atomics) const +{ + //for discrete gpu, we only reserve,no commit yet. + return amd::Os::reserveMemory(nullptr, size, alignment, amd::Os::MEM_PROT_NONE); +} + +void +Device::hostFree(void* ptr, size_t size) const +{ + //If we allocate the host memory, we need free, or we have to release + amd::Os::releaseMemory(ptr, size); +} + +void* +Device::svmAlloc(amd::Context& context, size_t size, size_t alignment, cl_svm_mem_flags flags, void* svmPtr) const +{ + alignment = std::max(alignment, static_cast(info_.memBaseAddrAlign_)); + + //VAM for GPU needs 64K alignment for Tahiti and CI+, will pull idnfo from gsl later + size_t vmBigK = 64 * Ki; + alignment = (alignment < vmBigK) ? vmBigK : alignment; + + size = amd::alignUp(size, alignment); + amd::Memory* mem = nullptr; + freeCPUMem_ = false; + if (nullptr == svmPtr) { + if (isFineGrainedSystem()) { + freeCPUMem_ = true; + return amd::Os::alignedMalloc(size, alignment); + } + + //create a hidden buffer, which will allocated on the device later + mem = new (context)amd::Buffer(context, flags, size, reinterpret_cast(1)); + if (mem == nullptr) { + LogError("failed to create a svm mem object!"); + return nullptr; + } + + if (!mem->create(nullptr, false)) { + LogError("failed to create a svm hidden buffer!"); + mem->release(); + return nullptr; + } + //if the device supports SVM FGS, return the committed CPU address directly. + pal::Memory* gpuMem = getGpuMemory(mem); + + //add the information to context so that we can use it later. + amd::SvmManager::AddSvmBuffer(mem->getSvmPtr(), mem); + svmPtr = mem->getSvmPtr(); + } + else { + //find the existing amd::mem object + mem = amd::SvmManager::FindSvmBuffer(svmPtr); + if (nullptr == mem) { + return nullptr; + } + //commit the CPU memory for FGS device. + if (isFineGrainedSystem()) { + mem->commitSvmMemory(); + } + else { + pal::Memory* gpuMem = getGpuMemory(mem); + } + svmPtr = mem->getSvmPtr(); + } + return svmPtr; +} + +void +Device::svmFree(void *ptr) const +{ + if (freeCPUMem_) { + amd::Os::alignedFree(ptr); + } + else { + amd::Memory * svmMem = nullptr; + svmMem = amd::SvmManager::FindSvmBuffer(ptr); + if (nullptr != svmMem) { + svmMem->release(); + amd::SvmManager::RemoveSvmBuffer(ptr); + } + } +} + + +Device::SrdManager::~SrdManager() +{ + for (uint i = 0; i < pool_.size(); ++i) { + pool_[i].buf_->unmap(nullptr); + delete pool_[i].buf_; + delete pool_[i].flags_; + } +} + +bool +Sampler::create(uint32_t oclSamplerState) +{ + hwSrd_ = dev_.srds().allocSrdSlot(&hwState_); + if (0 == hwSrd_) { + return false; + } + dev_.fillHwSampler(oclSamplerState, hwState_, HsaSamplerObjectSize); + return true; +} + +bool +Sampler::create(const amd::Sampler& owner) +{ + hwSrd_ = dev_.srds().allocSrdSlot(&hwState_); + if (0 == hwSrd_) { + return false; + } + dev_.fillHwSampler(owner.state(), hwState_, HsaSamplerObjectSize, + owner.mipFilter(), owner.minLod(), owner.maxLod()); + return true; +} + +Sampler::~Sampler() +{ + dev_.srds().freeSrdSlot(hwSrd_); +} + +uint64_t +Device::SrdManager::allocSrdSlot(address* cpuAddr) +{ + amd::ScopedLock lock(ml_); + // Check all buffers in the pool of chunks + for (uint i = 0; i < pool_.size(); ++i) { + const Chunk& ch = pool_[i]; + // Search for an empty slot + for (uint s = 0; s < numFlags_; ++s) { + uint mask = ch.flags_[s]; + // Check if there is an empty slot in this group + if (mask != 0) { + uint idx; + // Find the first empty index + for (idx = 0; (mask & 0x1) == 0; mask >>= 1, ++idx); + // Mark the slot as busy + ch.flags_[s] &= ~(1 << idx); + // Calculate SRD offset in the buffer + uint offset = (s * MaskBits + idx) * srdSize_; + *cpuAddr = ch.buf_->data() + offset; + return ch.buf_->vmAddress() + offset; + } + } + } + // At this point the manager doesn't have empty slots + // and has to allocate a new chunk + Chunk chunk; + chunk.flags_ = new uint[numFlags_]; + if (chunk.flags_ == nullptr) { + return 0; + } + chunk.buf_ = new Memory(dev_, bufSize_); + if (chunk.buf_ == nullptr || !chunk.buf_->create(Resource::Remote) || + (nullptr == chunk.buf_->map(nullptr))) { + delete [] chunk.flags_; + delete chunk.buf_; + return 0; + } + // All slots in the chunk are in "free" state + memset(chunk.flags_, 0xff, numFlags_ * sizeof(uint)); + // Take the first one... + chunk.flags_[0] &= ~0x1; + pool_.push_back(chunk); + *cpuAddr = chunk.buf_->data(); + return chunk.buf_->vmAddress(); +} + +void +Device::SrdManager::freeSrdSlot(uint64_t addr) { + amd::ScopedLock lock(ml_); + if (addr == 0) return; + // Check all buffers in the pool of chunks + for (uint i = 0; i < pool_.size(); ++i) { + Chunk* ch = &pool_[i]; + // Find the offset + int64_t offs = static_cast(addr) - + static_cast(ch->buf_->vmAddress()); + // Check if the offset inside the chunk buffer + if ((offs >= 0) && (offs < bufSize_)) { + // Find the index in the chunk + uint idx = offs / srdSize_; + uint s = idx / MaskBits; + // Free the slot + ch->flags_[s] |= 1 << (idx % MaskBits); + return; + } + } + assert(false && "Wrong slot address!"); +} + +void +Device::updateFreeMemory(Pal::GpuHeap heap, Pal::gpusize size, bool free) +{ + if (free) { + freeMem[heap] += size; + } + else { + freeMem[heap] -= size; + } +} + +void +Device::SrdManager::fillResourceList(std::vector& memList) +{ + for (uint i = 0; i < pool_.size(); ++i) { + memList.push_back(pool_[i].buf_); + } +} + +cl_int +Device::hwDebugManagerInit(amd::Context *context, uintptr_t messageStorage) +{ + cl_int status = hwDebugMgr_->registerDebugger(context, messageStorage); + + if (CL_SUCCESS != status) { + delete hwDebugMgr_; + hwDebugMgr_ = nullptr; + } + + return status; +} + +} // namespace pal diff --git a/projects/clr/rocclr/runtime/device/pal/paldevice.hpp b/projects/clr/rocclr/runtime/device/pal/paldevice.hpp new file mode 100644 index 0000000000..8fe3347d46 --- /dev/null +++ b/projects/clr/rocclr/runtime/device/pal/paldevice.hpp @@ -0,0 +1,598 @@ +// +// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved. +// +#ifndef PALDEVICE_HPP_ +#define PALDEVICE_HPP_ + +#include "top.hpp" +#include "device/device.hpp" +#include "platform/command.hpp" +#include "platform/program.hpp" +#include "platform/perfctr.hpp" +#include "platform/threadtrace.hpp" +#include "platform/memory.hpp" +#include "utils/concurrent.hpp" +#include "thread/thread.hpp" +#include "thread/monitor.hpp" +#include "device/pal/palvirtual.hpp" +#include "device/pal/palmemory.hpp" +#include "device/pal/paldefs.hpp" +#include "device/pal/palsettings.hpp" +#include "device/pal/palappprofile.hpp" +#include "acl.h" +#include "memory" + + +/*! \addtogroup PAL + * @{ + */ + +//! PAL Device Implementation +namespace pal { + +//! A nil device object +class NullDevice : public amd::Device +{ +protected: + static aclCompiler* compiler_; +public: + aclCompiler* compiler() const { return compiler_; } + +public: + static bool init(void); + + //! Construct a new identifier + NullDevice(); + + //! Creates an offline device with the specified target + bool create( + Pal::GfxIpLevel ipLevel //!< GPU ip level + ); + + virtual cl_int createSubDevices( + device::CreateSubDevicesInfo& create_info, + cl_uint num_entries, + cl_device_id* devices, + cl_uint* num_devices) { + return CL_INVALID_VALUE; + } + + //! Instantiate a new virtual device + virtual device::VirtualDevice* createVirtualDevice( + amd::CommandQueue* queue = NULL + ) { return NULL; } + + //! Compile the given source code. + virtual device::Program* createProgram(amd::option::Options* options = NULL); + + //! Just returns NULL for the dummy device + virtual device::Memory* createMemory(amd::Memory& owner) const { return NULL; } + + //! Sampler object allocation + virtual bool createSampler( + const amd::Sampler& owner, //!< abstraction layer sampler object + device::Sampler** sampler //!< device sampler object + ) const + { + ShouldNotReachHere(); + return true; + } + + //! Just returns NULL for the dummy device + virtual device::Memory* createView( + amd::Memory& owner, //!< Owner memory object + const device::Memory& parent //!< Parent device memory object for the view + ) const { return NULL; } + + //! Reallocates the provided buffer object + virtual bool reallocMemory(amd::Memory& owner) const { return true; } + + //! Acquire external graphics API object in the host thread + //! Needed for OpenGL objects on CPU device + + virtual bool bindExternalDevice( + intptr_t type, void* pDevice, void* pContext, bool validateOnly) { return true; } + + virtual bool unbindExternalDevice( + intptr_t type, void* pDevice, void* pContext, bool validateOnly) { return true; } + + //! Releases non-blocking map target memory + virtual void freeMapTarget(amd::Memory& mem, void* target) {} + + Pal::GfxIpLevel ipLevel() const { return ipLevel_; } + + const AMDDeviceInfo* hwInfo() const { return hwInfo_; } + + //! Empty implementation on Null device + virtual bool globalFreeMemory(size_t* freeMemory) const { return false; } + + //! Get GPU device settings + const pal::Settings& settings() const + { return reinterpret_cast(*settings_); } + virtual void* svmAlloc(amd::Context& context, size_t size, size_t alignment, cl_svm_mem_flags flags, void* svmPtr) const { return NULL; } + virtual void svmFree(void* ptr) const {return;} + +protected: + Pal::GfxIpLevel ipLevel_; //!< Device IP level + const AMDDeviceInfo* hwInfo_; //!< Device HW info structure + + //! Fills OpenCL device info structure + void fillDeviceInfo( + const Pal::DeviceProperties& palProp,//!< PAL device properties + const Pal::GpuMemoryHeapProperties heaps[Pal::GpuHeapCount], + size_t maxTextureSize, //!< Maximum texture size supported in HW + uint numComputeRings //!< Number of compute rings + ); +}; + +//! Forward declarations +class Command; +class Device; +class GpuCommand; +class Heap; +class HeapBlock; +class Program; +class Kernel; +class Memory; +class Resource; +class VirtualDevice; +class PrintfDbg; +class ThreadTrace; + +#ifndef CL_FILTER_NONE +#define CL_FILTER_NONE 0x1142 +#endif + +class Sampler : public device::Sampler +{ +public: + //! Constructor + Sampler(const Device& dev): dev_(dev) {} + + //! Default destructor for the device memory object + virtual ~Sampler(); + + //! Creates a device sampler from the OCL sampler state + bool create( + uint32_t oclSamplerState //!< OCL sampler state + ); + + //! Creates a device sampler from the OCL sampler state + bool create( + const amd::Sampler& owner //!< AMD sampler object + ); + + const void* hwState() const { return hwState_; } + +private: + //! Disable default copy constructor + Sampler& operator=(const Sampler&); + + //! Disable operator= + Sampler(const Sampler&); + + const Device& dev_; //!< Device object associated with the sampler + address hwState_; //!< GPU HW state (\todo legacy path) +}; + +//! A GPU device ordinal (physical GPU device) +class Device : public NullDevice +{ +public: + //! Locks any access to the virtual GPUs + class ScopedLockVgpus : public amd::StackObject { + public: + //! Default constructor + ScopedLockVgpus(const Device& dev); + + //! Destructor + ~ScopedLockVgpus(); + + private: + const Device& dev_; //! Device object + }; + + //! Transfer buffers + class XferBuffers : public amd::HeapObject + { + public: + static const size_t MaxXferBufListSize = 8; + + //! Default constructor + XferBuffers(const Device& device, Resource::MemoryType type, size_t bufSize) + : type_(type) + , bufSize_(bufSize) + , acquiredCnt_(0) + , gpuDevice_(device) + {} + + //! Default destructor + ~XferBuffers(); + + //! Creates the xfer buffers object + bool create(); + + //! Acquires an instance of the transfer buffers + Memory& acquire(); + + //! Releases transfer buffer + void release( + VirtualGPU& gpu, //!< Virual GPU object used with the buffer + Memory& buffer //!< Transfer buffer for release + ); + + //! Returns the buffer's size for transfer + size_t bufSize() const { return bufSize_; } + + private: + //! Disable copy constructor + XferBuffers(const XferBuffers&); + + //! Disable assignment operator + XferBuffers& operator=(const XferBuffers&); + + //! Get device object + const Device& dev() const { return gpuDevice_; } + + Resource::MemoryType type_; //!< The buffer's type + size_t bufSize_; //!< Staged buffer size + std::list freeBuffers_; //!< The list of free buffers + amd::Atomic acquiredCnt_; //!< The total number of acquired buffers + amd::Monitor lock_; //!< Stgaed buffer acquire/release lock + const Device& gpuDevice_; //!< GPU device object + }; + + //! Virtual address cache entry + struct VACacheEntry : public amd::HeapObject + { + void* startAddress_; //!< Start virtual address + void* endAddress_; //!< End virtual address + Memory* memory_; //!< GPU memory, associated with the range + + //! Constructor + VACacheEntry( + void* startAddress, //!< Start virtual address + void* endAddress, //!< End virtual address + Memory* memory //!< GPU memory object + ): startAddress_(startAddress), endAddress_(endAddress), memory_(memory) {} + + private: + //! Disable default constructor + VACacheEntry(); + }; + + struct ScratchBuffer : public amd::HeapObject + { + uint regNum_; //!< The number of used scratch registers + Memory* memObj_; //!< Memory objects for scratch buffers + uint offset_; //!< Offset from the global scratch store + uint size_; //!< Scratch buffer size on this queue + + //! Default constructor + ScratchBuffer(): regNum_(0), memObj_(NULL), offset_(0) {} + + //! Default constructor + ~ScratchBuffer(); + + //! Destroys memory objects + void destroyMemory(); + }; + + + class SrdManager : public amd::HeapObject { + public: + SrdManager(const Device& dev, uint srdSize, uint bufSize) + : dev_(dev) + , numFlags_(bufSize / (srdSize * MaskBits)) + , srdSize_(srdSize) + , bufSize_(bufSize) {} + ~SrdManager(); + + //! Allocates a new SRD slot for a resource + uint64_t allocSrdSlot(address* cpuAddr); + + //! Frees a SRD slot + void freeSrdSlot(uint64_t addr); + + // Fills the memory list for VidMM KMD + void fillResourceList(std::vector& memList); + + private: + //! Disable copy constructor + SrdManager(const SrdManager&); + + //! Disable assignment operator + SrdManager& operator=(const SrdManager&); + + struct Chunk { + Memory* buf_; + uint* flags_; + Chunk(): buf_(NULL), flags_(NULL) {} + }; + + static const uint MaskBits = 32; + const Device& dev_; //!< GPU device for the chunk manager + amd::Monitor ml_; //!< Global lock for the SRD manager + std::vector pool_; //!< Pool of SRD buffers + uint numFlags_; //!< Total number of flags in array + uint srdSize_; //!< SRD size + uint bufSize_; //!< Buffer size that holds SRDs + }; + + //! Initialise the whole GPU device subsystem + static bool init(); + + //! Shutdown the whole GPU device subsystem + static void tearDown(); + + //! Construct a new physical GPU device + Device(); + + //! Initialise a device (i.e. all parts of the constructor that could + //! potentially fail) + bool create( + Pal::IDevice* device //!< PAL device interface object + ); + + //! Destructor for the physical GPU device + virtual ~Device(); + + //! Instantiate a new virtual device + device::VirtualDevice* createVirtualDevice( + amd::CommandQueue* queue = NULL + ); + + //! Memory allocation + virtual device::Memory* createMemory( + amd::Memory& owner //!< abstraction layer memory object + ) const; + + //! Sampler object allocation + virtual bool createSampler( + const amd::Sampler& owner, //!< abstraction layer sampler object + device::Sampler** sampler //!< device sampler object + ) const; + + //! Reallocates the provided buffer object + virtual bool reallocMemory( + amd::Memory& owner //!< Buffer for reallocation + ) const; + + //! Allocates a view object from the device memory + virtual device::Memory* createView( + amd::Memory& owner, //!< Owner memory object + const device::Memory& parent //!< Parent device memory object for the view + ) const; + + //! Create the device program. + virtual device::Program* createProgram(amd::option::Options* options = NULL); + + //! Attempt to bind with external graphics API's device/context + virtual bool bindExternalDevice( + intptr_t type, + void* pDevice, + void* pContext, + bool validateOnly); + + //! Attempt to unbind with external graphics API's device/context + virtual bool unbindExternalDevice( + intptr_t type, + void* pDevice, + void* pContext, + bool validateOnly); + + //! Validates kernel before execution + virtual bool validateKernel( + const amd::Kernel& kernel, //!< AMD kernel object + const device::VirtualDevice* vdev + ); + + //! Retrieves information about free memory on a GPU device + virtual bool globalFreeMemory(size_t* freeMemory) const; + + //! Returns a GPU memory object from AMD memory object + pal::Memory* getGpuMemory( + amd::Memory* mem //!< Pointer to AMD memory object + ) const; + + amd::Monitor& lockAsyncOps() const { return *lockAsyncOps_; } + + //! Returns the lock object for the virtual gpus list + amd::Monitor* vgpusAccess() const { return vgpusAccess_; } + + //! Returns the monitor object for PAL + amd::Monitor& lockPAL() const { return *lockPAL_; } + + //! Returns the number of virtual GPUs allocated on this device + uint numOfVgpus() const { return numOfVgpus_; } + uint numOfVgpus_; //!< The number of virtual GPUs (lock protected) + + typedef std::vector VirtualGPUs; + + //! Returns the list of all virtual GPUs running on this device + const VirtualGPUs vgpus() const { return vgpus_; } + VirtualGPUs vgpus_; //!< The list of all running virtual gpus (lock protected) + + //! Scratch buffer allocation + pal::Memory* createScratchBuffer( + size_t size //!< Size of buffer + ) const; + + //! Returns transfer buffer object + XferBuffers& xferWrite() const { return *xferWrite_; } + + //! Returns transfer buffer object + XferBuffers& xferRead() const { return *xferRead_; } + + //! Adds GPU memory to the VA cache list + void addVACache(Memory* memory) const; + + //! Removes GPU memory from the VA cache list + void removeVACache(const Memory* memory) const; + + //! Finds GPU memory from virtual address + Memory* findMemoryFromVA(const void* ptr, size_t* offset) const; + + //! Finds an appropriate map target + amd::Memory* findMapTarget(size_t size) const; + + //! Adds a map target to the cache + bool addMapTarget(amd::Memory* memory) const; + + //! Returns resource cache object + ResourceCache& resourceCache() const { return *resourceCache_; } + + //! Returns the number of available compute rings + uint numComputeEngines() const { return numComputeEngines_; } + + //! Returns the number of available DMA engines + uint numDMAEngines() const { return numDmaEngines_; } + + //! Returns engines object + const device::BlitManager& xferMgr() const; + + VirtualGPU* xferQueue() const { return xferQueue_; } + + //! Retrieves the internal format from the OCL format + Pal::Format getPalFormat( + const amd::Image::Format& format, //! OCL image format + Pal::ChannelMapping* channel + ) const; + + const ScratchBuffer* scratch(uint idx) const { return scratch_[idx]; } + + //! Returns the global scratch buffer + Memory* globalScratchBuf() const { return globalScratchBuf_; }; + + //! Destroys scratch buffer memory + void destroyScratchBuffers(); + + //! Initialize heap resources if uninitialized + bool initializeHeapResources(); + + //! Set GSL sampler to the specified state + void fillHwSampler( + uint32_t state, //!< Sampler's OpenCL state + void* hwState, //!< Sampler's HW state + uint32_t hwStateSize, //!< Size of sampler's HW state + uint32_t mipFilter = CL_FILTER_NONE, //!< Mip filter + float minLod = 0.f, //!< Min level of detail + float maxLod = CL_MAXFLOAT //!< Max level of detail + ) const; + + //! host memory alloc + virtual void* hostAlloc(size_t size, size_t alignment, bool atomics = false) const; + + //! SVM allocation + virtual void* svmAlloc(amd::Context& context, size_t size, size_t alignment, + cl_svm_mem_flags flags, void* svmPtr) const; + + //! Free host SVM memory + void hostFree(void* ptr, size_t size) const; + + //! SVM free + virtual void svmFree(void* ptr) const; + + //! Returns SRD manger object + SrdManager& srds() const { return *srdManager_; } + + //! Initial the Hardware Debug Manager + cl_int hwDebugManagerInit(amd::Context *context, uintptr_t messageStorage); + + //! Returns PAL device properties + const Pal::DeviceProperties& properties() const { return properties_; } + + //! Returns PAL device interface + Pal::IDevice* iDev() const { return device_; } + + //! Return private device context for internal allocations + amd::Context& context() const { return *context_; } + + //! Update free memory for OCL extension + void updateFreeMemory( + Pal::GpuHeap heap, //!< PAL GPU heap for update + Pal::gpusize size, //!< Size of alocated/destroyed memory + bool free //!< TRUE if runtime frees memory + ); + + //! Interop for GL device + bool initGLInteropPrivateExt(void* GLplatformContext, void* GLdeviceContext) const; + bool glCanInterop(void* GLplatformContext, void* GLdeviceContext) const; + bool resGLAssociate(void* GLContext, uint name, uint type, + void** handle, void** mbResHandle, size_t* offset) const; + bool resGLAcquire(void* GLplatformContext, void* mbResHandle, uint type) const; + bool resGLRelease(void* GLplatformContext, void* mbResHandle, uint type) const; + bool resGLFree(void* GLplatformContext, void* mbResHandle, uint type) const; + +private: + //! Disable copy constructor + Device(const Device&); + + //! Disable assignment + Device& operator=(const Device&); + + //! Sends the stall command to all queues + bool stallQueues(); + + //! Buffer allocation + pal::Memory* createBuffer( + amd::Memory& owner, //!< Abstraction layer memory object + bool directAccess //!< Use direct host memory access + ) const; + + //! Image allocation + pal::Memory* createImage( + amd::Memory& owner, //!< Abstraction layer memory object + bool directAccess //!< Use direct host memory access + ) const; + + //! Allocates/reallocates the scratch buffer, according to the usage + bool allocScratch( + uint regNum, //!< Number of the scratch registers + const VirtualGPU* vgpu //!< Virtual GPU for the allocation + ); + + //! Interop for D3D devices + bool associateD3D11Device( + void* d3d11Device //!< void* is of type ID3D11Device* + ); + bool associateD3D10Device( + void* d3d10Device //!< void* is of type ID3D10Device* + ); + bool associateD3D9Device( + void* d3d9Device //!< void* is of type IDirect3DDevice9* + ); + //! Interop for GL device + bool glAssociate(void* GLplatformContext, void* GLdeviceContext) const; + bool glDissociate(void* GLplatformContext, void* GLdeviceContext) const; + + amd::Context* context_; //!< A dummy context for internal allocations + amd::Monitor* lockAsyncOps_; //!< Lock to serialise all async ops on this device + amd::Monitor* lockForInitHeap_; //!< Lock to serialise all async ops on initialization heap operation + amd::Monitor* lockPAL_; //!< Lock to serialise PAL access + amd::Monitor* vgpusAccess_; //!< Lock to serialise virtual gpu list access + amd::Monitor* scratchAlloc_; //!< Lock to serialise scratch allocation + amd::Monitor* mapCacheOps_; //!< Lock to serialise cache for the map resources + XferBuffers* xferRead_; //!< Transfer buffers read + XferBuffers* xferWrite_; //!< Transfer buffers write + amd::Monitor* vaCacheAccess_; //!< Lock to serialize VA caching access + std::list* vaCacheList_; //!< VA cache list + std::vector* mapCache_; //!< Map cache info structure + ResourceCache* resourceCache_; //!< Resource cache + uint numComputeEngines_; //!< The number of available compute engines + uint numDmaEngines_; //!< The number of available compute engines + bool heapInitComplete_; //!< Keep track of initialization status of heap resources + VirtualGPU* xferQueue_; //!< Transfer queue + std::vector scratch_; //!< Scratch buffers for kernels + Memory* globalScratchBuf_; //!< Global scratch buffer + SrdManager* srdManager_; //!< SRD manager object + static AppProfile appProfile_; //!< application profile + mutable bool freeCPUMem_; //!< flag to mark GPU free SVM CPU mem + Pal::DeviceProperties properties_; //!< PAL device properties + Pal::IDevice* device_; //!< PAL device object + std::atomic freeMem[Pal::GpuHeap::GpuHeapCount]; //!< Free memory counter +}; + +/*@}*/} // namespace pal + +#endif /*PALDEVICE_HPP_*/ diff --git a/projects/clr/rocclr/runtime/device/pal/paldeviced3d10.cpp b/projects/clr/rocclr/runtime/device/pal/paldeviced3d10.cpp new file mode 100644 index 0000000000..d03ac6c18c --- /dev/null +++ b/projects/clr/rocclr/runtime/device/pal/paldeviced3d10.cpp @@ -0,0 +1,143 @@ +#include "paldevice.hpp" + +#if defined(ATI_OS_LINUX) +namespace pal { +bool +Device::associateD3D10Device(void* d3d10Device) +{ + return false; +} +} // pal +#else // !ATI_OS_WIN + +#include + +/************************************************************************************************************** +* Note: ideally the DXX extension interfaces should be mapped from the DXX perforce branch. +* This means OCL client spec will need to change to include headers directly from the DXX perforce tree. +* However, OCL only cares about the DXX OpenCL extension interface class. The spec cannot change +* without notification. So it is safe to use a local copy of the relevant DXX extension interface classes. +**************************************************************************************************************/ +#include "DxxOpenCLInteropExt.h" + +namespace pal { + +static bool +queryD3D10DeviceGPUMask(ID3D10Device* pd3d10Device, UINT* pd3d10DeviceGPUMask) +{ + HMODULE hDLL = nullptr; + IAmdDxExt* pExt = nullptr; + IAmdDxExtCLInterop* pCLExt = nullptr; + PFNAmdDxExtCreate AmdDxExtCreate; + HRESULT hr = S_OK; + + // Get a handle to the DXX DLL with extension API support +#if defined _WIN64 + static const CHAR dxxModuleName[13] = "atidxx64.dll"; +#else + static const CHAR dxxModuleName[13] = "atidxx32.dll"; +#endif + + hDLL = GetModuleHandle(dxxModuleName); + + if (hDLL == nullptr) { + hr = E_FAIL; + } + + // Get the exported AmdDxExtCreate() function pointer + if (SUCCEEDED(hr)) { + AmdDxExtCreate = reinterpret_cast( + GetProcAddress(hDLL, "AmdDxExtCreate")); + if (AmdDxExtCreate == nullptr) { + hr = E_FAIL; + } + } + + // Create the extension object + if (SUCCEEDED(hr)) { + hr = AmdDxExtCreate(pd3d10Device, &pExt); + } + + // Get the extension version information + if (SUCCEEDED(hr)) { + AmdDxExtVersion extVersion; + hr = pExt->GetVersion(&extVersion); + + if (extVersion.majorVersion == 0) + { + hr = E_FAIL; + } + } + + // Get the OpenCL Interop interface + if (SUCCEEDED(hr)) { + pCLExt = static_cast( + pExt->GetExtInterface(AmdDxExtCLInteropID)); + if (pCLExt != nullptr) { + // Get the GPU mask using the CL Interop extension. + pCLExt->QueryInteropGpuMask(pd3d10DeviceGPUMask); + } + else { + hr = E_FAIL; + } + } + + if (pCLExt != nullptr) { + pCLExt->Release(); + } + + if (pExt != nullptr) { + pExt->Release(); + } + + return (SUCCEEDED(hr)); +} + +bool +Device::associateD3D10Device(void* d3d10Device) +{ + ID3D10Device* pd3d10Device = static_cast(d3d10Device); + + IDXGIDevice* pDXGIDevice; + pd3d10Device->QueryInterface(__uuidof(IDXGIDevice), (void **)&pDXGIDevice); + + IDXGIAdapter* pDXGIAdapter; + pDXGIDevice->GetAdapter(&pDXGIAdapter); + + DXGI_ADAPTER_DESC adapterDesc; + pDXGIAdapter->GetDesc(&adapterDesc); + + // match the adapter + bool canInteroperate = + (properties().osProperties.luidHighPart == adapterDesc.AdapterLuid.HighPart) && + (properties().osProperties.luidLowPart == adapterDesc.AdapterLuid.LowPart); + + UINT chainBitMask = 1 << properties().gpuIndex; + + // match the chain ID + if (canInteroperate) { + UINT d3d10DeviceGPUMask = 0; + + if (queryD3D10DeviceGPUMask(pd3d10Device, &d3d10DeviceGPUMask)) { + canInteroperate = (chainBitMask & d3d10DeviceGPUMask) != 0; + } + else { + // special handling for Intel iGPU + AMD dGPU in LDA mode + // (only occurs on a PX platform) where + // the D3D10Device object is created on the Intel iGPU and + // passed to AMD dGPU (secondary) to interoperate. + if (chainBitMask > 1) { + canInteroperate = false; + } + } + } + + pDXGIDevice->Release(); + pDXGIAdapter->Release(); + + return canInteroperate; +} + +} // pal + +#endif // !ATI_OS_WIN diff --git a/projects/clr/rocclr/runtime/device/pal/paldeviced3d11.cpp b/projects/clr/rocclr/runtime/device/pal/paldeviced3d11.cpp new file mode 100644 index 0000000000..e12cc14d5d --- /dev/null +++ b/projects/clr/rocclr/runtime/device/pal/paldeviced3d11.cpp @@ -0,0 +1,142 @@ +#include "paldevice.hpp" + +#if defined(ATI_OS_LINUX) +namespace pal { +bool +Device::associateD3D11Device(void* d3d11Device) +{ + return false; +} +} +#else // !ATI_OS_LINUX + +#include + +/************************************************************************************************************** +* Note: ideally the DXX extension interfaces should be mapped from the DXX perforce branch. +* This means OCL client spec will need to change to include headers directly from the DXX perforce tree. +* However, OCL only cares about the DXX OpenCL extension interface class. The spec cannot change +* without notification. So it is safe to use a local copy of the relevant DXX extension interface classes. +**************************************************************************************************************/ +#include "DxxOpenCLInteropExt.h" + +namespace pal { + +static bool +queryD3D11DeviceGPUMask(ID3D11Device* pd3d11Device, UINT* pd3d11DeviceGPUMask) +{ + HMODULE hDLL = nullptr; + IAmdDxExt* pExt = nullptr; + IAmdDxExtCLInterop* pCLExt = nullptr; + PFNAmdDxExtCreate11 AmdDxExtCreate11; + HRESULT hr = S_OK; + + // Get a handle to the DXX DLL with extension API support +#if defined _WIN64 + static const CHAR dxxModuleName[13] = "atidxx64.dll"; +#else + static const CHAR dxxModuleName[13] = "atidxx32.dll"; +#endif + + hDLL = GetModuleHandle(dxxModuleName); + + if (hDLL == nullptr) { + hr = E_FAIL; + } + + // Get the exported AmdDxExtCreate() function pointer + if (SUCCEEDED(hr)) { + AmdDxExtCreate11 = reinterpret_cast( + GetProcAddress(hDLL, "AmdDxExtCreate11")); + if (AmdDxExtCreate11 == nullptr) { + hr = E_FAIL; + } + } + + // Create the extension object + if (SUCCEEDED(hr)) { + hr = AmdDxExtCreate11(pd3d11Device, &pExt); + } + + // Get the extension version information + if (SUCCEEDED(hr)) { + AmdDxExtVersion extVersion; + hr = pExt->GetVersion(&extVersion); + + if (extVersion.majorVersion == 0) { + hr = E_FAIL; + } + } + + // Get the OpenCL Interop interface + if (SUCCEEDED(hr)) { + pCLExt = static_cast( + pExt->GetExtInterface(AmdDxExtCLInteropID)); + if (pCLExt != nullptr) { + // Get the GPU mask using the CL Interop extension. + pCLExt->QueryInteropGpuMask(pd3d11DeviceGPUMask); + } + else { + hr = E_FAIL; + } + } + + if (pCLExt != nullptr) { + pCLExt->Release(); + } + + if (pExt != nullptr) { + pExt->Release(); + } + + return (SUCCEEDED(hr)); +} + +bool +Device::associateD3D11Device(void* d3d11Device) +{ + ID3D11Device* pd3d11Device = static_cast(d3d11Device); + + IDXGIDevice* pDXGIDevice; + pd3d11Device->QueryInterface(__uuidof(IDXGIDevice), (void **)&pDXGIDevice); + + IDXGIAdapter* pDXGIAdapter; + pDXGIDevice->GetAdapter(&pDXGIAdapter); + + DXGI_ADAPTER_DESC adapterDesc; + pDXGIAdapter->GetDesc(&adapterDesc); + + // match the adapter + bool canInteroperate = + (properties().osProperties.luidHighPart == adapterDesc.AdapterLuid.HighPart) && + (properties().osProperties.luidLowPart == adapterDesc.AdapterLuid.LowPart); + + UINT chainBitMask = 1 << properties().gpuIndex; + + // match the chain ID + if (canInteroperate) { + UINT d3d11DeviceGPUMask = 0; + + if (queryD3D11DeviceGPUMask(pd3d11Device, &d3d11DeviceGPUMask)) { + canInteroperate = (chainBitMask & d3d11DeviceGPUMask) != 0; + } + else { + // special handling for Intel iGPU + AMD dGPU in LDA mode + // (only occurs on a PX platform) where + // the D3D11Device object is created on the Intel iGPU and + // passed to AMD dGPU (secondary) to interoperate. + if (chainBitMask > 1) { + canInteroperate = false; + } + } + } + + pDXGIDevice->Release(); + pDXGIAdapter->Release(); + + return canInteroperate; +} + +} // pal + +#endif // !ATI_OS_LINUX diff --git a/projects/clr/rocclr/runtime/device/pal/paldeviced3d9.cpp b/projects/clr/rocclr/runtime/device/pal/paldeviced3d9.cpp new file mode 100644 index 0000000000..98bc526a23 --- /dev/null +++ b/projects/clr/rocclr/runtime/device/pal/paldeviced3d9.cpp @@ -0,0 +1,53 @@ +#include "paldevice.hpp" + +#if defined(ATI_OS_LINUX) +namespace pal { +bool +Device::associateD3D9Device(void* d3dDevice) +{ + return false; +} +} +#else // !ATI_OS_LINUX + +#include +#include + +/************************************************************************************************************** +* Note: ideally the DXX extension interfaces should be mapped from the DXX perforce branch. +* This means OCL client spec will need to change to include headers directly from the DXX perforce tree. +* However, OCL only cares about the DXX OpenCL extension interface class. The spec cannot change +* without notification. So it is safe to use a local copy of the relevant DXX extension interface classes. +**************************************************************************************************************/ +#include "DxxOpenCLInteropExt.h" + +namespace pal { + +bool +Device::associateD3D9Device(void* d3d9Device) +{ + D3DCAPS9 pCaps; + IDirect3D9* p3d9dev; + LUID d3d9deviceLuid = {0, 0}; + + IDirect3DDevice9* pd3d9Device = static_cast(d3d9Device); + + // Get D3D9 Device caps + pd3d9Device->GetDeviceCaps(&pCaps); + // Get 3D9 Device + pd3d9Device->GetDirect3D(&p3d9dev); + + IDirect3D9Ex* p3d9devEx = static_cast(p3d9dev); + p3d9devEx->GetAdapterLUID(pCaps.AdapterOrdinal, &d3d9deviceLuid); + p3d9dev->Release(); + + // match the adapter + bool canInteroperate = + (properties().osProperties.luidHighPart == d3d9deviceLuid.HighPart) && + (properties().osProperties.luidLowPart == d3d9deviceLuid.LowPart); + + return canInteroperate; +} + +} // pal +#endif // !ATI_OS_WIN diff --git a/projects/clr/rocclr/runtime/device/pal/paldevicegl.cpp b/projects/clr/rocclr/runtime/device/pal/paldevicegl.cpp new file mode 100644 index 0000000000..5745252cf8 --- /dev/null +++ b/projects/clr/rocclr/runtime/device/pal/paldevicegl.cpp @@ -0,0 +1,306 @@ +#include "platform/context.hpp" +#include "device/device.hpp" +#include "platform/runtime.hpp" +#include "platform/agent.hpp" +#ifdef _WIN32 +#include +#include "CL/cl_d3d10.h" +#include "CL/cl_d3d11.h" +#endif // _WIN32 + +#include +#include +#include "CL/cl_gl.h" +#include "paldevice.hpp" +//#include "cwddeci.h" +#include +#include "GL/glATIInternal.h" +#ifdef ATI_OS_LINUX +#include +#include +#include "GL/glx.h" +#include "GL/glxext.h" +#include "GL/glXATIPrivate.h" +#else +#include "GL/wglATIPrivate.h" +#endif + +#ifdef ATI_OS_LINUX +typedef void* (*PFNGlxGetProcAddress)(const GLubyte* procName); +static PFNGlxGetProcAddress pfnGlxGetProcAddress=NULL; +static PFNGLXBEGINCLINTEROPAMD glXBeginCLInteropAMD = NULL; +static PFNGLXENDCLINTEROPAMD glXEndCLInteropAMD = NULL; +static PFNGLXRESOURCEATTACHAMD glXResourceAttachAMD = NULL; +static PFNGLXRESOURCEDETACHAMD glxResourceAcquireAMD = NULL; +static PFNGLXRESOURCEDETACHAMD glxResourceReleaseAMD = NULL; +static PFNGLXRESOURCEDETACHAMD glXResourceDetachAMD = NULL; +static PFNGLXGETCONTEXTMVPUINFOAMD glXGetContextMVPUInfoAMD = NULL; +#else +static PFNWGLBEGINCLINTEROPAMD wglBeginCLInteropAMD = NULL; +static PFNWGLENDCLINTEROPAMD wglEndCLInteropAMD = NULL; +static PFNWGLRESOURCEATTACHAMD wglResourceAttachAMD = NULL; +static PFNWGLRESOURCEDETACHAMD wglResourceAcquireAMD = NULL; +static PFNWGLRESOURCEDETACHAMD wglResourceReleaseAMD = NULL; +static PFNWGLRESOURCEDETACHAMD wglResourceDetachAMD = NULL; +static PFNWGLGETCONTEXTGPUINFOAMD wglGetContextGPUInfoAMD = NULL; +#endif + +namespace pal { + +bool +Device::initGLInteropPrivateExt(void* GLplatformContext, void* GLdeviceContext) const +{ +#ifdef ATI_OS_LINUX + GLXContext ctx = (GLXContext)GLplatformContext; + void * pModule = dlopen("libGL.so.1",RTLD_NOW); + + if(NULL == pModule) { + return false; + } + pfnGlxGetProcAddress = (PFNGlxGetProcAddress) dlsym(pModule,"glXGetProcAddress"); + + if (NULL == pfnGlxGetProcAddress) { + return false; + } + + if (!glXBeginCLInteropAMD || !glXEndCLInteropAMD || !glXResourceAttachAMD || + !glXResourceDetachAMD || !glXGetContextMVPUInfoAMD) { + glXBeginCLInteropAMD = (PFNGLXBEGINCLINTEROPAMD) pfnGlxGetProcAddress ((const GLubyte *)"glXBeginCLInteroperabilityAMD"); + glXEndCLInteropAMD = (PFNGLXENDCLINTEROPAMD) pfnGlxGetProcAddress ((const GLubyte *)"glXEndCLInteroperabilityAMD"); + glXResourceAttachAMD = (PFNGLXRESOURCEATTACHAMD) pfnGlxGetProcAddress ((const GLubyte *)"glXResourceAttachAMD"); + glxResourceAcquireAMD = (PFNGLXRESOURCEDETACHAMD) pfnGlxGetProcAddress ((const GLubyte *)"glXResourceAcquireAMD"); + glxResourceReleaseAMD = (PFNGLXRESOURCEDETACHAMD) pfnGlxGetProcAddress ((const GLubyte *)"glXResourceReleaseAMD"); + glXResourceDetachAMD = (PFNGLXRESOURCEDETACHAMD) pfnGlxGetProcAddress ((const GLubyte *)"glXResourceDetachAMD"); + glXGetContextMVPUInfoAMD = (PFNGLXGETCONTEXTMVPUINFOAMD) pfnGlxGetProcAddress ((const GLubyte *)"glXGetContextMVPUInfoAMD"); + } + + if (!glXBeginCLInteropAMD || !glXEndCLInteropAMD || !glXResourceAttachAMD || + !glXResourceDetachAMD +#ifndef BRAHMA + || !glXGetContextMVPUInfoAMD +#endif + ) { + return false; + } +#else + if (!wglBeginCLInteropAMD || !wglEndCLInteropAMD || !wglResourceAttachAMD || + !wglResourceDetachAMD || !wglGetContextGPUInfoAMD) { + HGLRC fakeRC = NULL; + + if (!wglGetCurrentContext()) { + fakeRC = wglCreateContext((HDC)GLdeviceContext); + wglMakeCurrent((HDC)GLdeviceContext, fakeRC); + } + + wglBeginCLInteropAMD = (PFNWGLBEGINCLINTEROPAMD) wglGetProcAddress ("wglBeginCLInteroperabilityAMD"); + wglEndCLInteropAMD = (PFNWGLENDCLINTEROPAMD) wglGetProcAddress ("wglEndCLInteroperabilityAMD"); + wglResourceAttachAMD = (PFNWGLRESOURCEATTACHAMD) wglGetProcAddress ("wglResourceAttachAMD"); + wglResourceAcquireAMD = (PFNWGLRESOURCEDETACHAMD) wglGetProcAddress ("wglResourceAcquireAMD"); + wglResourceReleaseAMD = (PFNWGLRESOURCEDETACHAMD) wglGetProcAddress ("wglResourceReleaseAMD"); + wglResourceDetachAMD = (PFNWGLRESOURCEDETACHAMD) wglGetProcAddress ("wglResourceDetachAMD"); + wglGetContextGPUInfoAMD = (PFNWGLGETCONTEXTGPUINFOAMD) wglGetProcAddress ("wglGetContextGPUInfoAMD"); + + if (fakeRC) { + wglMakeCurrent(NULL, NULL); + wglDeleteContext(fakeRC); + } + } + if (!wglBeginCLInteropAMD || !wglEndCLInteropAMD || !wglResourceAttachAMD || + !wglResourceDetachAMD || !wglGetContextGPUInfoAMD) { + return false; + } +#endif + return true; +} + +bool +Device::glCanInterop(void* GLplatformContext, void* GLdeviceContext) const +{ + bool canInteroperate = false; + +#ifdef ATI_OS_WIN + LUID glAdapterLuid = {0, 0}; + UINT glChainBitMask = 0; + HGLRC hRC = (HGLRC)GLplatformContext; + + //get GL context's LUID and chainBitMask from UGL + if (wglGetContextGPUInfoAMD(hRC, &glAdapterLuid, &glChainBitMask)) { + // match the adapter + canInteroperate = + (properties().osProperties.luidHighPart == glAdapterLuid.HighPart) && + (properties().osProperties.luidLowPart == glAdapterLuid.LowPart) && + ((1 << properties().gpuIndex) == glChainBitMask); + } +#else +#ifdef BRAHMA + canInteroperate = true; +#else + GLuint glDeviceId = 0 ; + GLuint glChainMask = 0 ; + GLXContext ctx = (GLXContext)GLplatformContext; + + if (glXGetContextMVPUInfoAMD(ctx, &glDeviceId, &glChainMask)) { + // we allow intoperability only with GL context reside on a single GPU + canInteroperate = + (properties().deviceId == glDeviceId) && + ((1 << properties().gpuIndex) == glChainBitMask); + + } + } +#endif +#endif + return canInteroperate; +} + +bool +Device::glAssociate(void* GLplatformContext, void* GLdeviceContext) const +{ + //initialize pointers to the gl extension that supports interoperability + if (!initGLInteropPrivateExt(GLplatformContext, GLdeviceContext) || + !glCanInterop(GLplatformContext, GLdeviceContext)) { + return false; + } + + int flags = 0; +/* + if (m_adp->pAsicInfo->svmFineGrainSystem) + { + flags = GL_INTEROP_SVM; + } +*/ +#ifdef ATI_OS_LINUX + GLXContext ctx = (GLXContext)GLplatformContext; + return (glXBeginCLInteropAMD(ctx, 0)) ? true : false; +#else + HGLRC hRC = (HGLRC)GLplatformContext; + return (wglBeginCLInteropAMD(hRC, flags)) ? true : false; +#endif +} + +bool +Device::glDissociate(void* GLplatformContext, void* GLdeviceContext) const +{ + int flags = 0; +/* + if (m_adp->pAsicInfo->svmFineGrainSystem) + { + flags = GL_INTEROP_SVM; + } +*/ +#ifdef ATI_OS_LINUX + GLXContext ctx = (GLXContext)GLplatformContext; + return (glXEndCLInteropAMD(ctx, 0)) ? true : false; +#else + HGLRC hRC = (HGLRC)GLplatformContext; + return (wglEndCLInteropAMD(hRC, flags)) ? true : false; +#endif +} + +bool +Device::resGLAssociate( + void* GLContext, + uint name, + uint type, + void** handle, + void** mbResHandle, + size_t* offset) const +{ + amd::ScopedLock lk(lockPAL()); + + GLResource hRes = {}; + GLResourceData hData = {}; + + bool status = false; + + hRes.type = type; + hRes.name = name; + + hData.version = GL_RESOURCE_DATA_VERSION; +#ifdef ATI_OS_LINUX + GLXContext ctx = (GLXContext)GLContext; + if (glXResourceAttachAMD(ctx, &hRes, &hData)) { + attribs.dynamicSharedBufferID = hData->sharedBufferID; + status = true; + } +#else + HGLRC hRC = (HGLRC)GLContext; + if (wglResourceAttachAMD(hRC, &hRes, &hData)) { + status = true; + } +#endif + + if (!status) { + return false; + } + + *handle = reinterpret_cast(hData.handle); + *mbResHandle = reinterpret_cast(hData.mbResHandle); + *offset = static_cast(hData.offset); + + return status; +} + +bool +Device::resGLAcquire(void* GLplatformContext, void* mbResHandle, uint type) const +{ + amd::ScopedLock lk(lockPAL()); + + GLResource hRes = {}; + hRes.mbResHandle = (GLuintp)mbResHandle; + hRes.type = type; + +#ifdef ATI_OS_LINUX + GLXContext ctx = (GLXContext) GLplatformContext; + return (glxResourceAcquireAMD(ctx, &hRes)) ? true : false; +#else + HGLRC hRC = wglGetCurrentContext(); + //! @todo A temporary workaround for MT issue in conformance fence_sync + if (0 == hRC) { + return true; + } + return (wglResourceAcquireAMD(hRC, &hRes)) ? true : false; +#endif +} + +bool +Device::resGLRelease(void* GLplatformContext, void* mbResHandle, uint type) const +{ + amd::ScopedLock lk(lockPAL()); + + GLResource hRes = {}; + hRes.mbResHandle = (GLuintp)mbResHandle; + hRes.type = type; +#ifdef ATI_OS_LINUX + //TODO : make sure the application GL context is current. if not no + // point calling into the GL RT. + GLXContext ctx = (GLXContext) GLplatformContext; + return (glxResourceReleaseAMD(ctx, &hRes)) ? true : false; +#else + // Make the call into the GL driver only if the application GL context is current + HGLRC hRC = wglGetCurrentContext(); + //! @todo A temporary workaround for MT issue in conformance fence_sync + if (0 == hRC) { + return true; + } + return (wglResourceReleaseAMD(hRC, &hRes)) ? true : false; +#endif +} + +bool +Device::resGLFree(void* GLplatformContext, void* mbResHandle, uint type) const +{ + amd::ScopedLock lk(lockPAL()); + + GLResource hRes = {}; + hRes.mbResHandle = (GLuintp)mbResHandle; + hRes.type = type; +#ifdef ATI_OS_LINUX + GLXContext ctx = (GLXContext)GLplatformContext; + return (glXResourceDetachAMD(ctx, &hRes)) ? true : false; +#else + HGLRC hRC = (HGLRC)GLplatformContext; + return (wglResourceDetachAMD(hRC, &hRes)) ? true : false; +#endif +} + +} // pal diff --git a/projects/clr/rocclr/runtime/device/pal/palkernel.cpp b/projects/clr/rocclr/runtime/device/pal/palkernel.cpp new file mode 100644 index 0000000000..268bb9eebc --- /dev/null +++ b/projects/clr/rocclr/runtime/device/pal/palkernel.cpp @@ -0,0 +1,1197 @@ +// +// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved. +// +#include "device/pal/palkernel.hpp" +#include "device/pal/palprogram.hpp" +#include "device/pal/palblit.hpp" +#include "device/pal/palconstbuf.hpp" +#include "device/pal/palsched.hpp" +#include "platform/commandqueue.hpp" +#include "utils/options.hpp" + +#include "acl.h" +#include "SCShadersR678XXCommon.h" + +#include +#include +#include +#include +#include +#include +#include + +namespace pal { + +inline static HSAIL_ARG_TYPE +GetHSAILArgType(const aclArgData* argInfo) +{ + switch (argInfo->type) { + case ARG_TYPE_POINTER: + return HSAIL_ARGTYPE_POINTER; + case ARG_TYPE_QUEUE: + return HSAIL_ARGTYPE_QUEUE; + case ARG_TYPE_VALUE: + return HSAIL_ARGTYPE_VALUE; + case ARG_TYPE_IMAGE: + return HSAIL_ARGTYPE_IMAGE; + case ARG_TYPE_SAMPLER: + return HSAIL_ARGTYPE_SAMPLER; + case ARG_TYPE_ERROR: + default: + return HSAIL_ARGTYPE_ERROR; + } +} + +inline static size_t +GetHSAILArgAlignment(const aclArgData* argInfo) +{ + switch (argInfo->type) { + case ARG_TYPE_POINTER: + return argInfo->arg.pointer.align; + default: + return 1; + } +} + +inline static HSAIL_ACCESS_TYPE +GetHSAILArgAccessType(const aclArgData* argInfo) +{ + if (argInfo->type == ARG_TYPE_POINTER) { + switch (argInfo->arg.pointer.type) { + case ACCESS_TYPE_RO: + return HSAIL_ACCESS_TYPE_RO; + case ACCESS_TYPE_WO: + return HSAIL_ACCESS_TYPE_WO; + case ACCESS_TYPE_RW: + default: + return HSAIL_ACCESS_TYPE_RW; + } + } + return HSAIL_ACCESS_TYPE_NONE; +} + +inline static HSAIL_ADDRESS_QUALIFIER +GetHSAILAddrQual(const aclArgData* argInfo) +{ + if (argInfo->type == ARG_TYPE_POINTER) { + switch (argInfo->arg.pointer.memory) { + case PTR_MT_CONSTANT_EMU: + case PTR_MT_CONSTANT: + case PTR_MT_UAV: + case PTR_MT_GLOBAL: + return HSAIL_ADDRESS_GLOBAL; + case PTR_MT_LDS_EMU: + case PTR_MT_LDS: + return HSAIL_ADDRESS_LOCAL; + case PTR_MT_SCRATCH_EMU: + return HSAIL_ADDRESS_GLOBAL; + case PTR_MT_ERROR: + default: + LogError("Unsupported address type"); + return HSAIL_ADDRESS_ERROR; + } + } + else if ((argInfo->type == ARG_TYPE_IMAGE) || + (argInfo->type == ARG_TYPE_SAMPLER)) { + return HSAIL_ADDRESS_GLOBAL; + } + else if (argInfo->type == ARG_TYPE_QUEUE) { + return HSAIL_ADDRESS_GLOBAL; + } + return HSAIL_ADDRESS_ERROR; +} + +/* f16 returns f32 - workaround due to comp lib */ +inline static HSAIL_DATA_TYPE +GetHSAILDataType(const aclArgData* argInfo) +{ + aclArgDataType dataType; + + if (argInfo->type == ARG_TYPE_POINTER) { + dataType = argInfo->arg.pointer.data; + } + else if (argInfo->type == ARG_TYPE_VALUE) { + dataType = argInfo->arg.value.data; + } + else { + return HSAIL_DATATYPE_ERROR; + } + switch (dataType) { + case DATATYPE_i1: + return HSAIL_DATATYPE_B1; + case DATATYPE_i8: + return HSAIL_DATATYPE_S8; + case DATATYPE_i16: + return HSAIL_DATATYPE_S16; + case DATATYPE_i32: + return HSAIL_DATATYPE_S32; + case DATATYPE_i64: + return HSAIL_DATATYPE_S64; + case DATATYPE_u8: + return HSAIL_DATATYPE_U8; + case DATATYPE_u16: + return HSAIL_DATATYPE_U16; + case DATATYPE_u32: + return HSAIL_DATATYPE_U32; + case DATATYPE_u64: + return HSAIL_DATATYPE_U64; + case DATATYPE_f16: + return HSAIL_DATATYPE_F32; + case DATATYPE_f32: + return HSAIL_DATATYPE_F32; + case DATATYPE_f64: + return HSAIL_DATATYPE_F64; + case DATATYPE_struct: + return HSAIL_DATATYPE_STRUCT; + case DATATYPE_opaque: + return HSAIL_DATATYPE_OPAQUE; + case DATATYPE_ERROR: + default: + return HSAIL_DATATYPE_ERROR; + } +} + +inline static int +GetHSAILArgSize(const aclArgData *argInfo) +{ + switch (argInfo->type) { + case ARG_TYPE_VALUE: + switch (GetHSAILDataType(argInfo)) { + case HSAIL_DATATYPE_B1: + return 1; + case HSAIL_DATATYPE_B8: + case HSAIL_DATATYPE_S8: + case HSAIL_DATATYPE_U8: + return 1; + case HSAIL_DATATYPE_B16: + case HSAIL_DATATYPE_U16: + case HSAIL_DATATYPE_S16: + case HSAIL_DATATYPE_F16: + return 2; + case HSAIL_DATATYPE_B32: + case HSAIL_DATATYPE_U32: + case HSAIL_DATATYPE_S32: + case HSAIL_DATATYPE_F32: + return 4; + case HSAIL_DATATYPE_B64: + case HSAIL_DATATYPE_U64: + case HSAIL_DATATYPE_S64: + case HSAIL_DATATYPE_F64: + return 8; + case HSAIL_DATATYPE_STRUCT: + return argInfo->arg.value.numElements; + default: + return -1; + } + case ARG_TYPE_POINTER: + case ARG_TYPE_IMAGE: + case ARG_TYPE_SAMPLER: + case ARG_TYPE_QUEUE: + return sizeof(void*); + default: + return -1; + } +} + +inline static clk_value_type_t +GetOclType(const aclArgData* argInfo) +{ + static const clk_value_type_t ClkValueMapType[6][6] = { + { T_CHAR, T_CHAR2, T_CHAR3, T_CHAR4, T_CHAR8, T_CHAR16 }, + { T_SHORT, T_SHORT2, T_SHORT3, T_SHORT4, T_SHORT8, T_SHORT16 }, + { T_INT, T_INT2, T_INT3, T_INT4, T_INT8, T_INT16 }, + { T_LONG, T_LONG2, T_LONG3, T_LONG4, T_LONG8, T_LONG16 }, + { T_FLOAT, T_FLOAT2, T_FLOAT3, T_FLOAT4, T_FLOAT8, T_FLOAT16 }, + { T_DOUBLE, T_DOUBLE2, T_DOUBLE3, T_DOUBLE4, T_DOUBLE8, T_DOUBLE16 }, + }; + + uint sizeType; + if (argInfo->type == ARG_TYPE_QUEUE) { + return T_QUEUE; + } + if ((argInfo->type == ARG_TYPE_POINTER) || (argInfo->type == ARG_TYPE_IMAGE)) { + return T_POINTER; + } + else if (argInfo->type == ARG_TYPE_VALUE) { + switch (argInfo->arg.value.data) { + case DATATYPE_i8: + case DATATYPE_u8: + sizeType = 0; + break; + case DATATYPE_i16: + case DATATYPE_u16: + sizeType = 1; + break; + case DATATYPE_i32: + case DATATYPE_u32: + sizeType = 2; + break; + case DATATYPE_i64: + case DATATYPE_u64: + sizeType = 3; + break; + case DATATYPE_f16: + case DATATYPE_f32: + sizeType = 4; + break; + case DATATYPE_f64: + sizeType = 5; + break; + default: + return T_VOID; + } + switch (argInfo->arg.value.numElements) { + case 1: return ClkValueMapType[sizeType][0]; + case 2: return ClkValueMapType[sizeType][1]; + case 3: return ClkValueMapType[sizeType][2]; + case 4: return ClkValueMapType[sizeType][3]; + case 8: return ClkValueMapType[sizeType][4]; + case 16: return ClkValueMapType[sizeType][5]; + default: return T_VOID; + } + } + else if (argInfo->type == ARG_TYPE_SAMPLER) { + return T_SAMPLER; + } + else { + return T_VOID; + } +} + +inline static cl_kernel_arg_address_qualifier +GetOclAddrQual(const aclArgData* argInfo) +{ + if (argInfo->type == ARG_TYPE_POINTER) { + switch (argInfo->arg.pointer.memory) { + case PTR_MT_UAV: + case PTR_MT_GLOBAL: + return CL_KERNEL_ARG_ADDRESS_GLOBAL; + case PTR_MT_CONSTANT: + case PTR_MT_UAV_CONSTANT: + case PTR_MT_CONSTANT_EMU: + return CL_KERNEL_ARG_ADDRESS_CONSTANT; + case PTR_MT_LDS_EMU: + case PTR_MT_LDS: + return CL_KERNEL_ARG_ADDRESS_LOCAL; + default: + return CL_KERNEL_ARG_ADDRESS_PRIVATE; + } + } + else if (argInfo->type == ARG_TYPE_IMAGE) { + return CL_KERNEL_ARG_ADDRESS_GLOBAL; + } + //default for all other cases + return CL_KERNEL_ARG_ADDRESS_PRIVATE; +} + +inline static cl_kernel_arg_access_qualifier +GetOclAccessQual(const aclArgData* argInfo) +{ + if (argInfo->type == ARG_TYPE_IMAGE) { + switch (argInfo->arg.image.type) { + case ACCESS_TYPE_RO: + return CL_KERNEL_ARG_ACCESS_READ_ONLY; + case ACCESS_TYPE_WO: + return CL_KERNEL_ARG_ACCESS_WRITE_ONLY; + case ACCESS_TYPE_RW: + return CL_KERNEL_ARG_ACCESS_READ_WRITE; + default: + return CL_KERNEL_ARG_ACCESS_NONE; + } + } + return CL_KERNEL_ARG_ACCESS_NONE; +} + +inline static cl_kernel_arg_type_qualifier +GetOclTypeQual(const aclArgData* argInfo) +{ + cl_kernel_arg_type_qualifier rv = CL_KERNEL_ARG_TYPE_NONE; + if (argInfo->type == ARG_TYPE_POINTER) { + if (argInfo->arg.pointer.isVolatile) { + rv |= CL_KERNEL_ARG_TYPE_VOLATILE; + } + if (argInfo->arg.pointer.isRestrict) { + rv |= CL_KERNEL_ARG_TYPE_RESTRICT; + } + if (argInfo->arg.pointer.isPipe) { + rv |= CL_KERNEL_ARG_TYPE_PIPE; + } + if (argInfo->isConst) { + rv |= CL_KERNEL_ARG_TYPE_CONST; + } + switch (argInfo->arg.pointer.memory) { + case PTR_MT_CONSTANT: + case PTR_MT_UAV_CONSTANT: + case PTR_MT_CONSTANT_EMU: + rv |= CL_KERNEL_ARG_TYPE_CONST; + break; + default: + break; + } + } + return rv; +} + +static int +GetOclSize(const aclArgData* argInfo) +{ + switch (argInfo->type) { + case ARG_TYPE_POINTER: return sizeof(void *); + case ARG_TYPE_VALUE: + //! \note OCL 6.1.5. For 3-component vector data types, + //! the size of the data type is 4 * sizeof(component). + switch (argInfo->arg.value.data) { + case DATATYPE_struct: + return 1 * argInfo->arg.value.numElements; + case DATATYPE_i8: + case DATATYPE_u8: + return 1 * amd::nextPowerOfTwo(argInfo->arg.value.numElements); + case DATATYPE_u16: + case DATATYPE_i16: + case DATATYPE_f16: + return 2 * amd::nextPowerOfTwo(argInfo->arg.value.numElements); + case DATATYPE_u32: + case DATATYPE_i32: + case DATATYPE_f32: + return 4 * amd::nextPowerOfTwo(argInfo->arg.value.numElements); + case DATATYPE_i64: + case DATATYPE_u64: + case DATATYPE_f64: + return 8 * amd::nextPowerOfTwo(argInfo->arg.value.numElements); + case DATATYPE_ERROR: + default: return -1; + } + case ARG_TYPE_IMAGE: return sizeof(cl_mem); + case ARG_TYPE_SAMPLER: return sizeof(cl_sampler); + case ARG_TYPE_QUEUE: return sizeof(cl_command_queue); + default: return -1; + } +} + +bool +HSAILKernel::aqlCreateHWInfo(amd::hsa::loader::Symbol *sym) +{ + if (!sym) { + return false; + } + uint64_t akc_addr = 0; + if (!sym->GetInfo(HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, reinterpret_cast(&akc_addr))) { + return false; + } + amd_kernel_code_t *akc = reinterpret_cast(akc_addr); + cpuAqlCode_ = akc; + if (!sym->GetInfo(HSA_EXT_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT_SIZE, reinterpret_cast(&codeSize_))) { + return false; + } + size_t akc_align = 0; + if (!sym->GetInfo(HSA_EXT_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT_ALIGN, reinterpret_cast(&akc_align))) { + return false; + } + code_ = new Memory(dev(), amd::alignUp(codeSize_, akc_align)); + Resource::MemoryType type = Resource::RemoteUSWC; + if (flags_.internalKernel_) { + type = Resource::RemoteUSWC; + } + // Initialize kernel ISA code + if (code_ && code_->create(type)) { + address cpuCodePtr = static_cast
(code_->map(nullptr, Resource::WriteOnly)); + // Copy only amd_kernel_code_t + memcpy(cpuCodePtr, reinterpret_cast
(akc), codeSize_); + code_->unmap(nullptr); + } + else { + LogError("Failed to allocate ISA code!"); + return false; + } + + assert((akc->workitem_private_segment_byte_size & 3) == 0 && + "Scratch must be DWORD aligned"); + workGroupInfo_.scratchRegs_ = + amd::alignUp(akc->workitem_private_segment_byte_size, 16) / sizeof(uint); +/* + workGroupInfo_.availableSGPRs_ = dev().gslCtx()->getNumSGPRsAvailable(); + workGroupInfo_.availableVGPRs_ = dev().gslCtx()->getNumVGPRsAvailable(); + workGroupInfo_.preferredSizeMultiple_ = dev().getAttribs().wavefrontSize; + workGroupInfo_.wavefrontPerSIMD_ = dev().getAttribs().wavefrontSize; +*/ + workGroupInfo_.privateMemSize_ = akc->workitem_private_segment_byte_size; + workGroupInfo_.localMemSize_ = + workGroupInfo_.usedLDSSize_ = akc->workgroup_group_segment_byte_size; + workGroupInfo_.usedSGPRs_ = akc->wavefront_sgpr_count; + workGroupInfo_.usedStackSize_ = 0; + workGroupInfo_.usedVGPRs_ = akc->workitem_vgpr_count; + + return true; +} + +void +HSAILKernel::initArgList(const aclArgData* aclArg) +{ + // Initialize the hsail argument list too + initHsailArgs(aclArg); + + // Iterate through the arguments and insert into parameterList + device::Kernel::parameters_t params; + amd::KernelParameterDescriptor desc; + size_t offset = 0; + + // Reserved arguments for HSAIL launch + aclArg += MaxExtraArgumentsNum; + for (uint i = 0; aclArg->struct_size != 0; i++, aclArg++) { + desc.name_ = arguments_[i]->name_.c_str(); + desc.type_ = GetOclType(aclArg); + desc.addressQualifier_ = GetOclAddrQual(aclArg); + desc.accessQualifier_ = GetOclAccessQual(aclArg); + desc.typeQualifier_ = GetOclTypeQual(aclArg); + desc.typeName_ = arguments_[i]->typeName_.c_str(); + + // Make a check if it is local or global + if (desc.addressQualifier_ == CL_KERNEL_ARG_ADDRESS_LOCAL) { + desc.size_ = 0; + } + else { + desc.size_ = GetOclSize(aclArg); + } + + // Make offset alignment to match CPU metadata, since + // in multidevice config abstraction layer has a single signature + // and CPU sends the paramaters as they are allocated in memory + size_t size = desc.size_; + if (size == 0) { + // Local memory for CPU + size = sizeof(cl_mem); + } + offset = amd::alignUp(offset, std::min(size, size_t(16))); + desc.offset_ = offset; + offset += amd::alignUp(size, sizeof(uint32_t)); + params.push_back(desc); + + if (arguments_[i]->type_ == HSAIL_ARGTYPE_IMAGE) { + flags_.imageEna_ = true; + if (desc.accessQualifier_ != CL_KERNEL_ARG_ACCESS_READ_ONLY) { + flags_.imageWriteEna_ = true; + } + } + } + + createSignature(params); +} + +void +HSAILKernel::initHsailArgs(const aclArgData* aclArg) +{ + int offset = 0; + + // Reserved arguments for HSAIL launch + aclArg += MaxExtraArgumentsNum; + + // Iterate through the each kernel argument + for (; aclArg->struct_size != 0; aclArg++) { + Argument* arg = new Argument; + // Initialize HSAIL kernel argument + arg->name_ = aclArg->argStr; + arg->typeName_ = aclArg->typeStr; + arg->size_ = GetHSAILArgSize(aclArg); + arg->offset_ = offset; + arg->type_ = GetHSAILArgType(aclArg); + arg->addrQual_ = GetHSAILAddrQual(aclArg); + arg->dataType_ = GetHSAILDataType(aclArg); + // If vector of args we add additional arguments to flatten it out + arg->numElem_ = ((aclArg->type == ARG_TYPE_VALUE) && + (aclArg->arg.value.data != DATATYPE_struct)) ? + aclArg->arg.value.numElements : 1; + arg->alignment_ = GetHSAILArgAlignment(aclArg); + arg->access_ = GetHSAILArgAccessType(aclArg); + offset += GetHSAILArgSize(aclArg); + arguments_.push_back(arg); + } +} + +void +HSAILKernel::initPrintf(const aclPrintfFmt* aclPrintf) +{ + PrintfInfo info; + uint index = 0; + for (; aclPrintf->struct_size != 0; aclPrintf++) { + index = aclPrintf->ID; + if (printf_.size() <= index) { + printf_.resize(index + 1); + } + std::string pfmt = aclPrintf->fmtStr; + info.fmtString_.clear(); + size_t pos = 0; + for (size_t i = 0; i < pfmt.size(); ++i) { + char symbol = pfmt[pos++]; + if (symbol == '\\') { + // Rest of the C escape sequences (e.g. \') are handled correctly + // by the MDParser, we are not sure exactly how! + switch (pfmt[pos]) { + case 'a': + pos++; + symbol = '\a'; + break; + case 'b': + pos++; + symbol = '\b'; + break; + case 'f': + pos++; + symbol = '\f'; + break; + case 'n': + pos++; + symbol = '\n'; + break; + case 'r': + pos++; + symbol = '\r'; + break; + case 'v': + pos++; + symbol = '\v'; + break; + case '7': + if (pfmt[++pos] == '2') { + pos++; + i++; + symbol = '\72'; + } + break; + default: + break; + } + } + info.fmtString_.push_back(symbol); + } + info.fmtString_ += "\n"; + uint32_t *tmp_ptr = const_cast(aclPrintf->argSizes); + for (uint i = 0; i < aclPrintf->numSizes; i++ , tmp_ptr++) { + info.arguments_.push_back(*tmp_ptr); + } + printf_[index] = info; + info.arguments_.clear(); + } +} + +HSAILKernel::HSAILKernel(std::string name, + HSAILProgram* prog, + std::string compileOptions, + uint extraArgsNum) + : device::Kernel(name) + , compileOptions_(compileOptions) + , dev_(prog->dev()) + , prog_(*prog) + , index_(0) + , code_(nullptr) + , codeSize_(0) + , hwMetaData_(nullptr) + , extraArgumentsNum_(extraArgsNum) +{ + hsa_ = true; +} + +HSAILKernel::~HSAILKernel() +{ + while (!arguments_.empty()) { + Argument* arg = arguments_.back(); + delete arg; + arguments_.pop_back(); + } + + delete [] hwMetaData_; + + delete code_; +} + +bool +HSAILKernel::init(amd::hsa::loader::Symbol *sym, bool finalize) +{ + if (extraArgumentsNum_ > MaxExtraArgumentsNum) { + LogError("Failed to initialize kernel: extra arguments number is bigger than is supported"); + return false; + } + acl_error error = ACL_SUCCESS; + std::string openClKernelName = openclMangledName(name()); + flags_.internalKernel_ = (compileOptions_.find("-cl-internal-kernel") != + std::string::npos) ? true: false; + //compile kernel down to ISA + if (finalize) { + std::string options(compileOptions_.c_str()); + options.append(" -just-kernel="); + options.append(openClKernelName.c_str()); + // Append an option so that we can selectively enable a SCOption on CZ + // whenever IOMMUv2 is enabled. + if (dev().settings().svmFineGrainSystem_) { + options.append(" -sc-xnack-iommu"); + } + error = aclCompile(dev().compiler(), prog().binaryElf(), + options.c_str(), ACL_TYPE_CG, ACL_TYPE_ISA, nullptr); + buildLog_ += aclGetCompilerLog(dev().compiler()); + if (error != ACL_SUCCESS) { + LogError("Failed to finalize kernel"); + return false; + } + } + + // Allocate HW resources for the real program only + if (!prog().isNull()) { + aqlCreateHWInfo(sym); + } + + // Pull out metadata from the ELF + size_t sizeOfArgList; + error = aclQueryInfo(dev().compiler(), prog().binaryElf(), + RT_ARGUMENT_ARRAY, openClKernelName.c_str(), nullptr, &sizeOfArgList); + if (error != ACL_SUCCESS) { + return false; + } + + char* aclArgList = new char[sizeOfArgList]; + if (nullptr == aclArgList) { + return false; + } + error = aclQueryInfo(dev().compiler(), prog().binaryElf(), + RT_ARGUMENT_ARRAY, openClKernelName.c_str(), aclArgList, &sizeOfArgList); + if (error != ACL_SUCCESS) { + return false; + } + // Set the argList + initArgList(reinterpret_cast(aclArgList)); + delete [] aclArgList; + + size_t sizeOfWorkGroupSize; + error = aclQueryInfo(dev().compiler(), prog().binaryElf(), + RT_WORK_GROUP_SIZE, openClKernelName.c_str(), nullptr, &sizeOfWorkGroupSize); + if (error != ACL_SUCCESS) { + return false; + } + error = aclQueryInfo(dev().compiler(), prog().binaryElf(), + RT_WORK_GROUP_SIZE, openClKernelName.c_str(), + workGroupInfo_.compileSize_, &sizeOfWorkGroupSize); + if (error != ACL_SUCCESS) { + return false; + } + + //! @todo get the right value; + // Copy wavefront size + workGroupInfo_.wavefrontSize_ = 64;//dev().getAttribs().wavefrontSize; + // Find total workgroup size + if (workGroupInfo_.compileSize_[0] != 0) { + workGroupInfo_.size_ = + workGroupInfo_.compileSize_[0] * + workGroupInfo_.compileSize_[1] * + workGroupInfo_.compileSize_[2]; + } + else { + workGroupInfo_.size_ = dev().info().maxWorkGroupSize_; + } + + // Pull out printf metadata from the ELF + size_t sizeOfPrintfList; + error = aclQueryInfo(dev().compiler(), prog().binaryElf(), + RT_GPU_PRINTF_ARRAY, openClKernelName.c_str(), nullptr, &sizeOfPrintfList); + if (error != ACL_SUCCESS) { + return false; + } + + // Make sure kernel has any printf info + if (0 != sizeOfPrintfList) { + char* aclPrintfList = new char[sizeOfPrintfList]; + if (nullptr == aclPrintfList) { + return false; + } + error = aclQueryInfo(dev().compiler(), prog().binaryElf(), + RT_GPU_PRINTF_ARRAY, openClKernelName.c_str(), aclPrintfList, + &sizeOfPrintfList); + if (error != ACL_SUCCESS) { + return false; + } + + // Set the PrintfList + initPrintf(reinterpret_cast(aclPrintfList)); + delete [] aclPrintfList; + } + + aclMetadata md; + md.enqueue_kernel = false; + size_t sizeOfDeviceEnqueue = sizeof(md.enqueue_kernel); + error = aclQueryInfo(dev().compiler(), prog().binaryElf(), + RT_DEVICE_ENQUEUE, openClKernelName.c_str(), + &md.enqueue_kernel, &sizeOfDeviceEnqueue); + if (error != ACL_SUCCESS) { + return false; + } + flags_.dynamicParallelism_ = md.enqueue_kernel; + + md.kernel_index = -1; + size_t sizeOfIndex = sizeof(md.kernel_index); + error = aclQueryInfo(dev().compiler(), prog().binaryElf(), + RT_KERNEL_INDEX, openClKernelName.c_str(), + &md.kernel_index, &sizeOfIndex); + if (error != ACL_SUCCESS) { + return false; + } + index_ = md.kernel_index; + + return true; +} + +bool +HSAILKernel::validateMemory(uint idx, amd::Memory* amdMem) const +{ + // Check if memory doesn't require reallocation + bool noRealloc = true; + //amdMem->reallocedDeviceMemory(&dev())); + + return noRealloc; +} + +const Device& +HSAILKernel::dev() const +{ + return reinterpret_cast(dev_); +} + +const HSAILProgram& +HSAILKernel::prog() const +{ + return reinterpret_cast(prog_); +} + +void +HSAILKernel::findLocalWorkSize( + size_t workDim, + const amd::NDRange& gblWorkSize, + amd::NDRange& lclWorkSize) const +{ + // Initialize the default workgoup info + // Check if the kernel has the compiled sizes + if (workGroupInfo()->compileSize_[0] == 0) { + // Find the default local workgroup size, if it wasn't specified + if (lclWorkSize[0] == 0) { + size_t thrPerGrp; + bool b1DOverrideSet = !flagIsDefault(GPU_MAX_WORKGROUP_SIZE); + bool b2DOverrideSet = !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_2D_X) || + !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_2D_Y); + bool b3DOverrideSet = !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_3D_X) || + !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_3D_Y) || + !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_3D_Z); + + bool overrideSet = ((workDim == 1) && b1DOverrideSet) || + ((workDim == 2) && b2DOverrideSet) || + ((workDim == 3) && b3DOverrideSet); + if (!overrideSet) { + // Find threads per group + thrPerGrp = workGroupInfo()->size_; + + // Check if kernel uses images + if (flags_.imageEna_ && + // and thread group is a multiple value of wavefronts + ((thrPerGrp % workGroupInfo()->wavefrontSize_) == 0) && + // and it's 2 or 3-dimensional workload + (workDim > 1) && + ((dev().settings().partialDispatch_) || + (((gblWorkSize[0] % 16) == 0) && + ((gblWorkSize[1] % 16) == 0)))) { + // Use 8x8 workgroup size if kernel has image writes + if (flags_.imageWriteEna_ || + (thrPerGrp != dev().info().maxWorkGroupSize_)) { + lclWorkSize[0] = 8; + lclWorkSize[1] = 8; + } + else { + lclWorkSize[0] = 16; + lclWorkSize[1] = 16; + } + if (workDim == 3) { + lclWorkSize[2] = 1; + } + } + else { + size_t tmp = thrPerGrp; + // Split the local workgroup into the most efficient way + for (uint d = 0; d < workDim; ++d) { + size_t div = tmp; + for (; (gblWorkSize[d] % div) != 0; div--); + lclWorkSize[d] = div; + tmp /= div; + } + + // Check if partial dispatch is enabled and + if (dev().settings().partialDispatch_ && + // we couldn't find optimal workload + (lclWorkSize.product() % workGroupInfo()->wavefrontSize_) != 0) { + size_t maxSize = 0; + size_t maxDim = 0; + for (uint d = 0; d < workDim; ++d) { + if (maxSize < gblWorkSize[d]) { + maxSize = gblWorkSize[d]; + maxDim = d; + } + } + // Check if a local workgroup has the most optimal size + if (thrPerGrp > maxSize) { + thrPerGrp = maxSize; + } + lclWorkSize[maxDim] = thrPerGrp; + for (uint d = 0; d < workDim; ++d) { + if (d != maxDim) { + lclWorkSize[d] = 1; + } + } + } + } + } + else { + // Use overrides when app doesn't provide workgroup dimensions + if (workDim == 1) { + lclWorkSize[0] = GPU_MAX_WORKGROUP_SIZE; + } + else if (workDim == 2) { + lclWorkSize[0] = GPU_MAX_WORKGROUP_SIZE_2D_X; + lclWorkSize[1] = GPU_MAX_WORKGROUP_SIZE_2D_Y; + } + else if (workDim == 3) { + lclWorkSize[0] = GPU_MAX_WORKGROUP_SIZE_3D_X; + lclWorkSize[1] = GPU_MAX_WORKGROUP_SIZE_3D_Y; + lclWorkSize[2] = GPU_MAX_WORKGROUP_SIZE_3D_Z; + } + else + { + assert(0 && "Invalid workDim!"); + } + } + } + } + else { + for (uint d = 0; d < workDim; ++d) { + lclWorkSize[d] = workGroupInfo()->compileSize_[d]; + } + } +} + +inline static void +WriteAqlArg( + unsigned char** dst,//!< The write pointer to the buffer + const void* src, //!< The source pointer + uint size, //!< The size in bytes to copy + uint alignment = 0 //!< The alignment to follow while writing to the buffer + ) +{ + if (alignment == 0) { + *dst = amd::alignUp(*dst, size); + } + else { + *dst = amd::alignUp(*dst, alignment); + } + memcpy(*dst, src, size); + *dst += size; +} + +const uint16_t kDispatchPacketHeader = + (HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) | + (1 << HSA_PACKET_HEADER_BARRIER) | + (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) | + (HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE); + +hsa_kernel_dispatch_packet_t* +HSAILKernel::loadArguments( + VirtualGPU& gpu, + const amd::Kernel& kernel, + const amd::NDRangeContainer& sizes, + const_address parameters, + bool nativeMem, + uint64_t vmDefQueue, + uint64_t* vmParentWrap, + std::vector& memList) const +{ + static const bool WaitOnBusyEngine = true; + uint64_t ldsAddress = ldsSize(); + address aqlArgBuf = gpu.cb(0)->sysMemCopy(); + address aqlStruct = gpu.cb(1)->sysMemCopy(); + bool srdResource = false; + + if (extraArgumentsNum_ > 0) { + assert(MaxExtraArgumentsNum >= 6 && "MaxExtraArgumentsNum has changed, the below algorithm should be changed accordingly"); + size_t extraArgs[MaxExtraArgumentsNum] = { 0, 0, 0, 0, 0, 0 }; + // The HLC generates up to 3 additional arguments for the global offsets + for (uint i = 0; i < sizes.dimensions(); ++i) { + extraArgs[i] = sizes.offset()[i]; + } + // Check if the kernel may have printf output + if ((printfInfo().size() > 0) && + // and printf buffer was allocated + (gpu.printfDbgHSA().dbgBuffer() != nullptr)) { + // and set the fourth argument as the printf_buffer pointer + extraArgs[3] = static_cast(gpu.printfDbgHSA().dbgBuffer()->vmAddress()); + memList.push_back(gpu.printfDbgHSA().dbgBuffer()); + } + if (dynamicParallelism()) { + // Provide the host parent AQL wrap object to the kernel + AmdAqlWrap* wrap = reinterpret_cast(aqlStruct); + memset(wrap, 0, sizeof(AmdAqlWrap)); + wrap->state = AQL_WRAP_BUSY; + ConstBuffer* cb = gpu.constBufs_[1]; + cb->uploadDataToHw(sizeof(AmdAqlWrap)); + *vmParentWrap = cb->vmAddress() + cb->wrtOffset(); + // and set 5th & 6th arguments + extraArgs[4] = vmDefQueue; + extraArgs[5] = *vmParentWrap; + memList.push_back(cb); + } + WriteAqlArg(&aqlArgBuf, extraArgs, sizeof(size_t)*extraArgumentsNum_, sizeof(size_t)); + } + + const amd::KernelSignature& signature = kernel.signature(); + const amd::KernelParameters& kernelParams = kernel.parameters(); + + // Find all parameters for the current kernel + for (uint i = 0; i != signature.numParameters(); ++i) { + const HSAILKernel::Argument* arg = argument(i); + const amd::KernelParameterDescriptor& desc = signature.at(i); + const_address paramaddr = parameters + desc.offset_; + + switch (arg->type_) { + case HSAIL_ARGTYPE_POINTER: + // If it is a global pointer + if (arg->addrQual_ == HSAIL_ADDRESS_GLOBAL) { + + Memory* gpuMem = nullptr; + amd::Memory* mem = nullptr; + + if (kernelParams.boundToSvmPointer(dev(), parameters, i)) { + WriteAqlArg(&aqlArgBuf, paramaddr, sizeof(paramaddr)); + mem = amd::SvmManager::FindSvmBuffer(*reinterpret_cast(paramaddr)); + if (mem != nullptr) { + gpuMem = dev().getGpuMemory(mem); + gpuMem->wait(gpu, WaitOnBusyEngine); + if ((mem->getMemFlags() & CL_MEM_READ_ONLY) == 0) { + mem->signalWrite(&dev()); + } + memList.push_back(gpuMem); + } + // If finegrainsystem is present then the pointer can be malloced by the app and + // passed to kernel directly. If so copy the pointer location to aqlArgBuf + else if ((dev().info().svmCapabilities_ & CL_DEVICE_SVM_FINE_GRAIN_SYSTEM) == 0) { + return nullptr; + } + break; + } + if (nativeMem) { + gpuMem = *reinterpret_cast(paramaddr); + if (nullptr != gpuMem) { + mem = gpuMem->owner(); + } + } + else { + mem = *reinterpret_cast(paramaddr); + if (mem != nullptr) { + gpuMem = dev().getGpuMemory(mem); + } + } + if (gpuMem == nullptr) { + WriteAqlArg(&aqlArgBuf, &gpuMem, sizeof(void*)); + break; + } + + //! @todo 64 bit isn't supported with 32 bit binary + uint64_t globalAddress = gpuMem->vmAddress() + gpuMem->pinOffset(); + WriteAqlArg(&aqlArgBuf, &globalAddress, sizeof(void*)); + + // Wait for resource if it was used on an inactive engine + //! \note syncCache may call DRM transfer + gpuMem->wait(gpu, WaitOnBusyEngine); + + //! @todo Compiler has to return read/write attributes + if ((nullptr != mem) && + ((mem->getMemFlags() & CL_MEM_READ_ONLY) == 0)) { + mem->signalWrite(&dev()); + } + memList.push_back(gpuMem); + + // save the memory object pointer to allow global memory access + if (nullptr != dev().hwDebugMgr()) { + dev().hwDebugMgr()->assignKernelParamMem(i, gpuMem->owner()); + } + } + // If it is a local pointer + else { + assert((arg->addrQual_ == HSAIL_ADDRESS_LOCAL) && + "Unsupported address type"); + ldsAddress = amd::alignUp(ldsAddress, arg->alignment_); + WriteAqlArg(&aqlArgBuf, &ldsAddress, sizeof(size_t)); + ldsAddress += *reinterpret_cast(paramaddr); + } + break; + case HSAIL_ARGTYPE_VALUE: + // Special case for structrues + if (arg->dataType_ == HSAIL_DATATYPE_STRUCT) { + // Copy the current structre into CB1 + memcpy(aqlStruct, paramaddr, arg->size_); + ConstBuffer* cb = gpu.constBufs_[1]; + cb->uploadDataToHw(arg->size_); + // Then use a pointer in aqlArgBuffer to CB1 + uint64_t gpuPtr = cb->vmAddress() + cb->wrtOffset(); + WriteAqlArg(&aqlArgBuf, &gpuPtr, sizeof(void*)); + memList.push_back(cb); + } + else { + WriteAqlArg(&aqlArgBuf, paramaddr, + arg->numElem_ * arg->size_, arg->size_); + } + break; + case HSAIL_ARGTYPE_IMAGE: { + Image* image = nullptr; + amd::Memory* mem = nullptr; + if (nativeMem) { + image = static_cast(*reinterpret_cast(paramaddr)); + } + else { + mem = *reinterpret_cast(paramaddr); + if (mem == nullptr) { + LogError( "The kernel image argument isn't an image object!"); + return nullptr; + } + image = static_cast(dev().getGpuMemory(mem)); + } + + // Wait for resource if it was used on an inactive engine + //! \note syncCache may call DRM transfer + image->wait(gpu, WaitOnBusyEngine); + + if (dev().settings().hsailDirectSRD_) { + // Image arguments are of size 48 bytes and aligned to 16 bytes + WriteAqlArg(&aqlArgBuf, image->hwState(), + HsaImageObjectSize, HsaImageObjectAlignment); + } + else { + //! \note Special case for the image views. + //! Copy SRD to CB1, so blit manager will be able to release + //! this view without a wait for SRD resource. + if (image->memoryType() == Resource::ImageView) { + // Copy the current structre into CB1 + memcpy(aqlStruct, image->hwState(), HsaImageObjectSize); + ConstBuffer* cb = gpu.constBufs_[1]; + cb->uploadDataToHw(HsaImageObjectSize); + // Then use a pointer in aqlArgBuffer to CB1 + uint64_t srd = cb->vmAddress() + cb->wrtOffset(); + WriteAqlArg(&aqlArgBuf, &srd, sizeof(srd)); + memList.push_back(cb); + } + else { + uint64_t srd = image->hwSrd(); + WriteAqlArg(&aqlArgBuf, &srd, sizeof(srd)); + srdResource = true; + } + } + + //! @todo Compiler has to return read/write attributes + if ((nullptr != mem) && + ((mem->getMemFlags() & CL_MEM_READ_ONLY) == 0)) { + mem->signalWrite(&dev()); + } + + memList.push_back(image); + break; + } + case HSAIL_ARGTYPE_SAMPLER: { + const amd::Sampler* sampler = + *reinterpret_cast(paramaddr); + const Sampler* gpuSampler = static_cast + (sampler->getDeviceSampler(dev())); + if (dev().settings().hsailDirectSRD_) { + WriteAqlArg(&aqlArgBuf, gpuSampler->hwState(), + HsaSamplerObjectSize, HsaSamplerObjectAlignment); + } + else { + uint64_t srd = gpuSampler->hwSrd(); + WriteAqlArg(&aqlArgBuf, &srd, sizeof(srd)); + srdResource = true; + } + break; + } + case HSAIL_ARGTYPE_QUEUE: { + const amd::DeviceQueue* queue = + *reinterpret_cast(paramaddr); + VirtualGPU* gpuQueue = static_cast(queue->vDev()); + uint64_t vmQueue; + if (dev().settings().useDeviceQueue_) { + vmQueue = gpuQueue->vQueue()->vmAddress(); + } + else { + if (!gpu.createVirtualQueue(queue->size())) { + LogError("Virtual queue creation failed!"); + return nullptr; + } + vmQueue = gpu.vQueue()->vmAddress(); + } + WriteAqlArg(&aqlArgBuf, &vmQueue, sizeof(void*)); + break; + } + default: + LogError(" Unsupported address type "); + return nullptr; + } + } + + if (ldsAddress > dev().info().localMemSize_) { + LogError("No local memory available\n"); + return nullptr; + } + + // HSAIL kernarg segment size is rounded up to multiple of 16. + aqlArgBuf = amd::alignUp(aqlArgBuf, 16); + assert((aqlArgBuf == (gpu.cb(0)->sysMemCopy() + argsBufferSize())) && + "Size and the number of arguments don't match!"); + hsa_kernel_dispatch_packet_t* hsaDisp = + reinterpret_cast(aqlArgBuf); + + amd::NDRange local(sizes.local()); + const amd::NDRange& global = sizes.global(); + + // Check if runtime has to find local workgroup size + findLocalWorkSize(sizes.dimensions(), sizes.global(), local); + + hsaDisp->header = kDispatchPacketHeader; + hsaDisp->setup = sizes.dimensions(); + + hsaDisp->workgroup_size_x = local[0]; + hsaDisp->workgroup_size_y = (sizes.dimensions() > 1) ? local[1] : 1; + hsaDisp->workgroup_size_z = (sizes.dimensions() > 2) ? local[2] : 1; + + hsaDisp->grid_size_x = global[0]; + hsaDisp->grid_size_y = (sizes.dimensions() > 1) ? global[1] : 1; + hsaDisp->grid_size_z = (sizes.dimensions() > 2) ? global[2] : 1; + hsaDisp->reserved2 = 0; + + // Initialize kernel ISA and execution buffer requirements + hsaDisp->private_segment_size = spillSegSize(); + hsaDisp->group_segment_size = ldsAddress - ldsSize(); + hsaDisp->kernel_object = gpuAqlCode()->vmAddress(); + + ConstBuffer* cb = gpu.constBufs_[0]; + cb->uploadDataToHw(argsBufferSize() + sizeof(hsa_kernel_dispatch_packet_t)); + uint64_t argList = cb->vmAddress() + cb->wrtOffset(); + + hsaDisp->kernarg_address = reinterpret_cast(argList); + hsaDisp->reserved2 = 0; + hsaDisp->completion_signal.handle = 0; + + memList.push_back(cb); + memList.push_back(gpuAqlCode()); + for (pal::Memory * mem : prog().globalStores()) { + memList.push_back(mem); + } + if (AMD_HSA_BITS_GET(cpuAqlCode_->kernel_code_properties, + AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_QUEUE_PTR)) { + memList.push_back(gpu.hsaQueueMem()); + } + + if (srdResource) { + dev().srds().fillResourceList(memList); + } + + return hsaDisp; +} + +} // namespace pal diff --git a/projects/clr/rocclr/runtime/device/pal/palkernel.hpp b/projects/clr/rocclr/runtime/device/pal/palkernel.hpp new file mode 100644 index 0000000000..f2b6c870b3 --- /dev/null +++ b/projects/clr/rocclr/runtime/device/pal/palkernel.hpp @@ -0,0 +1,263 @@ +// +// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved. +// +#ifndef GPUKERNEL_HPP_ +#define GPUKERNEL_HPP_ + +#include "device/device.hpp" +#include "utils/macros.hpp" +#include "platform/command.hpp" +#include "platform/program.hpp" +#include "platform/kernel.hpp" +#include "platform/sampler.hpp" +#include "device/pal/paldevice.hpp" +#include "device/pal/palvirtual.hpp" +#include "amd_hsa_kernel_code.h" +#include "device/pal/palprintf.hpp" +#include "device/pal/palwavelimiter.hpp" +#include "hsa.h" + +namespace amd { +namespace hsa { +namespace loader { +class Symbol; +} // loader +} // hsa +} // amd + +//! \namespace pal PAL Device Implementation +namespace pal { + +class VirtualGPU; +class Device; +class NullDevice; +class HSAILProgram; + +struct HWSHADER_Helper +{ + template + static T Get(S base, T offset) { + return reinterpret_cast(reinterpret_cast(base) + + reinterpret_cast(offset)); + } +}; + +#define HWSHADER_Get(shader, field) \ + HWSHADER_Helper::Get((shader), (shader)->field) + +template +static void CalcPtr(D& dst, const S src, size_t structSize, size_t size) { + dst = reinterpret_cast(reinterpret_cast(src) + + structSize * size); +} + +/*! \addtogroup pal PAL Device Implementation + * @{ + */ + +enum HSAIL_ADDRESS_QUALIFIER{ + HSAIL_ADDRESS_ERROR = 0, + HSAIL_ADDRESS_GLOBAL, + HSAIL_ADDRESS_LOCAL, + HSAIL_MAX_ADDRESS_QUALIFIERS +} ; + +enum HSAIL_ARG_TYPE{ + HSAIL_ARGTYPE_ERROR = 0, + HSAIL_ARGTYPE_POINTER, + HSAIL_ARGTYPE_VALUE, + HSAIL_ARGTYPE_IMAGE, + HSAIL_ARGTYPE_SAMPLER, + HSAIL_ARGTYPE_QUEUE, + HSAIL_ARGMAX_ARG_TYPES +}; + +enum HSAIL_DATA_TYPE{ + HSAIL_DATATYPE_ERROR = 0, + HSAIL_DATATYPE_B1, + HSAIL_DATATYPE_B8, + HSAIL_DATATYPE_B16, + HSAIL_DATATYPE_B32, + HSAIL_DATATYPE_B64, + HSAIL_DATATYPE_S8, + HSAIL_DATATYPE_S16, + HSAIL_DATATYPE_S32, + HSAIL_DATATYPE_S64, + HSAIL_DATATYPE_U8, + HSAIL_DATATYPE_U16, + HSAIL_DATATYPE_U32, + HSAIL_DATATYPE_U64, + HSAIL_DATATYPE_F16, + HSAIL_DATATYPE_F32, + HSAIL_DATATYPE_F64, + HSAIL_DATATYPE_STRUCT, + HSAIL_DATATYPE_OPAQUE, + HSAIL_DATATYPE_MAX_TYPES +}; + +enum HSAIL_ACCESS_TYPE { + HSAIL_ACCESS_TYPE_NONE = 0, + HSAIL_ACCESS_TYPE_RO, + HSAIL_ACCESS_TYPE_WO, + HSAIL_ACCESS_TYPE_RW +}; + +class HSAILKernel : public device::Kernel +{ +public: + struct Argument + { + std::string name_; //!< Argument's name + std::string typeName_; //!< Argument's type name + uint size_; //!< Size in bytes + uint offset_; //!< Argument's offset + uint alignment_; //!< Argument's alignment + HSAIL_ARG_TYPE type_; //!< Type of the argument + HSAIL_ADDRESS_QUALIFIER addrQual_; //!< Address qualifier of the argument + HSAIL_DATA_TYPE dataType_; //!< The type of data + uint numElem_; //!< Number of elements + HSAIL_ACCESS_TYPE access_; //!< Access type for the argument + }; + + // Max number of possible extra (hidden) kernel arguments + static const uint MaxExtraArgumentsNum = 6; + + HSAILKernel(std::string name, + HSAILProgram* prog, + std::string compileOptions, + uint extraArgsNum); + + virtual ~HSAILKernel(); + + //! Initializes the metadata required for this kernel, + //! finalizes the kernel if needed + bool init(amd::hsa::loader::Symbol *sym, bool finalize = false); + + //! Returns true if memory is valid for execution + virtual bool validateMemory(uint idx, amd::Memory* amdMem) const; + + //! Returns a pointer to the hsail argument + const Argument* argument(size_t i) const { return arguments_[i]; } + + //! Returns the number of hsail arguments + size_t numArguments() const { return arguments_.size(); } + + //! Returns GPU device object, associated with this kernel + const Device& dev() const; + + //! Returns HSA program associated with this kernel + const HSAILProgram& prog() const; + + //! Returns LDS size used in this kernel + uint32_t ldsSize() const + { return cpuAqlCode_->workgroup_group_segment_byte_size; } + + //! Returns pointer on CPU to AQL code info + const void* cpuAqlCode() const { return cpuAqlCode_; } + + //! Returns memory object with AQL code + pal::Memory* gpuAqlCode() const { return code_; } + + //! Returns size of AQL code + size_t aqlCodeSize() const { return codeSize_; } + + //! Returns the size of argument buffer + size_t argsBufferSize() const + { return cpuAqlCode_->kernarg_segment_byte_size; } + + //! Returns spill reg size per workitem + int spillSegSize() const + { return cpuAqlCode_->workitem_private_segment_byte_size; } + + //! Returns TRUE if kernel uses dynamic parallelism + bool dynamicParallelism() const + { return (flags_.dynamicParallelism_) ? true : false; } + + //! Returns TRUE if kernel is internal kernel + bool isInternalKernel() const + { return (flags_.internalKernel_) ? true : false; } + + //! Finds local workgroup size + void findLocalWorkSize( + size_t workDim, //!< Work dimension + const amd::NDRange& gblWorkSize,//!< Global work size + amd::NDRange& lclWorkSize //!< Local work size + ) const; + + //! Returns AQL packet in CPU memory + //! if the kerenl arguments were successfully loaded, otherwise NULL + hsa_kernel_dispatch_packet_t* loadArguments( + VirtualGPU& gpu, //!< Running GPU context + const amd::Kernel& kernel, //!< AMD kernel object + const amd::NDRangeContainer& sizes, //!< NDrange container + const_address parameters, //!< Application arguments for the kernel + bool nativeMem, //!< Native memory objectes are passed + uint64_t vmDefQueue, //!< GPU VM default queue pointer + uint64_t* vmParentWrap, //!< GPU VM parent aql wrap object + std::vector& memList //!< Memory list for GSL/VidMM handles + ) const; + + //! Returns pritnf info array + const std::vector& printfInfo() const { return printf_; } + + //! Returns the kernel index in the program + uint index() const { return index_; } + + //! Returns kernel's extra argument count + uint extraArgumentsNum() const { return extraArgumentsNum_; } + +private: + //! Disable copy constructor + HSAILKernel(const HSAILKernel&); + + //! Disable operator= + HSAILKernel& operator=(const HSAILKernel&); + + //! Creates AQL kernel HW info + bool aqlCreateHWInfo(amd::hsa::loader::Symbol *sym); + + //! Initializes arguments_ and the abstraction layer kernel parameters + void initArgList( + const aclArgData* aclArg //!< List of ACL arguments + ); + + //! Initializes Hsail Argument metadata and info + void initHsailArgs( + const aclArgData* aclArg //!< List of ACL arguments + ); + + //! Initializes Hsail Printf metadata and info + void initPrintf( + const aclPrintfFmt* aclPrintf //!< List of ACL printfs + ); + + std::vector arguments_; //!< Vector list of HSAIL Arguments + std::string compileOptions_; //!< compile used for finalizing this kernel + amd_kernel_code_t* cpuAqlCode_; //!< AQL kernel code on CPU + const NullDevice& dev_; //!< GPU device object + const HSAILProgram& prog_; //!< Reference to the parent program + std::vector printf_; //!< Format strings for GPU printf support + uint index_; //!< Kernel index in the program + + pal::Memory* code_; //!< Memory object with ISA code + size_t codeSize_; //!< Size of ISA code + + char* hwMetaData_; //!< SI metadata + + uint extraArgumentsNum_; //! Number of extra (hidden) kernel arguments + + union Flags { + struct { + uint imageEna_: 1; //!< Kernel uses images + uint imageWriteEna_: 1; //!< Kernel uses image writes + uint dynamicParallelism_: 1; //!< Dynamic parallelism enabled + uint internalKernel_: 1; //!< True: internal kernel + }; + uint value_; + Flags(): value_(0) {} + } flags_; +}; + +/*@}*/} // namespace pal + +#endif /*PALKERNEL_HPP_*/ diff --git a/projects/clr/rocclr/runtime/device/pal/palmemory.cpp b/projects/clr/rocclr/runtime/device/pal/palmemory.cpp new file mode 100644 index 0000000000..79c12945d0 --- /dev/null +++ b/projects/clr/rocclr/runtime/device/pal/palmemory.cpp @@ -0,0 +1,1271 @@ +//! Implementation of GPU device memory management + +#include "top.hpp" +#include "thread/thread.hpp" +#include "thread/monitor.hpp" +#include "device/device.hpp" +#include "device/pal/paldevice.hpp" +#include "device/pal/palblit.hpp" + +#ifdef _WIN32 +#include +#include "amdocl/cl_d3d9_amd.hpp" +#include "amdocl/cl_d3d10_amd.hpp" +#include "amdocl/cl_d3d11_amd.hpp" +#endif //_WIN32 +#include "amdocl/cl_gl_amd.hpp" + +#include +#include +#include +#include + +namespace pal { + +Memory::Memory( + const Device& gpuDev, + amd::Memory& owner, + size_t size) + : device::Memory(owner) + , Resource(gpuDev, size) +{ + init(); + + if (owner.parent() != nullptr) { + flags_ |= SubMemoryObject; + } +} + +Memory::Memory( + const Device& gpuDev, + size_t size) + : device::Memory(size) + , Resource(gpuDev, size) +{ + init(); +} + +Memory::Memory( + const Device& gpuDev, + amd::Memory& owner, + size_t width, + size_t height, + size_t depth, + cl_image_format format, + cl_mem_object_type imageType, + uint mipLevels + ) + : device::Memory(owner) + , Resource(gpuDev, width, height, depth, format, imageType, mipLevels) +{ + init(); + + if (owner.parent() != nullptr) { + flags_ |= SubMemoryObject; + } +} + +Memory::Memory( + const Device& gpuDev, + size_t size, + size_t width, + size_t height, + size_t depth, + cl_image_format format, + cl_mem_object_type imageType, + uint mipLevels + ) + : device::Memory(size) + , Resource(gpuDev, width, height, depth, format, imageType, mipLevels) +{ + init(); +} + +void +Memory::init() +{ + indirectMapCount_ = 0; + interopType_ = InteropNone; + interopMemory_ = nullptr; + pinnedMemory_ = nullptr; + parent_ = nullptr; +} + +#ifdef _WIN32 +static HANDLE +getSharedHandle(IUnknown* pIface) +{ + // Sanity checks + assert(pIface != nullptr); + + HRESULT hRes; + HANDLE hShared; + IDXGIResource* pDxgiRes = nullptr; + if((hRes = (const_cast(pIface))->QueryInterface( + __uuidof(IDXGIResource), + (void**) &pDxgiRes)) != S_OK) { + return (HANDLE) 0; + } + if(!pDxgiRes) { + return (HANDLE) 0; + } + hRes = pDxgiRes->GetSharedHandle(&hShared); + pDxgiRes->Release(); + if(hRes != S_OK) { + return (HANDLE) 0; + } + return hShared; +} +#endif //_WIN32 + +bool +Memory::create( + Resource::MemoryType memType, + Resource::CreateParams* params) +{ + bool result; + + // Reset the flag in case we reallocate the heap in local/remote + flags_ &= ~HostMemoryDirectAccess; + + // Create a resource in CAL + result = Resource::create(memType, params); + + // Check if CAL created a resource + if (result) { + switch (memoryType()) { + case Resource::Pinned: + case Resource::ExternalPhysical: + // Marks memory object for direct GPU access to the host memory + flags_ |= HostMemoryDirectAccess; + break; + case Resource::Remote: + case Resource::RemoteUSWC: + if (!desc().tiled_) { + // Marks memory object for direct GPU access to the host memory + flags_ |= HostMemoryDirectAccess; + } + break; + case Resource::View: { + Resource::ViewParams* view = + reinterpret_cast(params); + // Check if parent was allocated in system memory + if ((view->resource_->memoryType() == Resource::Pinned) || + (((view->resource_->memoryType() == Resource::Remote) || + (view->resource_->memoryType() == Resource::RemoteUSWC)) && + // @todo Enable unconditional optimization for remote memory + // Check for external allocation, to avoid the optimization + // for non-VM (double copy) mode + (owner() != nullptr) && + ((owner()->getMemFlags() & CL_MEM_ALLOC_HOST_PTR) || + dev().settings().remoteAlloc_))) { + // Marks memory object for direct GPU access to the host memory + flags_ |= HostMemoryDirectAccess; + } + if ((view->owner_ != nullptr) && (view->owner_->parent() != nullptr)) { + parent_ = reinterpret_cast(view->memory_); + flags_ |= SubMemoryObject; + } + break; + } + case Resource::ImageView: { + Resource::ImageViewParams* view = + reinterpret_cast(params); + parent_ = reinterpret_cast(view->memory_); + flags_ |= SubMemoryObject | (parent_->flags_ & HostMemoryDirectAccess); + break; + } + case Resource::ImageBuffer: { + Resource::ImageBufferParams* view = + reinterpret_cast(params); + parent_ = reinterpret_cast(view->memory_); + flags_ |= SubMemoryObject | (parent_->flags_ & HostMemoryDirectAccess); + break; + } + default: + break; + } + } + + return result; +} + +bool Memory::processGLResource(GLResourceOP operation) +{ + bool retVal = false; + switch (operation) + { + case GLDecompressResource: + retVal = gslGLAcquire(); + break; + case GLInvalidateFBO: + retVal = gslGLRelease(); + break; + default: + assert(false && "unknown GLResourceOP"); + } + return retVal; +} + +bool +Memory::createInterop(InteropType type) +{ + Resource::MemoryType memType = Resource::Empty; + Resource::OGLInteropParams oglRes; +#ifdef _WIN32 + Resource::D3DInteropParams d3dRes; +#endif //_WIN32 + + // Only external objects support interop + assert(owner() != nullptr); + + Resource::CreateParams* createParams = nullptr; + + amd::InteropObject* interop = owner()->getInteropObj(); + assert((interop != nullptr) && "An invalid interop object is impossible!"); + + amd::GLObject* glObject = interop->asGLObject(); +#ifdef _WIN32 + amd::D3D10Object* d3d10Object = interop->asD3D10Object(); + amd::D3D11Object* d3d11Object = interop->asD3D11Object(); + amd::D3D9Object* d3d9Object = interop->asD3D9Object(); + + if (d3d10Object != nullptr) { + createParams = &d3dRes; + + d3dRes.owner_ = owner(); + + const amd::D3D10ObjDesc_t* objDesc = d3d10Object->getObjDesc(); + + memType = Resource::D3D10Interop; + + // Get shared handle + if ((d3dRes.handle_ = + getSharedHandle(d3d10Object->getD3D10Resource()))) { + d3dRes.iDirect3D_ = static_cast + (d3d10Object->getD3D10Resource()); + d3dRes.type_ = Resource::InteropTypeless; + } + + d3dRes.misc = 0; + // Find D3D10 object type + switch (objDesc->objDim_) { + case D3D10_RESOURCE_DIMENSION_BUFFER: + d3dRes.type_ = Resource::InteropVertexBuffer; + break; + case D3D10_RESOURCE_DIMENSION_TEXTURE1D: + case D3D10_RESOURCE_DIMENSION_TEXTURE2D: + case D3D10_RESOURCE_DIMENSION_TEXTURE3D: + d3dRes.type_ = Resource::InteropTexture; + if (objDesc->mipLevels_ > 1) { + d3dRes.type_ = Resource::InteropTextureViewLevel; + + if (objDesc->arraySize_ > 1) { + d3dRes.layer_ = d3d10Object->getSubresource() / + objDesc->mipLevels_; + d3dRes.mipLevel_ = d3d10Object->getSubresource() % + objDesc->mipLevels_; + } + else { + d3dRes.layer_ = 0; + d3dRes.mipLevel_ = d3d10Object->getSubresource(); + } + } + break; + default: + return false; + break; + } + } + else if (d3d11Object != nullptr) { + createParams = &d3dRes; + + d3dRes.owner_ = owner(); + + const amd::D3D11ObjDesc_t* objDesc = d3d11Object->getObjDesc(); + + memType = Resource::D3D11Interop; + + // Get shared handle + if ((d3dRes.handle_ = + getSharedHandle(d3d11Object->getD3D11Resource()))) { + d3dRes.iDirect3D_ = static_cast + (d3d11Object->getD3D11Resource()); + d3dRes.type_ = Resource::InteropTypeless; + } + + d3dRes.misc = 0; + // Find D3D11 object type + switch (objDesc->objDim_) { + case D3D11_RESOURCE_DIMENSION_BUFFER: + d3dRes.type_ = Resource::InteropVertexBuffer; + break; + case D3D11_RESOURCE_DIMENSION_TEXTURE1D: + case D3D11_RESOURCE_DIMENSION_TEXTURE2D: + case D3D11_RESOURCE_DIMENSION_TEXTURE3D: + d3dRes.type_ = Resource::InteropTexture; + d3dRes.layer_= d3d11Object->getPlane(); + d3dRes.misc = d3d11Object->getMiscFlag(); + if (objDesc->mipLevels_ > 1) { + d3dRes.type_ = Resource::InteropTextureViewLevel; + + if (objDesc->arraySize_ > 1) { + d3dRes.layer_ = d3d11Object->getSubresource() / + objDesc->mipLevels_; + d3dRes.mipLevel_ = d3d11Object->getSubresource() % + objDesc->mipLevels_; + } + else { + d3dRes.layer_ = 0; + d3dRes.mipLevel_ = d3d11Object->getSubresource(); + } + } + break; + default: + return false; + break; + } + } + else if (d3d9Object != nullptr) { + createParams = &d3dRes; + + d3dRes.owner_ = owner(); + + const amd::D3D9ObjDesc_t* objDesc = d3d9Object->getObjDesc(); + + memType = Resource::D3D9Interop; + + // Get shared handle + if ((d3dRes.handle_ = d3d9Object->getD3D9SharedHandle())) { + d3dRes.iDirect3D_ = static_cast + (d3d9Object->getD3D9Resource()); + d3dRes.type_ = Resource::InteropSurface; + d3dRes.mipLevel_ = 0; + d3dRes.layer_ = d3d9Object->getPlane(); + d3dRes.misc = d3d9Object->getMiscFlag(); + } + } + else +#endif //_WIN32 + if (glObject != nullptr) { + createParams = &oglRes; + + oglRes.owner_ = owner(); + + memType = Resource::OGLInterop; + + // Fill the interop creation parameters + oglRes.handle_ = static_cast(glObject->getGLName()); + + // Find OGL object type + switch (glObject->getCLGLObjectType()) { + case CL_GL_OBJECT_BUFFER: + oglRes.type_ = Resource::InteropVertexBuffer; + break; + case CL_GL_OBJECT_TEXTURE_BUFFER: + case CL_GL_OBJECT_TEXTURE1D: + case CL_GL_OBJECT_TEXTURE1D_ARRAY: + case CL_GL_OBJECT_TEXTURE2D: + case CL_GL_OBJECT_TEXTURE2D_ARRAY: + case CL_GL_OBJECT_TEXTURE3D: + oglRes.type_ = Resource::InteropTexture; + if (GL_TEXTURE_CUBE_MAP == glObject->getGLTarget()) { + switch (glObject->getCubemapFace()) { + case GL_TEXTURE_CUBE_MAP_POSITIVE_X: + case GL_TEXTURE_CUBE_MAP_NEGATIVE_X: + case GL_TEXTURE_CUBE_MAP_POSITIVE_Y: + case GL_TEXTURE_CUBE_MAP_NEGATIVE_Y: + case GL_TEXTURE_CUBE_MAP_POSITIVE_Z: + case GL_TEXTURE_CUBE_MAP_NEGATIVE_Z: + oglRes.type_ = Resource::InteropTextureViewCube; + oglRes.layer_ = + glObject->getCubemapFace() - GL_TEXTURE_CUBE_MAP_POSITIVE_X; + oglRes.mipLevel_ = glObject->getGLMipLevel(); + break; + default: + break; + } + } + else if (glObject->getGLMipLevel() != 0) { + oglRes.type_ = Resource::InteropTextureViewLevel; + oglRes.layer_ = 0; + oglRes.mipLevel_ = glObject->getGLMipLevel(); + } + break; + case CL_GL_OBJECT_RENDERBUFFER: + oglRes.type_ = Resource::InteropRenderBuffer; + break; + default: + return false; + break; + } + } + else { + return false; + } + oglRes.glPlatformContext_ = owner()->getContext().info().hCtx_; + oglRes.glDeviceContext_ = owner()->getContext().info().hDev_; + // We dont pass any flags here for the GL Resource. + oglRes.flags_ = 0; + + // Get the interop settings + if (type == InteropDirectAccess) { + // Create memory object + if (!create(memType, createParams)) { + return false; + } + } + else { + // Allocate Resource object for interop as buffer + interopMemory_ = new Memory(dev(), size()); + + // Create the interop object in CAL + if (nullptr == interopMemory_ || !interopMemory_->create(memType, createParams)) { + delete interopMemory_; + interopMemory_ = nullptr; + return false; + } + } + + setInteropType(type); + + return true; +} + +Memory::~Memory() +{ + // Clean VA cache + dev().removeVACache(this); + + delete interopMemory_; + + // Release associated map target, if any + if (nullptr != mapMemory_) { + mapMemory()->unmap(nullptr); + mapMemory_->release(); + } + + // Destory pinned memory + if (flags_ & PinnedMemoryAlloced) { + delete pinnedMemory_; + } + + if ((owner() != nullptr) && isHostMemDirectAccess() && + !(flags_ & SubMemoryObject) && + (memoryType() != Resource::ExternalPhysical)) { + // Unmap memory if direct access was requested + unmap(nullptr); + } +} + +void +Memory::syncCacheFromHost(VirtualGPU& gpu, device::Memory::SyncFlags syncFlags) +{ + // If the last writer was another GPU, then make a writeback + if (!isHostMemDirectAccess() && + (owner()->getLastWriter() != nullptr) && + (&dev() != owner()->getLastWriter())) { + mgpuCacheWriteBack(); + } + + // If host memory doesn't have direct access, then we have to synchronize + if (!isHostMemDirectAccess() && (nullptr != owner()->getHostMem())) { + bool hasUpdates = true; + + // Make sure the parent of subbuffer is up to date + if (!syncFlags.skipParent_ && (flags_ & SubMemoryObject)) { + pal::Memory* gpuMemory = dev().getGpuMemory(owner()->parent()); + + //! \note: Skipping the sync for a view doesn't reflect the parent settings, + //! since a view is a small portion of parent + device::Memory::SyncFlags syncFlagsTmp; + + // Sync parent from a view, so views have to be skipped + syncFlagsTmp.skipViews_ = true; + + // Make sure the parent sync is an unique operation. + // If the app uses multiple subbuffers from multiple queues, + // then the parent sync can be called from multiple threads + amd::ScopedLock lock(owner()->parent()->lockMemoryOps()); + gpuMemory->syncCacheFromHost(gpu, syncFlagsTmp); + //! \note Don't do early exit here, since we still have to sync + //! this view, if the parent sync operation was a NOP. + //! If parent was synchronized, then this view sync will be a NOP + } + + // Is this a NOP? + if ((version_ == owner()->getVersion()) || + (&dev() == owner()->getLastWriter())) { + hasUpdates = false; + } + + // Update all available views, since we sync the parent + if ((owner()->subBuffers().size() != 0) && + (hasUpdates || !syncFlags.skipViews_)) { + device::Memory::SyncFlags syncFlagsTmp; + + // Sync views from parent, so parent has to be skipped + syncFlagsTmp.skipParent_ = true; + + if (hasUpdates) { + // Parent will be synced so update all views with a skip + syncFlagsTmp.skipEntire_ = true; + } + else { + // Passthrough the skip entire flag to the views, since + // any view is a submemory of the parent + syncFlagsTmp.skipEntire_ = syncFlags.skipEntire_; + } + + amd::ScopedLock lock(owner()->lockMemoryOps()); + for (auto& sub : owner()->subBuffers()) { + //! \note Don't allow subbuffer's allocation in the worker thread. + //! It may cause a system lock, because possible resource + //! destruction, heap reallocation or subbuffer allocation + static const bool AllocSubBuffer = false; + device::Memory* devSub = + sub->getDeviceMemory(dev(), AllocSubBuffer); + if (nullptr != devSub) { + pal::Memory* gpuSub = reinterpret_cast(devSub); + gpuSub->syncCacheFromHost(gpu, syncFlagsTmp); + } + } + } + + // Make sure we didn't have a NOP, + // because this GPU device was the last writer + if (&dev() != owner()->getLastWriter()) { + // Update the latest version + version_ = owner()->getVersion(); + } + + // Exit if sync is a NOP or sync can be skipped + if (!hasUpdates || syncFlags.skipEntire_) { + return; + } + + bool result = false; + static const bool Entire = true; + amd::Coord3D origin(0, 0, 0); + + // If host memory was pinned then make a transfer + if (flags_ & PinnedMemoryAlloced) { + if (desc().buffer_) { + amd::Coord3D region(owner()->getSize()); + result = gpu.blitMgr().copyBuffer(*pinnedMemory_, + *this, origin, origin, region, Entire); + } + else { + amd::Image& image = static_cast(*owner()); + result = gpu.blitMgr().copyBufferToImage(*pinnedMemory_, + *this, origin, origin, image.getRegion(), Entire, + image.getRowPitch(), image.getSlicePitch()); + } + } + + if (!result) { + if (desc().buffer_) { + amd::Coord3D region(owner()->getSize()); + result = gpu.blitMgr().writeBuffer(owner()->getHostMem(), + *this, origin, region, Entire); + } + else { + amd::Image& image = static_cast(*owner()); + result = gpu.blitMgr().writeImage(owner()->getHostMem(), + *this, origin, image.getRegion(), + image.getRowPitch(), image.getSlicePitch(), Entire); + } + } + + //!@todo A wait isn't really necessary. However + //! Linux no-VM may have extra random failures. + wait(gpu); + + // Should never fail + assert(result && "Memory synchronization failed!"); + } +} + +void +Memory::syncHostFromCache(device::Memory::SyncFlags syncFlags) +{ + // Sanity checks + assert(owner() != nullptr); + + // If host memory doesn't have direct access, then we have to synchronize + if (!isHostMemDirectAccess()) { + bool hasUpdates = true; + + // Make sure the parent of subbuffer is up to date + if (!syncFlags.skipParent_ && (flags_ & SubMemoryObject)) { + device::Memory* m = owner()->parent()->getDeviceMemory(dev()); + + //! \note: Skipping the sync for a view doesn't reflect the parent settings, + //! since a view is a small portion of parent + device::Memory::SyncFlags syncFlagsTmp; + + // Sync parent from a view, so views have to be skipped + syncFlagsTmp.skipViews_ = true; + + // Make sure the parent sync is an unique operation. + // If the app uses multiple subbuffers from multiple queues, + // then the parent sync can be called from multiple threads + amd::ScopedLock lock(owner()->parent()->lockMemoryOps()); + m->syncHostFromCache(syncFlagsTmp); + //! \note Don't do early exit here, since we still have to sync + //! this view, if the parent sync operation was a NOP. + //! If parent was synchronized, then this view sync will be a NOP + } + + // Is this a NOP? + if ((nullptr == owner()->getLastWriter()) || + (version_ == owner()->getVersion())) { + hasUpdates = false; + } + + // Update all available views, since we sync the parent + if ((owner()->subBuffers().size() != 0) && + (hasUpdates || !syncFlags.skipViews_)) { + device::Memory::SyncFlags syncFlagsTmp; + + // Sync views from parent, so parent has to be skipped + syncFlagsTmp.skipParent_ = true; + + if (hasUpdates) { + // Parent will be synced so update all views with a skip + syncFlagsTmp.skipEntire_ = true; + } + else { + // Passthrough the skip entire flag to the views, since + // any view is a submemory of the parent + syncFlagsTmp.skipEntire_ = syncFlags.skipEntire_; + } + + amd::ScopedLock lock(owner()->lockMemoryOps()); + for (auto& sub : owner()->subBuffers()) { + //! \note Don't allow subbuffer's allocation in the worker thread. + //! It may cause a system lock, because possible resource + //! destruction, heap reallocation or subbuffer allocation + static const bool AllocSubBuffer = false; + device::Memory* devSub = + sub->getDeviceMemory(dev(), AllocSubBuffer); + if (nullptr != devSub) { + pal::Memory* gpuSub = reinterpret_cast(devSub); + gpuSub->syncHostFromCache(syncFlagsTmp); + } + } + } + + // Make sure we didn't have a NOP, + // because CPU was the last writer + if (nullptr != owner()->getLastWriter()) { + // Mark parent as up to date, set our version accordingly + version_ = owner()->getVersion(); + } + + // Exit if sync is a NOP or sync can be skipped + if (!hasUpdates || syncFlags.skipEntire_) { + return; + } + + bool result = false; + static const bool Entire = true; + amd::Coord3D origin(0, 0, 0); + + // If backing store was pinned then make a transfer + if (flags_ & PinnedMemoryAlloced) { + if (desc().buffer_) { + amd::Coord3D region(owner()->getSize()); + result = dev().xferMgr().copyBuffer(*this, + *pinnedMemory_, origin, origin, region, Entire); + } + else { + amd::Image& image = static_cast(*owner()); + result = dev().xferMgr().copyImageToBuffer(*this, + *pinnedMemory_, origin, origin, image.getRegion(), Entire, + image.getRowPitch(), image.getSlicePitch()); + } + } + + // Just do a basic host read + if (!result) { + if (desc().buffer_) { + amd::Coord3D region(owner()->getSize()); + result = dev().xferMgr().readBuffer(*this, + owner()->getHostMem(), origin, region, Entire); + } + else { + amd::Image& image = static_cast(*owner()); + result = dev().xferMgr().readImage(*this, + owner()->getHostMem(), origin, image.getRegion(), + image.getRowPitch(), image.getSlicePitch(), Entire); + } + } + + // Should never fail + assert(result && "Memory synchronization failed!"); + } +} + +pal::Memory* +Memory::createBufferView(amd::Memory& subBufferOwner) +{ + pal::Memory* viewMemory; + Resource::ViewParams params; + + size_t offset = subBufferOwner.getOrigin(); + size_t size = subBufferOwner.getSize(); + + // Create a memory object + viewMemory = new pal::Memory(dev(), subBufferOwner, size); + if (nullptr == viewMemory) { + return nullptr; + } + + params.owner_ = &subBufferOwner; + params.gpu_ = static_cast(subBufferOwner.getVirtualDevice()); + params.offset_ = offset; + params.size_ = size; + params.resource_ = this; + params.memory_ = this; + if (!viewMemory->create(Resource::View, ¶ms)) { + delete viewMemory; + return nullptr; + } + + // Explicitly set the host memory location, + // because the parent location could change after reallocation + if (nullptr != owner()->getHostMem()) { + subBufferOwner.setHostMem( + reinterpret_cast(owner()->getHostMem()) + offset); + } + else { + subBufferOwner.setHostMem(nullptr); + } + + return viewMemory; +} + +void +Memory::decIndMapCount() +{ + // Map/unmap must be serialized + amd::ScopedLock lock(owner()->lockMemoryOps()); + + if (indirectMapCount_ == 0) { + if (!mipMapped()) { + LogError("decIndMapCount() called when indirectMapCount_ already zero"); + } + return; + } + + // Decrement the counter and release indirect map if it's the last op + if (--indirectMapCount_ == 0) { + if (nullptr != mapMemory_) { + amd::Memory* memory = mapMemory_; + amd::Memory* empty = nullptr; + + // Get GPU memory + Memory* gpuMemory = mapMemory(); + gpuMemory->unmap(nullptr); + + if (!dev().addMapTarget(memory)) { + memory->release(); + } + + // Map/unamp is serialized for the same memory object, + // so it's safe to clear the pointer + assert((mapMemory_ != nullptr) && "Mapped buffer should be valid"); + mapMemory_ = nullptr; + } + } +} + +// Note - must be called by the device under the async lock, so no spinning +// or long pauses allowed in this function. +void* +Memory::allocMapTarget( + const amd::Coord3D& origin, + const amd::Coord3D& region, + uint mapFlags, + size_t* rowPitch, + size_t* slicePitch) +{ + // Sanity checks + assert(owner() != nullptr); + + // Map/unmap must be serialized + amd::ScopedLock lock(owner()->lockMemoryOps()); + + address mapAddress = nullptr; + size_t offset = origin[0]; + + //For SVM implementation, we cannot use cached map. if svm space, use the svm host pointer + void *initHostPtr = owner()->getSvmPtr(); + if (nullptr != initHostPtr) { + owner()->commitSvmMemory(); + } + + if (owner()->numDevices() > 1) { + if ((nullptr == initHostPtr) && (owner()->getHostMem() == nullptr)) { + static const bool forceAllocHostMem = true; + if (!owner()->allocHostMemory(nullptr, forceAllocHostMem)) { + return nullptr; + } + } + } + + incIndMapCount(); + // If host memory exists, use it + if ((owner()->getHostMem() != nullptr) && isDirectMap()) { + mapAddress = reinterpret_cast
(owner()->getHostMem()); + } + // If resource is a persistent allocation, we can use it directly + else if (isPersistentDirectMap()) { + if (nullptr == map(nullptr)) { + LogError("Could not map target persistent resource"); + decIndMapCount(); + return nullptr; + } + mapAddress = data(); + } + // Otherwise we can use a remote resource: + else { + // Are we in range? + size_t elementCount = desc().width_; + size_t rSize = elementCount * elementSize(); + if (offset >= rSize || offset + region[0] > rSize) { + LogWarning("Memory::allocMapTarget() - offset/size out of bounds"); + return nullptr; + } + + // Allocate a map resource if there isn't any yet + if (indirectMapCount_ == 1) { + const static bool SysMem = true; + bool failed = false; + amd::Memory* memory = nullptr; + // Search for a possible indirect resource + cl_mem_flags flag = 0; + bool canBeCached = true; + if (nullptr != initHostPtr) { + //make sure the host memory is committed already, or we have a big problem. + assert(owner()->isSvmPtrCommited() && "The host svm memory not committed yet!"); + flag = CL_MEM_USE_HOST_PTR; + canBeCached = false; + } + else { + memory = dev().findMapTarget(owner()->getSize()); + } + + if (memory == nullptr) { + // for map target of svm buffer , we need use svm host ptr + memory = new(dev().context()) + amd::Buffer(dev().context(), flag, owner()->getSize()); + Memory* gpuMemory; + + do { + if ((memory == nullptr) || !memory->create(initHostPtr, SysMem)) { + failed = true; + break; + } + memory->setCacheStatus(canBeCached); + + gpuMemory = reinterpret_cast + (memory->getDeviceMemory(dev())); + + // Create, Map and get the base pointer for the resource + if ((gpuMemory == nullptr) || (nullptr == gpuMemory->map(nullptr))) { + failed = true; + break; + } + } + while (false); + } + + if (failed) { + if (memory != nullptr) { + memory->release(); + } + decIndMapCount(); + LogError("Could not map target resource"); + return nullptr; + } + + // Map/unamp is serialized for the same memory object, + // so it's safe to assign the new pointer + assert((mapMemory_ == nullptr) && "Mapped buffer can't be valid"); + mapMemory_ = memory; + } + else { + // Did the map resource allocation fail? + if (mapMemory_ == nullptr) { + LogError("Could not map target resource"); + return nullptr; + } + } + mapAddress = mapMemory()->data(); + + // Use start of the indirect buffer + offset = 0; + } + + return mapAddress + offset; +} + +bool +Memory::pinSystemMemory(void* hostPtr, size_t size) +{ + bool result = false; + + // If memory has a direct access already, then skip the host memory pinning + if (isHostMemDirectAccess()) { + return true; + } + + // Destroy the old pinned memory if it was already allocated + if (flags_ & PinnedMemoryAlloced) { + delete pinnedMemory_; + flags_ &= ~PinnedMemoryAlloced; + } + + // Allocate memory for the pinned object + pinnedMemory_ = new Memory(dev(), size); + + if (pinnedMemory_ == nullptr) { + return false; + } + + // Check if it's a view + if (flags_ & SubMemoryObject) { + const pal::Memory* gpuMemory; + if (owner() != nullptr) { + gpuMemory = dev().getGpuMemory(owner()->parent()); + } + else { + gpuMemory = parent(); + } + + if (gpuMemory->flags_ & PinnedMemoryAlloced) { + Resource::ViewParams params; + params.owner_ = owner(); + params.offset_ = owner()->getOrigin(); + params.size_ = owner()->getSize(); + params.resource_ = gpuMemory->pinnedMemory_; + params.memory_ = nullptr; + result = pinnedMemory_->create(Resource::View, ¶ms); + } + } + else { + Resource::PinnedParams params; + // Fill resource creation parameters + params.owner_ = owner(); + params.hostMemRef_ = owner()->getHostMemRef(); + params.size_ = size; + + // Create resource + result = pinnedMemory_->create(Resource::Pinned, ¶ms); + } + + if (!result) { + delete pinnedMemory_; + pinnedMemory_ = nullptr; + return false; + } + + flags_ |= PinnedMemoryAlloced; + return true; +} + +void* +Memory::cpuMap( + device::VirtualDevice& vDev, uint flags, + uint startLayer, uint numLayers, + size_t* rowPitch, + size_t* slicePitch) +{ + uint resFlags = 0; + if (flags == Memory::CpuReadOnly) { + resFlags = Resource::ReadOnly; + } + else if (flags == Memory::CpuWriteOnly) { + resFlags = Resource::WriteOnly; + } + + void* ptr = map(&static_cast(vDev), resFlags, startLayer, numLayers); + if (!desc().buffer_) { + *rowPitch = desc().pitch_ * elementSize(); + *slicePitch = desc().slice_ * elementSize(); + } + return ptr; +} + +void +Memory::cpuUnmap(device::VirtualDevice& vDev) +{ + unmap(&static_cast(vDev)); +} + +//! \note moveTo() must be called only from outside of +//! VirtualGPU submit command methods. +//! Otherwise a deadlock in lockVgpus() is possible. +//! Also the logic in this function is very specific to +//! the zero-copy functionality. + +bool +Memory::moveTo(Memory& dst) +{ + bool result = false; + + // Make sure that all virtual devices don't process any commands + Device::ScopedLockVgpus lock(dev()); + + // Wait for idle on all virtual GPUs + //!@note It's enough to wait on the active queue only + for (uint idx = 0; idx < dev().vgpus().size(); ++idx) { + wait(*(dev().vgpus()[idx])); + } + + static const bool Entire = true; + amd::Coord3D origin(0, 0, 0); + amd::Coord3D region(size()); + + // Transfer the data from old location to a new one + if (dev().xferMgr().copyBuffer( + *this, dst, origin, origin, region, Entire)) { + // Move all properties to the new object + dst.mapMemory_ = mapMemory_; + mapMemory_ = nullptr; + + dst.flags_ |= flags_ & ~HostMemoryDirectAccess; + flags_ &= HostMemoryDirectAccess; + + dst.indirectMapCount_ = indirectMapCount_; + indirectMapCount_ = 0; + + dst.pinnedMemory_ = pinnedMemory_; + pinnedMemory_ = nullptr; + + // Replace the device memory object + //! @note: current object will be destroyed + owner()->replaceDeviceMemory(&dev(), &dst); + result = true; + } + + return result; +} + +Memory* +Memory::mapMemory() const +{ + Memory* map = nullptr; + if (nullptr != mapMemory_) { + map = reinterpret_cast(mapMemory_->getDeviceMemory(dev())); + } + return map; +} + +void +Memory::mgpuCacheWriteBack() +{ + // Lock memory object, so only one write back can occur + amd::ScopedLock lock(owner()->lockMemoryOps()); + + // Attempt to allocate a staging buffer if don't have any + if (owner()->getHostMem() == nullptr) { + static const bool forceAllocHostMem = true; + if (owner()->allocHostMemory(nullptr, forceAllocHostMem)) { + //! \note Ignore pinning result + bool ok = pinSystemMemory( + owner()->getHostMem(), owner()->getHostMemRef()->size()); + } + } + + // Make synchronization + if (owner()->getHostMem() != nullptr) { + owner()->cacheWriteBack(); + } +} + +Memory* +Buffer::createBufferView(amd::Memory& subBufferOwner) const +{ + pal::Memory* subBuffer; + Resource::ViewParams params; + + size_t offset = subBufferOwner.getOrigin(); + size_t size = subBufferOwner.getSize(); + + // Create a memory object + subBuffer = new pal::Buffer(dev(), subBufferOwner, size); + if (nullptr == subBuffer) { + return nullptr; + } + + // Allocate a view for this buffer object + params.owner_ = &subBufferOwner; + params.offset_ = offset; + params.size_ = size; + params.resource_ = this; + params.memory_ = this; + + if (!subBuffer->create(Resource::View, ¶ms)) { + delete subBuffer; + return nullptr; + } + + return subBuffer; +} + +void* +Image::allocMapTarget( + const amd::Coord3D& origin, + const amd::Coord3D& region, + uint mapFlags, + size_t* rowPitch, + size_t* slicePitch) +{ + // Sanity checks + assert(owner() != nullptr); + bool useRemoteResource = true; + size_t slicePitchTmp = 0; + size_t height = desc().height_; + size_t depth = desc().depth_; + + // Map/unmap must be serialized + amd::ScopedLock lock(owner()->lockMemoryOps()); + + address mapAddress = nullptr; + size_t offset = origin[0]; + + incIndMapCount(); + + // If host memory exists, use it + if ((owner()->getHostMem() != nullptr) && isDirectMap()) { + useRemoteResource = false; + mapAddress = reinterpret_cast
(owner()->getHostMem()); + amd::Image* amdImage = owner()->asImage(); + + // Calculate the offset in bytes + offset *= elementSize(); + + // Update the row and slice pitches value + *rowPitch = (amdImage->getRowPitch() == 0) ? + (desc().width_ * elementSize()) : amdImage->getRowPitch(); + slicePitchTmp = (amdImage->getSlicePitch() == 0) ? + (height * (*rowPitch)) : amdImage->getSlicePitch(); + + // Adjust the offset in Y and Z dimensions + offset += origin[1] * (*rowPitch); + offset += origin[2] * slicePitchTmp; + } + // If resource is a persistent allocation, we can use it directly + //! @note Even if resource is a persistent allocation, + //! runtime can't use it directly, + //! because CAL volume map doesn't work properly. + //! @todo arrays can be added for persistent lock with some CAL changes + else if (isPersistentDirectMap()) { + if (nullptr == map(nullptr)) { + useRemoteResource = true; + LogError("Could not map target persistent resource, try remote resource"); + } + else { + useRemoteResource = false; + mapAddress = data(); + + // Calculate the offset in bytes + offset *= elementSize(); + + // Update the row pitch value + *rowPitch = desc().pitch_ * elementSize(); + + // Adjust the offset in Y dimension + offset += origin[1] * (*rowPitch); + } + } + + // Otherwise we can use a remote resource: + if (useRemoteResource) { + // Calculate X offset in bytes + offset *= elementSize(); + + // Allocate a map resource if there isn't any yet + if (indirectMapCount_ == 1) { + const static bool SysMem = true; + bool failed = false; + amd::Memory* memory; + + // Search for a possible indirect resource + memory = dev().findMapTarget(owner()->getSize()); + + if (memory == nullptr) { + // Allocate a new buffer to use as the map target + //! @note Allocate a 1D buffer, since CAL issues with 3D + //! Also HW doesn't support untiled images + memory = new (dev().context()) + amd::Buffer(dev().context(), 0, + desc().width_ * height * depth * elementSize()); + memory->setVirtualDevice(owner()->getVirtualDevice()); + + Memory* gpuMemory; + do { + if ((memory == nullptr) || !memory->create(nullptr, SysMem)) { + failed = true; + break; + } + + gpuMemory = reinterpret_cast + (memory->getDeviceMemory(dev())); + + // Create, Map and get the base pointer for the resource + if ((gpuMemory == nullptr) || (nullptr == gpuMemory->map(nullptr))) { + failed = true; + break; + } + } + while (false); + } + + if (failed) { + if (memory != nullptr) { + memory->release(); + } + decIndMapCount(); + LogError("Could not map target resource"); + return nullptr; + } + + // Map/unamp is serialized for the same memory object, + // so it's safe to assign the new pointer + assert((mapMemory_ == nullptr) && "Mapped buffer can't be valid"); + mapMemory_ = memory; + } + else { + // Did the map resource allocation fail? + if (mapMemory_ == nullptr) { + LogError("Could not map target resource"); + return nullptr; + } + } + + mapAddress = mapMemory()->data(); + + // Update the row and slice pitches value + *rowPitch = region[0] * elementSize(); + if (desc().topology_ == CL_MEM_OBJECT_IMAGE1D_ARRAY) { + slicePitchTmp = *rowPitch ; + } + else { + slicePitchTmp = *rowPitch * region[1]; + } + // Use start of the indirect buffer + offset = 0; + } + + if (slicePitch != nullptr) { + *slicePitch = slicePitchTmp; + } + + return mapAddress + offset; +} + +} // namespace pal diff --git a/projects/clr/rocclr/runtime/device/pal/palmemory.hpp b/projects/clr/rocclr/runtime/device/pal/palmemory.hpp new file mode 100644 index 0000000000..eae4ccfae0 --- /dev/null +++ b/projects/clr/rocclr/runtime/device/pal/palmemory.hpp @@ -0,0 +1,275 @@ +// +// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved. +// +#ifndef PALMEMORY_HPP_ +#define PALMEMORY_HPP_ + +#include "top.hpp" +#include "thread/atomic.hpp" +#include "device/pal/palresource.hpp" +#include + +/*! \addtogroup GPU + * @{ + */ +namespace device { +class Memory; +} + +//! PAL Device Implementation +namespace pal { + +class Device; +class Heap; +class Resource; +class Memory; +class VirtualGPU; + +//! GPU memory object. +// Wrapper that can contain a heap block or an interop buffer/image. +class Memory: public device::Memory, public Resource +{ +public: + enum InteropType { + InteropNone = 0, //!< None interop memory + InteropHwEmulation = 1, //!< Uses HW emulaiton with calMemCopy + InteropDirectAccess = 2 //!< Uses direct access to the interop surface + }; + + //! Constructor (with owner) + Memory( + const Device& gpuDev, //!< GPU device object + amd::Memory& owner, //!< Abstraction layer memory object + size_t size //!< Memory size for allocation + ); + + //! Constructor (nonfat version for local scratch mem use without heap block) + Memory( + const Device& gpuDev, //!< GPU device object + size_t size //!< Memory size for allocation + ); + + //! Constructor memory for images (without global heap allocation) + Memory( + const Device& gpuDev, //!< GPU device object + amd::Memory& owner, //!< Abstraction layer memory object + size_t width, //!< Allocated memory width + size_t height, //!< Allocated memory height + size_t depth, //!< Allocated memory depth + cl_image_format format, //!< Memory format + cl_mem_object_type imageType, //!< CL image type + uint mipLevels //!< The number of mip levels + ); + + //! Constructor memory for images (without global heap allocation) + Memory( + const Device& gpuDev, //!< GPU device object + size_t size, //!< Memory object size + size_t width, //!< Allocated memory width + size_t height, //!< Allocated memory height + size_t depth, //!< Allocated memory depth + cl_image_format format, //!< Memory format + cl_mem_object_type imageType, //!< CL image type + uint mipLevels //!< The number of mip levels + ); + + //! Default destructor + ~Memory(); + + //! Creates the interop memory + bool createInterop( + InteropType type //!< The interop type + ); + + //! Overloads the resource create method + virtual bool create( + Resource::MemoryType memType, //!< Memory type + Resource::CreateParams* params = NULL //!< Prameters for create + ); + + //! Allocate memory for API-level maps + virtual void* allocMapTarget( + const amd::Coord3D& origin, //!< The map location in memory + const amd::Coord3D& region, //!< The map region in memory + uint mapFlags, //!< Map flags + size_t* rowPitch = NULL, //!< Row pitch for the mapped memory + size_t* slicePitch = NULL //!< Slice for the mapped memory + ); + + //! Pins system memory associated with this memory object + virtual bool pinSystemMemory( + void* hostPtr, //!< System memory address + size_t size //!< Size of allocated system memory + ); + + //! Releases indirect map surface + virtual void releaseIndirectMap() { decIndMapCount(); } + + //! Map the device memory to CPU visible + virtual void* cpuMap( + device::VirtualDevice& vDev,//!< Virtual device for map operaiton + uint flags = 0, //!< flags for the map operation + // Optimization for multilayer map/unmap + uint startLayer = 0, //!< Start layer for multilayer map + uint numLayers = 0, //!< End layer for multilayer map + size_t* rowPitch = NULL, //!< Row pitch for the device memory + size_t* slicePitch = NULL //!< Slice pitch for the device memory + ); + + //! Unmap the device memory + virtual void cpuUnmap( + device::VirtualDevice& vDev //!< Virtual device for unmap operaiton + ); + + //! Updates device memory from the owner's host allocation + void syncCacheFromHost( + VirtualGPU& gpu, //!< Virtual GPU device object + //! Synchronization flags + device::Memory::SyncFlags syncFlags = device::Memory::SyncFlags() + ); + + //! Updates the owner's host allocation from device memory + virtual void syncHostFromCache( + //! Synchronization flags + device::Memory::SyncFlags syncFlags = device::Memory::SyncFlags() + ); + + //! Creates a view from current resource + virtual Memory* createBufferView( + amd::Memory& subBufferOwner //!< The abstraction layer subbuf owner + ); + + //! Allocates host memory for synchronization with MGPU context + void mgpuCacheWriteBack(); + + //! Transfers objects data to the destination object + bool moveTo(Memory& dst); + + //! Accessors for indirect map memory object + Memory* mapMemory() const; + + //! Returns the interop memory for this memory object + Memory* interop() const { return interopMemory_; } + + //! Gets interop type for this memory object + InteropType interopType() const { return interopType_; } + + //! Sets interop type for this memory object + void setInteropType(InteropType type) { interopType_ = type; } + + //! Set the owner + void setOwner(amd::Memory* owner) { owner_ = owner; } + + // Decompress GL depth-stencil/MSAA resources for CL access + // Invalidates any FBOs the resource may be bound to, otherwise the GL driver may crash. + virtual bool processGLResource(GLResourceOP operation); + + //! Returns the interop resource for this memory object + const Memory* parent() const { return parent_; } + + //! Returns TRUE if direct map is acceaptable. The method detects + //! forced USWC memory on APU and will cause a switch to + //! indirect map for allocations with a possibility of host read + bool isDirectMap() + { + return (isCacheable() || !isHostMemDirectAccess() || + (owner()->getMemFlags() & + (CL_MEM_ALLOC_HOST_PTR | CL_MEM_HOST_WRITE_ONLY | CL_MEM_READ_ONLY))); + } + +protected: + //! Decrement map count + void decIndMapCount(); + + //! Initialize the object members + void init(); + +private: + //! Disable copy constructor + Memory(const Memory&); + + //! Disable operator= + Memory& operator=(const Memory&); + + InteropType interopType_; //!< Interop type + Memory* interopMemory_; //!< interop memory + Memory* pinnedMemory_; //!< Memory used as pinned system memory + const Memory* parent_; //!< Parent memory object +}; + +class Buffer: public pal::Memory +{ +public: + //! Buffer constructor + Buffer( + const Device& gpuDev, //!< GPU device object + amd::Memory& owner, //!< Abstraction layer memory object + size_t size //!< Buffer size + ) + : pal::Memory(gpuDev, owner, size) + {} + + //! Creates a view from current resource + virtual Memory* createBufferView( + amd::Memory& subBufferOwner //!< The abstraction layer subbuf owner + ) const; + +private: + //! Disable copy constructor + Buffer(const Buffer&); + + //! Disable operator= + Buffer& operator=(const Buffer&); +}; + +class Image: public pal::Memory +{ +public: + //! Image constructor + Image( + const Device& gpuDev, //!< GPU device object + amd::Memory& owner, //!< Abstraction layer memory object + size_t width, //!< Allocated memory width + size_t height, //!< Allocated memory height + size_t depth, //!< Allocated memory depth + cl_image_format format, //!< Memory format + cl_mem_object_type imageType, //!< CL image type + uint mipLevels //!< The number of mip levels + ) + : pal::Memory(gpuDev, owner, width, height, depth, format, imageType, mipLevels) + {} + + //! Image constructor + Image( + const Device& gpuDev, //!< GPU device object + size_t size, //!< Memory size + size_t width, //!< Allocated memory width + size_t height, //!< Allocated memory height + size_t depth, //!< Allocated memory depth + cl_image_format format, //!< Memory format + cl_mem_object_type imageType, //!< CL image type + uint mipLevels //!< The number of mip levels + ) + : pal::Memory(gpuDev, size, width, height, depth, format, imageType, mipLevels) + {} + + //! Allocate memory for API-level maps + virtual void* allocMapTarget( + const amd::Coord3D& origin, //!< The map location in memory + const amd::Coord3D& region, //!< The map region in memory + uint mapFlags, //!< Map flags + size_t* rowPitch = NULL, //!< Row pitch for the mapped memory + size_t* slicePitch = NULL //!< Slice for the mapped memory + ); + +private: + //! Disable copy constructor + Image(const Image&); + + //! Disable operator= + Image& operator=(const Image&); +}; + +} // namespace pal + +#endif // PALMEMORY_HPP_ diff --git a/projects/clr/rocclr/runtime/device/pal/palprintf.cpp b/projects/clr/rocclr/runtime/device/pal/palprintf.cpp new file mode 100644 index 0000000000..40d902b377 --- /dev/null +++ b/projects/clr/rocclr/runtime/device/pal/palprintf.cpp @@ -0,0 +1,714 @@ +// +// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved. +// +#include "top.hpp" +#include "os/os.hpp" +#include "device/device.hpp" +#include "device/pal/paldefs.hpp" +#include "device/pal/palmemory.hpp" +#include "device/pal/palkernel.hpp" +#include "device/pal/palprogram.hpp" +#include "device/pal/palprintf.hpp" +#include +#include +#include + +namespace pal { + +PrintfDbg::PrintfDbg(Device& device, FILE* file) + : dbgBuffer_(nullptr) + , dbgFile_(file) + , gpuDevice_(device) + , wiDbgSize_(0) + , initCntValue_(device, 4) +{ +} + +PrintfDbg::~PrintfDbg() +{ + delete dbgBuffer_; +} + +bool +PrintfDbg::create() +{ + // Create a resource for the init count value + if (initCntValue_.create(Resource::Remote)) { + uint32_t* value = reinterpret_cast(initCntValue_.map(nullptr)); + // The counter starts from 1 + if (nullptr != value) { + *value = 1; + } + else { + return false; + } + initCntValue_.unmap(nullptr); + return true; + } + return false; +} + +bool +PrintfDbg::init( + VirtualGPU& gpu, + bool printfEnabled, + const amd::NDRange& size) +{ + // Set up debug output buffer (if printf active) + if (printfEnabled) { + if (!allocate()) { + return false; + } + + // Make sure that the size isn't bigger than the reported max + if (size.product() <= dev().settings().maxWorkGroupSize_) { + size_t wiDbgSizeTmp; + + // Calculate the debug buffer size per workitem + wiDbgSizeTmp = std::min(dbgBuffer_->size() / size.product(), + dev().xferRead().bufSize()); + + // Make sure the size is DWORD aligned + wiDbgSizeTmp = amd::alignDown(wiDbgSizeTmp, sizeof(uint32_t)); + + // If the new size is different, then clear the initial values + if (wiDbgSize_ != wiDbgSizeTmp) { + wiDbgSize_ = wiDbgSizeTmp; + if (!clearWorkitems(gpu, 0, size.product())) { + wiDbgSize_ = 0; + return false; + } + } + } + } + + return true; +} + +bool +PrintfDbg::output( + VirtualGPU& gpu, + bool printfEnabled, + const amd::NDRange& size, + const std::vector& printfInfo) +{ + // Are we expected to generate debug output? + if (printfEnabled && !printfInfo.empty()) { + uint32_t* workitemData; + size_t i, j, k, z; + bool realloc = false; + + // Wait for kernel execution + gpu.waitAllEngines(); + + size_t zdim = 1; + size_t ydim = 1; + size_t xdim = 1; + + switch (size.dimensions()) { + case 3: + zdim = size[2]; + // Fall through ... + case 2: + ydim = size[1]; + // Fall through ... + case 1: + xdim = size[0]; + // Fall through ... + default: + break; + } + + for (k = 0; k < zdim; ++k) { + for (j = 0; j < ydim; ++j) { + for (i = 0; i < xdim; ++i) { + size_t idx = (xdim * (ydim * k + j) + i); + workitemData = mapWorkitem(gpu, idx, &realloc); + + if (nullptr != workitemData) { + uint32_t wp = workitemData[0]; // write pointer (i.e. first unwritten element) + // Walk through each PrintfDbg entry + for (z = 1; (z < (wiDbgSize() / sizeof(uint32_t))) && (z < wp); ) { + if (printfInfo.size() < workitemData[z]) { + LogError("The format string wasn't reported"); + return false; + } + // Get the PrintfDbg info + const PrintfInfo& info = printfInfo[workitemData[z++]]; + // There's something in this buffer + outputDbgBuffer(info, workitemData, z); + } + } + unmapWorkitem(gpu, workitemData); + } + } + } + + // Reallocate debug buffer if necessary + if (!allocate(realloc)) { + return false; + } + } + return true; +} + +bool +PrintfDbg::allocate(bool realloc) +{ + if (nullptr == dbgBuffer_) { + dbgBuffer_ = dev().createScratchBuffer(dev().info().printfBufferSize_); + } + else if (realloc) { + LogWarning("Debug buffer reallocation!"); + // Double the buffer size if it's not big enough + size_t size = dbgBuffer_->size(); + delete dbgBuffer_; + dbgBuffer_ = dev().createScratchBuffer(size << 1); + } + + return (nullptr != dbgBuffer_) ? true : false; +} + +bool +PrintfDbg::checkFloat(const std::string& fmt) const +{ + switch (fmt[fmt.size() - 1]) { + case 'e': + case 'E': + case 'f': + case 'g': + case 'G': + case 'a': + return true; + break; + default: + break; + } + return false; +} + +bool +PrintfDbg::checkString(const std::string& fmt) const +{ + if (fmt[fmt.size() - 1] == 's') + return true; + return false; +} + +int +PrintfDbg::checkVectorSpecifier( + const std::string& fmt, + size_t startPos, + size_t& curPos) const +{ + int vectorSize = 0; + size_t pos = curPos; + size_t size = curPos - startPos; + + if (size >= 3) { + size = 0; + //no modifiers + if (fmt[curPos - 3] == 'v') { + size = 2; + } + //the modifiers are "h" or "l" + else if (fmt[curPos - 4] == 'v') { + size = 3; + } + //the modifier is "hh" + else if ((curPos >= 5) && (fmt[curPos - 5] == 'v')) { + size = 4; + } + if (size > 0) { + curPos = size; + pos -= curPos; + + // Get vector size + vectorSize = fmt[pos++] - '0'; + // PrintfDbg supports only 2, 3, 4, 8 and 16 wide vectors + switch (vectorSize) { + case 1: + if ((fmt[pos++] - '0') == 6) { + vectorSize = 16; + } + else { + vectorSize = 0; + } + break; + case 2: + case 3: + case 4: + case 8: + break; + default: + vectorSize = 0; + break; + } + } + } + + return vectorSize; +} + +static const size_t ConstStr = 0xffffffff; +static const char Separator[] = ",\0"; + +size_t +PrintfDbg::outputArgument( + const std::string& fmt, + bool printFloat, + size_t size, + const uint32_t* argument) const +{ + // Serialize the output to the screen + amd::ScopedLock k(dev().lockAsyncOps()); + + size_t copiedBytes = size; + // Print the string argument, using standard PrintfDbg() + if (checkString(fmt.c_str())) { + //copiedBytes should be as number of printed chars + copiedBytes = 0; + //(null) should be printed + if (*argument == 0) { + amd::Os::printf(fmt.data(),0); + //copiedBytes = strlen("(null)") + copiedBytes = 6; + } + else { + const unsigned char* argumentStr = reinterpret_cast(argument); + amd::Os::printf(fmt.data(),argumentStr); + //copiedBytes = strlen(argumentStr) + while (argumentStr[copiedBytes++] != 0); + } + } + + // Print the argument(except for string ), using standard PrintfDbg() + else { + bool hlModifier = (strstr(fmt.c_str(),"hl") != nullptr); + std::string hlFmt; + if (hlModifier) { + hlFmt = fmt; + hlFmt.erase(hlFmt.find_first_of("hl"),2); + } + switch (size) { + case 0: { + const char* str = reinterpret_cast(argument); + amd::Os::printf(fmt.data(), str); + // Find the string length + while (str[copiedBytes++] != 0); + } + break; + case 1: + amd::Os::printf(fmt.data(), *(reinterpret_cast(argument))); + break; + case 2: + case 4: + if (printFloat) { + static const char* fSpecifiers = "eEfgGa"; + std::string fmtF = fmt; + size_t posS = fmtF.find_first_of("%"); + size_t posE = fmtF.find_first_of(fSpecifiers); + if (posS != std::string::npos &&posE != std::string::npos) { + fmtF.replace(posS+1,posE-posS,"s"); + } + float fArg = *(reinterpret_cast(argument)); + float fSign = copysign(1.0,fArg); + if (isinf(fArg)&&!isnan(fArg)) { + if(fSign < 0) { + amd::Os::printf(fmtF.data(),"-infinity"); + } + else { + amd::Os::printf(fmtF.data(),"infinity"); + } + } + else if (isnan(fArg)) { + if(fSign < 0) { + amd::Os::printf(fmtF.data(),"-nan"); + } + else { + amd::Os::printf(fmtF.data(),"nan"); + } + } + else if (hlModifier) { + amd::Os::printf(hlFmt.data(),fArg); + } + else { + amd::Os::printf(fmt.data(),fArg); + } + } + else { + bool hhModifier = (strstr(fmt.c_str(),"hh") != nullptr); + if (hhModifier) { + //current implementation of printf in gcc 4.5.2 runtime libraries, doesn`t recognize "hh" modifier ==> + //argument should be explicitly converted to unsigned char (uchar) before printing and + //fmt should be updated not to contain "hh" modifier + std::string hhFmt = fmt; + hhFmt.erase(hhFmt.find_first_of("h"),2); + amd::Os::printf(hhFmt.data(), *(reinterpret_cast(argument))); + } + else if (hlModifier) { + amd::Os::printf(hlFmt.data(), *argument); + } + else { + amd::Os::printf(fmt.data(), *argument); + } + } + break; + case 8: + if (printFloat) { + if (hlModifier) { + amd::Os::printf(hlFmt.data(), *(reinterpret_cast(argument))); + } + else { + amd::Os::printf(fmt.data(), *(reinterpret_cast(argument))); + } + } + else { + std::string out = fmt; + // Use 'll' for 64 bit printf + out.insert((out.size() - 1), 1, 'l'); + amd::Os::printf(out.data(), *(reinterpret_cast(argument))); + } + break; + case ConstStr: { + const char* str = reinterpret_cast(argument); + amd::Os::printf(fmt.data(), str); + } + break; + default: + amd::Os::printf("Error: Unsupported data size for PrintfDbg. %d bytes", + static_cast(size)); + return 0; + } + } + fflush(stdout); + return copiedBytes; +} + +void +PrintfDbg::outputDbgBuffer(const PrintfInfo& info, const uint32_t* workitemData, size_t& i) const +{ + static const char* specifiers = "cdieEfgGaosuxXp"; + static const char* modifiers = "hl"; + static const char* special = "%n"; + static const std::string sepStr = "%s"; + const uint32_t* s = workitemData; + size_t pos = 0; + + // Find the format string + std::string str = info.fmtString_; + std::string fmt; + size_t posStart, posEnd; + + // Print all arguments + // Note: the following code walks through all arguments, provided by the kernel and + // finds the corresponding specifier in the format string. + // Then it splits the original string into substrings with a single specifier and + // uses standard PrintfDbg() to print each argument + for (uint j = 0; j < info.arguments_.size(); ++j) { + do { + posStart = str.find_first_of("%", pos); + if (posStart != std::string::npos) { + posStart++; + // Erase all spaces after % + while (str[posStart] == ' ') { + str.erase(posStart, 1); + } + size_t tmp = str.find_first_of(special, posStart); + size_t tmp2 = str.find_first_of(specifiers, posStart); + // Special cases. Special symbol is located before any specifier + if (tmp < tmp2) { + posEnd = posStart + 1; + fmt = str.substr(pos, posEnd - pos); + fmt.erase(posStart - pos - 1, 1); + pos = posStart = posEnd; + outputArgument(sepStr, false, ConstStr, + reinterpret_cast(fmt.data())); + continue; + } + break; + } + else if (pos < str.length()) { + outputArgument(sepStr, false, ConstStr,reinterpret_cast((str.substr(pos)).data())); + } + } + while (posStart != std::string::npos); + + if (posStart != std::string::npos) { + bool printFloat = false; + int vectorSize = 0; + size_t length; + size_t idPos = 0; + + // Search for PrintfDbg specifier in the format string. + // It will be a split point for the output + posEnd = str.find_first_of(specifiers, posStart); + if (posEnd == std::string::npos) { + pos = posStart = posEnd; + break; + } + posEnd++; + + size_t curPos = posEnd; + vectorSize = checkVectorSpecifier(str, posStart, curPos); + + // Get substring from the last position to the current specifier + fmt = str.substr(pos, posEnd - pos); + + // Readjust the string pointer if PrintfDbg outputs a vector + if (vectorSize != 0) { + size_t posVecSpec = fmt.length()-(curPos + 1); + size_t posVecMod = fmt.find_first_of(modifiers,posVecSpec + 1); + size_t posMod = str.find_first_of(modifiers,posStart); + if(posMod < posEnd){ + fmt = fmt.erase(posVecSpec, posVecMod - posVecSpec); + } + else{ + fmt = fmt.erase(posVecSpec, curPos); + } + idPos = posStart - pos - 1; + } + pos = posStart = posEnd; + + // Find out if the argument is a float + printFloat = checkFloat(fmt); + + // Is it a scalar value? + if (vectorSize == 0) { + length = outputArgument(fmt, printFloat, info.arguments_[j], &s[i]); + if (0 == length) { + return; + } + i += amd::alignUp(length, sizeof(uint32_t)) / sizeof(uint32_t); + } + else { + // 3-component vector's size is defined as 4 * size of each scalar component + size_t elemSize = info.arguments_[j] / (vectorSize == 3 ? 4 : vectorSize); + size_t k = i * sizeof(uint32_t); + std::string elementStr = fmt.substr(idPos, fmt.size()); + + // Print first element with full string + if (0 == outputArgument(fmt, printFloat, elemSize, &s[i])) { + return; + } + + // Print other elemnts with separator if available + for (int e = 1; e < vectorSize; ++e) { + const char* t = reinterpret_cast(s); + // Output the vector separator + outputArgument(sepStr, false, ConstStr, + reinterpret_cast(Separator)); + + // Output the next element + outputArgument(elementStr, printFloat, elemSize, + reinterpret_cast(&t[k + e * elemSize])); + } + i += (amd::alignUp(info.arguments_[j], sizeof(uint32_t))) + / sizeof(uint32_t); + } + } + } + + if (pos != std::string::npos) { + fmt = str.substr(pos, str.size() - pos); + outputArgument(sepStr, false, ConstStr, + reinterpret_cast(fmt.data())); + } +} + +bool +PrintfDbg::clearWorkitems(VirtualGPU& gpu, size_t idxStart, size_t number) const +{ + // Go through all locations for every thread and copy 1 + for (uint i = idxStart; i < idxStart + number; ++i) { + amd::Coord3D dst(i * wiDbgSize(), 0, 0); + amd::Coord3D size(sizeof(uint32_t), 0, 0); + + // Copy 1 into the corresponding location in the debug buffer + if (!initCntValue_.partialMemCopyTo( + gpu, amd::Coord3D(0, 0, 0), dst, size, *dbgBuffer_)) { + return false; + } + } + return true; +} + +uint32_t* +PrintfDbg::mapWorkitem(VirtualGPU& gpu, size_t idx, bool* realloc) +{ + uint32_t wiSize = 0; + amd::Coord3D src(idx * wiDbgSize(), 0, 0); + xferBufRead_ = &(dev().xferRead().acquire()); + + // Copy workitem size from the corresponding location in the debug buffer + if (!dbgBuffer_->partialMemCopyTo(gpu, + src, amd::Coord3D(0, 0, 0), amd::Coord3D(sizeof(uint32_t), 0, 0), + *xferBufRead_)) { + return nullptr; + } + + // Get memory pointer to the satged buffer + uint32_t* workitem = reinterpret_cast(xferBufRead_->map(&gpu)); + if (nullptr == workitem) { + return nullptr; + } + + // Copy size value + wiSize = *workitem; + xferBufRead_->unmap(&gpu); + + // Check if the cuurent workitem almost reached the size limit + if ((wiDbgSize() - static_cast(wiSize)) < 3) { + *realloc = true; + } + + // If the current workitem had any output then get the data + if ((wiSize > 1) && (wiSize <= wiDbgSize())) { + amd::Coord3D size(wiSize * sizeof(uint32_t), 0, 0); + + // Copy the current workitem output data to the staged buffer + if (!dbgBuffer_->partialMemCopyTo( + gpu, src, amd::Coord3D(0, 0, 0), size, *xferBufRead_) || + // Clear the write pointer back to index 1 for the current workitem + !clearWorkitems(gpu, idx, 1)) { + LogError("Reading the workitem data failed!"); + return nullptr; + } + + // Get a pointer to the workitem data + uint32_t* workitem = reinterpret_cast + (xferBufRead_->map(&gpu)); + + return workitem; + } + + return nullptr; +} + +void +PrintfDbg::unmapWorkitem(VirtualGPU& gpu , const uint32_t* workitemData) const +{ + if (nullptr != workitemData) { + xferBufRead_->unmap(&gpu); + } + + dev().xferRead().release(gpu, *xferBufRead_); +} + +bool +PrintfDbgHSA::init( + VirtualGPU& gpu, + bool printfEnabled) +{ + // Set up debug output buffer (if printf active) + if (printfEnabled) { + if (!allocate()) { + return false; + } + + // The first two DWORDs in the printf buffer are as follows: + // First DWORD = Offset to where next information is to + // be written, initialized to 0 + // Second DWORD = Number of bytes available for printf data + // = buffer size – 2*sizeof(uint32_t) + const uint8_t initSize = 2*sizeof(uint32_t); + uint8_t sysMem[initSize]; + memset(sysMem, 0, initSize); + uint32_t dbgBufferSize = dbgBuffer_->size() - initSize; + memcpy(&sysMem[4], &dbgBufferSize, sizeof(dbgBufferSize)); + + // Copy offset and number of bytes available for printf data + // into the corresponding location in the debug buffer + dbgBuffer_->writeRawData(gpu, initSize, sysMem, true); + } + return true; +} + +bool +PrintfDbgHSA::output( + VirtualGPU& gpu, + bool printfEnabled, + const std::vector& printfInfo) +{ + if (printfEnabled) { + uint32_t offsetSize = 0; + xferBufRead_ = &(dev().xferRead().acquire()); + + // Copy offset from the first DWORD in the debug buffer + if (!dbgBuffer_->partialMemCopyTo(gpu, + amd::Coord3D(0, 0, 0), amd::Coord3D(0, 0, 0), + amd::Coord3D(sizeof(uint32_t), 0, 0),*xferBufRead_)) { + return false; + } + + // Get memory pointer to the satged buffer + uint32_t* dbgBufferPtr = reinterpret_cast(xferBufRead_->map(&gpu)); + if (nullptr == dbgBufferPtr) { + return false; + } + + offsetSize = *dbgBufferPtr; + xferBufRead_->unmap(&gpu); + + if (offsetSize == 0) { + LogError("\n The printf buffer is empty!"); + return false; + } + + size_t bufSize = dev().xferRead().bufSize(); + size_t copySize = offsetSize; + while (copySize != 0) { + // Copy the buffer data (i.e., the printfID followed by the + //argument data for each printf call in th kernel) to the staged buffer + if (!dbgBuffer_->partialMemCopyTo(gpu, + amd::Coord3D(2*sizeof(uint32_t) + offsetSize - copySize, 0, 0), + amd::Coord3D(0, 0, 0), + std::min(copySize, bufSize), *xferBufRead_)) { + return false; + } + + // Get a pointer to the buffer data + dbgBufferPtr = reinterpret_cast(xferBufRead_->map(&gpu)); + if (nullptr == dbgBufferPtr) { + return false; + } + + std::vector::const_iterator ita; + uint sb = 0; + uint sbt = 0; + + // parse the debug buffer + while (sbt < copySize) { + assert(((*dbgBufferPtr) < printfInfo.size()) && + "Cound't find the reported PrintfID!"); + const PrintfInfo& info = printfInfo[(*dbgBufferPtr)]; + sb += sizeof(uint32_t); + for (ita = info.arguments_.begin(); + ita != info.arguments_.end(); ++ita){ + sb += *ita; + } + + if (sbt + sb > bufSize) { + break; // Need new portion of data in staging buffer + } + + size_t idx = 1; + // There's something in the debug buffer + outputDbgBuffer(info, dbgBufferPtr, idx); + + sbt += sb; + dbgBufferPtr += sb/sizeof(uint32_t); + sb = 0; + } + + copySize -= sbt; + xferBufRead_->unmap(&gpu); + } + + dev().xferRead().release(gpu, *xferBufRead_); + } + + return true; +} + +} // namespace pal diff --git a/projects/clr/rocclr/runtime/device/pal/palprintf.hpp b/projects/clr/rocclr/runtime/device/pal/palprintf.hpp new file mode 100644 index 0000000000..1a71af0fa5 --- /dev/null +++ b/projects/clr/rocclr/runtime/device/pal/palprintf.hpp @@ -0,0 +1,192 @@ +// +// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved. +// +#ifndef PALPRINTFDBG_HPP_ +#define PALPRINTFDBG_HPP_ + +#include "device/pal/palmemory.hpp" + +/*! \addtogroup GPU GPU Device Implementation + * @{ + */ +#ifndef isinf +#ifdef _MSC_VER +#define isinf(X) (!_finite(X) && !_isnan(X)) +#endif //_MSC_VER +#endif //isinf + +#ifndef isnan +#ifdef _MSC_VER +#define isnan(X) (_isnan(X)) +#endif //_MSC_VER +#endif //isnan + +#ifndef copysign +#ifdef _MSC_VER +#define copysign(X,Y) (_copysign(X,Y)) +#endif //_MSC_VER +#endif //copysign + +//! GPU Device Implementation +namespace pal { + +//! Printf info structure +struct PrintfInfo +{ + std::string fmtString_; //!< formated string for printf + std::vector arguments_; //!< passed arguments to the printf() call +}; + +class Kernel; +class VirtualGPU; +class Memory; + +class PrintfDbg : public amd::HeapObject +{ +public: + //! Debug buffer size per workitem + static const uint WorkitemDebugSize = 4096; + + //! Default constructor + PrintfDbg( + Device& device, + FILE* file = NULL + ); + + //! Destructor + ~PrintfDbg(); + + //! Creates the PrintfDbg object + bool create(); + + //! Initializes the debug buffer before kernel's execution + bool init( + VirtualGPU& gpu, //!< Virtual GPU object + bool printfEnabled, //!< checks for printf + const amd::NDRange& size //!< Kernel's workload + ); + + //! Prints the kernel's debug informaiton from the buffer + bool output( + VirtualGPU& gpu, //!< Virtual GPU object + bool printfEnabled, //!< checks for printf + const amd::NDRange& size, //!< Kernel's workload + const std::vector& printfInfo //!< printf info + ); + + //! Debug buffer size per workitem + size_t wiDbgSize() const { return wiDbgSize_; } + + //! Returns debug buffer object + Memory* dbgBuffer() const { return dbgBuffer_; } + +protected: + Memory* dbgBuffer_; //!< Buffer to hold debug output + FILE* dbgFile_; //!< Debug file + Device& gpuDevice_; //!< GPU device object + Memory* xferBufRead_; //!< Transfer buffer for the dump read + + //! Gets GPU device object + Device& dev() const { return gpuDevice_; } + + //! Allocates the debug buffer + bool allocate( + bool realloc = false //!< If TRUE then reallocate the debug memory + ); + + //! Returns TRUE if a float value has to be printed + bool checkFloat( + const std::string& fmt //!< Format string + ) const; + + //! Returns TRUE if a string value has to be printed + bool checkString( + const std::string& fmt //!< Format string + ) const; + + //! Finds the specifier in the format string + int checkVectorSpecifier( + const std::string& fmt, //!< Format string + size_t startPos, //!< Start position for processing + size_t& curPos //!< End position for processing + ) const; + + //! Outputs an argument + size_t outputArgument( + const std::string& fmt, //!< Format strint + bool printFloat, //!< Argument is a float value + size_t size, //!< Argument's size + const uint32_t* argument //!< Argument's location + ) const; + + //! Displays the PrintfDbg + void outputDbgBuffer( + const PrintfInfo& info, //!< printf info + const uint32_t* workitemData, //!< The PrintfDbg dump buffer + size_t& i //!< index to the data in the buffer + ) const; + +private: + //! Disable copy constructor + PrintfDbg(const PrintfDbg&); + + //! Disable assignment + PrintfDbg& operator=(const PrintfDbg&); + + //! Returns the pointer to the workitem data block + bool clearWorkitems( + VirtualGPU& gpu, //!< Virtual GPU object + size_t idxStart, //!< Workitem global index start + size_t number //!< Number of workitems to clear + ) const; + + //! Returns the pointer to the workitem data block + uint32_t* mapWorkitem( + VirtualGPU& gpu, //!< Virtual GPU object + size_t idx, //!< Workitem global index + bool* realloc //!< Returns TRUE if workitem reached the buffer limit + ); + + //! Unamp the staged buffer + void unmapWorkitem( + VirtualGPU& gpu, //!< Virtual GPU object + const uint32_t* workitemData //!< The PrintfDbg dump buffer + ) const; + + size_t wiDbgSize_; //!< Workitem debug size + Memory initCntValue_; //!< Initialized count value +}; +class PrintfDbgHSA : public PrintfDbg +{ +public: + + //! Default constructor + PrintfDbgHSA( + Device& device, + FILE* file = NULL + ): PrintfDbg(device, file) { } + + //! Initializes the debug buffer before kernel's execution + bool init( + VirtualGPU& gpu, //!< Virtual GPU object + bool printfEnabled //!< checks for printf + ); + + //! Prints the kernel's debug informaiton from the buffer + bool output( + VirtualGPU& gpu, //!< Virtual GPU object + bool printfEnabled, //!< checks for printf + const std::vector& printfInfo //!< printf info + ); + +private: + //! Disable copy constructor + PrintfDbgHSA(const PrintfDbgHSA&); + + //! Disable assignment + PrintfDbgHSA& operator=(const PrintfDbgHSA&); +}; + +/*@}*/} // namespace pal + +#endif /*PALPRINTFDBG_HPP_*/ diff --git a/projects/clr/rocclr/runtime/device/pal/palprogram.cpp b/projects/clr/rocclr/runtime/device/pal/palprogram.cpp new file mode 100644 index 0000000000..2384396b0e --- /dev/null +++ b/projects/clr/rocclr/runtime/device/pal/palprogram.cpp @@ -0,0 +1,925 @@ +// +// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved. +// +#include "os/os.hpp" +#include "utils/flags.hpp" +#include "include/aclTypes.h" +#include "utils/amdilUtils.hpp" +#include "utils/bif_section_labels.hpp" +#include "device/pal/palprogram.hpp" +#include "device/pal/palblit.hpp" +#include "macrodata.h" +#include "MDParser/AMDILMDInterface.h" +#include +#include +#include +#include +#include "utils/options.hpp" +#include "hsa.h" +#include "hsa_ext_image.h" +#include "amd_hsa_loader.hpp" + +namespace pal { + +HSAILProgram::HSAILProgram(Device& device) + : Program(device) + , llvmBinary_() + , binaryElf_(nullptr) + , rawBinary_(nullptr) + , kernels_(nullptr) + , maxScratchRegs_(0) + , isNull_(false) + , executable_(nullptr) + , loaderContext_(this) +{ + memset(&binOpts_, 0, sizeof(binOpts_)); + binOpts_.struct_size = sizeof(binOpts_); + binOpts_.elfclass = LP64_SWITCH(ELFCLASS32, ELFCLASS64); + binOpts_.bitness = ELFDATA2LSB; + binOpts_.alloc = &::malloc; + binOpts_.dealloc = &::free; + loader_ = amd::hsa::loader::Loader::Create(&loaderContext_); +} + +HSAILProgram::HSAILProgram(NullDevice& device) + : Program(device) + , llvmBinary_() + , binaryElf_(nullptr) + , rawBinary_(nullptr) + , kernels_(nullptr) + , maxScratchRegs_(0) + , isNull_(true) + , executable_(nullptr) + , loaderContext_(this) +{ + memset(&binOpts_, 0, sizeof(binOpts_)); + binOpts_.struct_size = sizeof(binOpts_); + binOpts_.elfclass = LP64_SWITCH(ELFCLASS32, ELFCLASS64); + binOpts_.bitness = ELFDATA2LSB; + binOpts_.alloc = &::malloc; + binOpts_.dealloc = &::free; + loader_ = amd::hsa::loader::Loader::Create(&loaderContext_); +} + +HSAILProgram::~HSAILProgram() +{ + // Destroy internal static samplers + for (auto& it : staticSamplers_) { + delete it; + } + if (rawBinary_ != nullptr) { + free(rawBinary_); + } + acl_error error; + // Free the elf binary + if (binaryElf_ != nullptr) { + error = aclBinaryFini(binaryElf_); + if (error != ACL_SUCCESS) { + LogWarning( "Error while destroying the acl binary \n" ); + } + } + releaseClBinary(); + if (executable_ != nullptr) { + loader_->DestroyExecutable(executable_); + } + delete kernels_; + amd::hsa::loader::Loader::Destroy(loader_); +} + +bool +HSAILProgram::initBuild(amd::option::Options *options) +{ + if (!device::Program::initBuild(options)) { + return false; + } + + const char* devName = dev().hwInfo()->machineTarget_; + options->setPerBuildInfo( + (devName && (devName[0] != '\0')) ? devName : "gpu", + clBinary()->getEncryptCode(), true); + + // Elf Binary setup + std::string outFileName; + + // true means fsail required + clBinary()->init(options, true); + if (options->isDumpFlagSet(amd::option::DUMP_BIF)) { + outFileName = options->getDumpFileName(".bin"); + } + + if (!clBinary()->setElfOut(LP64_SWITCH(ELFCLASS32, ELFCLASS64), + (outFileName.size() > 0) ? outFileName.c_str() : nullptr)) { + LogError("Setup elf out for gpu failed"); + return false; + } + return true; +} + +bool +HSAILProgram::finiBuild(bool isBuildGood) +{ + clBinary()->resetElfOut(); + clBinary()->resetElfIn(); + + if (!isBuildGood) { + // Prevent the encrypted binary form leaking out + clBinary()->setBinary(nullptr, 0); + } + + return device::Program::finiBuild(isBuildGood); +} + +bool +HSAILProgram::linkImpl( + const std::vector &inputPrograms, + amd::option::Options *options, + bool createLibrary) +{ + std::vector::const_iterator it + = inputPrograms.begin(); + std::vector::const_iterator itEnd + = inputPrograms.end(); + acl_error errorCode; + + // For each program we need to extract the LLVMIR and create + // aclBinary for each + std::vector binaries_to_link; + + for (size_t i = 0; it != itEnd; ++it, ++i) { + HSAILProgram *program = (HSAILProgram *)*it; + // Check if the program was created with clCreateProgramWIthBinary + binary_t binary = program->binary(); + if ((binary.first != nullptr) && (binary.second > 0)) { + // Binary already exists -- we can also check if there is no + // opencl source code + // Need to check if LLVMIR exists in the binary + // If LLVMIR does not exist then is it valid + // We need to pull out all the compiled kernels + // We cannot do this at present because we need at least + // Hsail text to pull the kernels oout + void *mem = const_cast(binary.first); + binaryElf_ = aclReadFromMem(mem, binary.second, &errorCode); + if (errorCode != ACL_SUCCESS) { + LogWarning("Error while linking : Could not read from raw binary"); + return false; + } + } + // At this stage each HSAILProgram contains a valid binary_elf + // Check if LLVMIR is in the binary + // @TODO - Memory leak , cannot free this buffer + // need to fix this.. File EPR on compiler library + size_t llvmirSize = 0; + const void *llvmirText = aclExtractSection(dev().compiler(), + binaryElf_, &llvmirSize, aclLLVMIR, &errorCode); + if (errorCode != ACL_SUCCESS) { + bool spirv = false; + size_t boolSize = sizeof(bool); + errorCode = aclQueryInfo(dev().compiler(), binaryElf_, + RT_CONTAINS_SPIRV, nullptr, &spirv, &boolSize); + if (errorCode != ACL_SUCCESS) { + spirv = false; + } + if (spirv) { + errorCode = aclCompile(dev().compiler(), binaryElf_, + options->origOptionStr.c_str(), ACL_TYPE_SPIRV_BINARY, + ACL_TYPE_LLVMIR_BINARY, nullptr); + buildLog_ += aclGetCompilerLog(dev().compiler()); + if (errorCode != ACL_SUCCESS) { + buildLog_ += "Error while linking: Could not load SPIR-V" ; + return false; + } + } else { + buildLog_ +="Error while linking : \ + Invalid binary (Missing LLVMIR section)" ; + return false; + } + } + // Create a new aclBinary for each LLVMIR and save it in a list + aclBIFVersion ver = aclBinaryVersion(binaryElf_); + aclBinary *bin = aclCreateFromBinary(binaryElf_, ver); + binaries_to_link.push_back(bin); + } + + errorCode = aclLink(dev().compiler(), + binaries_to_link[0], binaries_to_link.size() - 1, + binaries_to_link.size() > 1 ? &binaries_to_link[1] : NULL, + ACL_TYPE_LLVMIR_BINARY, "-create-library", NULL); + if (errorCode != ACL_SUCCESS) { + buildLog_ += aclGetCompilerLog(dev().compiler()); + buildLog_ +="Error while linking : aclLink failed" ; + return false; + } + // Store the newly linked aclBinary for this program. + binaryElf_ = binaries_to_link[0]; + // Free all the other aclBinaries + for (size_t i = 1; i < binaries_to_link.size(); i++) { + aclBinaryFini(binaries_to_link[i]); + } + if (createLibrary) { + size_t size = 0; + void *mem = NULL; + aclWriteToMem(binaryElf_, &mem, &size); + setBinary(static_cast(mem), size); + buildLog_ += aclGetCompilerLog(dev().compiler()); + setType(TYPE_LIBRARY); + return true; + } + // Now call linkImpl with the new options + return linkImpl(options); +} + +aclType +HSAILProgram::getCompilationStagesFromBinary(std::vector& completeStages, bool& needOptionsCheck) +{ + acl_error errorCode; + size_t secSize = 0; + completeStages.clear(); + aclType from = ACL_TYPE_DEFAULT; + needOptionsCheck = true; + size_t boolSize = sizeof(bool); + //! @todo Should we also check for ACL_TYPE_OPENCL & ACL_TYPE_LLVMIR_TEXT? + // Checking llvmir in .llvmir section + bool containsSpirv = true; + errorCode = aclQueryInfo(dev().compiler(), binaryElf_, + RT_CONTAINS_SPIRV, nullptr, &containsSpirv, &boolSize); + if (errorCode != ACL_SUCCESS) { + containsSpirv = false; + } + if (containsSpirv) { + completeStages.push_back(from); + from = ACL_TYPE_SPIRV_BINARY; + } + bool containsSpirText = true; + errorCode = aclQueryInfo(dev().compiler(), binaryElf_, RT_CONTAINS_SPIR, nullptr, &containsSpirText, &boolSize); + if (errorCode != ACL_SUCCESS) { + containsSpirText = false; + } + if (containsSpirText) { + completeStages.push_back(from); + from = ACL_TYPE_SPIR_BINARY; + } + bool containsLlvmirText = true; + errorCode = aclQueryInfo(dev().compiler(), binaryElf_, RT_CONTAINS_LLVMIR, nullptr, &containsLlvmirText, &boolSize); + if (errorCode != ACL_SUCCESS) { + containsLlvmirText = false; + } + // Checking compile & link options in .comment section + bool containsOpts = true; + errorCode = aclQueryInfo(dev().compiler(), binaryElf_, RT_CONTAINS_OPTIONS, nullptr, &containsOpts, &boolSize); + if (errorCode != ACL_SUCCESS) { + containsOpts = false; + } + if (containsLlvmirText && containsOpts) { + completeStages.push_back(from); + from = ACL_TYPE_LLVMIR_BINARY; + } + // Checking HSAIL in .cg section + bool containsHsailText = true; + errorCode = aclQueryInfo(dev().compiler(), binaryElf_, RT_CONTAINS_HSAIL, nullptr, &containsHsailText, &boolSize); + if (errorCode != ACL_SUCCESS) { + containsHsailText = false; + } + // Checking BRIG sections + bool containsBrig = true; + errorCode = aclQueryInfo(dev().compiler(), binaryElf_, RT_CONTAINS_BRIG, nullptr, &containsBrig, &boolSize); + if (errorCode != ACL_SUCCESS) { + containsBrig = false; + } + if (containsBrig) { + completeStages.push_back(from); + from = ACL_TYPE_HSAIL_BINARY; + } else if (containsHsailText) { + completeStages.push_back(from); + from = ACL_TYPE_HSAIL_TEXT; + } + // Checking Loader Map symbol from CG section + bool containsLoaderMap = true; + errorCode = aclQueryInfo(dev().compiler(), binaryElf_, RT_CONTAINS_LOADER_MAP, nullptr, &containsLoaderMap, &boolSize); + if (errorCode != ACL_SUCCESS) { + containsLoaderMap = false; + } + if (containsLoaderMap) { + completeStages.push_back(from); + from = ACL_TYPE_CG; + } + // Checking ISA in .text section + bool containsShaderIsa = true; + errorCode = aclQueryInfo(dev().compiler(), binaryElf_, RT_CONTAINS_ISA, nullptr, &containsShaderIsa, &boolSize); + if (errorCode != ACL_SUCCESS) { + containsShaderIsa = false; + } + if (containsShaderIsa) { + completeStages.push_back(from); + from = ACL_TYPE_ISA; + } + std::string sCurOptions = compileOptions_ + linkOptions_; + amd::option::Options curOptions; + if (!amd::option::parseAllOptions(sCurOptions, curOptions)) { + buildLog_ += curOptions.optionsLog(); + LogError("Parsing compile options failed."); + return ACL_TYPE_DEFAULT; + } + switch (from) { + // compile from HSAIL text, no matter prev. stages and options + case ACL_TYPE_HSAIL_TEXT: + needOptionsCheck = false; + break; + case ACL_TYPE_HSAIL_BINARY: + // do not check options, if LLVMIR is absent or might be absent or options are absent + if (!curOptions.oVariables->BinLLVMIR || !containsLlvmirText || !containsOpts) { + needOptionsCheck = false; + } + break; + case ACL_TYPE_CG: + case ACL_TYPE_ISA: + // do not check options, if LLVMIR is absent or might be absent or options are absent + if (!curOptions.oVariables->BinLLVMIR || !containsLlvmirText || !containsOpts) { + needOptionsCheck = false; + } + // do not check options, if BRIG is absent or might be absent or LoaderMap is absent + if (!curOptions.oVariables->BinCG || !containsBrig || !containsLoaderMap) { + needOptionsCheck = false; + } + break; + // recompilation might be needed + case ACL_TYPE_LLVMIR_BINARY: + case ACL_TYPE_DEFAULT: + default: + break; + } + return from; +} + +aclType +HSAILProgram::getNextCompilationStageFromBinary(amd::option::Options* options) { + aclType continueCompileFrom = ACL_TYPE_DEFAULT; + binary_t binary = this->binary(); + // If the binary already exists + if ((binary.first != nullptr) && (binary.second > 0)) { + void *mem = const_cast(binary.first); + acl_error errorCode; + binaryElf_ = aclReadFromMem(mem, binary.second, &errorCode); + if (errorCode != ACL_SUCCESS) { + buildLog_ += "Error: Reading the binary from memory failed.\n"; + return continueCompileFrom; + } + // Calculate the next stage to compile from, based on sections in binaryElf_; + // No any validity checks here + std::vector completeStages; + bool needOptionsCheck = true; + continueCompileFrom = getCompilationStagesFromBinary(completeStages, needOptionsCheck); + // Saving binary in the interface class, + // which also load compile & link options from binary + setBinary(static_cast(mem), binary.second); + if (!options || !needOptionsCheck) { + return continueCompileFrom; + } + bool recompile = false; + //! @todo Should we also check for ACL_TYPE_OPENCL & ACL_TYPE_LLVMIR_TEXT? + switch (continueCompileFrom) { + case ACL_TYPE_HSAIL_BINARY: + case ACL_TYPE_CG: + case ACL_TYPE_ISA: { + // Compare options loaded from binary with current ones, recompile if differ; + // If compile options are absent in binary, do not compare and recompile + if (compileOptions_.empty()) + break; + const oclBIFSymbolStruct* symbol = findBIF30SymStruct(symOpenclCompilerOptions); + assert(symbol && "symbol not found"); + std::string symName = std::string(symbol->str[bif::PRE]) + std::string(symbol->str[bif::POST]); + size_t symSize = 0; + const void *opts = aclExtractSymbol(dev().compiler(), + binaryElf_, &symSize, aclCOMMENT, symName.c_str(), &errorCode); + if (errorCode != ACL_SUCCESS) { + recompile = true; + break; + } + std::string sBinOptions = std::string((char*)opts, symSize); + std::string sCurOptions = compileOptions_ + linkOptions_; + amd::option::Options curOptions, binOptions; + if (!amd::option::parseAllOptions(sBinOptions, binOptions)) { + buildLog_ += binOptions.optionsLog(); + LogError("Parsing compile options from binary failed."); + return ACL_TYPE_DEFAULT; + } + if (!amd::option::parseAllOptions(sCurOptions, curOptions)) { + buildLog_ += curOptions.optionsLog(); + LogError("Parsing compile options failed."); + return ACL_TYPE_DEFAULT; + } + if (!curOptions.equals(binOptions)) { + recompile = true; + } + break; + } + default: + break; + } + if (recompile) { + while (!completeStages.empty()) { + continueCompileFrom = completeStages.back(); + if (continueCompileFrom == ACL_TYPE_SPIRV_BINARY || + continueCompileFrom == ACL_TYPE_LLVMIR_BINARY || + continueCompileFrom == ACL_TYPE_SPIR_BINARY || + continueCompileFrom == ACL_TYPE_DEFAULT) { + break; + } + completeStages.pop_back(); + } + } + } + return continueCompileFrom; +} + +inline static std::vector +splitSpaceSeparatedString(char *str) +{ + std::string s(str); + std::stringstream ss(s); + std::istream_iterator beg(ss), end; + std::vector vec(beg, end); + return vec; +} + +bool +HSAILProgram::linkImpl(amd::option::Options* options) +{ + acl_error errorCode; + aclType continueCompileFrom = ACL_TYPE_LLVMIR_BINARY; + bool finalize = true; + bool hsaLoad = true; + // If !binaryElf_ then program must have been created using clCreateProgramWithBinary + if (!binaryElf_) { + continueCompileFrom = getNextCompilationStageFromBinary(options); + } + switch (continueCompileFrom) { + case ACL_TYPE_SPIRV_BINARY: + case ACL_TYPE_SPIR_BINARY: + // Compilation from ACL_TYPE_LLVMIR_BINARY to ACL_TYPE_CG in cases: + // 1. if the program is not created with binary; + // 2. if the program is created with binary and contains only .llvmir & .comment + // 3. if the program is created with binary, contains .llvmir, .comment, brig sections, + // but the binary's compile & link options differ from current ones (recompilation); + case ACL_TYPE_LLVMIR_BINARY: + // Compilation from ACL_TYPE_HSAIL_BINARY to ACL_TYPE_CG in cases: + // 1. if the program is created with binary and contains only brig sections + case ACL_TYPE_HSAIL_BINARY: + // Compilation from ACL_TYPE_HSAIL_TEXT to ACL_TYPE_CG in cases: + // 1. if the program is created with binary and contains only hsail text + case ACL_TYPE_HSAIL_TEXT: { + std::string curOptions = options->origOptionStr + hsailOptions(); + errorCode = aclCompile(dev().compiler(), binaryElf_, + curOptions.c_str(), continueCompileFrom, ACL_TYPE_CG, nullptr); + buildLog_ += aclGetCompilerLog(dev().compiler()); + if (errorCode != ACL_SUCCESS) { + buildLog_ += "Error: BRIG code generation failed.\n"; + return false; + } + break; + } + case ACL_TYPE_CG: + break; + case ACL_TYPE_ISA: + finalize = false; + break; + default: + buildLog_ += "Error: The binary is incorrect or incomplete. Finalization to ISA couldn't be performed.\n"; + return false; + } + if (finalize) { + std::string fin_options(options->origOptionStr + hsailOptions()); + // Append an option so that we can selectively enable a SCOption on CZ + // whenever IOMMUv2 is enabled. + if (dev().settings().svmFineGrainSystem_) { + fin_options.append(" -sc-xnack-iommu"); + } + errorCode = aclCompile(dev().compiler(), binaryElf_, + fin_options.c_str(), ACL_TYPE_CG, ACL_TYPE_ISA, nullptr); + buildLog_ += aclGetCompilerLog(dev().compiler()); + if (errorCode != ACL_SUCCESS) { + buildLog_ += "Error: BRIG finalization to ISA failed.\n"; + return false; + } + } + // ACL_TYPE_CG stage is not performed for offline compilation + hsa_agent_t agent; + agent.handle = 1; + if (!isNull() && hsaLoad) { + executable_ = loader_->CreateExecutable(HSA_PROFILE_BASE, nullptr); + if (executable_ == nullptr) { + buildLog_ += "Error: Executable for AMD HSA Code Object isn't created.\n"; + return false; + } + size_t size = 0; + hsa_code_object_t code_object; + code_object.handle = reinterpret_cast(aclExtractSection(dev().compiler(), binaryElf_, &size, aclTEXT, &errorCode)); + if (errorCode != ACL_SUCCESS) { + buildLog_ += "Error: Extracting AMD HSA Code Object from binary failed.\n"; + return false; + } + hsa_status_t status = executable_->LoadCodeObject(agent, code_object, nullptr); + if (status != HSA_STATUS_SUCCESS) { + buildLog_ += "Error: AMD HSA Code Object loading failed.\n"; + return false; + } + } + size_t kernelNamesSize = 0; + errorCode = aclQueryInfo(dev().compiler(), binaryElf_, RT_KERNEL_NAMES, nullptr, nullptr, &kernelNamesSize); + if (errorCode != ACL_SUCCESS) { + buildLog_ += "Error: Querying of kernel names size from the binary failed.\n"; + return false; + } + if (!isNull() && kernelNamesSize > 0) { + char* kernelNames = new char[kernelNamesSize]; + errorCode = aclQueryInfo(dev().compiler(), binaryElf_, RT_KERNEL_NAMES, nullptr, kernelNames, &kernelNamesSize); + if (errorCode != ACL_SUCCESS) { + buildLog_ += "Error: Querying of kernel names from the binary failed.\n"; + delete kernelNames; + return false; + } + std::vector vKernels = splitSpaceSeparatedString(kernelNames); + delete kernelNames; + std::vector::iterator it = vKernels.begin(); + bool dynamicParallelism = false; + aclMetadata md; + md.numHiddenKernelArgs = 0; + size_t sizeOfnumHiddenKernelArgs = sizeof(md.numHiddenKernelArgs); + for (it; it != vKernels.end(); ++it) { + std::string kernelName(*it); + std::string openclKernelName = device::Kernel::openclMangledName(kernelName); + errorCode = aclQueryInfo(dev().compiler(), binaryElf_, RT_NUM_KERNEL_HIDDEN_ARGS, + openclKernelName.c_str(), &md.numHiddenKernelArgs, &sizeOfnumHiddenKernelArgs); + if (errorCode != ACL_SUCCESS) { + buildLog_ += "Error: Querying of kernel '" + openclKernelName + + "' extra arguments count from AMD HSA Code Object failed. Kernel initialization failed.\n"; + return false; + } + HSAILKernel *aKernel = new HSAILKernel(kernelName, this, options->origOptionStr + hsailOptions(), + md.numHiddenKernelArgs); + kernels()[kernelName] = aKernel; + amd::hsa::loader::Symbol *sym = executable_->GetSymbol("", openclKernelName.c_str(), agent, 0); + if (!sym) { + buildLog_ += "Error: Getting kernel ISA code symbol '" + openclKernelName + + "' from AMD HSA Code Object failed. Kernel initialization failed.\n"; + return false; + } + if (!aKernel->init(sym, false)) { + buildLog_ += "Error: Kernel '" + openclKernelName + "' initialization failed.\n"; + return false; + } + buildLog_ += aKernel->buildLog(); + aKernel->setUniformWorkGroupSize(options->oVariables->UniformWorkGroupSize); + dynamicParallelism |= aKernel->dynamicParallelism(); + // Find max scratch regs used in the program. It's used for scratch buffer preallocation + // with dynamic parallelism, since runtime doesn't know which child kernel will be called + maxScratchRegs_ = std::max(static_cast(aKernel->workGroupInfo()->scratchRegs_), maxScratchRegs_); + } + // Allocate kernel table for device enqueuing + if (!isNull() && dynamicParallelism && !allocKernelTable()) { + return false; + } + } + // Save the binary in the interface class + size_t size = 0; + void *mem = nullptr; + aclWriteToMem(binaryElf_, &mem, &size); + setBinary(static_cast(mem), size); + buildLog_ += aclGetCompilerLog(dev().compiler()); + setType(TYPE_EXECUTABLE); + return true; +} + +bool +HSAILProgram::createBinary(amd::option::Options *options) +{ + return true; +} + +bool +HSAILProgram::initClBinary() +{ + if (clBinary_ == nullptr) { + clBinary_ = new ClBinaryHsa(static_cast(device())); + if (clBinary_ == nullptr) { + return false; + } + } + return true; +} + +void +HSAILProgram::releaseClBinary() +{ + if (clBinary_ != nullptr) { + delete clBinary_; + clBinary_ = nullptr; + } +} + +std::string +HSAILProgram::hsailOptions() +{ + std::string hsailOptions; + // Set options for the standard device specific options + // All our devices support these options now + if (dev().settings().reportFMAF_) { + hsailOptions.append(" -DFP_FAST_FMAF=1"); + } + if (dev().settings().reportFMA_) { + hsailOptions.append(" -DFP_FAST_FMA=1"); + } + if (!dev().settings().singleFpDenorm_) { + hsailOptions.append(" -cl-denorms-are-zero"); + } + + // Check if the host is 64 bit or 32 bit + LP64_ONLY(hsailOptions.append(" -m64")); + + // Append each extension supported by the device + std::string token; + std::istringstream iss(""); + iss.str(device().info().extensions_); + while (getline(iss, token, ' ')) { + if (!token.empty()) { + hsailOptions.append(" -D"); + hsailOptions.append(token); + hsailOptions.append("=1"); + } + } + return hsailOptions; +} + +bool +HSAILProgram::allocKernelTable() +{ + uint size = kernels().size() * sizeof(size_t); + + kernels_ = new pal::Memory(dev(), size); + // Initialize kernel table + if ((kernels_ == nullptr) || !kernels_->create(Resource::RemoteUSWC)) { + delete kernels_; + return false; + } + else { + size_t* table = reinterpret_cast( + kernels_->map(nullptr, pal::Resource::WriteOnly)); + for (auto& it : kernels()) { + HSAILKernel* kernel = static_cast(it.second); + table[kernel->index()] = static_cast( + kernel->gpuAqlCode()->vmAddress()); + } + kernels_->unmap(nullptr); + } + return true; +} + +void +HSAILProgram::fillResListWithKernels( + std::vector& memList) const +{ + for (auto& it : kernels()) { + memList.push_back( + static_cast(it.second)->gpuAqlCode()); + } +} + +const aclTargetInfo & +HSAILProgram::info(const char * str) { + acl_error err; + std::string arch = "hsail"; + if (dev().settings().use64BitPtr_) { + arch = "hsail64"; + } + info_ = aclGetTargetInfo(arch.c_str(), ( str && str[0] == '\0' ? + dev().hwInfo()->targetName_ : str ), &err); + if (err != ACL_SUCCESS) { + LogWarning("aclGetTargetInfo failed"); + } + return info_; +} + +hsa_isa_t ORCAHSALoaderContext::IsaFromName(const char *name) { + hsa_isa_t isa = {0}; + if (!strcmp(Gfx700, name)) { isa.handle = gfx700; return isa; } + if (!strcmp(Gfx701, name)) { isa.handle = gfx701; return isa; } + if (!strcmp(Gfx800, name)) { isa.handle = gfx800; return isa; } + if (!strcmp(Gfx801, name)) { isa.handle = gfx801; return isa; } + if (!strcmp(Gfx804, name)) { isa.handle = gfx804; return isa; } + if (!strcmp(Gfx810, name)) { isa.handle = gfx810; return isa; } + if (!strcmp(Gfx900, name)) { isa.handle = gfx900; return isa; } + if (!strcmp(Gfx901, name)) { isa.handle = gfx901; return isa; } + return isa; +} + +bool ORCAHSALoaderContext::IsaSupportedByAgent(hsa_agent_t agent, hsa_isa_t isa) { + switch (program_->dev().hwInfo()->gfxipVersion_) { + default: + LogError("Unsupported gfxip version"); + return false; + case gfx700: + case gfx701: + case gfx702: + // gfx701 only differs from gfx700 by faster fp operations and can be loaded on either device. + return isa.handle == gfx700 || isa.handle == gfx701; + case gfx800: + switch (program_->dev().properties().revision) { + case Pal::AsicRevision::Iceland: + case Pal::AsicRevision::Tonga: + return isa.handle == gfx800; + case Pal::AsicRevision::Carrizo: + return isa.handle == gfx801; + case Pal::AsicRevision::Fiji: + case Pal::AsicRevision::Ellesmere: + case Pal::AsicRevision::Baffin: + // gfx800 ISA has only sgrps limited and can be loaded. + // gfx801 ISA has XNACK limitations and can be loaded. + return isa.handle == gfx800 || isa.handle == gfx801 || isa.handle == gfx804; + case Pal::AsicRevision::Stoney: + return isa.handle == gfx810; + default: + assert(0); + return false; + } + case gfx900: + switch (program_->dev().properties().revision) { + case 0: +/* case Pal::AsicRevision::Greenland: + return isa.handle == gfx900 || isa.handle == gfx901;*/ + default: + assert(0); + return false; + } + } +} + +void* ORCAHSALoaderContext::SegmentAlloc(amdgpu_hsa_elf_segment_t segment, + hsa_agent_t agent, size_t size, size_t align, bool zero) { + assert(size); + assert(align); + switch (segment) { + case AMDGPU_HSA_SEGMENT_GLOBAL_PROGRAM: + case AMDGPU_HSA_SEGMENT_GLOBAL_AGENT: + case AMDGPU_HSA_SEGMENT_READONLY_AGENT: + return AgentGlobalAlloc(agent, size, align, zero); + case AMDGPU_HSA_SEGMENT_CODE_AGENT: + return KernelCodeAlloc(agent, size, align, zero); + default: + assert(false); return 0; + } +} + +bool ORCAHSALoaderContext::SegmentCopy(amdgpu_hsa_elf_segment_t segment, + hsa_agent_t agent, void* dst, size_t offset, const void* src, size_t size) { + switch (segment) { + case AMDGPU_HSA_SEGMENT_GLOBAL_PROGRAM: + case AMDGPU_HSA_SEGMENT_GLOBAL_AGENT: + case AMDGPU_HSA_SEGMENT_READONLY_AGENT: + return AgentGlobalCopy(dst, offset, src, size); + case AMDGPU_HSA_SEGMENT_CODE_AGENT: + return KernelCodeCopy(dst, offset, src, size); + default: + assert(false); return false; + } +} + +void ORCAHSALoaderContext::SegmentFree(amdgpu_hsa_elf_segment_t segment, + hsa_agent_t agent, void* seg, size_t size) { + switch (segment) { + case AMDGPU_HSA_SEGMENT_GLOBAL_PROGRAM: + case AMDGPU_HSA_SEGMENT_GLOBAL_AGENT: + case AMDGPU_HSA_SEGMENT_READONLY_AGENT: AgentGlobalFree(seg, size); break; + case AMDGPU_HSA_SEGMENT_CODE_AGENT: KernelCodeFree(seg, size); break; + default: + assert(false); return; + } +} + +void* ORCAHSALoaderContext::SegmentAddress(amdgpu_hsa_elf_segment_t segment, + hsa_agent_t agent, void* seg, size_t offset) { + assert(seg); + switch (segment) { + case AMDGPU_HSA_SEGMENT_GLOBAL_PROGRAM: + case AMDGPU_HSA_SEGMENT_GLOBAL_AGENT: + case AMDGPU_HSA_SEGMENT_READONLY_AGENT: { + pal::Memory *gpuMem = reinterpret_cast(seg); + return reinterpret_cast(gpuMem->vmAddress() + offset); + } + case AMDGPU_HSA_SEGMENT_CODE_AGENT: return (char*) seg + offset; + default: + assert(false); return nullptr; + } +} + +hsa_status_t ORCAHSALoaderContext::SamplerCreate( + hsa_agent_t agent, + const hsa_ext_sampler_descriptor_t *sampler_descriptor, + hsa_ext_sampler_t *sampler_handle) { + if (!agent.handle) { + return HSA_STATUS_ERROR_INVALID_AGENT; + } + if (!sampler_descriptor || !sampler_handle) { + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + } + uint32_t state = 0; + switch (sampler_descriptor->coordinate_mode) { + case HSA_EXT_SAMPLER_COORDINATE_MODE_UNNORMALIZED: state = amd::Sampler::StateNormalizedCoordsFalse; break; + case HSA_EXT_SAMPLER_COORDINATE_MODE_NORMALIZED: state = amd::Sampler::StateNormalizedCoordsTrue; break; + default: + assert(false); + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + } + switch (sampler_descriptor->filter_mode) { + case HSA_EXT_SAMPLER_FILTER_MODE_NEAREST: state |= amd::Sampler::StateFilterNearest; break; + case HSA_EXT_SAMPLER_FILTER_MODE_LINEAR: state |= amd::Sampler::StateFilterLinear; break; + default: + assert(false); + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + + } + switch (sampler_descriptor->address_mode) { + case HSA_EXT_SAMPLER_ADDRESSING_MODE_CLAMP_TO_EDGE: state |= amd::Sampler::StateAddressClampToEdge; break; + case HSA_EXT_SAMPLER_ADDRESSING_MODE_CLAMP_TO_BORDER: state |= amd::Sampler::StateAddressClamp; break; + case HSA_EXT_SAMPLER_ADDRESSING_MODE_REPEAT: state |= amd::Sampler::StateAddressRepeat; break; + case HSA_EXT_SAMPLER_ADDRESSING_MODE_MIRRORED_REPEAT: state |= amd::Sampler::StateAddressMirroredRepeat; break; + case HSA_EXT_SAMPLER_ADDRESSING_MODE_UNDEFINED: state |= amd::Sampler::StateAddressNone; break; + default: + assert(false); + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + } + assert(!program_->dev().settings().hsailDirectSRD_); + pal::Sampler* sampler = new pal::Sampler(program_->dev()); + if (!sampler || !sampler->create(state)) { + delete sampler; + return HSA_STATUS_ERROR; + } + program_->addSampler(sampler); + sampler_handle->handle = sampler->hwSrd(); + return HSA_STATUS_SUCCESS; +} + +hsa_status_t ORCAHSALoaderContext::SamplerDestroy( + hsa_agent_t agent, hsa_ext_sampler_t sampler_handle) { + if (!agent.handle) { + return HSA_STATUS_ERROR_INVALID_AGENT; + } + if (!sampler_handle.handle) { + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + } + return HSA_STATUS_SUCCESS; +} + +void* ORCAHSALoaderContext::CpuMemAlloc(size_t size, size_t align, bool zero) { + assert(size); + assert(align); + assert(sizeof(void*) == 8 || sizeof(void*) == 4); + void* ptr = amd::Os::alignedMalloc(size, align); + if (zero) { + memset(ptr, 0, size); + } + return ptr; +} + +bool ORCAHSALoaderContext::CpuMemCopy(void *dst, size_t offset, const void* src, size_t size) { + if (!dst || !src || dst == src) { + return false; + } + if (0 == size) { + return true; + } + amd::Os::fastMemcpy((char*)dst + offset, src, size); + return true; +} + +void* ORCAHSALoaderContext::GpuMemAlloc(size_t size, size_t align, bool zero) { + assert(size); + assert(align); + assert(sizeof(void*) == 8 || sizeof(void*) == 4); + pal::Memory* mem = new pal::Memory(program_->dev(), amd::alignUp(size, align)); + if (!mem || !mem->create(pal::Resource::Local)) { + delete mem; + return nullptr; + } + assert(program_->dev().xferQueue()); + if (zero) { + char pattern = 0; + program_->dev().xferMgr().fillBuffer(*mem, &pattern, sizeof(pattern), amd::Coord3D(0), amd::Coord3D(size)); + } + program_->addGlobalStore(mem); + program_->setGlobalVariableTotalSize(program_->globalVariableTotalSize() + size); + return mem; +} + +bool ORCAHSALoaderContext::GpuMemCopy(void *dst, size_t offset, const void *src, size_t size) { + if (!dst || !src || dst == src) { + return false; + } + if (0 == size) { + return true; + } + assert(program_->dev().xferQueue()); + pal::Memory* mem = reinterpret_cast(dst); + return program_->dev().xferMgr().writeBuffer(src, *mem, amd::Coord3D(offset), amd::Coord3D(size), true); + return true; +} + +} // namespace pal diff --git a/projects/clr/rocclr/runtime/device/pal/palprogram.hpp b/projects/clr/rocclr/runtime/device/pal/palprogram.hpp new file mode 100644 index 0000000000..e4f72d7bf3 --- /dev/null +++ b/projects/clr/rocclr/runtime/device/pal/palprogram.hpp @@ -0,0 +1,292 @@ +// +// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved. +// +#ifndef PALPROGRAM_HPP_ +#define PALPROGRAM_HPP_ + +#include "device/pal/palkernel.hpp" +#include "device/pal/palbinary.hpp" +#include "amd_hsa_loader.hpp" + +namespace amd { +namespace option { +class Options; +} // option +namespace hsa { +namespace loader { +class Loader; +class Executable; +class Context; +} // loader +} // hsa +} // amd + +//! \namespace pal PAL Device Implementation +namespace pal { + +/*! \addtogroup pal PAL Device Implementation + * @{ + */ + +using namespace amd::hsa::loader; +class HSAILProgram; +class ClBinaryHsa; + +class ORCAHSALoaderContext final: public Context { +public: + ORCAHSALoaderContext(HSAILProgram* program): program_(program) {} + + virtual ~ORCAHSALoaderContext() {} + + hsa_isa_t IsaFromName(const char *name) override; + + bool IsaSupportedByAgent(hsa_agent_t agent, hsa_isa_t isa) override; + + void* SegmentAlloc(amdgpu_hsa_elf_segment_t segment, + hsa_agent_t agent, size_t size, size_t align, bool zero) override; + + bool SegmentCopy(amdgpu_hsa_elf_segment_t segment, + hsa_agent_t agent, void* dst, size_t offset, + const void* src, size_t size) override; + + void SegmentFree(amdgpu_hsa_elf_segment_t segment, + hsa_agent_t agent, void* seg, size_t size = 0) override; + + void* SegmentAddress(amdgpu_hsa_elf_segment_t segment, + hsa_agent_t agent, void* seg, size_t offset) override; + + bool SegmentFreeze(amdgpu_hsa_elf_segment_t segment, + hsa_agent_t agent, void* seg, size_t size) override { return false; } + + bool ImageExtensionSupported() override { return false; } + + hsa_status_t ImageCreate( + hsa_agent_t agent, + hsa_access_permission_t image_permission, + const hsa_ext_image_descriptor_t *image_descriptor, + const void *image_data, + hsa_ext_image_t *image_handle) override { + // not supported + assert(false); + return HSA_STATUS_ERROR; + } + + hsa_status_t ImageDestroy( + hsa_agent_t agent, hsa_ext_image_t image_handle) override { + // not supported + assert(false); + return HSA_STATUS_ERROR; + } + + hsa_status_t SamplerCreate( + hsa_agent_t agent, + const hsa_ext_sampler_descriptor_t *sampler_descriptor, + hsa_ext_sampler_t *sampler_handle) override; + + //! All samplers are owned by HSAILProgram and are deleted in its destructor. + hsa_status_t SamplerDestroy( + hsa_agent_t agent, hsa_ext_sampler_t sampler_handle) override; + +private: + + void* AgentGlobalAlloc( + hsa_agent_t agent, size_t size, size_t align, bool zero) { + return GpuMemAlloc(size, align, zero); + } + + bool AgentGlobalCopy(void *dst, size_t offset, const void *src, size_t size) { + return GpuMemCopy(dst, offset, src, size); + } + + void AgentGlobalFree(void *ptr, size_t size) { + GpuMemFree(ptr, size); + } + + void* KernelCodeAlloc( + hsa_agent_t agent, size_t size, size_t align, bool zero) { + return CpuMemAlloc(size, align, zero); + } + + bool KernelCodeCopy(void *dst, size_t offset, const void *src, size_t size) { + return CpuMemCopy(dst, offset, src, size); + } + + void KernelCodeFree(void *ptr, size_t size) { + CpuMemFree(ptr, size); + } + + void* CpuMemAlloc(size_t size, size_t align, bool zero); + + bool CpuMemCopy(void *dst, size_t offset, const void* src, size_t size); + + void CpuMemFree(void *ptr, size_t size) { + amd::Os::alignedFree(ptr); + } + + void* GpuMemAlloc(size_t size, size_t align, bool zero); + + bool GpuMemCopy(void *dst, size_t offset, const void *src, size_t size); + + void GpuMemFree(void *ptr, size_t size = 0) { + delete reinterpret_cast(ptr); + } + + ORCAHSALoaderContext(const ORCAHSALoaderContext &c); + + ORCAHSALoaderContext& operator=(const ORCAHSALoaderContext &c); + + enum gfx_handle { + gfx700 = 700, + gfx701 = 701, + gfx702 = 702, + gfx800 = 800, + gfx801 = 801, + gfx804 = 804, + gfx810 = 810, + gfx900 = 900, + gfx901 = 901 + }; + + pal::HSAILProgram* program_; +}; + +//! \class HSAIL program +class HSAILProgram : public device::Program +{ + friend class ClBinary; +public: + //! Default constructor + HSAILProgram(Device& device); + HSAILProgram(NullDevice& device); + //! Default destructor + ~HSAILProgram(); + + //! Returns the aclBinary associated with the progrm + aclBinary* binaryElf() const { + return static_cast(binaryElf_); } + + void addGlobalStore(Memory* mem) { globalStores_.push_back(mem); } + + const std::vector& globalStores() const { return globalStores_; } + + //! Return a typecasted GPU device + pal::Device& dev() + { return const_cast( + static_cast(device())); } + + //! Returns GPU kernel table + const Memory* kernelTable() const { return kernels_; } + + //! Adds all kernels to the mem handle lists + void fillResListWithKernels(std::vector& memList) const; + + //! Returns the maximum number of scratch regs used in the program + uint maxScratchRegs() const { return maxScratchRegs_; } + + //! Add internal static sampler + void addSampler(Sampler* sampler) { staticSamplers_.push_back(sampler); } + + //! Returns TRUE if the program just compiled + bool isNull() const { return isNull_; } + +protected: + //! pre-compile setup for GPU + virtual bool initBuild(amd::option::Options* options); + + //! post-compile setup for GPU + virtual bool finiBuild(bool isBuildGood); + + /*! \brief Compiles GPU CL program to LLVM binary (compiler frontend) + * + * \return True if we successefully compiled a GPU program + */ + virtual bool compileImpl( + const std::string& sourceCode, //!< the program's source code + const std::vector& headers, + const char** headerIncludeNames, + amd::option::Options* options //!< compile options's object + ); + + /* \brief Returns the next stage to compile from, based on sections in binary, + * also returns completeStages in a vector, which contains at least ACL_TYPE_DEFAULT, + * sets needOptionsCheck to true if options check is needed to decide whether or not to recompile + */ + aclType getCompilationStagesFromBinary(std::vector& completeStages, bool& needOptionsCheck); + + /* \brief Returns the next stage to compile from, based on sections and options in binary + */ + aclType getNextCompilationStageFromBinary(amd::option::Options* options); + + /*! \brief Compiles LLVM binary to FSAIL code (compiler backend: link+opt+codegen) + * + * \return The build error code + */ + int compileBinaryToFSAIL( + amd::option::Options* options //!< options for compilation + ); + + virtual bool linkImpl(amd::option::Options* options); + + //! Link the device programs. + virtual bool linkImpl (const std::vector& inputPrograms, + amd::option::Options* options, + bool createLibrary); + + virtual bool createBinary(amd::option::Options* options); + + //! Initialize Binary + virtual bool initClBinary(); + + //! Release the Binary + virtual void releaseClBinary(); + + virtual const aclTargetInfo & info(const char * str = ""); + + virtual bool isElf(const char* bin) const { + return amd::isElfMagic(bin); + //return false; + } + + //! Returns the binary + // This should ensure that the binary is updated with all the kernels + // ClBinary& clBinary() { return binary_; } + ClBinaryHsa* clBinary() { + return static_cast(device::Program::clBinary()); + } + const ClBinaryHsa* clBinary() const { + return static_cast(device::Program::clBinary()); + } + +private: + //! Disable default copy constructor + HSAILProgram(const HSAILProgram&); + + //! Disable operator= + HSAILProgram& operator=(const HSAILProgram&); + + //! Returns all the options to be appended while passing to the + //compiler library + std::string hsailOptions(); + + //! Allocate kernel table + bool allocKernelTable(); + + std::string openCLSource_; //!< Original OpenCL source + std::string HSAILProgram_; //!< FSAIL program after compilation + std::string llvmBinary_; //!< LLVM IR binary code + aclBinary* binaryElf_; //!< Binary for the new compiler library + void* rawBinary_; //!< Pointer to the raw binary + aclBinaryOptions binOpts_; //!< Binary options to create aclBinary + std::vector globalStores_; //!< Global memory for the program + Memory* kernels_; //!< Table with kernel object pointers + uint maxScratchRegs_; //!< Maximum number of scratch regs used in the program by individual kernel + std::list staticSamplers_; //!< List od internal static samplers + bool isNull_; //!< Null program no memory allocations + amd::hsa::loader::Loader* loader_; //!< Loader object + amd::hsa::loader::Executable* executable_; //!< Executable for HSA Loader + ORCAHSALoaderContext loaderContext_; //!< Context for HSA Loader +}; + +/*@}*/} // namespace pal + +#endif /*PALPROGRAM_HPP_*/ diff --git a/projects/clr/rocclr/runtime/device/pal/palresource.cpp b/projects/clr/rocclr/runtime/device/pal/palresource.cpp new file mode 100644 index 0000000000..6a6a75124e --- /dev/null +++ b/projects/clr/rocclr/runtime/device/pal/palresource.cpp @@ -0,0 +1,2042 @@ +// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved. +// +#include "platform/program.hpp" +#include "platform/kernel.hpp" +#include "os/os.hpp" +#include "device/device.hpp" +#include "utils/flags.hpp" +#include "thread/monitor.hpp" +#include "device/pal/palresource.hpp" +#include "device/pal/paldevice.hpp" +#include "device/pal/palblit.hpp" +#include "device/pal/paltimestamp.hpp" +#include "thread/atomic.hpp" +#include "hsa_ext_image.h" +#ifdef _WIN32 +#include +#include "CL/cl_d3d10.h" +#include "CL/cl_d3d11.h" +#endif // _WIN32 +#include +#include "GL/glATIInternal.h" + +#include +#include +#include +#include +#include + +namespace pal { + +GpuMemoryReference* +GpuMemoryReference::Create( + const Device& dev, + const Pal::GpuMemoryCreateInfo& createInfo) +{ + Pal::Result result; + size_t gpuMemSize = dev.iDev()->GetGpuMemorySize(createInfo, &result); + if (result != Pal::Result::Success) { + return nullptr; + } + + GpuMemoryReference* memRef = new (gpuMemSize) GpuMemoryReference(); + if (memRef != nullptr) { + result = dev.iDev()->CreateGpuMemory(createInfo, &memRef[1], &memRef->gpuMem_); + if (result != Pal::Result::Success) { + memRef->release(); + return nullptr; + } + } + // Update free memory size counters + const_cast(dev).updateFreeMemory( + createInfo.heaps[0], createInfo.size, false); + return memRef; +} + +GpuMemoryReference* +GpuMemoryReference::Create( + const Device& dev, + const void* sysMem, + size_t memSize) +{ + Pal::Result result; + size_t gpuMemSize = dev.iDev()->GetPinnedGpuMemorySize(sysMem, memSize, &result); + if (result != Pal::Result::Success) { + return nullptr; + } + + GpuMemoryReference* memRef = new (gpuMemSize) GpuMemoryReference(); + Pal::VaRange vaRange = Pal::VaRange::Default; + if (memRef != nullptr) { + result = dev.iDev()->CreatePinnedGpuMemory(sysMem, memSize, vaRange, + &memRef[1], &memRef->gpuMem_); + if (result != Pal::Result::Success) { + memRef->release(); + return nullptr; + } + } + // Update free memory size counters + const_cast(dev).updateFreeMemory( + Pal::GpuHeap::GpuHeapGartCacheable, memSize, false); + return memRef; +} + +GpuMemoryReference* +GpuMemoryReference::Create( + const Device& dev, + const Pal::ExternalResourceOpenInfo& openInfo) +{ + Pal::Result result; + size_t gpuMemSize = dev.iDev()->GetExternalSharedGpuMemorySize(&result); + if (result != Pal::Result::Success) { + return nullptr; + } + + Pal::GpuMemoryCreateInfo createInfo = {}; + GpuMemoryReference* memRef = new (gpuMemSize) GpuMemoryReference(); + if (memRef != nullptr) { + result = dev.iDev()->OpenExternalSharedGpuMemory( + openInfo, &memRef[1], &createInfo, &memRef->gpuMem_); + if (result != Pal::Result::Success) { + memRef->release(); + return nullptr; + } + } + + return memRef; +} + +GpuMemoryReference* +GpuMemoryReference::Create( + const Device& dev, + const Pal::ExternalImageOpenInfo& openInfo, + Pal::ImageCreateInfo* imgCreateInfo, + Pal::IImage** image) +{ + Pal::Result result; + size_t gpuMemSize = 0; + size_t imageSize = 0; + if (Pal::Result::Success != dev.iDev()->GetExternalSharedImageSizes( + openInfo, &imageSize, &gpuMemSize, imgCreateInfo)) { + return nullptr; + } + + Pal::GpuMemoryCreateInfo createInfo = {}; + GpuMemoryReference* memRef = new (gpuMemSize) GpuMemoryReference(); + char* imgMem = new char [imageSize]; + if (memRef != nullptr) { + result = dev.iDev()->OpenExternalSharedImage( + openInfo, imgMem, &memRef[1], &createInfo, image, &memRef->gpuMem_); + if (result != Pal::Result::Success) { + memRef->release(); + return nullptr; + } + } + + return memRef; +} + +GpuMemoryReference::GpuMemoryReference() + : gpuMem_(nullptr) + , cpuAddress_(nullptr) +{ +} + +GpuMemoryReference::~GpuMemoryReference() +{ + if (cpuAddress_ != nullptr) { + iMem()->Unmap(); + } + if (0 != iMem()) { + iMem()->Destroy(); + gpuMem_ = nullptr; + } +} + +Resource::Resource( + const Device& gpuDev, + size_t size) + : elementSize_(0) + , gpuDevice_(gpuDev) + , mapCount_(0) + , address_(nullptr) + , offset_(0) + , curRename_(0) + , memRef_(nullptr) + , viewOwner_(nullptr) + , pinOffset_(0) + , gpu_(nullptr) + , image_(nullptr) + , hwSrd_(0) +{ + // Fill resource descriptor fields + desc_.state_ = 0; + desc_.type_ = Empty; + desc_.width_ = amd::alignUp(size, + Pal::Formats::BytesPerPixel(Pal::ChFmt::R32)) / + Pal::Formats::BytesPerPixel(Pal::ChFmt::R32); + desc_.height_ = 1; + desc_.depth_ = 1; + desc_.mipLevels_ = 1; + desc_.format_.image_channel_order = CL_R; + desc_.format_.image_channel_data_type = CL_FLOAT; + desc_.flags_ = 0; + desc_.pitch_ = 0; + desc_.slice_ = 0; + desc_.cardMemory_ = true; + desc_.dimSize_ = 1; + desc_.buffer_ = true; + desc_.imageArray_ = false; + desc_.topology_ = CL_MEM_OBJECT_BUFFER; + desc_.SVMRes_ = false; + desc_.scratch_ = false; + desc_.isAllocExecute_ = false; +} + +Resource::Resource( + const Device& gpuDev, + size_t width, + size_t height, + size_t depth, + cl_image_format format, + cl_mem_object_type imageType, + uint mipLevels) + : elementSize_(0) + , gpuDevice_(gpuDev) + , mapCount_(0) + , address_(nullptr) + , offset_(0) + , curRename_(0) + , memRef_(nullptr) + , viewOwner_(nullptr) + , pinOffset_(0) + , gpu_(nullptr) + , image_(nullptr) + , hwSrd_(0) +{ + // Fill resource descriptor fields + desc_.state_ = 0; + desc_.type_ = Empty; + desc_.width_ = width; + desc_.height_ = height; + desc_.depth_ = depth; + desc_.mipLevels_ = mipLevels; + desc_.format_ = format; + desc_.flags_ = 0; + desc_.pitch_ = 0; + desc_.slice_ = 0; + desc_.cardMemory_ = true; + desc_.buffer_ = false; + desc_.imageArray_ = false; + desc_.topology_ = imageType; + desc_.SVMRes_ = false; + desc_.scratch_ = false; + desc_.isAllocExecute_ = false; + + switch (imageType) { + case CL_MEM_OBJECT_IMAGE2D: + desc_.dimSize_ = 2; + break; + case CL_MEM_OBJECT_IMAGE3D: + desc_.dimSize_ = 3; + break; + case CL_MEM_OBJECT_IMAGE2D_ARRAY: + desc_.dimSize_ = 3; + desc_.imageArray_ = true; + break; + case CL_MEM_OBJECT_IMAGE1D: + desc_.dimSize_ = 1; + break; + case CL_MEM_OBJECT_IMAGE1D_ARRAY: + desc_.dimSize_ = 2; + desc_.imageArray_ = true; + break; + case CL_MEM_OBJECT_IMAGE1D_BUFFER: + desc_.dimSize_ = 1; + break; + default: + desc_.dimSize_ = 1; + LogError("Unknown image type!"); + break; + } +} + +Resource::~Resource() +{ + Pal::GpuHeap heap = Pal::GpuHeapCount; + switch (memoryType()) { + case Persistent: + heap = Pal::GpuHeapLocal; + break; + case RemoteUSWC: + heap = Pal::GpuHeapGartUswc; + break; + case Pinned: + case Remote: + heap = Pal::GpuHeapGartCacheable; + break; + case Shader: + case BusAddressable: + case ExternalPhysical: + // Fall through to process the memory allocation ... + case Local: + heap = Pal::GpuHeapInvisible; + break; + } + if ((memRef_ != nullptr) && (heap != Pal::GpuHeapCount)) { + // Update free memory size counters + const_cast(dev()).updateFreeMemory( + heap, iMem()->Desc().size, true); + } + + free(); + + if ((nullptr != image_) && ((memoryType() != ImageView) || + //! @todo PAL doesn't allow an SRD view creation with different pixel size + (elementSize() != viewOwner_->elementSize()))) { + image_->Destroy(); + delete [] reinterpret_cast(image_); + } +} + +static uint32_t GetHSAILImageFormatType(const cl_image_format& format) +{ + static const uint32_t FormatType[] = { + HSA_EXT_IMAGE_CHANNEL_TYPE_SNORM_INT8, + HSA_EXT_IMAGE_CHANNEL_TYPE_SNORM_INT16, + HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_INT8, + HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_INT16, + HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_SHORT_565, + HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_SHORT_555, + HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_SHORT_101010, + HSA_EXT_IMAGE_CHANNEL_TYPE_SIGNED_INT8, + HSA_EXT_IMAGE_CHANNEL_TYPE_SIGNED_INT16, + HSA_EXT_IMAGE_CHANNEL_TYPE_SIGNED_INT32, + HSA_EXT_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8, + HSA_EXT_IMAGE_CHANNEL_TYPE_UNSIGNED_INT16, + HSA_EXT_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32, + HSA_EXT_IMAGE_CHANNEL_TYPE_HALF_FLOAT, + HSA_EXT_IMAGE_CHANNEL_TYPE_FLOAT, + HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_INT24 }; + + uint idx = format.image_channel_data_type - CL_SNORM_INT8; + assert((idx <= (CL_UNORM_INT24 - CL_SNORM_INT8)) && "Out of range format channel!"); + return FormatType[idx]; +} + +static uint32_t GetHSAILImageOrderType(const cl_image_format& format) +{ + static const uint32_t OrderType[] = { + HSA_EXT_IMAGE_CHANNEL_ORDER_R, + HSA_EXT_IMAGE_CHANNEL_ORDER_A, + HSA_EXT_IMAGE_CHANNEL_ORDER_RG, + HSA_EXT_IMAGE_CHANNEL_ORDER_RA, + HSA_EXT_IMAGE_CHANNEL_ORDER_RGB, + HSA_EXT_IMAGE_CHANNEL_ORDER_RGBA, + HSA_EXT_IMAGE_CHANNEL_ORDER_BGRA, + HSA_EXT_IMAGE_CHANNEL_ORDER_ARGB, + HSA_EXT_IMAGE_CHANNEL_ORDER_INTENSITY, + HSA_EXT_IMAGE_CHANNEL_ORDER_LUMINANCE, + HSA_EXT_IMAGE_CHANNEL_ORDER_RX, + HSA_EXT_IMAGE_CHANNEL_ORDER_RGX, + HSA_EXT_IMAGE_CHANNEL_ORDER_RGBX, + HSA_EXT_IMAGE_CHANNEL_ORDER_DEPTH, + HSA_EXT_IMAGE_CHANNEL_ORDER_DEPTH_STENCIL, + HSA_EXT_IMAGE_CHANNEL_ORDER_SRGB, + HSA_EXT_IMAGE_CHANNEL_ORDER_SRGBX, + HSA_EXT_IMAGE_CHANNEL_ORDER_SRGBA, + HSA_EXT_IMAGE_CHANNEL_ORDER_SBGRA, + HSA_EXT_IMAGE_CHANNEL_ORDER_ABGR }; + + uint idx = format.image_channel_order - CL_R; + assert((idx <= (CL_ABGR - CL_R)) && "Out of range format order!"); + return OrderType[idx]; +} + +void +Resource::memTypeToHeap(Pal::GpuMemoryCreateInfo* createInfo) +{ + createInfo->heapCount = 1; + switch (memoryType()) { + case Persistent: + createInfo->heaps[0] = Pal::GpuHeapLocal; + break; + case RemoteUSWC: + createInfo->heaps[0] = Pal::GpuHeapGartUswc; + desc_.cardMemory_ = false; + break; + case Remote: + createInfo->heaps[0] = Pal::GpuHeapGartCacheable; + desc_.cardMemory_ = false; + break; + case Shader: + case BusAddressable: + case ExternalPhysical: + // Fall through to process the memory allocation ... + case Local: + createInfo->heapCount = 2; + createInfo->heaps[0] = Pal::GpuHeapInvisible; + createInfo->heaps[1] = Pal::GpuHeapLocal; + break; + } +} + +bool +Resource::create(MemoryType memType, CreateParams* params) +{ + static const Pal::gpusize MaxGpuAlignment = 64 * Ki; + const amd::HostMemoryReference* hostMemRef = nullptr; + bool imageCreateView = false; + uint hostMemOffset = 0; + bool foundCalRef = false; + bool viewDefined = false; + uint viewLayer = 0; + uint viewLevel = 0; + uint viewFlags = 0; + Pal::SubresId ImgSubresId = { Pal::ImageAspect::Color, 0, 0 }; + Pal::SubresRange ImgSubresRange = { ImgSubresId, 1, 1 }; + Pal::ChannelMapping channels; + Pal::Format format = dev().getPalFormat(desc().format_, &channels); + + // This is a thread safe operation + const_cast(dev()).initializeHeapResources(); + + amd::ScopedLock lk(dev().lockPAL()); + + if (memType == Shader) { + // force to use remote memory for HW DEBUG or use + // local memory once we determine if FGS is supported + // memType = (!dev().settings().enableHwDebug_) ? Local : RemoteUSWC; + memType = RemoteUSWC; + } + + // Get the element size + elementSize_ = Pal::Formats::BytesPerPixel(format.chFmt); + desc_.type_ = memType; + if (memType == Scratch) { + // use local memory for scratch buffer unless it is using HW DEBUG + desc_.type_ = (!dev().settings().enableHwDebug_) ? Local : RemoteUSWC; + desc_.scratch_ = true; + } + + // Force remote allocation if it was requested in the settings + if (dev().settings().remoteAlloc_ && + ((memoryType() == Local) || + (memoryType() == Persistent))) { + if (dev().settings().apuSystem_ && dev().settings().viPlus_) { + desc_.type_ = Remote; + } + else { + desc_.type_ = RemoteUSWC; + } + } + + if (dev().settings().disablePersistent_ && (memoryType() == Persistent)) { + desc_.type_ = RemoteUSWC; + } + + if (params != nullptr) { + gpu_ = params->gpu_; + } + + Pal::Result result; + +#ifdef _WIN32 + if ((memoryType() == OGLInterop) || + (memoryType() == D3D9Interop) || + (memoryType() == D3D10Interop) || + (memoryType() == D3D11Interop)) { + Pal::ExternalResourceOpenInfo openInfo = {}; + uint misc = 0; + uint layer = 0; + uint mipLevel = 0; + InteropType type = InteropTypeless; + + if (memoryType() == OGLInterop) { + OGLInteropParams* oglRes = reinterpret_cast(params); + assert(oglRes->glPlatformContext_ && "We don't have OGL context!"); + switch (oglRes->type_) { + case InteropVertexBuffer: + glType_ = GL_RESOURCE_ATTACH_VERTEXBUFFER_AMD; + break; + case InteropRenderBuffer: + glType_ = GL_RESOURCE_ATTACH_RENDERBUFFER_AMD; + break; + case InteropTexture: + case InteropTextureViewLevel: + case InteropTextureViewCube: + glType_ = GL_RESOURCE_ATTACH_TEXTURE_AMD; + break; + default: + LogError("Unknown OGL interop type!"); + return false; + break; + } + glPlatformContext_ = oglRes->glPlatformContext_; + glDeviceContext_ = oglRes->glDeviceContext_; + layer = oglRes->layer_; + type = oglRes->type_; + mipLevel = oglRes->mipLevel_; + + if (!dev().resGLAssociate(oglRes->glPlatformContext_, oglRes->handle_, + glType_, &openInfo.hExternalResource, &glInteropMbRes_, &offset_)) { + return false; + } + } + else { + D3DInteropParams* d3dRes = reinterpret_cast(params); + openInfo.hExternalResource = d3dRes->handle_; + misc = d3dRes->misc; + layer = d3dRes->layer_; + type = d3dRes->type_; + mipLevel = d3dRes->mipLevel_; + } + //! @todo PAL query for image/buffer object doesn't work properly! +#if 0 + bool isImage = false; + if (Pal::Result::Success != + dev().iDev()->DetermineExternalSharedResourceType(openInfo, &isImage)) { + return false; + } +#endif // 0 + if (desc().buffer_ || misc) { + memRef_ = GpuMemoryReference::Create(dev(), openInfo); + if (nullptr == memRef_) { + return false; + } + + if (misc) { + Pal::ImageCreateInfo imgCreateInfo = {}; + Pal::ExternalImageOpenInfo imgOpenInfo = {}; + imgOpenInfo.resourceInfo = openInfo; + imgOpenInfo.format = format; + imgOpenInfo.flags.formatChangeSrd = true; + imgOpenInfo.usage.shaderRead = true; + imgOpenInfo.usage.shaderWrite = true; + Pal::gpusize imageSize; + Pal::gpusize gpuMemSize; + + if (Pal::Result::Success != dev().iDev()->GetExternalSharedImageSizes( + imgOpenInfo, &imageSize, &gpuMemSize, &imgCreateInfo)) { + return false; + } + + Pal::gpusize viewOffset = 0; + imgCreateInfo.flags.shareable = false; + imgCreateInfo.imageType = Pal::ImageType::Tex2d; + imgCreateInfo.extent.width = desc().width_; + imgCreateInfo.extent.height = desc().height_; + imgCreateInfo.extent.depth = desc().depth_; + imgCreateInfo.arraySize = 1; + imgCreateInfo.flags.formatChangeSrd = true; + imgCreateInfo.usageFlags.shaderRead = true; + imgCreateInfo.usageFlags.shaderWrite = true; + imgCreateInfo.format = format; + imgCreateInfo.mipLevels = 1; + imgCreateInfo.samples = 1; + imgCreateInfo.fragments = 1; + imgCreateInfo.tiling = Pal::ImageTiling::Linear; + + switch (misc) { + case 1: // NV12 format + switch (layer) { + case -1: + break; + case 0: + break; + case 1: + // Y - plane size to the offset + // NV12 format. UV is 2 times smaller plane Y + viewOffset = 2 * imgCreateInfo.rowPitch * desc().height_; + imgCreateInfo.depthPitch = imgCreateInfo.rowPitch * desc().height_; + break; + default: + LogError("Unknown Interop View Type"); + return false; + } + break; + case 2: // YV12 format + switch (layer) { + case -1: + break; + case 0: + break; + case 1: + // Y - plane size to the offset + // YV12 format. U is 4 times smaller plane than Y + viewOffset = 2 * imgCreateInfo.rowPitch * desc().height_; + imgCreateInfo.rowPitch >>= 1; + break; + case 2: + // Y + U plane sizes to the offest. + // U plane is 4 times smaller than Y and U == V + viewOffset = 5 * imgCreateInfo.rowPitch * desc().height_ / 2; + imgCreateInfo.rowPitch >>= 1; + break; + default: + LogError("Unknown Interop View Type"); + return false; + } + imgCreateInfo.depthPitch = imgCreateInfo.rowPitch * desc().height_; + break; + default: + LogError("Unknown Interop View Type"); + return false; + } + + imageSize = dev().iDev()->GetImageSize(imgCreateInfo, &result); + if (result != Pal::Result::Success) { + return false; + } + + char* memImg = new char[imageSize]; + if (memImg != nullptr) { + result = dev().iDev()->CreateImage(imgCreateInfo, memImg, &image_); + if (result != Pal::Result::Success) { + delete memImg; + return false; + } + } + result = image_->BindGpuMemory(iMem(), viewOffset); + offset_ = static_cast(viewOffset); + hwSrd_ = dev().srds().allocSrdSlot(reinterpret_cast(&hwState_)); + if ((0 == hwSrd_) && (memoryType() != ImageView)) { + return false; + } + Pal::ImageViewInfo viewInfo = {}; + viewInfo.viewType = Pal::ImageViewType::Tex2d; + viewInfo.pImage = image_; + viewInfo.format = format; + viewInfo.channels = channels; + viewInfo.subresRange = ImgSubresRange; + dev().iDev()->CreateImageViewSrds(1, &viewInfo, hwState_); + + hwState_[8] = GetHSAILImageFormatType(desc().format_); + hwState_[9] = GetHSAILImageOrderType(desc().format_); + hwState_[10] = static_cast(desc().width_); + hwState_[11] = 0; // one extra reserved field in the argument + } + } + else if (desc().topology_ == CL_MEM_OBJECT_IMAGE1D_BUFFER) { + memRef_ = GpuMemoryReference::Create(dev(), openInfo); + if (nullptr == memRef_) { + return false; + } + Pal::BufferViewInfo viewInfo = {}; + viewInfo.gpuAddr = memRef_->iMem()->Desc().gpuVirtAddr + offset(); + viewInfo.range = memRef_->iMem()->Desc().size; + viewInfo.stride = elementSize(); + viewInfo.format = format; + hwSrd_ = dev().srds().allocSrdSlot(reinterpret_cast(&hwState_)); + if ((0 == hwSrd_) && (memoryType() != ImageView)) { + return false; + } + + dev().iDev()->CreateTypedBufferViewSrds(1, &viewInfo, hwState_); + hwState_[8] = GetHSAILImageFormatType(desc().format_); + hwState_[9] = GetHSAILImageOrderType(desc().format_); + hwState_[10] = static_cast(desc().width_); + hwState_[11] = 0; // one extra reserved field in the argument + } + else { + Pal::ExternalImageOpenInfo imgOpenInfo = {}; + Pal::ImageCreateInfo imgCreateInfo = {}; + imgOpenInfo.resourceInfo = openInfo; + imgOpenInfo.format = format; + imgOpenInfo.flags.formatChangeSrd = true; + imgOpenInfo.usage.shaderRead = true; + imgOpenInfo.usage.shaderWrite = true; + memRef_ = GpuMemoryReference::Create( + dev(), imgOpenInfo, &imgCreateInfo, &image_); + if (nullptr == memRef_) { + return false; + } + + hwSrd_ = dev().srds().allocSrdSlot(reinterpret_cast(&hwState_)); + if ((0 == hwSrd_) && (memoryType() != ImageView)) { + return false; + } + Pal::ImageViewInfo viewInfo = {}; + viewInfo.viewType = Pal::ImageViewType::Tex2d; + switch (imgCreateInfo.imageType) { + case Pal::ImageType::Tex3d: + viewInfo.viewType = Pal::ImageViewType::Tex3d; + break; + case Pal::ImageType::Tex1d: + viewInfo.viewType = Pal::ImageViewType::Tex1d; + break; + } + viewInfo.pImage = image_; + viewInfo.format = format; + viewInfo.channels = channels; + if ((type == InteropTextureViewLevel) || + (type == InteropTextureViewCube)) { + ImgSubresRange.startSubres.mipLevel = mipLevel; + if (type == InteropTextureViewCube) { + ImgSubresRange.startSubres.arraySlice = layer; + viewInfo.viewType = Pal::ImageViewType::Tex2d; + } + } + if (desc().topology_ == CL_MEM_OBJECT_IMAGE1D_ARRAY) { + ImgSubresRange.numSlices = desc_.height_; + } + if (desc().topology_ == CL_MEM_OBJECT_IMAGE2D_ARRAY) { + ImgSubresRange.numSlices = desc_.depth_; + } + viewInfo.subresRange = ImgSubresRange; + + dev().iDev()->CreateImageViewSrds(1, &viewInfo, hwState_); + + hwState_[8] = GetHSAILImageFormatType(desc().format_); + hwState_[9] = GetHSAILImageOrderType(desc().format_); + hwState_[10] = static_cast(desc().width_); + hwState_[11] = 0; // one extra reserved field in the argument + } + return true; + } +#endif // _WIN32 + + if (!desc_.buffer_) { + if (desc().topology_ == CL_MEM_OBJECT_IMAGE1D_BUFFER) { + Pal::GpuMemoryCreateInfo createInfo = {}; + createInfo.size = desc().width_ * elementSize(); + // @todo 64K alignment is too big + createInfo.size = amd::alignUp(createInfo.size, MaxGpuAlignment); + createInfo.alignment = MaxGpuAlignment; + createInfo.vaRange = Pal::VaRange::Default; + createInfo.priority = Pal::GpuMemPriority::Normal; + memTypeToHeap(&createInfo); + // createInfo.priority; + memRef_ = dev().resourceCache().findGpuMemory(&desc_, createInfo.size, createInfo.alignment); + if (nullptr == memRef_) { + memRef_ = GpuMemoryReference::Create(dev(), createInfo); + if (nullptr == memRef_) { + LogError("Failed PAL memory allocation!"); + return false; + } + } + Pal::BufferViewInfo viewInfo = {}; + viewInfo.gpuAddr = memRef_->iMem()->Desc().gpuVirtAddr + offset(); + viewInfo.range = memRef_->iMem()->Desc().size; + viewInfo.stride = elementSize(); + viewInfo.format = format; + hwSrd_ = dev().srds().allocSrdSlot(reinterpret_cast(&hwState_)); + if ((0 == hwSrd_) && (memoryType() != ImageView)) { + return false; + } + + dev().iDev()->CreateTypedBufferViewSrds(1, &viewInfo, hwState_); + hwState_[8] = GetHSAILImageFormatType(desc().format_); + hwState_[9] = GetHSAILImageOrderType(desc().format_); + hwState_[10] = static_cast(desc().width_); + hwState_[11] = 0; // one extra reserved field in the argument + return true; + } + + Pal::ImageViewInfo viewInfo = {}; + Pal::ImageCreateInfo imgCreateInfo = {}; + Pal::GpuMemoryRequirements req = {}; + char* memImg; + imgCreateInfo.imageType = Pal::ImageType::Tex2d; + viewInfo.viewType = Pal::ImageViewType::Tex2d; + imgCreateInfo.extent.width = desc_.width_; + imgCreateInfo.extent.height = desc_.height_; + imgCreateInfo.extent.depth = desc_.depth_; + imgCreateInfo.arraySize = 1; + + switch (desc_.topology_) { + case CL_MEM_OBJECT_IMAGE3D: + imgCreateInfo.imageType = Pal::ImageType::Tex3d; + viewInfo.viewType = Pal::ImageViewType::Tex3d; + break; + case CL_MEM_OBJECT_IMAGE1D: + case CL_MEM_OBJECT_IMAGE1D_ARRAY: + case CL_MEM_OBJECT_IMAGE1D_BUFFER: + imgCreateInfo.imageType = Pal::ImageType::Tex1d; + viewInfo.viewType = Pal::ImageViewType::Tex1d; + break; + } + if (desc_.topology_ == CL_MEM_OBJECT_IMAGE1D_ARRAY) { + ImgSubresRange.numSlices = imgCreateInfo.arraySize = desc_.height_; + imgCreateInfo.extent.depth = desc_.height_; + imgCreateInfo.extent.height = 1; + } + if (desc_.topology_ == CL_MEM_OBJECT_IMAGE2D_ARRAY) { + ImgSubresRange.numSlices = imgCreateInfo.arraySize = desc_.depth_; + } + + if (memoryType() == ImageView) { + ImageViewParams* imageView = reinterpret_cast(params); + ImgSubresRange.startSubres.mipLevel = imageView->level_; + ImgSubresRange.startSubres.arraySlice = imageView->layer_; + viewOwner_ = imageView->resource_; + image_ = viewOwner_->image_; + offset_ = viewOwner_->offset_; + } + else if (memoryType() == ImageBuffer) { + ImageBufferParams* imageBuffer = reinterpret_cast(params); + viewOwner_ = imageBuffer->resource_; + } + + if ((memoryType() != ImageView) || + //! @todo PAL doesn't allow an SRD view creation with different pixel size + (elementSize() != viewOwner_->elementSize())) { + imgCreateInfo.flags.formatChangeSrd = true; + imgCreateInfo.usageFlags.shaderRead = true; + imgCreateInfo.usageFlags.shaderWrite = true; + imgCreateInfo.format = format; + imgCreateInfo.mipLevels = (desc_.mipLevels_) ? desc_.mipLevels_ : 1; + imgCreateInfo.samples = 1; + imgCreateInfo.fragments = 1; + Pal::ImageTiling tiling = Pal::ImageTiling::Optimal; + + if (((memoryType() == Persistent) && + dev().settings().linearPersistentImage_) || + (memoryType() == ImageBuffer)) { + tiling = Pal::ImageTiling::Linear; + } + else if (memoryType() == ImageView) { + tiling = viewOwner_->image_->GetImageCreateInfo().tiling; + } + imgCreateInfo.tiling = tiling; + + size_t imageSize = dev().iDev()->GetImageSize(imgCreateInfo, &result); + if (result != Pal::Result::Success) { + return false; + } + + memImg = new char[imageSize]; + if (memImg != nullptr) { + result = dev().iDev()->CreateImage(imgCreateInfo, memImg, &image_); + if (result != Pal::Result::Success) { + delete memImg; + return false; + } + } + image_->GetGpuMemoryRequirements(&req); + // createInfo.priority; + } + + if ((memoryType() != ImageView) && (memoryType() != ImageBuffer)) { + Pal::GpuMemoryCreateInfo createInfo = {}; + createInfo.size = amd::alignUp(req.size, MaxGpuAlignment); + createInfo.alignment = std::max(req.alignment, MaxGpuAlignment); + createInfo.vaRange = Pal::VaRange::Default; + createInfo.priority = Pal::GpuMemPriority::Normal; + memTypeToHeap(&createInfo); + + memRef_ = dev().resourceCache().findGpuMemory(&desc_, createInfo.size, createInfo.alignment); + if (nullptr == memRef_) { + memRef_ = GpuMemoryReference::Create(dev(), createInfo); + if (nullptr == memRef_) { + LogError("Failed PAL memory allocation!"); + return false; + } + } + } + else { + memRef_ = viewOwner_->memRef_; + memRef_->retain(); + desc_.cardMemory_ = viewOwner_->desc().cardMemory_; + if (req.size > viewOwner_->iMem()->Desc().size) { + LogWarning("Image is bigger than the original mem object!"); + } + } + + result = image_->BindGpuMemory(memRef_->gpuMem_, offset_); + if (result != Pal::Result::Success) { + return false; + } + + hwSrd_ = dev().srds().allocSrdSlot(reinterpret_cast(&hwState_)); + if ((0 == hwSrd_) && (memoryType() != ImageView)) { + return false; + } + viewInfo.pImage = image_; + viewInfo.format = format; + viewInfo.channels = channels; + viewInfo.subresRange = ImgSubresRange; + dev().iDev()->CreateImageViewSrds(1, &viewInfo, hwState_); + + hwState_[8] = GetHSAILImageFormatType(desc().format_); + hwState_[9] = GetHSAILImageOrderType(desc().format_); + hwState_[10] = static_cast(desc().width_); + hwState_[11] = 0; // one extra reserved field in the argument + return true; + } + + if (memoryType() == View) { + // Save the offset in the global heap + ViewParams* view = reinterpret_cast(params); + offset_ = view->offset_; + + // Make sure parent was provided + if (nullptr != view->resource_) { + viewOwner_ = view->resource_; + offset_ += viewOwner_->offset(); + + if (viewOwner_->isMemoryType(Pinned)) { + address_ = viewOwner_->data() + view->offset_; + } + pinOffset_ = viewOwner_->pinOffset(); + memRef_ = viewOwner_->memRef_; + memRef_->retain(); + desc_.cardMemory_ = viewOwner_->desc().cardMemory_; + } + else { + desc_.type_ = Empty; + } + return true; + } + + if (memoryType() == Pinned) { + PinnedParams* pinned = reinterpret_cast(params); + uint allocSize = static_cast(pinned->size_); + void* pinAddress; + hostMemRef = pinned->hostMemRef_; + pinAddress = address_ = hostMemRef->hostMem(); + // assert((allocSize == (desc().width_ * elementSize())) && "Sizes don't match"); + if (desc().topology_ == CL_MEM_OBJECT_BUFFER) { + // Allign offset to 4K boundary (Vista/Win7 limitation) + char* tmpHost = const_cast( + amd::alignDown(reinterpret_cast(address_), + PinnedMemoryAlignment)); + + // Find the partial size for unaligned copy + hostMemOffset = static_cast( + reinterpret_cast(address_) - tmpHost); + + pinOffset_ = hostMemOffset; + + pinAddress = tmpHost; + + // Align width to avoid GSL useless assert with a view + if (hostMemOffset != 0) { + allocSize += hostMemOffset; + } + allocSize = amd::alignUp(allocSize, PinnedMemoryAlignment); +// hostMemOffset &= ~(0xff); + } + else if (desc().topology_ == CL_MEM_OBJECT_IMAGE2D) { + //! @todo: Width has to be aligned for 3D. + //! Need to be replaced with a compute copy + // Width aligned by 8 texels + if (((desc().width_ % 0x8) != 0) || + // Pitch aligned by 64 bytes + (((desc().width_ * elementSize()) % 0x40) != 0)) { + return false; + } + } + else { + //! @todo GSL doesn't support pinning with resAlloc_ + return false; + } + + // Ensure page alignment + if ((uint64_t)(pinAddress) & (amd::Os::pageSize() - 1)) { + return false; + } + + memRef_ = GpuMemoryReference::Create(dev(), pinAddress, allocSize); + if (nullptr == memRef_) { + LogError("Failed PAL memory allocation!"); + pinOffset_ = 0; + return false; + } + desc_.cardMemory_ = false; + return true; + } + + Pal::GpuMemoryCreateInfo createInfo = {}; + createInfo.size = desc().width_ * elementSize_; + // @todo 64K alignment is too big + createInfo.size = amd::alignUp(createInfo.size, MaxGpuAlignment); + createInfo.alignment = MaxGpuAlignment; + createInfo.vaRange = Pal::VaRange::Default; + createInfo.priority = Pal::GpuMemPriority::Normal; + memTypeToHeap(&createInfo); + // createInfo.priority; + memRef_ = dev().resourceCache().findGpuMemory(&desc_, createInfo.size, createInfo.alignment); + if (nullptr == memRef_) { + memRef_ = GpuMemoryReference::Create(dev(), createInfo); + if (nullptr == memRef_) { + LogError("Failed PAL memory allocation!"); + return false; + } + } + + return true; +} + +void +Resource::free() +{ + if (memRef_ == nullptr) { + return; + } + + // Sanity check for the map calls + if (mapCount_ != 0) { + LogWarning("Resource wasn't unlocked, but destroyed!"); + } + const bool wait = (memoryType() != ImageView) && + (memoryType() != ImageBuffer) && + (memoryType() != View); + + // Check if resource could be used in any queue(thread) + if (gpu_ == nullptr) { + Device::ScopedLockVgpus lock(dev()); + + if (renames_.size() == 0) { + // Destroy GSL resource + if (iMem() != 0) { + // Release all virtual memory objects on all virtual GPUs + for (uint idx = 0; idx < dev().vgpus().size(); ++idx) { + // Ignore the transfer queue, + // since it releases resources after every operation + if (dev().vgpus()[idx] != dev().xferQueue()) { + dev().vgpus()[idx]->releaseMemory(iMem(), wait); + } + } + + //! @note: This is a workaround for bad applications that + //! don't unmap memory + if (mapCount_ != 0) { + unmap(nullptr); + } + + // Add resource to the cache + if (wait && !dev().resourceCache().addGpuMemory(&desc_, memRef_)) { + gslFree(); + } + } + } + else { + renames_[curRename_]->cpuAddress_ = 0; + for (size_t i = 0; i < renames_.size(); ++i) { + memRef_ = renames_[i]; + // Destroy GSL resource + if (iMem() != 0) { + // Release all virtual memory objects on all virtual GPUs + for (uint idx = 0; idx < dev().vgpus().size(); ++idx) { + // Ignore the transfer queue, + // since it releases resources after every operation + if (dev().vgpus()[idx] != dev().xferQueue()) { + dev().vgpus()[idx]->releaseMemory(iMem()); + } + } + gslFree(); + } + } + } + } + else { + if (renames_.size() == 0) { + // Destroy GSL resource + if (wait && (iMem() != 0)) { + // Release virtual memory object on the specified virtual GPU + gpu_->releaseMemory(iMem(), wait); + gslFree(); + } + } + else for (size_t i = 0; i < renames_.size(); ++i) { + memRef_ = renames_[i]; + // Destroy GSL resource + if (iMem() != 0) { + // Release virtual memory object on the specified virtual GPUs + gpu_->releaseMemory(iMem()); + gslFree(); + } + } + } + + // Free SRD for images + if (!desc().buffer_) { + dev().srds().freeSrdSlot(hwSrd_); + } +} + +void +Resource::writeRawData( + VirtualGPU& gpu, + size_t size, + const void* data, + bool waitForEvent) const +{ + GpuEvent event; + + // Write data size bytes to surface + // size needs to be DWORD aligned + assert((size & 3) == 0); + gpu.eventBegin(MainEngine); + //! @todo Remove cache flush + //! It's a workaround for a PAL crash with embedded data, allocated before any command + gpu.flushCUCaches(); + gpu.queue(MainEngine).addCmdMemRef(iMem()); + gpu.iCmd()->CmdUpdateMemory(*iMem(), 0, size, reinterpret_cast(data)); + gpu.eventEnd(MainEngine, event); + + setBusy(gpu, event); + // Update the global GPU event + gpu.setGpuEvent(event, false); + + if (waitForEvent) { + // Wait for event to complete + gpu.waitForEvent(&event); + } +} +static const Pal::ChFmt ChannelFmt(uint bytesPerElement) +{ + if (bytesPerElement == 16) { + return Pal::ChFmt::R32G32B32A32; + } + else if (bytesPerElement == 8) { + return Pal::ChFmt::R32G32; + } + else if (bytesPerElement == 4) { + return Pal::ChFmt::R32; + } + else if (bytesPerElement == 2) { + return Pal::ChFmt::R16; + } + else { + return Pal::ChFmt::R8; + } +} + +bool +Resource::partialMemCopyTo( + VirtualGPU& gpu, + const amd::Coord3D& srcOrigin, + const amd::Coord3D& dstOrigin, + const amd::Coord3D& size, + Resource& dstResource, + bool enableCopyRect, + bool flushDMA, + uint bytesPerElement) const +{ + Pal::SubresId ImgSubresId = { Pal::ImageAspect::Color, 0, 0 }; + Pal::SubresRange ImgSubresRange = { ImgSubresId, 1, 1 }; + GpuEvent event; + bool result = true; + EngineType activeEngineID = gpu.engineID_; + static const bool waitOnBusyEngine = true; + // \note timing issues in Linux with sync mode + bool flush = true; + + // Check if runtime can use async memory copy, + // even if a caller didn't request async + if (!desc().cardMemory_ || !dstResource.desc().cardMemory_) { + // Switch to SDMA engine + gpu.engineID_ = SdmaEngine; + flush = false; + } + else { + assert("Unsupported configuraiton!"); + } + + // Wait for the resources, since runtime may use async transfers + wait(gpu, waitOnBusyEngine); + dstResource.wait(gpu, waitOnBusyEngine); + + size_t calSrcOrigin[3], calDstOrigin[3], calSize[3]; + calSrcOrigin[0] = srcOrigin[0] + pinOffset(); + calSrcOrigin[1] = srcOrigin[1]; + calSrcOrigin[2] = srcOrigin[2]; + calDstOrigin[0] = dstOrigin[0] + dstResource.pinOffset(); + calDstOrigin[1] = dstOrigin[1]; + calDstOrigin[2] = dstOrigin[2]; + calSize[0] = size[0]; + calSize[1] = size[1]; + calSize[2] = size[2]; + + if (gpu.validateSdmaOverlap(*this, dstResource)) { + gpu.flushDMA(SdmaEngine); + } + + Pal::ImageLayout imgLayout = {}; + gpu.eventBegin(gpu.engineID_); + gpu.queue(gpu.engineID_).addCmdMemRef(iMem()); + gpu.queue(gpu.engineID_).addCmdMemRef(dstResource.iMem()); + if (desc().buffer_ && !dstResource.desc().buffer_) { + Pal::MemoryImageCopyRegion copyRegion = {}; + copyRegion.imageSubres = ImgSubresId; + copyRegion.imageOffset.x = calDstOrigin[0]; + copyRegion.imageOffset.y = calDstOrigin[1]; + copyRegion.imageOffset.z = calDstOrigin[2]; + copyRegion.imageExtent.width = calSize[0]; + copyRegion.imageExtent.height = calSize[1]; + copyRegion.imageExtent.depth = calSize[2]; + copyRegion.numSlices = 1; + copyRegion.gpuMemoryOffset = calSrcOrigin[0] + offset(); + copyRegion.gpuMemoryRowPitch = (calSrcOrigin[1]) ? calSrcOrigin[1] : + calSize[0] * dstResource.elementSize(); + copyRegion.gpuMemoryDepthPitch = (calSrcOrigin[2]) ? calSrcOrigin[2] : + copyRegion.gpuMemoryRowPitch * calSize[1]; + // Make sure linear pitch in bytes is 4 bytes aligned + if (((copyRegion.gpuMemoryRowPitch % 4) != 0) || + // another DRM restriciton... SI has 4 pixels + (copyRegion.gpuMemoryOffset % 4 != 0)) { + result = false; + } + else { + gpu.iCmd()->CmdCopyMemoryToImage(*iMem(), *dstResource.image_, + imgLayout, 1, ©Region); + } + } + else if (!desc().buffer_ && dstResource.desc().buffer_) { + Pal::MemoryImageCopyRegion copyRegion = {}; + copyRegion.imageSubres = ImgSubresId; + copyRegion.imageOffset.x = calSrcOrigin[0]; + copyRegion.imageOffset.y = calSrcOrigin[1]; + copyRegion.imageOffset.z = calSrcOrigin[2]; + copyRegion.imageExtent.width = calSize[0]; + copyRegion.imageExtent.height = calSize[1]; + copyRegion.imageExtent.depth = calSize[2]; + copyRegion.numSlices = 1; + copyRegion.gpuMemoryOffset = calDstOrigin[0] + dstResource.offset(); + copyRegion.gpuMemoryRowPitch = (calDstOrigin[1]) ? calDstOrigin[1] : + calSize[0] * elementSize(); + copyRegion.gpuMemoryDepthPitch = (calDstOrigin[2]) ? calDstOrigin[2] : + copyRegion.gpuMemoryRowPitch * calSize[1]; + // Make sure linear pitch in bytes is 4 bytes aligned + if (((copyRegion.gpuMemoryRowPitch % 4) != 0) || + // another DRM restriciton... SI has 4 pixels + (copyRegion.gpuMemoryOffset % 4 != 0)) { + result = false; + } + else { + gpu.iCmd()->CmdCopyImageToMemory(*image_, imgLayout, + *dstResource.iMem(), 1, ©Region); + } + } + else { + if (enableCopyRect) { + Pal::TypedBufferCopyRegion copyRegion = {}; + copyRegion.srcBuffer.format.chFmt = ChannelFmt(bytesPerElement); + copyRegion.srcBuffer.format.numFmt = Pal::NumFmt::Uint; + copyRegion.srcBuffer.offset = calSrcOrigin[0] + offset(); + copyRegion.srcBuffer.rowPitch = calSrcOrigin[1]; + copyRegion.srcBuffer.depthPitch = calSrcOrigin[2]; + copyRegion.extent.width = calSize[0]; + copyRegion.extent.height = calSize[1]; + copyRegion.extent.depth = calSize[2]; + copyRegion.dstBuffer.format.chFmt = ChannelFmt(bytesPerElement); + copyRegion.dstBuffer.format.numFmt = Pal::NumFmt::Uint; + copyRegion.dstBuffer.offset = calDstOrigin[0] + dstResource.offset(); + copyRegion.dstBuffer.rowPitch = calDstOrigin[1]; + copyRegion.dstBuffer.depthPitch = calDstOrigin[2]; + gpu.iCmd()->CmdCopyTypedBuffer(*iMem(), *dstResource.iMem(), + 1, ©Region); + } + else { + Pal::MemoryCopyRegion copyRegion = {}; + copyRegion.srcOffset = calSrcOrigin[0] + offset(); + copyRegion.dstOffset = calDstOrigin[0] + dstResource.offset(); + copyRegion.copySize = calSize[0]; + gpu.iCmd()->CmdCopyMemory(*iMem(), *dstResource.iMem(), + 1, ©Region); + } + } + + gpu.eventEnd(gpu.engineID_, event); + + if (result) { + // Mark source and destination as busy + setBusy(gpu, event); + dstResource.setBusy(gpu, event); + + // Update the global GPU event + gpu.setGpuEvent(event, (flush | flushDMA)); + } + + // Restore the original engine + gpu.engineID_ = activeEngineID; + + return result; +} + +void +Resource::setBusy( + VirtualGPU& gpu, + GpuEvent gpuEvent + ) const +{ + gpu.assignGpuEvent(iMem(), gpuEvent); + + // If current resource is a view, then update the parent event as well + if (viewOwner_ != nullptr) { + viewOwner_->setBusy(gpu, gpuEvent); + } +} + +void +Resource::wait(VirtualGPU& gpu, bool waitOnBusyEngine) const +{ + GpuEvent* gpuEvent = gpu.getGpuEvent(iMem()); + + // Check if we have to wait unconditionally + if (!waitOnBusyEngine || + // or we have to wait only if another engine was used on this resource + (waitOnBusyEngine && (gpuEvent->engineId_ != gpu.engineID_))) { + gpu.waitForEvent(gpuEvent); + } + + // If current resource is a view and not in the global heap, + // then wait for the parent event as well + if (viewOwner_ != nullptr) { + viewOwner_->wait(gpu, waitOnBusyEngine); + } +} + +bool +Resource::hostWrite( + VirtualGPU* gpu, + const void* hostPtr, + const amd::Coord3D& origin, + const amd::Coord3D& size, + uint flags, + size_t rowPitch, + size_t slicePitch) +{ + void* dst; + + size_t startLayer = origin[2]; + size_t numLayers = size[2]; + if (desc().topology_ == CL_MEM_OBJECT_IMAGE1D_ARRAY) { + startLayer = origin[1]; + numLayers = size[1]; + } + + // Get physical GPU memmory + dst = map(gpu, flags, startLayer, numLayers); + if (nullptr == dst) { + LogError("Couldn't map GPU memory for host write"); + return false; + } + + if (1 == desc().dimSize_) { + size_t copySize = (desc().buffer_) ? size[0] : size[0] * elementSize_; + + // Update the pointer + dst = static_cast(static_cast(dst) + origin[0]); + + // Copy memory + amd::Os::fastMemcpy(dst, hostPtr, copySize); + } + else { + size_t srcOffs = 0; + size_t dstOffsBase = origin[0] * elementSize_; + size_t dstOffs; + + // Make sure we use the right pitch if it's not specified + if (rowPitch == 0) { + rowPitch = size[0] * elementSize_; + } + + // Make sure we use the right slice if it's not specified + if (slicePitch == 0) { + slicePitch = size[0] * size[1] * elementSize_; + } + + // Adjust the destination offset with Y dimension + dstOffsBase += desc().pitch_ * origin[1] * elementSize_; + + // Adjust the destination offset with Z dimension + dstOffsBase += desc().slice_ * origin[2] * elementSize_; + + // Copy memory slice by slice + for (size_t slice = 0; slice < size[2]; ++slice) { + dstOffs = dstOffsBase + slice * desc().slice_ * elementSize_; + srcOffs = slice * slicePitch; + + // Copy memory line by line + for (size_t row = 0; row < size[1]; ++row) { + // Copy memory + amd::Os::fastMemcpy( + (reinterpret_cast
(dst) + dstOffs), + (reinterpret_cast(hostPtr) + srcOffs), + size[0] * elementSize_); + + dstOffs += desc().pitch_ * elementSize_; + srcOffs += rowPitch; + } + } + } + + // Unmap GPU memory + unmap(gpu); + + return true; +} + +bool +Resource::hostRead( + VirtualGPU* gpu, + void* hostPtr, + const amd::Coord3D& origin, + const amd::Coord3D& size, + size_t rowPitch, + size_t slicePitch) +{ + void* src; + + size_t startLayer = origin[2]; + size_t numLayers = size[2]; + if (desc().topology_ == CL_MEM_OBJECT_IMAGE1D_ARRAY) { + startLayer = origin[1]; + numLayers = size[1]; + } + + // Get physical GPU memmory + src = map(gpu, ReadOnly, startLayer, numLayers); + if (nullptr == src) { + LogError("Couldn't map GPU memory for host read"); + return false; + } + + if (1 == desc().dimSize_) { + size_t copySize = (desc().buffer_) ? size[0] : size[0] * elementSize_; + + // Update the pointer + src = static_cast(static_cast(src) + origin[0]); + + // Copy memory + amd::Os::fastMemcpy(hostPtr, src, copySize); + } + else { + size_t srcOffsBase = origin[0] * elementSize_; + size_t srcOffs; + size_t dstOffs = 0; + + // Make sure we use the right pitch if it's not specified + if (rowPitch == 0) { + rowPitch = size[0] * elementSize_; + } + + // Make sure we use the right slice if it's not specified + if (slicePitch == 0) { + slicePitch = size[0] * size[1] * elementSize_; + } + + // Adjust destination offset with Y dimension + srcOffsBase += desc().pitch_ * origin[1] * elementSize_; + + // Adjust the destination offset with Z dimension + srcOffsBase += desc().slice_ * origin[2] * elementSize_; + + // Copy memory line by line + for (size_t slice = 0; slice < size[2]; ++slice) { + srcOffs = srcOffsBase + slice * desc().slice_ * elementSize_; + dstOffs = slice * slicePitch; + + // Copy memory line by line + for (size_t row = 0; row < size[1]; ++row) { + // Copy memory + amd::Os::fastMemcpy( + (reinterpret_cast
(hostPtr) + dstOffs), + (reinterpret_cast(src) + srcOffs), + size[0] * elementSize_); + + srcOffs += desc().pitch_ * elementSize_; + dstOffs += rowPitch; + } + } + } + + // Unmap GPU memory + unmap(gpu); + + return true; +} + +void* +Resource::gpuMemoryMap(size_t* pitch, uint flags, Pal::IGpuMemory* resource) const +{ + if (desc_.cardMemory_ && !isPersistentDirectMap()) { + // @todo remove const cast + Unimplemented(); + return nullptr; +// return const_cast(dev()).resMapLocal(*pitch, resource, flags); + } + else { + amd::ScopedLock lk(dev().lockPAL()); + void* address; + *pitch = desc().width_ * elementSize(); + if (Pal::Result::Success == resource->Map(&address)) { + return address; + } + else { + LogError("PAL GpuMemory->Map() failed!"); + return nullptr; + } + } +} + +void +Resource::gpuMemoryUnmap(Pal::IGpuMemory* resource) const +{ + if (desc_.cardMemory_) { + // @todo remove const cast + Unimplemented(); +// const_cast(dev()).resUnmapLocal(resource); + } + else { + Pal::Result result = resource->Unmap(); + if (Pal::Result::Success != result) { + LogError("PAL GpuMemory->Unmap() failed!"); + } + } +} + +bool +Resource::gslGLAcquire() +{ + bool retVal = true; + if (desc().type_ == OGLInterop) { + retVal = dev().resGLAcquire(glPlatformContext_, glInteropMbRes_, glType_); + } + return retVal; +} + +bool +Resource::gslGLRelease() +{ + bool retVal = true; + if (desc().type_ == OGLInterop) { + retVal = dev().resGLRelease(glPlatformContext_,glInteropMbRes_, glType_); + } + return retVal; +} +void +Resource::gslFree() const +{ + amd::ScopedLock lk(dev().lockPAL()); + + if (desc().type_ == OGLInterop) { + dev().resGLFree(glPlatformContext_, glInteropMbRes_, glType_); + } + memRef_->release(); +} + +bool +Resource::isMemoryType(MemoryType memType) const +{ + if (memoryType() == memType) { + return true; + } + else if (memoryType() == View) { + return viewOwner_->isMemoryType(memType); + } + + return false; +} + +bool +Resource::isPersistentDirectMap() const +{ + bool directMap = ((memoryType() == Resource::Persistent) && + (desc().dimSize_ < 3) && !desc().imageArray_); + + // If direct map is possible, then validate it with the current tiling + if (directMap && desc().tiled_) { + //!@note IOL for Linux doesn't support tiling aperture + // and runtime doesn't force linear images in persistent + directMap = IS_WINDOWS && !dev().settings().linearPersistentImage_; + } + + return directMap; +} + +void* +Resource::map(VirtualGPU* gpu, uint flags, uint startLayer, uint numLayers) +{ + if (isMemoryType(Pinned)) { + // Check if we have to wait + if (!(flags & NoWait)) { + if (gpu != nullptr) { + wait(*gpu); + } + } + return address_; + } + + if (flags & ReadOnly) { + assert(!(flags & Discard) && "We can't use lock discard with read only!"); + } + + if (flags & WriteOnly) { + } + + // Check if use map discard + if (flags & Discard) { + if (gpu != nullptr) { + // If we use a new renamed allocation, then skip the wait + if (rename(*gpu)) { + flags |= NoWait; + } + } + } + + // Check if we have to wait + if (!(flags & NoWait)) { + if (gpu != nullptr) { + wait(*gpu); + } + } + + // Check if memory wasn't mapped yet + if (++mapCount_ == 1) { + if ((desc().dimSize_ == 3) || desc().imageArray_ || + ((desc().type_ == ImageView) && viewOwner_->mipMapped())) { + // Save map info for multilayer map/unmap + startLayer_ = startLayer; + numLayers_ = numLayers; + mapFlags_ = flags; + // Map with layers + address_ = mapLayers(gpu, flags); + } + else { + // Map current resource + address_ = gpuMemoryMap(&desc_.pitch_, flags, iMem()); + if (address_ == nullptr) { + LogError("cal::ResMap failed!"); + --mapCount_; + return nullptr; + } + } + } + + //! \note the atomic operation with counter doesn't + // guarantee that the address will be valid, + // since GSL could still process the first map + if (address_ == nullptr) { + amd::Os::sleep(10); + assert((address_ != nullptr) && "Multiple maps failed!"); + } + + return address_; +} + +void* +Resource::mapLayers(VirtualGPU* gpu, uint flags) +{ + size_t srcOffs = 0; + size_t dstOffs = 0; + Pal::IGpuMemory* sliceResource = 0; + PalGpuMemoryType palDim = PAL_TEXTURE_2D; + size_t layers = desc().depth_; + size_t height = desc().height_; + + // Use 1D layers + if (CL_MEM_OBJECT_IMAGE1D_ARRAY == desc().topology_) { + palDim = PAL_TEXTURE_1D; + height = 1; + layers = desc().height_; + } + + desc_.pitch_ = desc().width_; + desc_.slice_ = desc().pitch_ * height; + address_ = new char [desc().slice_ * layers * elementSize()]; + if (nullptr == address_) { + return nullptr; + } + + // Check if map is write only + if (flags & WriteOnly) { + return address_; + } + + if (numLayers_ != 0) { + layers = startLayer_ + numLayers_; + } + + dstOffs = startLayer_ * desc().slice_ * elementSize(); + + // Loop through all layers + for (uint i = startLayer_; i < layers; ++i) { + // gslResource3D gslSize; + size_t calOffset; + void* sliceAddr; + size_t pitch; + Unimplemented(); + // Allocate a layer from the image + // gslSize.width = desc().width_; + //gslSize.height = height; + //gslSize.depth = 1; + calOffset = 0; +/* + sliceResource = dev().resAllocView( + iMem(), gslSize, + calOffset, desc().format_, desc().channelOrder_, palDim, + 0, i, CAL_RESALLOCSLICEVIEW_LEVEL_AND_LAYER); + if (0 == sliceResource) { + LogError("Map layer. resAllocSliceView failed!"); + return nullptr; + } +*/ + // Map 2D layer + sliceAddr = gpuMemoryMap(&pitch, ReadOnly, sliceResource); + if (sliceAddr == nullptr) { + LogError("Map layer. CalResMap failed!"); + return nullptr; + } + + srcOffs = 0; + // Copy memory line by line + for (size_t rows = 0; rows < height; ++rows) { + // Copy memory + amd::Os::fastMemcpy( + (reinterpret_cast
(address_) + dstOffs), + (reinterpret_cast(sliceAddr) + srcOffs), + desc().width_ * elementSize_); + + dstOffs += desc().pitch_ * elementSize(); + srcOffs += pitch * elementSize(); + } + + // Unmap a layer + gpuMemoryUnmap(sliceResource); + //dev().resFree(sliceResource); + } + + return address_; +} + +void +Resource::unmap(VirtualGPU* gpu) +{ + if (isMemoryType(Pinned)) { + return; + } + + // Decrement map counter + int count = --mapCount_; + + // Check if it's the last unmap + if (count == 0) { + if ((desc().dimSize_ == 3) || desc().imageArray_ || + ((desc().type_ == ImageView) && viewOwner_->mipMapped())) { + // Unmap layers + unmapLayers(gpu); + } + else { + // Unmap current resource + gpuMemoryUnmap(iMem()); + } + address_ = nullptr; + } + else if (count < 0) { + LogError("dev().serialCalResUnmap failed!"); + ++mapCount_; + return; + } +} + +void +Resource::unmapLayers(VirtualGPU* gpu) +{ + size_t srcOffs = 0; + size_t dstOffs = 0; + PalGpuMemoryType palDim = PAL_TEXTURE_2D; + Pal::IGpuMemory* sliceResource = nullptr; + uint layers = desc().depth_; + uint height = desc().height_; + + // Use 1D layers + if (CL_MEM_OBJECT_IMAGE1D_ARRAY == desc().topology_) { + palDim = PAL_TEXTURE_1D; + height = 1; + layers = desc().height_; + } + + if (numLayers_ != 0) { + layers = startLayer_ + numLayers_; + } + + srcOffs = startLayer_ * desc().slice_ * elementSize(); + + // Check if map is write only + if (!(mapFlags_ & ReadOnly)) { + // Loop through all layers + for (uint i = startLayer_; i < layers; ++i) { + Unimplemented(); +// gslResource3D gslSize; + size_t calOffset; + void* sliceAddr; + size_t pitch; + + // Allocate a layer from the image + //gslSize.width = desc().width_; + //gslSize.height = height; + //gslSize.depth = 1; + calOffset = 0; + /*sliceResource = dev().resAllocView( + iMem(), gslSize, + calOffset, desc().format_, desc().channelOrder_, palDim, + 0, i, CAL_RESALLOCSLICEVIEW_LEVEL_AND_LAYER); + if (0 == sliceResource) { + LogError("Unmap layer. resAllocSliceView failed!"); + return; + } +*/ + // Map a layer + sliceAddr = gpuMemoryMap(&pitch, WriteOnly, sliceResource); + if (sliceAddr == nullptr) { + LogError("Unmap layer. CalResMap failed!"); + return; + } + + dstOffs = 0; + // Copy memory line by line + for (size_t rows = 0; rows < height; ++rows) { + // Copy memory + amd::Os::fastMemcpy( + (reinterpret_cast
(sliceAddr) + dstOffs), + (reinterpret_cast(address_) + srcOffs), + desc().width_ * elementSize_); + + dstOffs += pitch * elementSize(); + srcOffs += desc().pitch_ * elementSize(); + } + + // Unmap a layer + gpuMemoryUnmap(sliceResource); + //dev().resFree(sliceResource); + } + } + + // Destroy the mapped memory + delete [] reinterpret_cast(address_); +} + +void +Resource::setActiveRename(VirtualGPU& gpu, GpuMemoryReference* rename) +{ + // Copy the unique GSL data + memRef_ = rename; + address_ = rename->cpuAddress_; +} + +bool +Resource::getActiveRename(VirtualGPU& gpu, GpuMemoryReference** rename) +{ + // Copy the old data to the rename descriptor + *rename = memRef_; + return true; +} + +bool +Resource::rename(VirtualGPU& gpu, bool force) +{ + GpuEvent* gpuEvent = gpu.getGpuEvent(iMem()); + if (!gpuEvent->isValid() && !force) { + return true; + } + + bool useNext = false; + uint resSize = desc().width_ * ((desc().height_) ? desc().height_ : 1) * + elementSize_; + + // Rename will work with real GSL resources + if (((memoryType() != Local) && + (memoryType() != Persistent) && + (memoryType() != Remote) && + (memoryType() != RemoteUSWC)) || + (dev().settings().maxRenames_ == 0)) { + return false; + } + + // If the resource for renaming is too big, then lets check the current status first + // at the cost of an extra flush + if (resSize >= (dev().settings().maxRenameSize_ / dev().settings().maxRenames_)) { + if (gpu.isDone(gpuEvent)) { + return true; + } + } + + // Save the first + if (renames_.size() == 0) { + GpuMemoryReference* rename; + if (mapCount_ > 0) { + memRef_->cpuAddress_ = address_; + } + if (!getActiveRename(gpu, &rename)) { + return false; + } + + curRename_ = renames_.size(); + renames_.push_back(rename); + } + + // Can we use a new rename? + if ((renames_.size() <= dev().settings().maxRenames_) && + ((renames_.size() * resSize) <= dev().settings().maxRenameSize_)) { + GpuMemoryReference* rename; + + // Create a new GSL allocation + if (create(memoryType())) { + if (mapCount_ > 0) { + assert(!desc().cardMemory_ && "Unsupported memory type!"); + memRef_->cpuAddress_ = gpuMemoryMap(&desc_.pitch_, 0, iMem()); + if (memRef_->cpuAddress_ == nullptr) { + LogError("gslMap fails on rename!"); + } + address_ = memRef_->cpuAddress_; + } + if (getActiveRename(gpu, &rename)) { + curRename_ = renames_.size(); + renames_.push_back(rename); + } + else { + memRef_->release(); + useNext = true; + } + } + else { + useNext = true; + } + } + else { + useNext = true; + } + + if (useNext) { + // Get the last submitted + curRename_++; + if (curRename_ >= renames_.size()) { + curRename_ = 0; + } + setActiveRename(gpu, renames_[curRename_]); + return false; + } + + return true; +} + +void +Resource::warmUpRenames(VirtualGPU& gpu) +{ + for (uint i = 0; i < dev().settings().maxRenames_; ++i) { + uint dummy = 0; + const bool NoWait = false; + // Write 0 for the buffer paging by VidMM + writeRawData(gpu, sizeof(dummy), &dummy, NoWait); + const bool Force = true; + rename(gpu, Force); + } +} + +ResourceCache::~ResourceCache() +{ + free(); +} + +//! \note the cache works in FILO mode +bool +ResourceCache::addGpuMemory( + Resource::Descriptor* desc, GpuMemoryReference* ref) +{ + amd::ScopedLock l(&lockCacheOps_); + bool result = false; + size_t size = ref->iMem()->Desc().size; + + // Make sure current allocation isn't bigger than cache + if (((desc->type_ == Resource::Local) || + (desc->type_ == Resource::Persistent) || + (desc->type_ == Resource::Remote) || + (desc->type_ == Resource::RemoteUSWC)) && + (size < cacheSizeLimit_) && + !desc->SVMRes_) { + // Validate the cache size limit. Loop until we have enough space + while ((cacheSize_ + size) > cacheSizeLimit_) { + removeLast(); + } + Resource::Descriptor* descCached = new Resource::Descriptor; + if (descCached != nullptr) { + // Copy the original desc to the cached version + memcpy(descCached, desc, sizeof(Resource::Descriptor)); + + // Add the current resource to the cache + resCache_.push_front(std::make_pair(descCached, ref)); + cacheSize_ += size; + result = true; + } + } + + return result; +} + +GpuMemoryReference* +ResourceCache::findGpuMemory( + Resource::Descriptor* desc, Pal::gpusize size, Pal::gpusize alignment) +{ + amd::ScopedLock l(&lockCacheOps_); + GpuMemoryReference* ref = nullptr; + + // Early exit if resource is too big + if (size >= cacheSizeLimit_ || desc->SVMRes_) { + //! \note we may need to free the cache here to reduce memory pressure + return ref; + } + + // Serach the right resource through the cache list + for (const auto& it: resCache_) { + Resource::Descriptor* entry = it.first; + size_t sizeRes = it.second->iMem()->Desc().size; + // Find if we can reuse this entry + if ((entry->type_ == desc->type_) && + (entry->flags_ == desc->flags_) && + (size <= sizeRes) && + (size > (sizeRes >> 2)) && + ((it.second->iMem()->Desc().gpuVirtAddr % alignment) == 0) && + (entry->isAllocExecute_ == desc->isAllocExecute_)) { + ref = it.second; + delete it.first; + // Remove the found etry from the cache + resCache_.remove(it); + cacheSize_ -= sizeRes; + break; + } + } + + return ref; +} + +bool +ResourceCache::free(size_t minCacheEntries) +{ + amd::ScopedLock l(&lockCacheOps_); + bool result = false; + + if (minCacheEntries < resCache_.size()) { + if (static_cast(cacheSize_) > 0) { + result = true; + } + // Clear the cache + while (static_cast(cacheSize_) > 0) { + removeLast(); + } + CondLog((cacheSize_ != 0), "Incorrect size for cache release!"); + } + return result; +} + +void +ResourceCache::removeLast() +{ + std::pair entry; + entry = resCache_.back(); + resCache_.pop_back(); + + size_t size = entry.second->iMem()->Desc().size; + + // Delete Descriptor + delete entry.first; + + // Destroy GSL resource + entry.second->release(); + cacheSize_ -= size; +} + +} // namespace pal diff --git a/projects/clr/rocclr/runtime/device/pal/palresource.hpp b/projects/clr/rocclr/runtime/device/pal/palresource.hpp new file mode 100644 index 0000000000..8ac50780a6 --- /dev/null +++ b/projects/clr/rocclr/runtime/device/pal/palresource.hpp @@ -0,0 +1,508 @@ +// +// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved. +// +#ifndef PALRESOURCE_HPP_ +#define PALRESOURCE_HPP_ + +#include "platform/command.hpp" +#include "platform/program.hpp" +#include "device/pal/paldefs.hpp" + +//! \namespace pal PAL Resource Implementation +namespace pal { + +class Device; +class VirtualGPU; + +/*! \addtogroup PAL PAL Resource Implementation + * @{ + */ + +class GpuMemoryReference : public amd::ReferenceCountedObject +{ +public: + static GpuMemoryReference* Create( + const Device& dev, + const Pal::GpuMemoryCreateInfo& createInfo); + + static GpuMemoryReference* Create( + const Device& dev, + const void* sysMem, + size_t memSize); + + static GpuMemoryReference* Create( + const Device& dev, + const Pal::ExternalResourceOpenInfo& openInfo); + + static GpuMemoryReference* Create( + const Device& dev, + const Pal::ExternalImageOpenInfo& openInfo, + Pal::ImageCreateInfo* imgCreateInfo, + Pal::IImage** image); + + //! Default constructor + GpuMemoryReference(); + + //! Get PAL memory object + Pal::IGpuMemory* iMem() const { return gpuMem_; } + + Pal::IGpuMemory* gpuMem_; //!< PAL GPU memory object + void* cpuAddress_; //!< CPU address of this memory + +protected: + //! Default destructor + ~GpuMemoryReference(); + +private: + //! Disable copy constructor + GpuMemoryReference(const GpuMemoryReference&); + + //! Disable operator= + GpuMemoryReference& operator=(const GpuMemoryReference&); +}; + +//! GPU resource +class Resource : public amd::HeapObject +{ +public: + enum InteropType { + InteropTypeless = 0, + InteropVertexBuffer, + InteropIndexBuffer, + InteropRenderBuffer, + InteropTexture, + InteropTextureViewLevel, + InteropTextureViewCube, + InteropSurface + }; + + struct CreateParams : public amd::StackObject { + amd::Memory* owner_; //!< Resource's owner + VirtualGPU* gpu_; //!< Resource won't be shared between multiple queues + CreateParams(): owner_(NULL), gpu_(NULL) {} + }; + + struct PinnedParams : public CreateParams { + const amd::HostMemoryReference* hostMemRef_;//!< System memory pointer for pinning + size_t size_; //!< System memory size + }; + + struct ViewParams : public CreateParams { + size_t offset_; //!< Alias resource offset + size_t size_; //!< Alias resource size + const Resource* resource_; //!< Parent resource for the view creation + const void* memory_; + }; + + struct ImageViewParams : public CreateParams { + size_t level_; //!< Image mip level for a new view + size_t layer_; //!< Image layer for a new view + const Resource* resource_; //!< Parent resource for the view creation + const void* memory_; + }; + + struct ImageBufferParams : public CreateParams { + const Resource* resource_; //!< Parent resource for the image creation + const void* memory_; + }; + + struct OGLInteropParams : public CreateParams { + InteropType type_; //!< OGL resource type + uint handle_; //!< OGL resource handle + uint mipLevel_; //!< Texture mip level + uint layer_; //!< Texture layer + void* glPlatformContext_; + void* glDeviceContext_; + uint flags_; + }; + +#ifdef _WIN32 + struct D3DInteropParams : public CreateParams { + InteropType type_; //!< D3D resource type + void* iDirect3D_; //!< D3D resource interface object + void* handle_; //!< D3D resource handle + uint mipLevel_; //!< Texture mip level + int layer_; //!< Texture layer + uint misc; //!< miscellaneous cases + }; +#endif // _WIN32 + + //! Resource memory + enum MemoryType + { + Empty = 0x0, //!< resource is empty + Local, //!< resource in local memory + Persistent, //!< resource in persistent memory + Remote, //!< resource in nonlocal memory + RemoteUSWC, //!< resource in nonlocal memory + Pinned, //!< resource in pinned system memory + View, //!< resource is an alias + OGLInterop, //!< resource is an OGL memory object + D3D10Interop, //!< resource is a D3D10 memory object + D3D11Interop, //!< resource is a D3D11 memory object + ImageView, //!< resource is a view to some image + ImageBuffer, //!< resource is an image view of a buffer + BusAddressable, //!< resource is a bus addressable memory + ExternalPhysical, //!< resource is an external physical memory + D3D9Interop, //!< resource is a D3D9 memory object + Scratch, //!< resource is scratch memory + Shader, //!< resource is a shader + }; + + //! Resource map flags + enum MapFlags + { + Discard = 0x00000001, //!< discard lock + NoOverwrite = 0x00000002, //!< lock with no overwrite + ReadOnly = 0x00000004, //!< lock for read only operation + WriteOnly = 0x00000008, //!< lock for write only operation + NoWait = 0x00000010, //!< lock with no wait + }; + + //! Resource descriptor + struct Descriptor : public amd::HeapObject + { + MemoryType type_; //!< Memory type + size_t width_; //!< Resource width + size_t height_; //!< Resource height + size_t depth_; //!< Resource depth + uint mipLevels_; //!< Number of mip levels + uint flags_; //!< Resource flags, used in creation + size_t pitch_; //!< Resource pitch, valid if locked + size_t slice_; //!< Resource slice, valid if locked + cl_image_format format_; //!< CL image format + cl_mem_object_type topology_;//!< CL mem object type + union { + struct { + uint dimSize_ : 2; //!< Dimension size + uint cardMemory_ : 1; //!< GSL resource is in video memory + uint imageArray_ : 1; //!< GSL resource is an array of images + uint buffer_ : 1; //!< GSL resource is a buffer + uint tiled_ : 1; //!< GSL resource is tiled + uint SVMRes_ : 1; //!< SVM flag to the cal resource + uint scratch_ : 1; //!< Scratch buffer + uint isAllocExecute_ : 1; //!< SVM resource allocation attribute for shader\cmdbuf + }; + uint state_; + }; + }; + + //! Constructor of 1D Resource object + Resource( + const Device& gpuDev, //!< GPU device object + size_t size //!< Resource size + ); + + //! Constructor of Image Resource object + Resource( + const Device& gpuDev, //!< GPU device object + size_t width, //!< resource width + size_t height, //!< resource height + size_t depth, //!< resource depth + cl_image_format format, //!< resource format + cl_mem_object_type imageType, //!< CL image type + uint mipLevels = 1 //!< Number of mip levels + ); + + //! Destructor of the resource + virtual ~Resource(); + + /*! \brief Creates a CAL object, associated with the resource + * + * \return True if we succesfully created a CAL resource + */ + virtual bool create( + MemoryType memType, //!< memory type + CreateParams* params = 0 //!< special parameters for resource allocation + ); + + /*! \brief Copies a subregion of memory from one resource to another + * + * This is a general copy from anything to anything (as long as it fits). + * All positions and sizes are given in bytes. Note, however, that only + * a subset of this general interface is currently implemented. + * + * \return true if successful + */ + bool partialMemCopyTo( + VirtualGPU& gpu, //!< Virtual GPU device object + const amd::Coord3D& srcOrigin, //!< Origin of the source region + const amd::Coord3D& dstOrigin, //!< Origin of the destination region + const amd::Coord3D& size, //!< Size of the region to copy + Resource& dstResource, //!< Destination resource + bool enableRectCopy = false, //!< Rectangular DMA support + bool flushDMA = false, //!< Flush DMA if requested + uint bytesPerElement = 1 //!< Bytes Per Element + ) const; + + /*! \brief Copies size/4 DWORD of memory to a surface + * + * This is a raw copy to any surface using a CP packet. + * Size needs to be atleast a DWORD or multiple + * + */ + void writeRawData( + VirtualGPU& gpu, //!< Virtual GPU device object + size_t size, //!< Size in bytes of data to be copied(multiple of DWORDS) + const void* data, //!< Data to be copied + bool waitForEvent //!< Wait for event complete + ) const; + + //! Returns the offset in GPU memory for aliases + size_t offset() const { return offset_; } + + //! Returns the pinned memory offset + uint64_t pinOffset() const { return pinOffset_; } + + //! Returns the GPU device that owns this resource + const Device& dev() const { return gpuDevice_; } + + //! Returns the descriptor for resource + const Descriptor& desc() const { return desc_; } + + //! Returns the PAL memory object + Pal::IGpuMemory* iMem() const { return memRef_->iMem(); } + + //! Returns global memory offset + uint64_t vmAddress() const { return iMem()->Desc().gpuVirtAddr + offset_; } + + //! Returns global memory offset + uint64_t vmSize() const { return iMem()->Desc().size - offset_; } + + //! Returns global memory offset + bool mipMapped() const { return (desc().mipLevels_ > 1) ? true : false; } + + //! Checks if persistent memory can have a direct map + bool isPersistentDirectMap() const; + + /*! \brief Locks the resource and returns a physical pointer + * + * \note This operation stalls HW pipeline! + * + * \return Pointer to the physical memory + */ + void* map( + VirtualGPU* gpu, //!< Virtual GPU device object + uint flags = 0, //!< flags for the map operation + // Optimization for multilayer map/unmap + uint startLayer = 0, //!< Start layer for multilayer map + uint numLayers = 0 //!< End layer for multilayer map + ); + + //! Unlocks the resource if it was locked + void unmap( + VirtualGPU* gpu //!< Virtual GPU device object + ); + + //! Marks the resource as busy + void setBusy( + VirtualGPU& gpu, //!< Virtual GPU device object + GpuEvent calEvent //!< CAL event + ) const; + + //! Wait for the resource + void wait( + VirtualGPU& gpu, //!< Virtual GPU device object + bool waitOnBusyEngine = false//!< Wait only if engine has changed + ) const; + + //! Performs host write to the resource GPU memory + bool hostWrite( + VirtualGPU* gpu, //!< Virtual GPU device object + const void* hostPtr, //!< Host pointer to the SRC data + const amd::Coord3D& origin, //!< Offsets for the update + const amd::Coord3D& size, //!< The number of bytes to write + uint flags = 0, //!< Map flags + size_t rowPitch = 0, //!< Raw data row pitch + size_t slicePitch = 0 //!< Raw data slice pitch + ); + + //! Performs host read from the resource GPU memory + bool hostRead( + VirtualGPU* gpu, //!< Virtual GPU device object + void* hostPtr, //!< Host pointer to the DST data + const amd::Coord3D& origin, //!< Offsets for the update + const amd::Coord3D& size, //!< The number of bytes to write + size_t rowPitch = 0, //!< Raw data row pitch + size_t slicePitch = 0 //!< Raw data slice pitch + ); + + //! Warms up the rename list for this resource + void warmUpRenames(VirtualGPU& gpu); + + //! Gets the resource element size + uint elementSize() const { return elementSize_; } + + //! Get the mapped address of this resource + address data() const { return reinterpret_cast
(address_); } + + //! Frees all allocated CAL memories and resources, + //! associated with this objects. And also destroys all rename structures + //! Note: doesn't destroy the object itself + void free(); + + //! Return memory type + MemoryType memoryType() const { return desc().type_; } + + //! Retunrs true if memory type matches specified + bool isMemoryType(MemoryType memType) const; + + //! Returns TRUE if resource was allocated as cacheable + bool isCacheable() const + { return (isMemoryType(Remote) || isMemoryType(Pinned)) ? true : false; } + + bool gslGLAcquire() ; + bool gslGLRelease() ; + + //! Returns HW state for the resource (used for images only) + const void* hwState() const { return hwState_; } + + //! Returns CPU HW SRD for the resource (used for images only) + uint64_t hwSrd() const { return hwSrd_; } + + uint numComponents() const { + return Pal::Formats::NumComponents(image_->GetImageCreateInfo().format.chFmt); } + +protected: + uint elementSize_; //!< Size of a single element in bytes + +private: + //! Disable copy constructor + Resource(const Resource&); + + //! Disable operator= + Resource& operator=(const Resource&); + + typedef std::vector RenameList; + + //! Rename current resource + bool rename( + VirtualGPU& gpu, //!< Virtual GPU device object + bool force = false //!< Force renaming + ); + + //! Sets the rename as active + void setActiveRename( + VirtualGPU& gpu, //!< Virtual GPU device object + GpuMemoryReference* rename //!< new active rename + ); + + //! Gets the active rename + bool getActiveRename( + VirtualGPU& gpu, //!< Virtual GPU device object + GpuMemoryReference** rename //!< Saved active rename + ); + + /*! \brief Locks the resource with layers and returns a physical pointer + * + * \return Pointer to the physical memory + */ + void* mapLayers( + VirtualGPU* gpu, //!< Virtual GPU device object + uint flags = 0 //!< flags for the map operation + ); + + //! Unlocks the resource with layers if it was locked + void unmapLayers( + VirtualGPU* gpu //!< Virtual GPU device object + ); + + //! Calls GSL to map a resource + void* gpuMemoryMap( + size_t* pitch, //!< Pitch value for the image + uint flags, //!< Map flags + Pal::IGpuMemory* resource //!< GSL memory object + ) const; + + //! Uses GSL to unmap a resource + void gpuMemoryUnmap( + Pal::IGpuMemory* resource //!< GSL memory object + ) const; + + //! Fress all GSL resources associated with OCL resource + void gslFree() const; + + //! Converts Resource memory type to the PAL heaps + void memTypeToHeap( + Pal::GpuMemoryCreateInfo* createInfo //!< Memory create info + ); + + const Device& gpuDevice_; //!< GPU device + Descriptor desc_; //!< Descriptor for this resource + amd::Atomic mapCount_; //!< Total number of maps + void* address_; //!< Physical address of this resource + size_t offset_; //!< Resource offset + size_t curRename_; //!< Current active rename in the list + RenameList renames_; //!< Rename resource list + GpuMemoryReference* memRef_; //!< GSL resource reference + const Resource* viewOwner_; //!< GPU resource, which owns this view + uint64_t pinOffset_; //!< Pinned memory offset + void* glInteropMbRes_;//!< Mb Res handle + uint32_t glType_; //!< GL interop type + void* glPlatformContext_; + void* glDeviceContext_; + + // Optimization for multilayer map/unmap + uint startLayer_; //!< Start layer for map/unmapLayer + uint numLayers_; //!< Number of layers for map/unmapLayer + uint mapFlags_; //!< Map flags for map/umapLayer + + //! @note: This field is necessary for the thread safe release only + VirtualGPU* gpu_; //!< Resource will be used only on this queue + Pal::IImage* image_; //!< PAL image object + + uint32_t* hwState_; //!< HW state for image object + uint64_t hwSrd_; //!< GPU pointer to HW SRD +}; + +class ResourceCache : public amd::HeapObject +{ +public: + //! Default constructor + ResourceCache(size_t cacheSizeLimit) + : lockCacheOps_("PAL resource cache", true) + , cacheSize_(0) + , cacheSizeLimit_(cacheSizeLimit) + {} + + //! Default destructor + ~ResourceCache(); + + //! Adds a CAL resource to the cache + bool addGpuMemory( + Resource::Descriptor* desc, //!< Resource descriptor - cache key + GpuMemoryReference* ref //!< Resource reference + ); + + //! Finds a CAL resource from the cache + GpuMemoryReference* findGpuMemory( + Resource::Descriptor* desc, //!< Resource descriptor - cache key + Pal::gpusize size, + Pal::gpusize alignment + ); + + //! Destroys cache + bool free(size_t minCacheEntries = 0); + +private: + //! Disable copy constructor + ResourceCache(const ResourceCache&); + + //! Disable operator= + ResourceCache& operator=(const ResourceCache&); + + //! Removes one last entry from the cache + void removeLast(); + + amd::Monitor lockCacheOps_; //!< Lock to serialise cache access + + size_t cacheSize_; //!< Current cache size in bytes + size_t cacheSizeLimit_; //!< Cache size limit in bytes + + //! CAL resource cache + std::list > resCache_; +}; + +/*@}*/} // namespace pal + +#endif /*PALRESOURCE_HPP_*/ diff --git a/projects/clr/rocclr/runtime/device/pal/palsched.hpp b/projects/clr/rocclr/runtime/device/pal/palsched.hpp new file mode 100644 index 0000000000..44038dd2c5 --- /dev/null +++ b/projects/clr/rocclr/runtime/device/pal/palsched.hpp @@ -0,0 +1,78 @@ +// +// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved. +// +#ifndef PALSCHED_HPP_ +#define PALSCHED_HPP_ + +#include "hsa.h" + +namespace pal { + +//! AmdAqlWrap slot state +enum AqlWrapState { + AQL_WRAP_FREE = 0, + AQL_WRAP_RESERVED, + AQL_WRAP_READY, + AQL_WRAP_MARKER, + AQL_WRAP_BUSY, + AQL_WRAP_DONE +}; + +struct AmdVQueueHeader { + uint32_t aql_slot_num; //!< [LRO/SRO] The total number of the AQL slots (multiple of 64). + uint32_t event_slot_num; //!< [LRO] The number of kernel events in the events buffer + uint64_t event_slot_mask; //!< [LRO] A pointer to the allocation bitmask array for the events + uint64_t event_slots; //!< [LRO] Pointer to a buffer for the events. + // Array of event_slot_num entries of AmdEvent + uint64_t aql_slot_mask; //!< [LRO/SRO]A pointer to the allocation bitmask for aql_warp slots + uint32_t command_counter; //!< [LRW] The global counter for the submitted commands into the queue + uint32_t wait_size; //!< [LRO] The wait list size (in clk_event_t) + uint32_t arg_size; //!< [LRO] The size of argument buffer (in bytes) + uint32_t mask_groups; //!< Processed mask groups by one thread + uint64_t kernel_table; //!< [LRO] Pointer to an array with all kernel objects (ulong for each entry) + uint32_t reserved[2]; //!< For the future usage +}; + +struct AmdAqlWrap { + uint32_t state; //!< [LRW/SRW] The current state of the AQL wrapper: FREE, RESERVED, READY, + // MARKER, BUSY and DONE. The block could be returned back to a free state. + uint32_t enqueue_flags; //!< [LWO/SRO] Contains the flags for the kernel execution start + uint32_t command_id; //!< [LWO/SRO] The unique command ID + uint32_t child_counter; //!< [LRW/SRW] Counter that determine the launches of child kernels. + // It’s incremented on the + // start and decremented on the finish. The parent kernel can be considered as + // done when the value is 0 and the state is DONE + uint64_t completion; //!< [LWO/SRO] CL event for the current execution (clk_event_t) + uint64_t parent_wrap; //!< [LWO/SRO] Pointer to the parent AQL wrapper (AmdAqlWrap*) + uint64_t wait_list; //!< [LRO/SRO] Pointer to an array of clk_event_t objects (64 bytes default) + uint32_t wait_num; //!< [LWO/SRO] The number of cl_event_wait objects + uint32_t reserved[5]; //!< For the future usage + hsa_kernel_dispatch_packet_t aql; //!< [LWO/SRO] AQL packet – 64 bytes AQL packet +}; + +struct AmdEvent { + uint32_t state; //!< [LRO/SRW] Event state: START, END, COMPLETE + uint32_t counter; //!< [LRW] Event retain/release counter. 0 means the event is free + uint64_t timer[3]; //!< [LRO/SWO] Timer values for profiling for each state + uint64_t captureInfo; //!< [LRW/SRO] Profiling capture info for CLK_PROFILING_COMMAND_EXEC_TIME +}; + +struct SchedulerParam { + uint32_t signal; //!< Signal to stop the child queue(address must be 16 bytes aligned) + uint32_t eng_clk; //!< Engine clock in Mhz + uint64_t hw_queue; //!< Address to HW queue + uint64_t hsa_queue; //!< Address to HSA dummy queue + uint32_t useATC; //!< GPU access to shader program by ATC. + uint32_t scratchSize; //!< Scratch buffer size + uint64_t scratch; //!< GPU address to the scratch buffer + uint32_t numMaxWaves; //!< The max number of possible waves + uint32_t releaseHostCP; //!< Releases CP on the host queue + uint64_t parentAQL; //!< Host parent AmdAqlWrap packet + uint32_t dedicatedQueue; //!< Scheduler uses a dedicated queue + uint32_t scratchOffset; //!< Scratch buffer offset + uint32_t reserved[2]; //!< Reserved +}; + +} // namespace pal + +#endif diff --git a/projects/clr/rocclr/runtime/device/pal/palschedcl.cpp b/projects/clr/rocclr/runtime/device/pal/palschedcl.cpp new file mode 100644 index 0000000000..25727bc8aa --- /dev/null +++ b/projects/clr/rocclr/runtime/device/pal/palschedcl.cpp @@ -0,0 +1,23 @@ +// +// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved. +// +namespace pal { + +#define SCHEDULER_KERNEL(...) #__VA_ARGS__ + +const char* SchedulerSourceCode = SCHEDULER_KERNEL( +\n +extern void __amd_scheduler(__global void *, __global void *, uint); +\n +__kernel void +scheduler( + __global void * queue, + __global void * params, + uint paramIdx) +{ + __amd_scheduler(queue, params, paramIdx); +} +\n +); + +} // namespace pal diff --git a/projects/clr/rocclr/runtime/device/pal/palsettings.cpp b/projects/clr/rocclr/runtime/device/pal/palsettings.cpp new file mode 100644 index 0000000000..66c68f7f18 --- /dev/null +++ b/projects/clr/rocclr/runtime/device/pal/palsettings.cpp @@ -0,0 +1,433 @@ +// +// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved. +// +#include "top.hpp" +#include "os/os.hpp" +#include "device/device.hpp" +#include "device/pal/paldefs.hpp" +#include "device/pal/palsettings.hpp" + +#include + +namespace pal { + +/*! \brief information for adjusting maximum workload time + * + * This structure contains the time and OS minor version for max workload time + * adjustment for Windows 7 or 8. + */ +struct ModifyMaxWorkload +{ + uint32_t time; //!< max work load time (10x ms) + uint32_t minorVersion; //!< OS minor version +}; + + +Settings::Settings() +{ + // Initialize the GPU device default settings + oclVersion_ = OpenCL12; + debugFlags_ = 0; + singleHeap_ = false; + syncObject_ = GPU_USE_SYNC_OBJECTS; + remoteAlloc_ = REMOTE_ALLOC; + + stagedXferRead_ = true; + stagedXferWrite_ = true; + stagedXferSize_ = GPU_STAGING_BUFFER_SIZE * Ki; + + // We will enable staged read/write if we use local memory + disablePersistent_ = false; + + // By Default persistent writes will be disabled. + stagingWritePersistent_ = GPU_STAGING_WRITE_PERSISTENT; + + maxRenames_ = 4; + maxRenameSize_ = 4 * Mi; + + imageSupport_ = false; + hwLDSSize_ = 0; + + // Set this to true when we drop the flag + doublePrecision_ = ::CL_KHR_FP64; + + // Fill workgroup info size + // @todo: revisit the 256 limitation on workgroup size + maxWorkGroupSize_ = 256; + + hostMemDirectAccess_ = HostMemDisable; + + libSelector_ = amd::LibraryUndefined; + + // Enable workload split by default (for 24 bit arithmetic or timeout) + workloadSplitSize_ = 1 << GPU_WORKLOAD_SPLIT; + + // By default use host blit + blitEngine_ = BlitEngineHost; + const static size_t MaxPinnedXferSize = 32; + pinnedXferSize_ = std::min(GPU_PINNED_XFER_SIZE, MaxPinnedXferSize) * Mi; + pinnedMinXferSize_ = std::min(GPU_PINNED_MIN_XFER_SIZE * Ki, pinnedXferSize_); + + // Disable FP_FAST_FMA defines by default + reportFMAF_ = false; + reportFMA_ = false; + + // GPU device by default + apuSystem_ = false; + + // Disable 64 bit pointers support by default + use64BitPtr_ = false; + + // Max alloc size is 16GB + maxAllocSize_ = 16 * static_cast(Gi); + + // Disable memory dependency tracking by default + numMemDependencies_ = 0; + + // By default cache isn't present + cacheLineSize_ = 0; + cacheSize_ = 0; + + // Initialize transfer buffer size to 1MB by default + xferBufSize_ = 1024 * Ki; + + // Use image DMA if requested + imageDMA_ = GPU_IMAGE_DMA; + + // Disable ASIC specific features by default + ciPlus_ = false; + viPlus_ = false; + aiPlus_ = false; + + // Number of compute rings. + numComputeRings_ = 0; + + minWorkloadTime_ = 1; // 0.1 ms + maxWorkloadTime_ = 5000; // 500 ms + + // Controls tiled images in persistent + //!@note IOL for Linux doesn't setup tiling aperture in CMM/QS + linearPersistentImage_ = false; + + useSingleScratch_ = GPU_USE_SINGLE_SCRATCH; + + // Device enqueuing settings + numDeviceEvents_ = 1024; + numWaitEvents_ = 8; + + // Disable HSAIL by default + hsail_ = false; + + // Don't support platform atomics by default. + svmAtomics_ = false; + + // Use direct SRD by default + hsailDirectSRD_ = GPU_DIRECT_SRD; + + // Use host queue for device enqueuing by default + useDeviceQueue_ = GPU_USE_DEVICE_QUEUE; + + // Don't support Denormals for single precision by default + singleFpDenorm_ = false; +} + +bool +Settings::create( + const Pal::DeviceProperties& palProp, + const Pal::GpuMemoryHeapProperties* heaps, + bool reportAsOCL12Device +) +{ +// uint target = calAttr.target; + uint32_t osVer = 0x0; + + // Disable thread trace by default for all devices + threadTraceEnable_ = false; + bool doublePrecision = true; + + if (doublePrecision) { + // Report FP_FAST_FMA define if double precision HW + reportFMA_ = true; + // FMA is 1/4 speed on Pitcairn, Cape Verde, Devastator and Scrapper + // Bonaire, Kalindi, Spectre and Spooky so disable + // FP_FMA_FMAF for those parts in switch below + reportFMAF_ = true; + } + + // Update GPU specific settings and info structure if we have any + ModifyMaxWorkload modifyMaxWorkload = {0}; + + switch (palProp.revision) { +/* case Pal::AsicRevision::: + case CAL_TARGET_GREENLAND: + //TODO: specific codes for AI + aiPlus_ = true;*/ + // Fall through to VI ... + case Pal::AsicRevision::Carrizo: + case Pal::AsicRevision::Stoney: + if (!aiPlus_) { + // APU systems for VI + apuSystem_ = true; + } + case Pal::AsicRevision::Iceland: + case Pal::AsicRevision::Tonga: + case Pal::AsicRevision::Fiji: + case Pal::AsicRevision::Ellesmere: + case Pal::AsicRevision::Baffin: + // Disable tiling aperture on VI+ + linearPersistentImage_ = true; + // Keep this false even though we have support + // singleFpDenorm_ = true; + viPlus_ = true; + // Fall through to CI ... + case Pal::AsicRevision::Kalindi: + case Pal::AsicRevision::Spectre: + if (!viPlus_) { + // APU systems for CI + apuSystem_ = true; + // Fix BSOD/TDR issues observed on Kaveri Win7 (EPR#416903) + modifyMaxWorkload.time = 2500; // 250ms + modifyMaxWorkload.minorVersion = 1; // Win 7 + } + // Fall through ... + case Pal::AsicRevision::Bonaire: + case Pal::AsicRevision::Hawaii: + ciPlus_ = true; + hsail_ = true; + threadTraceEnable_ = AMD_THREAD_TRACE_ENABLE; + reportFMAF_ = false; + if (palProp.revision == Pal::AsicRevision::Hawaii) { + reportFMAF_ = true; + } + // Cache line size is 64 bytes + cacheLineSize_ = 64; + // L1 cache size is 16KB + cacheSize_ = 16 * Ki; + + if (ciPlus_) { + libSelector_ = amd::GPU_Library_CI; + if (LP64_SWITCH(WINDOWS_SWITCH(viPlus_, false), true)) { + oclVersion_ = !reportAsOCL12Device /*&& calAttr.isOpenCL200Device*/ ? + XCONCAT(OpenCL, XCONCAT(OPENCL_MAJOR, OPENCL_MINOR)) : OpenCL12; + } + if (GPU_FORCE_OCL20_32BIT) { + force32BitOcl20_ = true; + oclVersion_ = !reportAsOCL12Device /*&& calAttr.isOpenCL200Device*/ ? + XCONCAT(OpenCL, XCONCAT(OPENCL_MAJOR, OPENCL_MINOR)) : OpenCL12; + } + if (OPENCL_VERSION < 200) { + oclVersion_ = OpenCL12; + } + numComputeRings_ = 8; + } + else { + numComputeRings_ = 2; + libSelector_ = amd::GPU_Library_SI; + } + + // This needs to be cleaned once 64bit addressing is stable + if (oclVersion_ < OpenCL20) { + use64BitPtr_ = flagIsDefault(GPU_FORCE_64BIT_PTR) ? LP64_SWITCH(false, + /*calAttr.isWorkstation ||*/ hsail_) : GPU_FORCE_64BIT_PTR; + } + else { + if (GPU_FORCE_64BIT_PTR || LP64_SWITCH(false, (hsail_ + || (oclVersion_ >= OpenCL20)))) { + use64BitPtr_ = true; + } + } + + if (oclVersion_ >= OpenCL20) { + supportDepthsRGB_ = true; + } + if (use64BitPtr_) { + if (GPU_ENABLE_LARGE_ALLOCATION /*&& calAttr.isWorkstation*/) { + maxAllocSize_ = 64ULL * Gi; + } + else { + maxAllocSize_ = 4048 * Mi; + } + } + else { + maxAllocSize_ = 3ULL * Gi; + } + + supportRA_ = false; + partialDispatch_ = GPU_PARTIAL_DISPATCH; + numMemDependencies_ = GPU_NUM_MEM_DEPENDENCY; + break; + default: + assert(0 && "Unknown ASIC type!"); + return false; + } + + // Enable atomics support + enableExtension(ClKhrInt64BaseAtomics); + enableExtension(ClKhrInt64ExtendedAtomics); + enableExtension(ClKhrGlobalInt32BaseAtomics); + enableExtension(ClKhrGlobalInt32ExtendedAtomics); + enableExtension(ClKhrLocalInt32BaseAtomics); + enableExtension(ClKhrLocalInt32ExtendedAtomics); + enableExtension(ClKhrByteAddressableStore); + enableExtension(ClKhrGlSharing); + enableExtension(ClKhrGlEvent); + enableExtension(ClAmdMediaOps); + enableExtension(ClAmdMediaOps2); + enableExtension(ClAmdPopcnt); + enableExtension(ClKhr3DImageWrites); + enableExtension(ClAmdVec3); + enableExtension(ClAmdPrintf); + enableExtension(ClKhrImage2dFromBuffer); + + hwLDSSize_ = 32 * Ki; + + imageSupport_ = true; + singleHeap_ = true; + + // Use kernels for blit if appropriate + blitEngine_ = BlitEngineKernel; + + hostMemDirectAccess_ |= HostMemBuffer; + // HW doesn't support untiled image writes + // hostMemDirectAccess_ |= HostMemImage; + + // Make sure device actually supports double precision + doublePrecision_ = (doublePrecision) ? doublePrecision_ : false; + if (doublePrecision_) { + // Enable KHR double precision extension + enableExtension(ClKhrFp64); + } + + if (doublePrecision) { + // Enable AMD double precision extension + doublePrecision_ = true; + enableExtension(ClAmdFp64); + } + +//! @todo +/* + if (calAttr.totalSDIHeap > 0) { + //Enable bus addressable memory extension + enableExtension(ClAMDBusAddressableMemory); + } + + if (calAttr.longIdleDetect) { + // KMD is unable to detect if we map the visible memory for CPU access, so + // accessing persistent staged buffer may fail if LongIdleDetct is enabled. + disablePersistent_ = true; + } + + svmFineGrainSystem_ = calAttr.isSVMFineGrainSystem; + + svmAtomics_ = (calAttr.svmAtomics || calAttr.isSVMFineGrainSystem) ? true : false; +*/ + // Enable some platform extensions + enableExtension(ClAmdDeviceAttributeQuery); + + enableExtension(ClKhrSpir); + + // SVM is not currently supported for DX Interop +#if defined(_WIN32) + enableExtension(ClKhrD3d9Sharing); + enableExtension(ClKhrD3d10Sharing); + enableExtension(ClKhrD3d11Sharing); +#endif // _WIN32 + + // Enable some OpenCL 2.0 extensions + if (oclVersion_ >= OpenCL20) { + enableExtension(ClKhrGLDepthImages); + enableExtension(ClKhrSubGroups); + enableExtension(ClKhrDepthImages); + + if (GPU_MIPMAP) { + enableExtension(ClKhrMipMapImage); + enableExtension(ClKhrMipMapImageWrites); + } + + // Enable HW debug + if (GPU_ENABLE_HW_DEBUG) { + enableHwDebug_ = true; + } + } + + + if (apuSystem_ && + ((heaps[Pal::GpuHeapLocal].heapSize + heaps[Pal::GpuHeapInvisible].heapSize) < (150*Mi))) { + remoteAlloc_ = true; + } + + // Save resource cache size +#ifdef ATI_OS_LINUX + // Due to EPR#406216, set the default value for Linux for now + resourceCacheSize_ = GPU_RESOURCE_CACHE_SIZE * Mi; +#else + if (remoteAlloc_) { + resourceCacheSize_ = std::max((heaps[Pal::GpuHeapGartUswc].heapSize / 8), + GPU_RESOURCE_CACHE_SIZE * Mi); + } + else { + resourceCacheSize_ = std::max(((heaps[Pal::GpuHeapLocal].heapSize + + heaps[Pal::GpuHeapInvisible].heapSize) / 8), + GPU_RESOURCE_CACHE_SIZE * Mi); + } + resourceCacheSize_ = std::min(resourceCacheSize_, 512 * Mi); +#endif + + // Override current device settings + override(); + + return true; +} + +void +Settings::override() +{ + // Limit reported workgroup size + if (GPU_MAX_WORKGROUP_SIZE != 0) { + maxWorkGroupSize_ = GPU_MAX_WORKGROUP_SIZE; + } + + // Override blit engine type + if (GPU_BLIT_ENGINE_TYPE != BlitEngineDefault) { + blitEngine_ = GPU_BLIT_ENGINE_TYPE; + } + + if (!flagIsDefault(DEBUG_GPU_FLAGS)) { + debugFlags_ = DEBUG_GPU_FLAGS; + } + + if (!flagIsDefault(DEBUG_GPU_FLAGS)) { + debugFlags_ = DEBUG_GPU_FLAGS; + } + + if (!flagIsDefault(GPU_XFER_BUFFER_SIZE)) { + xferBufSize_ = GPU_XFER_BUFFER_SIZE * Ki; + } + + if (!flagIsDefault(GPU_USE_SYNC_OBJECTS)) { + syncObject_ = GPU_USE_SYNC_OBJECTS; + } + + if (!flagIsDefault(GPU_NUM_COMPUTE_RINGS)) { + numComputeRings_ = GPU_NUM_COMPUTE_RINGS; + } + + if (!flagIsDefault(GPU_RESOURCE_CACHE_SIZE)) { + resourceCacheSize_ = GPU_RESOURCE_CACHE_SIZE * Mi; + } + + if (!flagIsDefault(AMD_GPU_FORCE_SINGLE_FP_DENORM)) { + switch (AMD_GPU_FORCE_SINGLE_FP_DENORM) { + case 0: + singleFpDenorm_ = false; + break; + case 1: + singleFpDenorm_ = true; + break; + default: + break; + } + } +} + +} // namespace pal diff --git a/projects/clr/rocclr/runtime/device/pal/palsettings.hpp b/projects/clr/rocclr/runtime/device/pal/palsettings.hpp new file mode 100644 index 0000000000..ab66cfb541 --- /dev/null +++ b/projects/clr/rocclr/runtime/device/pal/palsettings.hpp @@ -0,0 +1,128 @@ +// +// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved. +// +#ifndef PALSETTINGS_HPP_ +#define PALSETTINGS_HPP_ + +#include "top.hpp" +#include "library.hpp" +#include "inc\core\palDevice.h" + +/*! \addtogroup pal PAL Resource Implementation + * @{ + */ + +//! PAL Device Implementation +namespace pal { + +//! Device settings +class Settings : public device::Settings +{ +public: + //! Debug GPU flags + enum DebugGpuFlags + { + CheckForILSource = 0x00000001, + StubCLPrograms = 0x00000002, //!< Enables OpenCL programs stubbing + LockGlobalMemory = 0x00000004, + }; + + enum BlitEngineType + { + BlitEngineDefault = 0x00000000, + BlitEngineHost = 0x00000001, + BlitEngineCAL = 0x00000002, + BlitEngineKernel = 0x00000003, + }; + + enum HostMemFlags + { + HostMemDisable = 0x00000000, + HostMemBuffer = 0x00000001, + HostMemImage = 0x00000002, + }; + + union { + struct { + uint singleHeap_: 1; //!< Device will use a preallocated heap + uint remoteAlloc_: 1; //!< Allocate remote memory for the heap + uint stagedXferRead_: 1; //!< Uses a staged buffer read + uint stagedXferWrite_: 1; //!< Uses a staged buffer write + uint disablePersistent_: 1; //!< Disables using persistent memory for staging + uint imageSupport_: 1; //!< Report images support + uint doublePrecision_: 1; //!< Enables double precision support + uint reportFMAF_: 1; //!< Report FP_FAST_FMAF define in CL program + uint reportFMA_: 1; //!< Report FP_FAST_FMA define in CL program + uint use64BitPtr_: 1; //!< Use 64bit pointers on GPU + uint force32BitOcl20_: 1; //!< Force 32bit apps to take CLANG/HSAIL path on GPU + uint imageDMA_: 1; //!< Enable direct image DMA transfers + uint syncObject_: 1; //!< Enable syncobject + uint ciPlus_: 1; //!< CI and post CI features + uint viPlus_: 1; //!< VI and post VI features + uint aiPlus_: 1; //!< AI and post AI features + uint threadTraceEnable_: 1; //!< Thread trace enable + uint linearPersistentImage_: 1; //!< Allocates linear images in persistent + uint useSingleScratch_: 1; //!< Allocates single scratch per device + uint hsail_: 1; //!< Enables HSAIL compilation + uint stagingWritePersistent_: 1; //!< Enables persistent writes + uint svmAtomics_: 1; //!< SVM device atomics + uint svmFineGrainSystem_: 1; //!< SVM fine grain system support + uint apuSystem_: 1; //!< Device is APU system with shared memory + uint hsailDirectSRD_: 1; //!< Controls direct SRD for HSAIL + uint useDeviceQueue_: 1; //!< Submit to separate device queue + uint singleFpDenorm_: 1; //!< Support Single FP Denorm + uint reserved_: 5; + }; + uint value_; + }; + + uint oclVersion_; //!< Reported OpenCL version support + uint debugFlags_; //!< Debug GPU flags + size_t stagedXferSize_; //!< Staged buffer size + uint maxRenames_; //!< Maximum number of possible renames + uint maxRenameSize_; //!< Maximum size for all renames + uint hwLDSSize_; //!< HW local data store size + uint maxWorkGroupSize_; //!< Requested workgroup size for this device + uint hostMemDirectAccess_; //!< Enables direct access to the host memory + amd::LibrarySelector libSelector_; //!< Select linking libraries for compiler + uint workloadSplitSize_; //!< Workload split size + uint minWorkloadTime_; //!< Minimal workload time in 0.1 ms + uint maxWorkloadTime_; //!< Maximum workload time in 0.1 ms + uint blitEngine_; //!< Blit engine type + size_t pinnedXferSize_; //!< Pinned buffer size for transfer + size_t pinnedMinXferSize_; //!< Minimal buffer size for pinned transfer + size_t resourceCacheSize_; //!< Resource cache size in MB + uint64_t maxAllocSize_; //!< Maximum single allocation size + size_t numMemDependencies_;//!< The array size for memory dependencies tracking + uint cacheLineSize_; //!< Cache line size in bytes + uint cacheSize_; //!< L1 cache size in bytes + size_t xferBufSize_; //!< Transfer buffer size for image copy optimization + uint numComputeRings_; //!< 0 - disabled, 1 , 2,.. - the number of compute rings + uint numDeviceEvents_; //!< The number of device events + uint numWaitEvents_; //!< The number of wait events for device enqueue + + + //! Default constructor + Settings(); + + //! Creates settings + bool create( + const Pal::DeviceProperties& palProp, //!< PAL device properties + const Pal::GpuMemoryHeapProperties* heaps, //!< PAL heap settings + bool reportAsOCL12Device = false //!< Report As OpenCL1.2 Device + ); + +private: + //! Disable copy constructor + Settings(const Settings&); + + //! Disable assignment + Settings& operator=(const Settings&); + + //! Overrides current settings based on registry/environment + void override(); +}; + +/*@}*/} // namespace pal + +#endif /*PALSETTINGS_HPP_*/ diff --git a/projects/clr/rocclr/runtime/device/pal/palthreadtrace.cpp b/projects/clr/rocclr/runtime/device/pal/palthreadtrace.cpp new file mode 100644 index 0000000000..871e1de8f5 --- /dev/null +++ b/projects/clr/rocclr/runtime/device/pal/palthreadtrace.cpp @@ -0,0 +1,67 @@ +// +// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved. +// +#include "device/pal/palthreadtrace.hpp" +#include "device/pal/palvirtual.hpp" + +namespace pal { + +CalThreadTraceReference::~CalThreadTraceReference() { + // The thread trace object is always associated with a particular queue, + // so we have to lock just this queue + amd::ScopedLock lock(gpu_.execution()); + + if (0 != threadTrace_) { + //gpu().cs()->destroyQuery(gslThreadTrace()); + } +} + +ThreadTrace::~ThreadTrace() +{ + if (calRef_ == nullptr) { + return; + } + Unimplemented(); + for(uint i = 0; i < amdThreadTraceMemObjsNum_;++i) { +// threadTraceBufferObjs_[i]->attachMemObject(gpu().cs(), nullptr, 0, 0, 0, i); +// gpu().cs()->destroyShaderTraceBuffer(threadTraceBufferObjs_[i]); + } + + // Release the thread trace reference object + //calRef_->release(); +} + +bool +ThreadTrace::create(CalThreadTraceReference* calRef) +{ + assert(&gpu() == &calRef->gpu()); + + calRef_ = calRef; + threadTrace_ = calRef->gslThreadTrace(); + + return true; +} + +bool +ThreadTrace::info(uint infoType, uint* info, uint infoSize) const +{ + switch (infoType) { + case CL_THREAD_TRACE_BUFFERS_SIZE: { + if (infoSize < amdThreadTraceMemObjsNum_) { + LogError("The amount of buffers should be equal to the amount of Shader Engines"); + return false; + } + else { + Unimplemented(); + //gslThreadTrace()->GetResultAll(gpu().cs(), info); + } + break; + } + default: + LogError("Wrong ThreadTrace::getInfo parameter"); + return false; + } + return true; +} + +} // namespace pal diff --git a/projects/clr/rocclr/runtime/device/pal/palthreadtrace.hpp b/projects/clr/rocclr/runtime/device/pal/palthreadtrace.hpp new file mode 100644 index 0000000000..19cb958ade --- /dev/null +++ b/projects/clr/rocclr/runtime/device/pal/palthreadtrace.hpp @@ -0,0 +1,136 @@ +// +// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved. +// +#ifndef GPUTHREADTRACE_HPP_ +#define GPUTHREADTRACE_HPP_ + +#include "top.hpp" +#include "device/device.hpp" +#include "device/pal/paldevice.hpp" +#include "palPerfExperiment.h" + +#include +namespace pal { + +class VirtualGPU; + +class CalThreadTraceReference : public amd::ReferenceCountedObject +{ +public: + //! Default constructor + CalThreadTraceReference( + VirtualGPU& gpu, //!< Virtual GPU device object + Pal::IPerfExperiment* gslThreadTrace) //!< GSL query thread trace object + : gpu_(gpu) + , threadTrace_(gslThreadTrace){} + + //! Get GSL thread race object + Pal::IPerfExperiment* gslThreadTrace() const { return threadTrace_; } + + //! Returns the virtual GPU device + const VirtualGPU& gpu() const { return gpu_; } + +protected: + //! Default destructor + ~CalThreadTraceReference(); + +private: + //! Disable copy constructor + CalThreadTraceReference(const CalThreadTraceReference&); + + //! Disable operator= + CalThreadTraceReference& operator=(const CalThreadTraceReference&); + + VirtualGPU& gpu_; //!< The virtual GPU device object + Pal::IPerfExperiment* threadTrace_; //!< GSL thread trace query object +}; + +//! ThreadTrace implementation on GPU +class ThreadTrace : public device::ThreadTrace +{ +public: + + //! Destructor for the GPU ThreadTrace object + virtual ~ThreadTrace(); + + //! Creates the current object + bool create( + CalThreadTraceReference* calRef //!< Reference ThreadTrace + ); + + //! Returns the GPU device, associated with the current object + const Device& dev() const { return gpuDevice_; } + + //! Returns the virtual GPU device + const VirtualGPU& gpu() const { return gpu_; } + + //! Constructor for the GPU ThreadTrace object + ThreadTrace( + Device& device, //!< A GPU device object + VirtualGPU& gpu, //!< Virtual GPU device object + uint amdThreadTraceMemObjsNum) + : gpuDevice_(device) + , gpu_(gpu) + , calRef_(NULL) + , index_(0) + , amdThreadTraceMemObjsNum_(amdThreadTraceMemObjsNum) + { + threadTraceBufferObjs_ = new Pal::ThreadTraceLayout[amdThreadTraceMemObjsNum]; + Unimplemented(); + for (uint i = 0; i < amdThreadTraceMemObjsNum;++i) { + //threadTraceBufferObjs_[i] = gpu.cs()->createShaderTraceBuffer(); + } + } + + //! Returns the specific information about the thread trace object + bool info( + uint infoType, //!< The type of returned information + uint* info, //!< The returned information + uint infoSize //!< The size of returned information + ) const; + + //! Set the ThreadTrace memory buffer size + void setMemBufferSizeTT(uint memBufferSizeTT) { memBufferSizeTT_ = memBufferSizeTT;} + + //! Set isNewBufferBinded_ to true/false if new buffer was binded/unbinded respectively + void setNewBufferBinded(bool isNewBufferBinded) { isNewBufferBinded_ = isNewBufferBinded; } + + //! Attach Pal::IGpuMemory to the TreadTrace buffer + void attachMemToThreadTraceBuffer(); + + void setMemObj(size_t memObjSize,std::vector memObj) + { + memObj_ = memObj; + memBufferSizeTT_ = memObjSize; + } + //! Get GSL thread trace object + Pal::IPerfExperiment* gslThreadTrace() const { return threadTrace_; } + + //! Get GSL Thread Trace Buffer objects + Pal::ThreadTraceLayout* getThreadTraceBufferObjects() {return threadTraceBufferObjs_;} +private: + //! Disable default copy constructor + ThreadTrace(const ThreadTrace&); + + //! Disable default operator= + ThreadTrace& operator=(const ThreadTrace&); + + const Device& gpuDevice_; //!< The backend device + + VirtualGPU& gpu_; //!< The virtual GPU device object + + CalThreadTraceReference* calRef_; //!< Reference ThreadTrace + Pal::ThreadTraceLayout* threadTraceBufferObjs_; //!< The buffer object for Thread Trace recording + uint index_; //!< ThreadTrace index in the CAL container + uint memBufferSizeTT_; //!< ThreadTrace memory buffer size + std::vector memObj_; //!< ThreadTrace memory object + Pal::IPerfExperiment* threadTrace_; //!< GSL thread trace query object + uint amdThreadTraceMemObjsNum_; //!< ThreadTrace memory object`s number (should be equal to the SE number) + bool isNewBufferBinded_; //!< The indicator if new buffer was binded to the ThreadTrace object + bool isBufferOnSubmit_; //!< The indicator if "new buffer on submit" mode is used +}; + +} // namespace pal + +#endif // PALTHREADTRACE_HPP_ + diff --git a/projects/clr/rocclr/runtime/device/pal/paltimestamp.cpp b/projects/clr/rocclr/runtime/device/pal/paltimestamp.cpp new file mode 100644 index 0000000000..15876345ac --- /dev/null +++ b/projects/clr/rocclr/runtime/device/pal/paltimestamp.cpp @@ -0,0 +1,123 @@ +// +// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved. +// +#include "os/os.hpp" +#include "platform/perfctr.hpp" +#include "device/pal/paldefs.hpp" +#include "device/pal/paltimestamp.hpp" +#include "device/pal/palvirtual.hpp" +#include "device/pal/palcounters.hpp" + +namespace pal { + +TimeStamp::TimeStamp( + const VirtualGPU& gpu, + Pal::IGpuMemory* iMem, + uint memOffset, + address cpuAddr) + : gpu_(gpu) + , iMem_(iMem) + , memOffset_(memOffset) +{ + values_ = reinterpret_cast(cpuAddr + memOffset); +} + +TimeStamp::~TimeStamp() +{ +} + +void +TimeStamp::begin(bool sdma) +{ + if (!flags_.beginIssued_) { + gpu().iCmd()->CmdWriteTimestamp(Pal::HwPipePoint::HwPipeTop, *iMem_, + memOffset_ + CommandStartTime * sizeof(uint64_t)); + flags_.beginIssued_ = true; + } +} + +void +TimeStamp::end(bool sdma) +{ + CondLog(!flags_.beginIssued_, "We didn't issue a begin operation!"); + gpu().iCmd()->CmdWriteTimestamp(Pal::HwPipePoint::HwPipeBottom, *iMem_, + memOffset_ + CommandEndTime * sizeof(uint64_t)); + flags_.endIssued_ = true; + flags_.sdma_ = sdma; +} + +inline void +SetValue(uint64_t* time, uint64_t val, double nanos) +{ + *time = static_cast(static_cast(val) * nanos); +} + +void +TimeStamp::value(uint64_t* startTime, uint64_t* endTime) +{ + CondLog(!flags_.endIssued_, "We didn't send the counter end operation!"); + //! @todo optimize! + const double NanoSecondsPerTick = 1000000000.0 / (gpu_.dev().properties().timestampFrequency); + + SetValue(startTime, values_[CommandStartTime], NanoSecondsPerTick); + SetValue(endTime, values_[CommandEndTime], NanoSecondsPerTick); +} + +TimeStampCache::~TimeStampCache() +{ + // Release all time stamp objects from the cache + for (uint i = 0; i < freedTS_.size(); ++i) { + delete freedTS_[i]; + } + freedTS_.clear(); + + // Release all memory objects + for (uint i = 0; i < tsBuf_.size(); ++i) { + tsBuf_[i]->unmap(&gpu_); + gpu_.queue(MainEngine).removeMemRef(tsBuf_[i]->iMem()); + gpu_.queue(SdmaEngine).removeMemRef(tsBuf_[i]->iMem()); + delete tsBuf_[i]; + } + tsBuf_.clear(); + +} + +TimeStamp* +TimeStampCache::allocTimeStamp() +{ + TimeStamp* ts = nullptr; + if (0 != freedTS_.size()) { + ts = freedTS_.back(); + freedTS_.pop_back(); + } + + if (nullptr == ts) { + if ((tsBufCpu_ == nullptr) || ((tsOffset_ + TimerSlotSize) > TimerBufSize)) { + Memory* buf = new Memory(gpu_.dev(), TimerBufSize); + if (buf == nullptr || !buf->create(Resource::Remote)) { + return nullptr; + } + gpu_.queue(MainEngine).addMemRef(buf->iMem()); + gpu_.queue(SdmaEngine).addMemRef(buf->iMem()); + tsBufCpu_ = reinterpret_cast
(buf->map(&gpu_)); + memset(tsBufCpu_, 0, TimerBufSize); + tsOffset_ = 0; + tsBuf_.push_back(buf); + } + // Allocate a TimeStamp object + ts = new TimeStamp(gpu_, tsBuf_[(tsBuf_.size() - 1)]->iMem(), + tsOffset_, tsBufCpu_); + // Create a timestamp + if (ts == nullptr) { + return nullptr; + } + tsOffset_ += TimerSlotSize; + } + + // Set this timestamp into DRM profile mode if it was requested + ts->clearStates(); + + return ts; +} + +} // namespace pal diff --git a/projects/clr/rocclr/runtime/device/pal/paltimestamp.hpp b/projects/clr/rocclr/runtime/device/pal/paltimestamp.hpp new file mode 100644 index 0000000000..99294dace1 --- /dev/null +++ b/projects/clr/rocclr/runtime/device/pal/paltimestamp.hpp @@ -0,0 +1,132 @@ +// +// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved. +// +#ifndef PALTIMESTAMP_HPP_ +#define PALTIMESTAMP_HPP_ + +#include "device/pal/paldefs.hpp" +#include "device/pal/palresource.hpp" + +/*! \addtogroup pal PAL Resource Implementation + * @{ + */ + +//! PAL Device Implementation +namespace pal { + +class Device; +class VirtualGPU; +class Memory; + +class TimeStamp : public amd::HeapObject +{ +public: + //! Enums for the timestamp information + //! \note *4 is the limitaiton of SDMA HW + //! (address has to be aligned by 256 bit) + enum TimeStampValue { + CommandStartTime = 0, + CommandEndTime = 4, + CommandTotal = 8 + }; + + //! The TimeStamp object flags + union Flags + { + struct + { + uint32_t beginIssued_ : 1; + uint32_t endIssued_ : 1; + uint32_t sdma_ : 1; + }; + uint32_t value_; + Flags(): value_(0) {} + }; + + //! Default constructor + TimeStamp( + const VirtualGPU& gpu, //!< Virtual GPU + Pal::IGpuMemory* iMem, //!< Buffer with the timer values + uint memOffset, //!< Offset in the buffer for the current TS + address cpuAddr //!< CPU pointer for the values in memory + ); + + //! Default destructor + ~TimeStamp(); + + //! Starts the timestamp + void begin(bool sdma = false); + + //! Ends the timestamp + void end(bool sdma = false); + + //! Returns the timestamp result in nano seconds + void value(uint64_t* startTime, uint64_t* endTime); + + //! Clear all TimeStamp states + void clearStates() + { flags_.value_ = 0; + values_[CommandStartTime] = 0; + values_[CommandEndTime] = 0; + } + + //! Timer commands were submitted to HW + bool isValid() const { return (flags_.endIssued_) ? true : false; } + +private: + //! Disable copy constructor + TimeStamp(const TimeStamp&); + + //! Disable operator= + TimeStamp& operator=(const TimeStamp&); + + //! Returns the GPU device object + const VirtualGPU& gpu() const { return gpu_; } + + const VirtualGPU& gpu_; //!< Virtual GPU + Flags flags_; //!< The time stamp state + Pal::IGpuMemory* iMem_; //!< Buffer with the timer values + uint memOffset_; //!< Offset in the buffer for the current timer + volatile uint64_t* values_; //!< CPU pointer to the timer values +}; + +class TimeStampCache : public amd::HeapObject +{ +public: + //! Default constructor + TimeStampCache( + VirtualGPU& gpu //!< Virtual GPU object + ) + : gpu_(gpu) + , tsBufCpu_(NULL) + , tsOffset_(0) {} + + //! Default destructor + ~TimeStampCache(); + + //! Gets a time stamp object. It will find a freed object or allocate a new one + TimeStamp* allocTimeStamp(); + + //! Frees a time stamp object + void freeTimeStamp(TimeStamp* ts) { freedTS_.push_back(ts); } + +private: + static const uint TimerSlotSize = TimeStamp::CommandTotal * sizeof(uint64_t); + static const uint TimerBufSize = TimerSlotSize * 4096; + + //! Disable copy constructor + TimeStampCache(const TimeStampCache&); + + //! Disable operator= + TimeStampCache& operator=(const TimeStampCache&); + + std::vector freedTS_; //!< Array of freed time stamp objects + VirtualGPU& gpu_; //!< Virtual GPU + std::vector tsBuf_; //!< Array of memory objects with the timer value + address tsBufCpu_; //!< CPU pointer for current TS memory + uint tsOffset_; //!< Active offset in the current mem object +}; + +/*@}*/} // namespace pal + +#endif /*PALTIMESTAMP_HPP_*/ diff --git a/projects/clr/rocclr/runtime/device/pal/paltrap.hpp b/projects/clr/rocclr/runtime/device/pal/paltrap.hpp new file mode 100644 index 0000000000..e1eed63243 --- /dev/null +++ b/projects/clr/rocclr/runtime/device/pal/paltrap.hpp @@ -0,0 +1,187 @@ +/******************************************************************************* + * The source of the runtime trap handler, "runtimetraphandler.sp3". + * The binary is created by the SP3 tool with the following command: + * + * sp3.exe runtimetraphandler.sp3 -hex runtimeTrapCode.hex + * + ******************************************************************************* + +shader main + asic(TAHITI) // for SI/CI or asic(VI) for VI + type(CS) + + // clear wave exception state + v_clrexcp + s_waitcnt 0 + //========================================================================== + // Handle the workaround for HW bug that causes the incorrect TMA value. + // Retrieve the TMA values, which are stored at TBA buffer at location + // 256 (0x100). + + // Construct the memory descriptor with TBA as the start address + // we are using the registers ttmp[8:11] for that. + s_mov_b32 ttmp8, tba_lo + s_and_b32 ttmp9, tba_hi, 0xffff + + // 0x100=256 bytes, which is the size of the buffer to + // store all the level 2 trap handler info + s_or_b32 ttmp9, ttmp9, 0x01000000 + s_mov_b32 ttmp10, 0x00002000 + s_mov_b32 ttmp11, 0x00024fac + + // TMA is stored 256 (0x100) bytes before the TBA value + s_sub_u32 ttmp8, ttmp8, 0x100 + + // Backup the s0 since ttmp registers cannot be target of + // buffer read instruction + s_mov_b32 ttmp7, s0 + s_buffer_load_dword s0, ttmp8, 0x0 // VI: offset=0x0 (bytes) + s_waitcnt 0 + s_mov_b32 tma_lo, s0 + s_buffer_load_dword s0, ttmp8, 0x1 // VI: offset=0x4 (bytes) + s_waitcnt 0 + s_mov_b32 tma_hi, s0 + s_mov_b32 s0, ttmp7 + + //=================================================== + // setup the mmeory descriptor for TMA + s_mov_b32 ttmp6, 0x18 + s_add_u32 ttmp8, tma_lo, ttmp6 + s_and_b32 ttmp9, tma_hi, 0xffff + //0x68=104 bytes, which is the size of the buffer to + //store all the level2 trap handler info + s_or_b32 ttmp9, ttmp9, 0x00680000 + s_mov_b32 ttmp10, 0x00002000 + s_mov_b32 ttmp11, 0x00024fac + + //=================================================== + // backup the TMA values to be restored later + // level-one TMA saved in the ttmp6,ttmp7 + s_mov_b32 ttmp6, tma_lo + s_mov_b32 ttmp7, tma_hi + + //=================================================== + // setup the TMA for the level-two trap handler + // level-two TMA saved in tma_hi, tma_lo + s_mov_b32 ttmp3, s0 + s_buffer_load_dword s0, ttmp8, 0x2 // VI: offset=0x8 (bytes) + s_waitcnt 0x0000 + s_mov_b32 tma_lo, s0 + + s_buffer_load_dword s0, ttmp8, 0x3 // VI: offset=0xc (bytes) + s_waitcnt 0x0000 + s_mov_b32 tma_hi, s0 + + //=================================================== + // setup the TBA for the level-two trap handler + // level-two TBA saved in ttmp9, ttmp8 + s_buffer_load_dword s0, ttmp8, 0x0 // VI: offset=0x0 (bytes) + s_waitcnt 0x0000 + s_mov_b32 ttmp2, s0 + + s_buffer_load_dword s0, ttmp8, 0x1 // VI: offset=0x4 (bytes) + s_waitcnt 0x0000 + + //swap the values of s0 and ttmp3 without using other registers + s_xor_b32 ttmp3, s0, ttmp3 + s_xor_b32 s0, s0, ttmp3 + s_xor_b32 ttmp3, s0, ttmp3 + + //store the debug trap handler start address in ttmp8,9 + s_mov_b32 ttmp8, ttmp2 + s_mov_b32 ttmp9, ttmp3 + + //=================================================== + // get the pc value to resume execution + s_getpc_b64 [ttmp2, ttmp3] + s_add_u32 ttmp2, ttmp2, 0x8 + + //=================================================== + //set the pc value to jump to the debug trap handler + s_setpc_b64 [ttmp8, ttmp9] + + //=================================================== + // restore the tamp values + s_mov_b32 tma_hi, ttmp7 + s_mov_b32 tma_lo, ttmp6 + + label_return: + //=================================================== + // return from the trap handler to the saved PC + s_and_b32 ttmp1, ttmp1, 0xffff + s_rfe_b64 [ttmp0,ttmp1] + +end + +*******************************************************************************/ + +/// shader codes with "asic(TAHITI)" instruction +static const uint32_t RuntimeTrapCode [] = { + 0x7e008200, 0xbf8c0000, + 0xbef8036c, 0x8779ff6d, + 0x0000ffff, 0x8879ff79, + 0x01000000, 0xbefa03ff, + 0x00002000, 0xbefb03ff, + 0x00024fac, 0x80f8ff78, + 0x00000100, 0xbef70300, + 0xc2007900, 0xbf8c0000, + 0xbeee0300, 0xc2007901, + 0xbf8c0000, 0xbeef0300, + 0xbe800377, 0xbef60398, + 0x8078766e, 0x8779ff6f, + 0x0000ffff, 0x8879ff79, + 0x00680000, 0xbefa03ff, + 0x00002000, 0xbefb03ff, + 0x00024fac, 0xbef6036e, + 0xbef7036f, 0xbef30300, + 0xc2007902, 0xbf8c0000, + 0xbeee0300, 0xc2007903, + 0xbf8c0000, 0xbeef0300, + 0xc2007900, 0xbf8c0000, + 0xbef20300, 0xc2007901, + 0xbf8c0000, 0x89737300, + 0x89007300, 0x89737300, + 0xbef80372, 0xbef90373, + 0xbef21f00, 0x80728872, + 0xbe802078, 0xbeef0377, + 0xbeee0376, 0x8771ff71, + 0x0000ffff, 0xbe802270 +}; + + +/// shader codes with "asic(VI)" instruction +static const uint32_t RuntimeTrapCodeVi [] = { + 0x7e006a00, 0xbf8c0000, + 0xbef8006c, 0x8679ff6d, + 0x0000ffff, 0x8779ff79, + 0x01000000, 0xbefa00ff, + 0x00002000, 0xbefb00ff, + 0x00024fac, 0x80f8ff78, + 0x00000100, 0xbef70000, + 0xc022003c, 0x00000000, + 0xbf8c0000, 0xbeee0000, + 0xc022003c, 0x00000004, + 0xbf8c0000, 0xbeef0000, + 0xbe800077, 0xbef60098, + 0x8078766e, 0x8679ff6f, + 0x0000ffff, 0x8779ff79, + 0x00680000, 0xbefa00ff, + 0x00002000, 0xbefb00ff, + 0x00024fac, 0xbef6006e, + 0xbef7006f, 0xbef30000, + 0xc022003c, 0x00000008, + 0xbf8c0000, 0xbeee0000, + 0xc022003c, 0x0000000c, + 0xbf8c0000, 0xbeef0000, + 0xc022003c, 0x00000000, + 0xbf8c0000, 0xbef20000, + 0xc022003c, 0x00000004, + 0xbf8c0000, 0x88737300, + 0x88007300, 0x88737300, + 0xbef80072, 0xbef90073, + 0xbef21c00, 0x80728872, + 0xbe801d78, 0xbeef0077, + 0xbeee0076, 0x8671ff71, + 0x0000ffff, 0xbe801f70 +}; + diff --git a/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp b/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp new file mode 100644 index 0000000000..ae642e1dc7 --- /dev/null +++ b/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp @@ -0,0 +1,3435 @@ +// +// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved. +// +#include "platform/perfctr.hpp" +#include "platform/threadtrace.hpp" +#include "platform/kernel.hpp" +#include "platform/commandqueue.hpp" +#include "device/pal/palconstbuf.hpp" +#include "device/pal/palvirtual.hpp" +#include "device/pal/palkernel.hpp" +#include "device/pal/palprogram.hpp" +#include "device/pal/palcounters.hpp" +#include "device/pal/palthreadtrace.hpp" +#include "device/pal/paltimestamp.hpp" +#include "device/pal/palblit.hpp" +#include "device/pal/paldebugger.hpp" +#include "hsa.h" +#include "amd_hsa_kernel_code.h" +#include "amd_hsa_queue.h" +#include +#include +#include +#include "palQueue.h" +#include "palFence.h" + +#ifdef _WIN32 +#include +#include "amdocl/cl_d3d9_amd.hpp" +#include "amdocl/cl_d3d10_amd.hpp" +#include "amdocl/cl_d3d11_amd.hpp" +#endif // _WIN32 + +namespace pal { + +VirtualGPU::Queue* +VirtualGPU::Queue::Create( + Pal::IDevice* palDev, + Pal::QueueType queueType, + uint engineIdx, + Pal::ICmdAllocator* cmdAllocator) +{ + Pal::Result result; + Pal::QueueCreateInfo qCreateInfo = {}; + qCreateInfo.engineType = queueType; + qCreateInfo.engineIndex = engineIdx; + + // Find queue object size + size_t qSize = palDev->GetQueueSize(qCreateInfo, &result); + if (result != Pal::Result::Success) { + return nullptr; + } + + Pal::CmdBufferCreateInfo cmdCreateInfo = {}; + cmdCreateInfo.pCmdAllocator = cmdAllocator; + cmdCreateInfo.queueType = queueType; + + // Find command buffer object size + size_t cmdSize = palDev->GetCmdBufferSize(cmdCreateInfo, &result); + if (result != Pal::Result::Success) { + return nullptr; + } + + // Find fence object size + size_t fSize = palDev->GetFenceSize(&result); + if (result != Pal::Result::Success) { + return nullptr; + } + + size_t allocSize = qSize + MaxCmdBuffers * (cmdSize + fSize); + VirtualGPU::Queue* queue = new (allocSize) VirtualGPU::Queue(palDev); + if (queue != nullptr) { + address addrQ = reinterpret_cast
(&queue[1]); + // Create PAL queue object + result = palDev->CreateQueue(qCreateInfo, addrQ, &queue->iQueue_); + if (result != Pal::Result::Success) { + delete queue; + return nullptr; + } + + address addrCmd = addrQ + qSize; + address addrF = addrCmd + MaxCmdBuffers * cmdSize; + Pal::CmdBufferBuildInfo cmdBuildInfo = {}; + + for (uint i = 0; i < MaxCmdBuffers; ++i) { + result = palDev->CreateCmdBuffer(cmdCreateInfo, + &addrCmd[i*cmdSize], &queue->iCmdBuffs_[i]); + if (result != Pal::Result::Success) { + delete queue; + return nullptr; + } + static const bool InitiallySignaled = false; + result = palDev->CreateFence(InitiallySignaled, &addrF[i*fSize], + &queue->iCmdFences_[i]); + if (result != Pal::Result::Success) { + delete queue; + return nullptr; + } + if (i == StartCmdBufIdx) { + result = queue->iCmdBuffs_[i]->Begin(cmdBuildInfo); + if (result != Pal::Result::Success) { + delete queue; + return nullptr; + } + } + } + } + return queue; +} + +VirtualGPU::Queue::~Queue() +{ + std::vector memRef; + // Remove all memory references + for (auto it: memReferences_) { + memRef.push_back(it.first); + } + if (memRef.size() != 0) { + iDev_->RemoveGpuMemoryReferences(memRef.size(), &memRef[0], NULL); + } + memReferences_.clear(); + + for (uint i = 0; i < MaxCmdBuffers; ++i) { + if (nullptr != iCmdBuffs_[i]) { + iCmdBuffs_[i]->Destroy(); + } + if (nullptr != iCmdFences_[i]) { + iCmdFences_[i]->Destroy(); + } + } + + if (nullptr != iQueue_) { + iQueue_->Destroy(); + } +} + +void +VirtualGPU::Queue::addCmdMemRef(Pal::IGpuMemory* iMem) +{ + auto it = memReferences_.find(iMem); + if (it != memReferences_.end()) { + it->second = (it->second & FirstMemoryReference) | cmdBufIdSlot_; + } + else { + memReferences_[iMem] = FirstMemoryReference | cmdBufIdSlot_; + } +} + +void +VirtualGPU::Queue::removeCmdMemRef(Pal::IGpuMemory* iMem) +{ + memReferences_.erase(iMem); + iDev_->RemoveGpuMemoryReferences(1, &iMem, iQueue_); +} + +uint +VirtualGPU::Queue::submit() +{ + cmdCnt_++; + uint id = cmdBufIdCurrent_; + if ((cmdCnt_ > MaxCommands) || GPU_FLUSH_ON_EXECUTION) { + if (!flush()) { + return GpuEvent::InvalidID; + } + } + return id; +} + +bool +VirtualGPU::Queue::flush() +{ + std::vector memRef; + // Stop commands building + if (Pal::Result::Success != iCmdBuffs_[cmdBufIdSlot_]->End()) { + LogError("PAL failed to finalize a command buffer!"); + return false; + } + // Add memory references + for (auto it = memReferences_.begin(); it != memReferences_.end(); ++it) { + if (it->second & FirstMemoryReference) { + it->second &= ~FirstMemoryReference; + memRef.push_back(it->first); + } + } + if (memRef.size() != 0) { + iDev_->AddGpuMemoryReferences(memRef.size(), &memRef[0], iQueue_); + } + + // Submit command buffer to OS + if (Pal::Result::Success != iQueue_->Submit( + 1, &iCmdBuffs_[cmdBufIdSlot_], 0, nullptr, iCmdFences_[cmdBufIdSlot_])) { + LogError("PAL failed to submit CMD!"); + return false; + } + if (GPU_FLUSH_ON_EXECUTION) { + if (Pal::Result::Success != + iDev_->WaitForFences(1, &iCmdFences_[cmdBufIdSlot_], true, 100.f)) { + LogError("PAL wait for a fence failed!"); + return false; + } + } + + // Reset the counter of commands + cmdCnt_ = 0; + + // Find the next command buffer + cmdBufIdCurrent_++; + + if (cmdBufIdCurrent_ == GpuEvent::InvalidID) { + ///@todo handle wrapping + cmdBufIdCurrent_ = 1; + cmbBufIdRetired_ = 0; + } + + // Wrap current slot + cmdBufIdSlot_ = cmdBufIdCurrent_ % MaxCmdBuffers; + + // Make sure the slot isn't busy + if (Pal::Result::NotReady == iCmdFences_[cmdBufIdSlot_]->GetStatus()) { + if (Pal::Result::Success != + iDev_->WaitForFences(1, &iCmdFences_[cmdBufIdSlot_], true, 100.f)) { + LogError("PAL wait for a fence failed!"); + return false; + } + } + // Progress retired TS + if ((cmdBufIdCurrent_ > MaxCmdBuffers) && + (cmbBufIdRetired_ < (cmdBufIdCurrent_ - MaxCmdBuffers))) { + cmbBufIdRetired_ = cmdBufIdCurrent_ - MaxCmdBuffers; + } + + if (Pal::Result::Success != + iDev_->ResetFences(1, &iCmdFences_[cmdBufIdSlot_])) { + LogError("PAL failed to reset a fence!"); + return false; + } + + // Start command buffer building + Pal::CmdBufferBuildInfo cmdBuildInfo = {}; + if (Pal::Result::Success != iCmdBuffs_[cmdBufIdSlot_]->Begin(cmdBuildInfo)) { + LogError("PAL failed CB building initialization!"); + return false; + } + + memRef.clear(); + // Remove old memory references + for (auto it = memReferences_.begin(); it != memReferences_.end();) { + if (it->second == cmdBufIdSlot_) { + memRef.push_back(it->first); + it = memReferences_.erase(it); + } + else { + ++it; + } + } + if (memRef.size() != 0) { + iDev_->RemoveGpuMemoryReferences(memRef.size(), &memRef[0], iQueue_); + } + + return true; +} + +bool +VirtualGPU::Queue::waitForEvent(uint id) +{ + if (isDone(id)) { + return true; + } + + uint slotId = id % MaxCmdBuffers; + + // Wait for the specified fence + if (Pal::Result::Success != iCmdFences_[slotId]->GetStatus()) { + if (Pal::Result::Success != + iDev_->WaitForFences(1, &iCmdFences_[slotId], true, 100.f)) { + LogError("PAL wait for a fence failed!"); + return false; + } + } + cmbBufIdRetired_ = id; + return true; +} + +bool +VirtualGPU::Queue::isDone(uint id) +{ + if ((id <= cmbBufIdRetired_) || (id > cmdBufIdCurrent_)) { + return true; + } + + if (id == cmdBufIdCurrent_) { + // Flush the current command buffer + flush(); + } + + if (Pal::Result::Success != iCmdFences_[id % MaxCmdBuffers]->GetStatus()) { + return false; + } + cmbBufIdRetired_ = id; + return true; +} + +bool +VirtualGPU::MemoryDependency::create(size_t numMemObj) +{ + if (numMemObj > 0) { + // Allocate the array of memory objects for dependency tracking + memObjectsInQueue_ = new MemoryState[numMemObj]; + if (nullptr == memObjectsInQueue_) { + return false; + } + memset(memObjectsInQueue_, 0, sizeof(MemoryState) * numMemObj); + maxMemObjectsInQueue_ = numMemObj; + } + + return true; +} + +void +VirtualGPU::MemoryDependency::validate( + VirtualGPU& gpu, + const Memory* memory, + bool readOnly) +{ + bool flushL1Cache = false; + + if (maxMemObjectsInQueue_ == 0) { + // Flush cache + gpu.flushCUCaches(); + return; + } + + uint64_t curStart = memory->vmAddress(); + uint64_t curEnd = curStart + memory->vmSize(); + + // Loop through all memory objects in the queue and find dependency + // @note don't include objects from the current kernel + for (size_t j = 0; j < endMemObjectsInQueue_; ++j) { + // Check if the queue already contains this mem object and + // GPU operations aren't readonly + uint64_t busyStart = memObjectsInQueue_[j].start_; + uint64_t busyEnd = memObjectsInQueue_[j].end_; + + // Check if the start inside the busy region + if ((((curStart >= busyStart) && (curStart < busyEnd)) || + // Check if the end inside the busy region + ((curEnd > busyStart) && (curEnd <= busyEnd)) || + // Check if the start/end cover the busy region + ((curStart <= busyStart) && (curEnd >= busyEnd))) && + // If the buys region was written or the current one is for write + (!memObjectsInQueue_[j].readOnly_ || !readOnly)) { + flushL1Cache = true; + break; + } + } + + // Did we reach the limit? + if (maxMemObjectsInQueue_ <= (numMemObjectsInQueue_ + 1)) { + flushL1Cache = true; + } + + if (flushL1Cache) { + // Flush cache + gpu.flushCUCaches(); + + // Clear memory dependency state + const static bool All = true; + clear(!All); + } + + // Insert current memory object into the queue always, + // since runtime calls flush before kernel execution and it has to keep + // current kernel in tracking + memObjectsInQueue_ + [numMemObjectsInQueue_].start_ = curStart; + memObjectsInQueue_ + [numMemObjectsInQueue_].end_ = curEnd; + memObjectsInQueue_ + [numMemObjectsInQueue_].readOnly_ = readOnly; + numMemObjectsInQueue_++; +} + +void +VirtualGPU::MemoryDependency::clear(bool all) +{ + if (numMemObjectsInQueue_ > 0) { + size_t i, j; + if (all) { + endMemObjectsInQueue_ = numMemObjectsInQueue_; + } + + // Preserve all objects from the current kernel + for (i = 0, j = endMemObjectsInQueue_; j < numMemObjectsInQueue_; i++, j++) { + memObjectsInQueue_[i].start_ = memObjectsInQueue_[j].start_; + memObjectsInQueue_[i].end_ = memObjectsInQueue_[j].end_; + memObjectsInQueue_[i].readOnly_ = memObjectsInQueue_[j].readOnly_; + } + // Clear all objects except current kernel + memset(&memObjectsInQueue_[i], 0, sizeof(amd::Memory*) * numMemObjectsInQueue_); + numMemObjectsInQueue_ -= endMemObjectsInQueue_; + endMemObjectsInQueue_ = 0; + } +} + +VirtualGPU::DmaFlushMgmt::DmaFlushMgmt(const Device& dev) + : cbWorkload_(0) + , dispatchSplitSize_(0) +{ + aluCnt_ = dev.info().simdPerCU_ * dev.info().simdWidth_ * dev.info().maxComputeUnits_; + maxDispatchWorkload_ = static_cast(dev.info().maxClockFrequency_) * + // find time in us + 100 * dev.settings().maxWorkloadTime_ * + aluCnt_; + resetCbWorkload(dev); +} + +void +VirtualGPU::DmaFlushMgmt::resetCbWorkload(const Device& dev) +{ + cbWorkload_ = 0; + maxCbWorkload_ = static_cast(dev.info().maxClockFrequency_) * + // find time in us + 100 * dev.settings().minWorkloadTime_ * aluCnt_; +} + +void +VirtualGPU::DmaFlushMgmt::findSplitSize( + const Device& dev, uint64_t threads, uint instructions) +{ + uint64_t workload = threads * instructions; + if (maxDispatchWorkload_ < workload) { + dispatchSplitSize_ = static_cast(maxDispatchWorkload_ / instructions); + uint fullLoad = dev.info().maxComputeUnits_ * dev.info().maxWorkGroupSize_; + if ((dispatchSplitSize_ % fullLoad) != 0) { + dispatchSplitSize_ = (dispatchSplitSize_ / fullLoad + 1) * fullLoad; + } + } + else { + dispatchSplitSize_ = (threads > dev.settings().workloadSplitSize_) ? + dev.settings().workloadSplitSize_ : 0; + } +} + +bool +VirtualGPU::DmaFlushMgmt::isCbReady( + VirtualGPU& gpu, uint64_t threads, uint instructions) +{ + bool cbReady = false; + uint64_t workload = amd::alignUp(threads, 4 * aluCnt_) * instructions; + // Add current workload to the overall workload in the current DMA + cbWorkload_ += workload; + // Did it exceed maximum? + if (cbWorkload_ > maxCbWorkload_) { + // Reset DMA workload + cbWorkload_ = 0; + // Increase workload of the next DMA buffer by 50% + maxCbWorkload_ = maxCbWorkload_ * 3 / 2; + if (maxCbWorkload_ > maxDispatchWorkload_) { + maxCbWorkload_ = maxDispatchWorkload_; + } + cbReady = true; + } + return cbReady; +} + +void +VirtualGPU::addXferWrite(Memory& memory) +{ + if (xferWriteBuffers_.size() > 7) { + dev().xferWrite().release(*this, *xferWriteBuffers_.front()); + xferWriteBuffers_.pop_front(); + } + + // Delay destruction + xferWriteBuffers_.push_back(&memory); +} + +void +VirtualGPU::releaseXferWrite() +{ + for (auto& memory : xferWriteBuffers_) { + dev().xferWrite().release(*this, *memory); + } + xferWriteBuffers_.clear(); +} + +void +VirtualGPU::addPinnedMem(amd::Memory* mem) +{ + if (nullptr == findPinnedMem(mem->getHostMem(), mem->getSize())) { + if (pinnedMems_.size() > 7) { + pinnedMems_.front()->release(); + pinnedMems_.pop_front(); + } + + // Start operation, since we should release mem object + flushDMA(getGpuEvent(dev().getGpuMemory(mem)->iMem())->engineId_); + + // Delay destruction + pinnedMems_.push_back(mem); + } +} + +void +VirtualGPU::releasePinnedMem() +{ + for (auto& amdMemory : pinnedMems_) { + amdMemory->release(); + } + pinnedMems_.clear(); +} + +amd::Memory* +VirtualGPU::findPinnedMem(void* addr, size_t size) +{ + for (auto& amdMemory : pinnedMems_) { + if ((amdMemory->getHostMem() == addr) && (size <= amdMemory->getSize())) { + return amdMemory; + } + } + return nullptr; +} + +bool +VirtualGPU::createVirtualQueue(uint deviceQueueSize) +{ + uint MinDeviceQueueSize = 16 * 1024; + deviceQueueSize = std::max(deviceQueueSize, MinDeviceQueueSize); + + maskGroups_ = deviceQueueSize / (512 * Ki); + maskGroups_ = (maskGroups_== 0) ? 1 : maskGroups_; + + // Align the queue size for the multiple dispatch scheduler. + // Each thread works with 32 entries * maskGroups + uint extra = deviceQueueSize % (sizeof(AmdAqlWrap) * + DeviceQueueMaskSize * maskGroups_); + if (extra != 0) { + deviceQueueSize += (sizeof(AmdAqlWrap) * + DeviceQueueMaskSize * maskGroups_) - extra; + } + + if (deviceQueueSize_ == deviceQueueSize) { + return true; + } + else { + //! @todo Temporarily keep the buffer mapped for debug purpose + if (nullptr != schedParams_) { + schedParams_->unmap(this); + } + delete vqHeader_; + delete virtualQueue_; + delete schedParams_; + vqHeader_ = nullptr; + virtualQueue_ = nullptr; + schedParams_ = nullptr; + schedParamIdx_ = 0; + deviceQueueSize_ = 0; + } + uint numSlots = deviceQueueSize / sizeof(AmdAqlWrap); + uint allocSize = deviceQueueSize; + + // Add the virtual queue header + allocSize += sizeof(AmdVQueueHeader); + allocSize = amd::alignUp(allocSize, sizeof(AmdAqlWrap)); + + uint argOffs = allocSize; + + // Add the kernel arguments and wait events + uint singleArgSize = amd::alignUp(dev().info().maxParameterSize_ + 64 + + dev().settings().numWaitEvents_ * sizeof(uint64_t), sizeof(AmdAqlWrap)); + allocSize += singleArgSize * numSlots; + + uint eventsOffs = allocSize; + // Add the device events + allocSize += dev().settings().numDeviceEvents_ * sizeof(AmdEvent); + + uint eventMaskOffs = allocSize; + // Add mask array for events + allocSize += amd::alignUp(dev().settings().numDeviceEvents_, DeviceQueueMaskSize) / 8; + + uint slotMaskOffs = allocSize; + // Add mask array for AmdAqlWrap slots + allocSize += amd::alignUp(numSlots, DeviceQueueMaskSize) / 8; + + virtualQueue_ = new Memory(dev(), allocSize); + Resource::MemoryType type = (GPU_PRINT_CHILD_KERNEL == 0) ? + Resource::Local : Resource::Remote; + if ((virtualQueue_ == nullptr) || !virtualQueue_->create(type)) { + return false; + } + address ptr = reinterpret_cast
( + virtualQueue_->map(this, Resource::WriteOnly)); + if (nullptr == ptr) { + return false; + } + // Clear memory + memset(ptr, 0, allocSize); + uint64_t vaBase = virtualQueue_->vmAddress(); + AmdVQueueHeader* header = reinterpret_cast(ptr); + + // Initialize the virtual queue header + header->aql_slot_num = numSlots; + header->event_slot_num = dev().settings().numDeviceEvents_; + header->event_slot_mask = vaBase + eventMaskOffs; + header->event_slots = vaBase + eventsOffs; + header->aql_slot_mask = vaBase + slotMaskOffs; + header->wait_size = dev().settings().numWaitEvents_; + header->arg_size = dev().info().maxParameterSize_ + 64; + header->mask_groups = maskGroups_; + vqHeader_ = new AmdVQueueHeader; + if (nullptr == vqHeader_) { + return false; + } + *vqHeader_ = *header; + + // Go over all slots and perform initialization + AmdAqlWrap* slots = reinterpret_cast(&header[1]); + for (uint i = 0; i < numSlots; ++i) { + uint64_t argStart = vaBase + argOffs + i * singleArgSize; + slots[i].aql.kernarg_address = reinterpret_cast(argStart); + slots[i].wait_list = argStart + dev().info().maxParameterSize_ + 64; + } + // Upload data back to local memory + if (GPU_PRINT_CHILD_KERNEL == 0) { + virtualQueue_->unmap(this); + } + + schedParams_ = new Memory(dev(), 64 * Ki); + if ((schedParams_ == nullptr) || !schedParams_->create(Resource::RemoteUSWC)) { + return false; + } + + ptr = reinterpret_cast
(schedParams_->map(this)); + + deviceQueueSize_ = deviceQueueSize; + + return true; +} + +VirtualGPU::VirtualGPU( + Device& device) + : device::VirtualDevice(device) + , engineID_(MainEngine) + , gpuDevice_(static_cast(device)) + , execution_("Virtual GPU execution lock", true) + , printfDbg_(nullptr) + , printfDbgHSA_(nullptr) + , tsCache_(nullptr) + , dmaFlushMgmt_(device) + , hwRing_(0) + , readjustTimeGPU_(0) + , currTs_(nullptr) + , vqHeader_(nullptr) + , virtualQueue_(nullptr) + , schedParams_(nullptr) + , schedParamIdx_(0) + , deviceQueueSize_(0) + , maskGroups_(1) + , hsaQueueMem_(nullptr) + , cmdAllocator_(nullptr) +{ + memset(&cal_, 0, sizeof(CalVirtualDesc)); + for (uint i = 0; i < AllEngines; ++i) { + cal_.events_[i].invalidate(); + } + + // Note: Virtual GPU device creation must be a thread safe operation + index_ = gpuDevice_.numOfVgpus_++; + gpuDevice_.vgpus_.resize(gpuDevice_.numOfVgpus()); + gpuDevice_.vgpus_[index()] = this; + queues_[MainEngine] = nullptr; + queues_[SdmaEngine] = nullptr; +} + +bool +VirtualGPU::create(bool profiling, uint deviceQueueSize) +{ + device::BlitManager::Setup blitSetup; + + if (index() >= GPU_MAX_COMMAND_QUEUES) { + // Cap the maximum number of concurrent Virtual GPUs + return false; + } + + // Virtual GPU will have profiling enabled + state_.profiling_ = profiling; + + Pal::CmdAllocatorCreateInfo createInfo = {}; + // \todo forces PAL to reuse CBs, but requires postamble + createInfo.flags.autoMemoryReuse = true; + createInfo.flags.threadSafe = false; + createInfo.allocInfo[Pal::CommandDataAlloc].allocHeap = + Pal::GpuHeapGartCacheable; + createInfo.allocInfo[Pal::CommandDataAlloc].allocSize = 128 * Ki; + createInfo.allocInfo[Pal::CommandDataAlloc].suballocSize = 128 * Ki; + + createInfo.allocInfo[Pal::EmbeddedDataAlloc].allocHeap = + Pal::GpuHeapGartCacheable; + createInfo.allocInfo[Pal::EmbeddedDataAlloc].allocSize = 64 * Ki; + createInfo.allocInfo[Pal::EmbeddedDataAlloc].suballocSize = 64 * Ki; + + Pal::Result result; + size_t cmdAllocSize = dev().iDev()->GetCmdAllocatorSize(createInfo, &result); + if (Pal::Result::Success != result) { + return false; + } + char* addr = new char [cmdAllocSize]; + if (Pal::Result::Success != + dev().iDev()->CreateCmdAllocator(createInfo, addr, &cmdAllocator_)) { + return false; + } + + if (dev().numComputeEngines()) { + uint idx = index() % dev().numComputeEngines(); + + // hwRing_ should be set 0 if forced to have single scratch buffer + hwRing_ = (dev().settings().useSingleScratch_) ? 0 : idx; + + queues_[MainEngine] = Queue::Create( + dev().iDev(), Pal::QueueTypeCompute, idx, cmdAllocator_); + if (nullptr == queues_[MainEngine]) { + return false; + } + + // Check if device has SDMA engines + if (dev().numDMAEngines() != 0) { + queues_[SdmaEngine] = Queue::Create( + dev().iDev(), Pal::QueueTypeDma, + idx % dev().numDMAEngines(), cmdAllocator_); + if (nullptr == queues_[SdmaEngine]) { + return false; + } + } + else { + Unimplemented(); + } + } + else { + Unimplemented(); + } + + // Diable double copy optimization, + // since UAV read from nonlocal is fast enough + blitSetup.disableCopyBufferToImageOpt_ = true; + if (!allocConstantBuffers()) { + return false; + } + + // Create Printf class + printfDbg_ = new PrintfDbg(gpuDevice_); + if ((nullptr == printfDbg_) || !printfDbg_->create()) { + delete printfDbg_; + LogError("Could not allocate debug buffer for printf()!"); + return false; + } + + // Create HSAILPrintf class + printfDbgHSA_ = new PrintfDbgHSA(gpuDevice_); + if (nullptr == printfDbgHSA_) { + delete printfDbgHSA_; + LogError("Could not create PrintfDbgHSA class!"); + return false; + } + + // Choose the appropriate class for blit engine + switch (dev().settings().blitEngine_) { + default: + // Fall through ... + case Settings::BlitEngineHost: + blitSetup.disableAll(); + // Fall through ... + case Settings::BlitEngineCAL: + case Settings::BlitEngineKernel: + // use host blit for HW debug + if (dev().settings().enableHwDebug_) { + blitSetup.disableCopyImageToBuffer_ = true; + blitSetup.disableCopyBufferToImage_ = true; + } + blitMgr_ = new KernelBlitManager(*this, blitSetup); + break; + } + if ((nullptr == blitMgr_) || !blitMgr_->create(gpuDevice_)) { + LogError("Could not create BlitManager!"); + return false; + } + + tsCache_ = new TimeStampCache(*this); + if (nullptr == tsCache_) { + LogError("Could not create TimeStamp cache!"); + return false; + } + + if (!memoryDependency().create(dev().settings().numMemDependencies_)) { + LogError("Could not create the array of memory objects!"); + return false; + } + + if(!allocHsaQueueMem()) { + LogError("Could not create hsaQueueMem object!"); + return false; + } + + // Check if the app requested a device queue creation + if (dev().settings().useDeviceQueue_ && + (0 != deviceQueueSize) && !createVirtualQueue(deviceQueueSize)) { + LogError("Could not create a virtual queue!"); + return false; + } + + return true; +} + +bool +VirtualGPU::allocHsaQueueMem() +{ + // Allocate a dummy HSA queue + hsaQueueMem_ = new Memory(dev(), sizeof(amd_queue_t)); + if ((hsaQueueMem_ == nullptr) || + (!hsaQueueMem_->create(Resource::RemoteUSWC))) { + delete hsaQueueMem_; + return false; + } + amd_queue_t* queue = reinterpret_cast + (hsaQueueMem_->map(nullptr, Resource::WriteOnly)); + if (nullptr == queue) { + delete hsaQueueMem_; + return false; + } + memset(queue, 0, sizeof(amd_queue_t)); + + // Provide private and local heap addresses + const static uint addressShift = LP64_SWITCH(0, 32); + LogWarning("Private/Shared aperture isn't set"); +/* queue->private_segment_aperture_base_hi = + static_cast(dev().gslCtx()->getPrivateApertureBase()>>addressShift); + queue->group_segment_aperture_base_hi = + static_cast(dev().gslCtx()->getSharedApertureBase()>>addressShift); +*/ + hsaQueueMem_->unmap(nullptr); + return true; +} + +VirtualGPU::~VirtualGPU() +{ + // Not safe to remove a queue. So lock the device + amd::ScopedLock k(dev().lockAsyncOps()); + amd::ScopedLock lock(dev().vgpusAccess()); + + // Destroy all memories + static const bool SkipScratch = false; + releaseMemObjects(SkipScratch); + + // Destroy printf object + delete printfDbg_; + + // Destroy printfHSA object + delete printfDbgHSA_; + + // Destroy BlitManager object + delete blitMgr_; + + // Destroy TimeStamp cache + delete tsCache_; + + // Destroy resource list with the constant buffers + for (uint i = 0; i < constBufs_.size(); ++i) { + delete constBufs_[i]; + } + + // Destroy queues + if (nullptr != queues_[MainEngine]) { + // Make sure the queues are idle + // It's unclear why PAL could still have a busy queue + queues_[MainEngine]->iQueue_->WaitIdle(); + delete queues_[MainEngine]; + } + + if (nullptr != queues_[SdmaEngine]) { + queues_[SdmaEngine]->iQueue_->WaitIdle(); + delete queues_[SdmaEngine]; + } + + if (nullptr != cmdAllocator_) { + cmdAllocator_->Destroy(); + delete [] reinterpret_cast(cmdAllocator_); + } + + gpuDevice_.numOfVgpus_--; + gpuDevice_.vgpus_.erase(gpuDevice_.vgpus_.begin() + index()); + for (uint idx = index(); idx < dev().vgpus().size(); ++idx) { + dev().vgpus()[idx]->index_--; + } + + // Release scratch buffer memory to reduce memory pressure + //!@note OCLtst uses single device with multiple tests + //! Release memory only if it's the last command queue. + //! The first queue is reserved for the transfers on device + if (gpuDevice_.numOfVgpus_ <= 1) { + gpuDevice_.destroyScratchBuffers(); + } + + //! @todo Temporarily keep the buffer mapped for debug purpose + if (nullptr != schedParams_) { + schedParams_->unmap(this); + } + delete vqHeader_; + delete virtualQueue_; + delete schedParams_; + delete hsaQueueMem_; +} + +void +VirtualGPU::submitReadMemory(amd::ReadMemoryCommand& vcmd) +{ + // Make sure VirtualGPU has an exclusive access to the resources + amd::ScopedLock lock(execution()); + + // Translate memory references and ensure cache up-to-date + pal::Memory* memory = dev().getGpuMemory(&vcmd.source()); + + size_t offset = 0; + // Find if virtual address is a CL allocation + pal::Memory* hostMemory = dev().findMemoryFromVA(vcmd.destination(), &offset); + + profilingBegin(vcmd, true); + + memory->syncCacheFromHost(*this); + cl_command_type type = vcmd.type(); + bool result = false; + amd::Memory* bufferFromImage = nullptr; + + // Force buffer read for IMAGE1D_BUFFER + if ((type == CL_COMMAND_READ_IMAGE) && + (vcmd.source().getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER)) { + bufferFromImage = createBufferFromImage(vcmd.source()); + if (nullptr == bufferFromImage) { + LogError("We should not fail buffer creation from image_buffer!"); + } + else { + type = CL_COMMAND_READ_BUFFER; + bufferFromImage->setVirtualDevice(this); + memory = dev().getGpuMemory(bufferFromImage); + } + } + + // Process different write commands + switch (type) { + case CL_COMMAND_READ_BUFFER: { + amd::Coord3D origin(vcmd.origin()[0]); + amd::Coord3D size(vcmd.size()[0]); + if (nullptr != bufferFromImage) { + size_t elemSize = + vcmd.source().asImage()->getImageFormat().getElementSize(); + origin.c[0] *= elemSize; + size.c[0] *= elemSize; + } + if (hostMemory != nullptr) { + // Accelerated transfer without pinning + amd::Coord3D dstOrigin(offset); + result = blitMgr().copyBuffer(*memory, *hostMemory, + origin, dstOrigin, size, vcmd.isEntireMemory()); + } + else { + result = blitMgr().readBuffer( + *memory, vcmd.destination(), + origin, size, vcmd.isEntireMemory()); + } + if (nullptr != bufferFromImage) { + bufferFromImage->release(); + } + } + break; + case CL_COMMAND_READ_BUFFER_RECT: { + amd::BufferRect hostbufferRect; + amd::Coord3D region(0); + amd::Coord3D hostOrigin(vcmd.hostRect().start_+ offset); + hostbufferRect.create(hostOrigin.c, vcmd.size().c , vcmd.hostRect().rowPitch_, vcmd.hostRect().slicePitch_); + if (hostMemory != nullptr) { + result = blitMgr().copyBufferRect(*memory, *hostMemory, + vcmd.bufRect(), hostbufferRect, vcmd.size(), + vcmd.isEntireMemory()); + } + else { + result = blitMgr().readBufferRect(*memory, + vcmd.destination(), vcmd.bufRect(), vcmd.hostRect(), vcmd.size(), + vcmd.isEntireMemory()); + } + } + break; + case CL_COMMAND_READ_IMAGE: + if (hostMemory != nullptr) { + // Accelerated image to buffer transfer without pinning + amd::Coord3D dstOrigin(offset); + result = blitMgr().copyImageToBuffer(*memory, *hostMemory, + vcmd.origin(), dstOrigin, vcmd.size(), + vcmd.isEntireMemory(), + vcmd.rowPitch(), vcmd.slicePitch()); + } + else { + result = blitMgr().readImage(*memory, vcmd.destination(), + vcmd.origin(), vcmd.size(), vcmd.rowPitch(), vcmd.slicePitch(), + vcmd.isEntireMemory()); + } + break; + default: + LogError("Unsupported type for the read command"); + break; + } + + if (!result) { + LogError("submitReadMemory failed!"); + vcmd.setStatus(CL_INVALID_OPERATION); + } + + profilingEnd(vcmd); +} + +void +VirtualGPU::submitWriteMemory(amd::WriteMemoryCommand& vcmd) +{ + // Make sure VirtualGPU has an exclusive access to the resources + amd::ScopedLock lock(execution()); + + // Translate memory references and ensure cache up to date + pal::Memory* memory = dev().getGpuMemory(&vcmd.destination()); + size_t offset = 0; + // Find if virtual address is a CL allocation + pal::Memory* hostMemory = dev().findMemoryFromVA(vcmd.source(), &offset); + + profilingBegin(vcmd, true); + + bool entire = vcmd.isEntireMemory(); + + // Synchronize memory from host if necessary + device::Memory::SyncFlags syncFlags; + syncFlags.skipEntire_ = entire; + memory->syncCacheFromHost(*this, syncFlags); + + cl_command_type type = vcmd.type(); + bool result = false; + amd::Memory* bufferFromImage = nullptr; + + // Force buffer write for IMAGE1D_BUFFER + if ((type == CL_COMMAND_WRITE_IMAGE) && + (vcmd.destination().getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER)) { + bufferFromImage = createBufferFromImage(vcmd.destination()); + if (nullptr == bufferFromImage) { + LogError("We should not fail buffer creation from image_buffer!"); + } + else { + type = CL_COMMAND_WRITE_BUFFER; + bufferFromImage->setVirtualDevice(this); + memory = dev().getGpuMemory(bufferFromImage); + } + } + + // Process different write commands + switch (type) { + case CL_COMMAND_WRITE_BUFFER: { + amd::Coord3D origin(vcmd.origin()[0]); + amd::Coord3D size(vcmd.size()[0]); + if (nullptr != bufferFromImage) { + size_t elemSize = + vcmd.destination().asImage()->getImageFormat().getElementSize(); + origin.c[0] *= elemSize; + size.c[0] *= elemSize; + } + if (hostMemory != nullptr) { + // Accelerated transfer without pinning + amd::Coord3D srcOrigin(offset); + result = blitMgr().copyBuffer(*hostMemory, *memory, + srcOrigin, origin, size, vcmd.isEntireMemory()); + } + else { + result = blitMgr().writeBuffer(vcmd.source(), *memory, + origin, size, vcmd.isEntireMemory()); + } + if (nullptr != bufferFromImage) { + bufferFromImage->release(); + } + } + break; + case CL_COMMAND_WRITE_BUFFER_RECT: { + amd::BufferRect hostbufferRect; + amd::Coord3D region(0); + amd::Coord3D hostOrigin(vcmd.hostRect().start_+ offset); + hostbufferRect.create(hostOrigin.c, vcmd.size().c , vcmd.hostRect().rowPitch_, vcmd.hostRect().slicePitch_); + if (hostMemory != nullptr) { + result = blitMgr().copyBufferRect(*hostMemory, *memory, + hostbufferRect, vcmd.bufRect(), vcmd.size(), + vcmd.isEntireMemory()); + } + else { + result = blitMgr().writeBufferRect(vcmd.source(), *memory, + vcmd.hostRect(), vcmd.bufRect(), vcmd.size(), + vcmd.isEntireMemory()); + } + } + break; + case CL_COMMAND_WRITE_IMAGE: + if (hostMemory != nullptr) { + // Accelerated buffer to image transfer without pinning + amd::Coord3D srcOrigin(offset); + result = blitMgr().copyBufferToImage(*hostMemory, *memory, + srcOrigin, vcmd.origin(), vcmd.size(), + vcmd.isEntireMemory(), + vcmd.rowPitch(), vcmd.slicePitch()); + } + else { + result = blitMgr().writeImage(vcmd.source(), *memory, + vcmd.origin(), vcmd.size(), vcmd.rowPitch(), vcmd.slicePitch(), + vcmd.isEntireMemory()); + } + break; + default: + LogError("Unsupported type for the write command"); + break; + } + + if (!result) { + LogError("submitWriteMemory failed!"); + vcmd.setStatus(CL_INVALID_OPERATION); + } + else { + // Mark this as the most-recently written cache of the destination + vcmd.destination().signalWrite(&gpuDevice_); + } + profilingEnd(vcmd); +} + +bool +VirtualGPU::copyMemory(cl_command_type type + , amd::Memory& srcMem + , amd::Memory& dstMem + , bool entire + , const amd::Coord3D& srcOrigin + , const amd::Coord3D& dstOrigin + , const amd::Coord3D& size + , const amd::BufferRect& srcRect + , const amd::BufferRect& dstRect + ) +{ + // Translate memory references and ensure cache up-to-date + pal::Memory* dstMemory = dev().getGpuMemory(&dstMem); + pal::Memory* srcMemory = dev().getGpuMemory(&srcMem); + + // Synchronize source and destination memory + device::Memory::SyncFlags syncFlags; + syncFlags.skipEntire_ = entire; + dstMemory->syncCacheFromHost(*this, syncFlags); + srcMemory->syncCacheFromHost(*this); + + amd::Memory* bufferFromImageSrc = nullptr; + amd::Memory* bufferFromImageDst = nullptr; + + // Force buffer read for IMAGE1D_BUFFER + if ((srcMem.getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER)) { + bufferFromImageSrc = createBufferFromImage(srcMem); + if (nullptr == bufferFromImageSrc) { + LogError("We should not fail buffer creation from image_buffer!"); + } + else { + type = CL_COMMAND_COPY_BUFFER; + bufferFromImageSrc->setVirtualDevice(this); + srcMemory = dev().getGpuMemory(bufferFromImageSrc); + } + } + // Force buffer write for IMAGE1D_BUFFER + if ((dstMem.getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER)) { + bufferFromImageDst = createBufferFromImage(dstMem); + if (nullptr == bufferFromImageDst) { + LogError("We should not fail buffer creation from image_buffer!"); + } + else { + type = CL_COMMAND_COPY_BUFFER; + bufferFromImageDst->setVirtualDevice(this); + dstMemory = dev().getGpuMemory(bufferFromImageDst); + } + } + + bool result = false; + + // Check if HW can be used for memory copy + switch (type) { + case CL_COMMAND_SVM_MEMCPY: + case CL_COMMAND_COPY_BUFFER: { + amd::Coord3D realSrcOrigin(srcOrigin[0]); + amd::Coord3D realDstOrigin(dstOrigin[0]); + amd::Coord3D realSize(size.c[0],size.c[1],size.c[2]); + + if (nullptr != bufferFromImageSrc) { + size_t elemSize = + srcMem.asImage()->getImageFormat().getElementSize(); + realSrcOrigin.c[0] *= elemSize; + if (nullptr != bufferFromImageDst) { + realDstOrigin.c[0] *= elemSize; + } + realSize.c[0] *= elemSize; + } + else if (nullptr != bufferFromImageDst) { + size_t elemSize = + dstMem.asImage()->getImageFormat().getElementSize(); + realDstOrigin.c[0] *= elemSize; + realSize.c[0] *= elemSize; + } + + result = blitMgr().copyBuffer(*srcMemory, *dstMemory, + realSrcOrigin, realDstOrigin, realSize, entire); + + if (nullptr != bufferFromImageSrc) { + bufferFromImageSrc->release(); + } + if (nullptr != bufferFromImageDst) { + bufferFromImageDst->release(); + } + } + break; + case CL_COMMAND_COPY_BUFFER_RECT: + result = blitMgr().copyBufferRect(*srcMemory, *dstMemory, + srcRect, dstRect, size, entire); + break; + case CL_COMMAND_COPY_IMAGE_TO_BUFFER: + result = blitMgr().copyImageToBuffer(*srcMemory, *dstMemory, + srcOrigin, dstOrigin, size, entire); + break; + case CL_COMMAND_COPY_BUFFER_TO_IMAGE: + result = blitMgr().copyBufferToImage(*srcMemory, *dstMemory, + srcOrigin, dstOrigin, size, entire); + break; + case CL_COMMAND_COPY_IMAGE: + result = blitMgr().copyImage(*srcMemory, *dstMemory, + srcOrigin, dstOrigin, size, entire); + break; + default: + LogError("Unsupported command type for memory copy!"); + break; + } + + if (!result) { + LogError("submitCopyMemory failed!"); + return false; + } + else { + // Mark this as the most-recently written cache of the destination + dstMem.signalWrite(&gpuDevice_); + } + return true; +} + +void +VirtualGPU::submitCopyMemory(amd::CopyMemoryCommand& vcmd) +{ + // Make sure VirtualGPU has an exclusive access to the resources + amd::ScopedLock lock(execution()); + + profilingBegin(vcmd); + + cl_command_type type = vcmd.type(); + bool entire = vcmd.isEntireMemory(); + + if (!copyMemory(type, vcmd.source(), vcmd.destination(), entire, + vcmd.srcOrigin(), vcmd.dstOrigin(), vcmd.size(), vcmd.srcRect(), + vcmd.dstRect())) { + vcmd.setStatus(CL_INVALID_OPERATION); + } + + profilingEnd(vcmd); +} + +void +VirtualGPU::submitSvmCopyMemory(amd::SvmCopyMemoryCommand& vcmd) +{ + // Make sure VirtualGPU has an exclusive access to the resources + amd::ScopedLock lock(execution()); + profilingBegin(vcmd); + + cl_command_type type = vcmd.type(); + //no op for FGS supported device + if (!dev().isFineGrainedSystem()) { + + amd::Memory* srcMem = amd::SvmManager::FindSvmBuffer(vcmd.src()); + amd::Memory* dstMem = amd::SvmManager::FindSvmBuffer(vcmd.dst()); + if (nullptr == srcMem || nullptr == dstMem) { + vcmd.setStatus(CL_INVALID_OPERATION); + return; + } + + amd::Coord3D srcOrigin(0, 0, 0); + amd::Coord3D dstOrigin(0, 0, 0); + amd::Coord3D size(vcmd.srcSize(), 1, 1); + amd::BufferRect srcRect; + amd::BufferRect dstRect; + + srcOrigin.c[0] = static_cast(vcmd.src()) - static_cast
(srcMem->getSvmPtr()); + dstOrigin.c[0] = static_cast(vcmd.dst()) - static_cast
(dstMem->getSvmPtr()); + + if (!(srcMem->validateRegion(srcOrigin, size)) || !(dstMem->validateRegion(dstOrigin, size))) { + vcmd.setStatus(CL_INVALID_OPERATION); + return; + } + + bool entire = srcMem->isEntirelyCovered(srcOrigin, size) && + dstMem->isEntirelyCovered(dstOrigin, size); + + if (!copyMemory(type, *srcMem, *dstMem, entire, + srcOrigin, dstOrigin, size, srcRect, dstRect)) { + vcmd.setStatus(CL_INVALID_OPERATION); + } + } + else { + //direct memcpy for FGS enabled system + amd::SvmBuffer::memFill(vcmd.dst(), vcmd.src(), vcmd.srcSize(), 1); + } + profilingEnd(vcmd); +} + +void +VirtualGPU::submitMapMemory(amd::MapMemoryCommand& vcmd) +{ + // Make sure VirtualGPU has an exclusive access to the resources + amd::ScopedLock lock(execution()); + + profilingBegin(vcmd, true); + + pal::Memory* memory = dev().getGpuMemory(&vcmd.memory()); + + // Save map info for unmap operation + memory->saveMapInfo(vcmd.origin(), vcmd.size(), + vcmd.mapFlags(), vcmd.isEntireMemory()); + + // If we have host memory, use it + if ((memory->owner()->getHostMem() != nullptr) && memory->isDirectMap()) { + if (!memory->isHostMemDirectAccess()) { + // Make sure GPU finished operation before + // synchronization with the backing store + memory->wait(*this); + } + + // Target is the backing store, so just ensure that owner is up-to-date + memory->owner()->cacheWriteBack(); + + // Add memory to VA cache, so rutnime can detect direct access to VA + dev().addVACache(memory); + } + else if (memory->isPersistentDirectMap()) { + // Nothing to do here + } + else if (memory->mapMemory() != nullptr) { + // Target is a remote resource, so copy + assert(memory->mapMemory() != nullptr); + if (vcmd.mapFlags() & (CL_MAP_READ | CL_MAP_WRITE)) { + amd::Coord3D dstOrigin(0, 0, 0); + if (memory->desc().buffer_) { + if (!blitMgr().copyBuffer(*memory, + *memory->mapMemory(), vcmd.origin(), dstOrigin, + vcmd.size(), vcmd.isEntireMemory())) { + LogError("submitMapMemory() - copy failed"); + vcmd.setStatus(CL_MAP_FAILURE); + } + } + else if ((vcmd.memory().getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER)) { + amd::Memory* bufferFromImage = nullptr; + Memory* memoryBuf = memory; + amd::Coord3D origin(vcmd.origin()[0]); + amd::Coord3D size(vcmd.size()[0]); + size_t elemSize = + vcmd.memory().asImage()->getImageFormat().getElementSize(); + origin.c[0] *= elemSize; + size.c[0] *= elemSize; + + bufferFromImage = createBufferFromImage(vcmd.memory()); + if (nullptr == bufferFromImage) { + LogError("We should not fail buffer creation from image_buffer!"); + } + else { + bufferFromImage->setVirtualDevice(this); + memoryBuf = dev().getGpuMemory(bufferFromImage); + } + if (!blitMgr().copyBuffer(*memoryBuf, + *memory->mapMemory(), origin, dstOrigin, + size, vcmd.isEntireMemory())) { + LogError("submitMapMemory() - copy failed"); + vcmd.setStatus(CL_MAP_FAILURE); + } + if (nullptr != bufferFromImage) { + bufferFromImage->release(); + } + } + else { + // Validate if it's a view for a map of mip level + if (vcmd.memory().parent() != nullptr) { + amd::Image* amdImage = vcmd.memory().parent()->asImage(); + if ((amdImage != nullptr) && (amdImage->getMipLevels() > 1)) { + // Save map write info in the parent object + dev().getGpuMemory(amdImage)->saveMapInfo( + vcmd.origin(), vcmd.size(), + vcmd.mapFlags(), vcmd.isEntireMemory(), + vcmd.memory().asImage()); + } + } + if (!blitMgr().copyImageToBuffer(*memory, + *memory->mapMemory(), vcmd.origin(), dstOrigin, + vcmd.size(), vcmd.isEntireMemory())) { + LogError("submitMapMemory() - copy failed"); + vcmd.setStatus(CL_MAP_FAILURE); + } + } + } + } + else { + LogError("Unhandled map!"); + } + + profilingEnd(vcmd); +} + +void +VirtualGPU::submitUnmapMemory(amd::UnmapMemoryCommand& vcmd) +{ + // Make sure VirtualGPU has an exclusive access to the resources + amd::ScopedLock lock(execution()); + + profilingBegin(vcmd, true); + pal::Memory* memory = dev().getGpuMemory(&vcmd.memory()); + amd::Memory* owner = memory->owner(); + bool unmapMip = false; + + // Check if image is a mipmap and assign a saved view + amd::Image* amdImage = owner->asImage(); + if ((amdImage != nullptr) && (amdImage->getMipLevels() > 1) && + (memory->writeMapInfo()->baseMip_ != nullptr)) { + // Clear unmap flags from the parent image + memory->clearUnmapFlags(); + // Assign mip level view + amdImage = memory->writeMapInfo()->baseMip_; + memory = dev().getGpuMemory(amdImage); + unmapMip = true; + } + + // We used host memory + if ((owner->getHostMem() != nullptr) && memory->isDirectMap()) { + if (memory->isUnmapWrite() && !owner->usesSvmPointer()) { + // Target is the backing store, so sync + owner->signalWrite(nullptr); + memory->syncCacheFromHost(*this); + } + // Remove memory from VA cache + dev().removeVACache(memory); + } + // data check was added for persistent memory that failed to get aperture + // and therefore are treated like a remote resource + else if (memory->isPersistentDirectMap() && (memory->data() != nullptr)) { + memory->unmap(this); + } + else if (memory->mapMemory() != nullptr) { + if (memory->isUnmapWrite()) { + amd::Coord3D srcOrigin(0, 0, 0); + // Target is a remote resource, so copy + assert(memory->mapMemory() != nullptr); + if (memory->desc().buffer_) { + if (!blitMgr().copyBuffer( + *memory->mapMemory(), *memory, + srcOrigin, + memory->writeMapInfo()->origin_, + memory->writeMapInfo()->region_, + memory->writeMapInfo()->entire_)) { + LogError("submitUnmapMemory() - copy failed"); + vcmd.setStatus(CL_OUT_OF_RESOURCES); + } + } + else if ((vcmd.memory().getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER)) { + amd::Memory* bufferFromImage = nullptr; + Memory* memoryBuf = memory; + amd::Coord3D origin(memory->writeMapInfo()->origin_[0]); + amd::Coord3D size(memory->writeMapInfo()->region_[0]); + size_t elemSize = + vcmd.memory().asImage()->getImageFormat().getElementSize(); + origin.c[0] *= elemSize; + size.c[0] *= elemSize; + + bufferFromImage = createBufferFromImage(vcmd.memory()); + if (nullptr == bufferFromImage) { + LogError("We should not fail buffer creation from image_buffer!"); + } + else { + bufferFromImage->setVirtualDevice(this); + memoryBuf = dev().getGpuMemory(bufferFromImage); + } + if (!blitMgr().copyBuffer( + *memory->mapMemory(), *memoryBuf, + srcOrigin, origin, size, + memory->writeMapInfo()->entire_)) { + LogError("submitUnmapMemory() - copy failed"); + vcmd.setStatus(CL_OUT_OF_RESOURCES); + } + if (nullptr != bufferFromImage) { + bufferFromImage->release(); + } + } + else { + if (!blitMgr().copyBufferToImage( + *memory->mapMemory(), *memory, + srcOrigin, + memory->writeMapInfo()->origin_, + memory->writeMapInfo()->region_, + memory->writeMapInfo()->entire_)) { + LogError("submitUnmapMemory() - copy failed"); + vcmd.setStatus(CL_OUT_OF_RESOURCES); + } + } + } + } + else { + LogError("Unhandled unmap!"); + vcmd.setStatus(CL_INVALID_VALUE); + } + + // Clear unmap flags + memory->clearUnmapFlags(); + + // Release a view for a mipmap map + if (unmapMip) { + amdImage->release(); + } + profilingEnd(vcmd); +} + +bool +VirtualGPU::fillMemory(cl_command_type type, amd::Memory* amdMemory, const void* pattern, + size_t patternSize, const amd::Coord3D& origin, const amd::Coord3D& size) +{ + pal::Memory* memory = dev().getGpuMemory(amdMemory); + bool entire = amdMemory->isEntirelyCovered(origin, size); + + // Synchronize memory from host if necessary + device::Memory::SyncFlags syncFlags; + syncFlags.skipEntire_ = entire; + memory->syncCacheFromHost(*this, syncFlags); + + bool result = false; + amd::Memory* bufferFromImage = nullptr; + float fillValue[4]; + + // Force fill buffer for IMAGE1D_BUFFER + if ((type == CL_COMMAND_FILL_IMAGE) && + (amdMemory->getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER)) { + bufferFromImage = createBufferFromImage(*amdMemory); + if (nullptr == bufferFromImage) { + LogError("We should not fail buffer creation from image_buffer!"); + } + else { + type = CL_COMMAND_FILL_BUFFER; + bufferFromImage->setVirtualDevice(this); + memory = dev().getGpuMemory(bufferFromImage); + } + } + + // Find the the right fill operation + switch (type) { + case CL_COMMAND_FILL_BUFFER : + case CL_COMMAND_SVM_MEMFILL : { + amd::Coord3D realOrigin(origin[0]); + amd::Coord3D realSize(size[0]); + // Reprogram fill parameters if it's an IMAGE1D_BUFFER object + if (nullptr != bufferFromImage) { + size_t elemSize = + amdMemory->asImage()->getImageFormat().getElementSize(); + realOrigin.c[0] *= elemSize; + realSize.c[0] *= elemSize; + memset(fillValue, 0, sizeof(fillValue)); + amdMemory->asImage()->getImageFormat().formatColor(pattern, fillValue); + pattern = fillValue; + patternSize = elemSize; + } + result = blitMgr().fillBuffer(*memory, pattern, + patternSize, realOrigin, realSize, amdMemory->isEntirelyCovered(origin, size)); + if (nullptr != bufferFromImage) { + bufferFromImage->release(); + } + } + break; + case CL_COMMAND_FILL_IMAGE: + result = blitMgr().fillImage(*memory, pattern, + origin, size, amdMemory->isEntirelyCovered(origin, size)); + break; + default: + LogError("Unsupported command type for FillMemory!"); + break; + } + + if (!result) { + LogError("fillMemory failed!"); + return false; + } + + // Mark this as the most-recently written cache of the destination + amdMemory->signalWrite(&gpuDevice_); + return true; +} + +void +VirtualGPU::submitFillMemory(amd::FillMemoryCommand& vcmd) +{ + // Make sure VirtualGPU has an exclusive access to the resources + amd::ScopedLock lock(execution()); + + profilingBegin(vcmd, true); + + if (!fillMemory(vcmd.type(), &vcmd.memory(),vcmd.pattern(), + vcmd.patternSize(), vcmd.origin(), vcmd.size())) { + vcmd.setStatus(CL_INVALID_OPERATION); + } + + profilingEnd(vcmd); +} + +void +VirtualGPU::submitSvmMapMemory(amd::SvmMapMemoryCommand& vcmd) +{ + // Make sure VirtualGPU has an exclusive access to the resources + amd::ScopedLock lock(execution()); + + profilingBegin(vcmd, true); + + //no op for FGS supported device + if (!dev().isFineGrainedSystem()) { + // Make sure we have memory for the command execution + pal::Memory* memory = dev().getGpuMemory(vcmd.getSvmMem()); + + memory->saveMapInfo(vcmd.origin(), vcmd.size(), + vcmd.mapFlags(), vcmd.isEntireMemory()); + + if (memory->mapMemory() != nullptr) { + if (vcmd.mapFlags() & (CL_MAP_READ | CL_MAP_WRITE)) { + amd::Coord3D dstOrigin(0, 0, 0); + assert(memory->desc().buffer_ && "SVM memory can't be an image"); + if (!blitMgr().copyBuffer(*memory, *memory->mapMemory(), + vcmd.origin(), dstOrigin, vcmd.size(), vcmd.isEntireMemory())) { + LogError("submitSVMMapMemory() - copy failed"); + vcmd.setStatus(CL_MAP_FAILURE); + } + } + } + else { + LogError("Unhandled svm map!"); + } + } + + profilingEnd(vcmd); +} + +void +VirtualGPU::submitSvmUnmapMemory(amd::SvmUnmapMemoryCommand& vcmd) +{ + // Make sure VirtualGPU has an exclusive access to the resources + amd::ScopedLock lock(execution()); + profilingBegin(vcmd, true); + + //no op for FGS supported device + if (!dev().isFineGrainedSystem()) { + + pal::Memory* memory = dev().getGpuMemory(vcmd.getSvmMem()); + if (memory->mapMemory() != nullptr) { + if (memory->isUnmapWrite()) { + amd::Coord3D srcOrigin(0, 0, 0); + // Target is a remote resource, so copy + assert(memory->desc().buffer_ && "SVM memory can't be an image"); + if (!blitMgr().copyBuffer(*memory->mapMemory(), *memory, srcOrigin, + memory->writeMapInfo()->origin_, memory->writeMapInfo()->region_, + memory->writeMapInfo()->entire_)) { + LogError("submitSvmUnmapMemory() - copy failed"); + vcmd.setStatus(CL_OUT_OF_RESOURCES); + } + } + } + } + + profilingEnd(vcmd); +} + +void +VirtualGPU::submitSvmFillMemory(amd::SvmFillMemoryCommand& vcmd) +{ + // Make sure VirtualGPU has an exclusive access to the resources + amd::ScopedLock lock(execution()); + + profilingBegin(vcmd, true); + + if (!dev().isFineGrainedSystem()) { + size_t patternSize = vcmd.patternSize(); + size_t fillSize = patternSize * vcmd.times(); + size_t offset = 0; + amd::Memory* dstMemory = amd::SvmManager::FindSvmBuffer(vcmd.dst()); + assert(dstMemory&&"No svm Buffer to fill with!"); + offset = reinterpret_cast(vcmd.dst()) + - reinterpret_cast(dstMemory->getSvmPtr()); + assert((offset >= 0) && "wrong svm ptr to fill with!"); + + pal::Memory* memory = dev().getGpuMemory(dstMemory); + + amd::Coord3D origin(offset, 0, 0); + amd::Coord3D size(fillSize, 1, 1); + assert((dstMemory->validateRegion(origin, size)) && "The incorrect fill size!"); + + if (!fillMemory(vcmd.type(), dstMemory, vcmd.pattern(), + vcmd.patternSize(), origin, size)) { + vcmd.setStatus(CL_INVALID_OPERATION); + } + } + else { + // for FGS capable device, fill CPU memory directly + amd::SvmBuffer::memFill(vcmd.dst(), vcmd.pattern(), vcmd.patternSize(), vcmd.times()); + } + + profilingEnd(vcmd); +} + +void +VirtualGPU::submitMigrateMemObjects(amd::MigrateMemObjectsCommand& vcmd) +{ + // Make sure VirtualGPU has an exclusive access to the resources + amd::ScopedLock lock(execution()); + + profilingBegin(vcmd, true); + + std::vector::const_iterator itr; + for (itr = vcmd.memObjects().begin(); itr != vcmd.memObjects().end(); itr++) { + // Find device memory + pal::Memory* memory = dev().getGpuMemory(*itr); + + if (vcmd.migrationFlags() & CL_MIGRATE_MEM_OBJECT_HOST) { + memory->mgpuCacheWriteBack(); + } + else if (vcmd.migrationFlags() & CL_MIGRATE_MEM_OBJECT_CONTENT_UNDEFINED) { + // Synchronize memory from host if necessary. + // The sync function will perform memory migration from + // another device if necessary + device::Memory::SyncFlags syncFlags; + memory->syncCacheFromHost(*this, syncFlags); + } + else { + LogWarning("Unknown operation for memory migration!"); + } + } + + profilingEnd(vcmd); +} + +void +VirtualGPU::submitSvmFreeMemory(amd::SvmFreeMemoryCommand& vcmd) +{ + // in-order semantics: previous commands need to be done before we start + // Make sure VirtualGPU has an exclusive access to the resources + amd::ScopedLock lock(execution()); + + profilingBegin(vcmd); + std::vector& svmPointers = vcmd.svmPointers(); + if (vcmd.pfnFreeFunc() == nullptr) { + // pointers allocated using clSVMAlloc + for (cl_uint i = 0; i < svmPointers.size(); i++) { + dev().svmFree(svmPointers[i]); + } + } + else { + vcmd.pfnFreeFunc()(as_cl(vcmd.queue()->asCommandQueue()), svmPointers.size(), + static_cast(&(svmPointers[0])), vcmd.userData()); + } + profilingEnd(vcmd); +} + +void +VirtualGPU::findIterations( + const amd::NDRangeContainer& sizes, + const amd::NDRange& local, + amd::NDRange& groups, + amd::NDRange& remainder, + size_t& extra) +{ + size_t dimensions = sizes.dimensions(); + + if (cal()->iterations_ > 1) { + size_t iterations = cal()->iterations_; + cal_.iterations_ = 1; + + // Find the total amount of all groups + groups = sizes.global() / local; + if (dev().settings().partialDispatch_) { + for (uint j = 0; j < dimensions; ++j) { + if ((sizes.global()[j] % local[j]) != 0) { + groups[j]++; + } + } + } + + // Calculate the real number of required iterations and + // the workgroup size of each iteration + for (int j = (dimensions - 1); j >= 0; --j) { + // Find possible size of each iteration + size_t tmp = (groups[j] / iterations); + // Make sure the group size is more than 1 + if (tmp > 0) { + remainder = groups; + remainder[j] = (groups[j] % tmp); + + extra = ((groups[j] / tmp) + + // Check for the remainder + ((remainder[j] != 0) ? 1 : 0)); + // Recalculate the number of iterations + cal_.iterations_ *= extra; + if (remainder[j] == 0) { + extra = 0; + } + groups[j] = tmp; + break; + } + else { + iterations = ((iterations / groups[j]) + + (((iterations % groups[j]) != 0) ? 1 : 0)); + cal_.iterations_ *= groups[j]; + groups[j] = 1; + } + } + } +} + +void +VirtualGPU::submitKernel(amd::NDRangeKernelCommand& vcmd) +{ + // Make sure VirtualGPU has an exclusive access to the resources + amd::ScopedLock lock(execution()); + + profilingBegin(vcmd); + + // Submit kernel to HW + if (!submitKernelInternal(vcmd.sizes(), vcmd.kernel(), vcmd.parameters(), false, + &vcmd.event())) { + vcmd.setStatus(CL_INVALID_OPERATION); + } + + profilingEnd(vcmd); +} + +bool +VirtualGPU::submitKernelInternal( + const amd::NDRangeContainer& sizes, + const amd::Kernel& kernel, + const_address parameters, + bool nativeMem, + amd::Event* enqueueEvent) +{ + uint64_t vmParentWrap = 0; + uint64_t vmDefQueue = 0; + amd::DeviceQueue* defQueue = kernel.program().context().defDeviceQueue(dev()); + VirtualGPU* gpuDefQueue = nullptr; + amd::HwDebugManager * dbgManager = dev().hwDebugMgr(); + + // Get the HSA kernel object + const HSAILKernel& hsaKernel = + static_cast(*(kernel.getDeviceKernel(dev()))); + std::vector memList; + + bool printfEnabled = (hsaKernel.printfInfo().size() > 0) ? true:false; + if (!printfDbgHSA().init(*this, printfEnabled )) { + LogError( "Printf debug buffer initialization failed!"); + return false; + } + + // Check memory dependency and SVM objects + if (!processMemObjectsHSA(kernel, parameters, nativeMem, &memList)) { + LogError("Wrong memory objects!"); + return false; + } + + if (hsaKernel.dynamicParallelism()) { + if (nullptr == defQueue) { + LogError("Default device queue wasn't allocated"); + return false; + } + else { + if (dev().settings().useDeviceQueue_) { + gpuDefQueue = static_cast(defQueue->vDev()); + if (gpuDefQueue->hwRing() == hwRing()) { + LogError("Can't submit the child kernels to the same HW ring as the host queue!"); + return false; + } + } + else { + createVirtualQueue(defQueue->size()); + gpuDefQueue = this; + } + } + vmDefQueue = gpuDefQueue->virtualQueue_->vmAddress(); + + // Add memory handles before the actual dispatch + memList.push_back(gpuDefQueue->virtualQueue_); + memList.push_back(gpuDefQueue->schedParams_); + memList.push_back(hsaKernel.prog().kernelTable()); + gpuDefQueue->writeVQueueHeader(*this, + hsaKernel.prog().kernelTable()->vmAddress()); + } + + // setup the storage for the memory pointers of the kernel parameters + uint numParams = kernel.signature().numParameters(); + if (dbgManager) { + dbgManager->allocParamMemList(numParams); + } + + size_t newOffset[3] = {0, 0, 0}; + size_t newGlobalSize[3] = {0, 0, 0}; + + int dim = -1; + int iteration = 1; + size_t globalStep = 0; + for (uint i = 0; i < sizes.dimensions(); i++) { + newGlobalSize[i] = sizes.global()[i]; + newOffset[i] = sizes.offset()[i]; + } + // Check if it is blit kernel. If it is, then check if split is needed. + if (hsaKernel.isInternalKernel()) { + // Calculate new group size for each submission + for (uint i = 0; i < sizes.dimensions(); i++) { + if (sizes.global()[i] > static_cast(0xffffffff)) { + dim = i; + iteration = sizes.global()[i] / 0xC0000000 + + ((sizes.global()[i] % 0xC0000000) ? 1: 0); + globalStep = (sizes.global()[i] / sizes.local()[i]) / iteration + * sizes.local()[dim]; + break; + } + } + } + + for (int j = 0; j < iteration; j++) { + // Reset global size for dimension dim if split is needed + if (dim != -1) { + newOffset[dim] = sizes.offset()[dim] + globalStep * j; + if (((newOffset[dim] + globalStep) < sizes.global()[dim]) && + (j != (iteration - 1))) { + newGlobalSize[dim] = globalStep; + } + else { + newGlobalSize[dim] = sizes.global()[dim] - newOffset[dim]; + } + } + + amd::NDRangeContainer tmpSizes(sizes.dimensions(), + &newOffset[0], &newGlobalSize[0], + &(const_cast(sizes).local()[0])); + + // Program the kernel arguments for the GPU execution + hsa_kernel_dispatch_packet_t* aqlPkt = + hsaKernel.loadArguments(*this, kernel, tmpSizes, parameters, nativeMem, + vmDefQueue, &vmParentWrap, memList); + if (nullptr == aqlPkt) { + LogError("Couldn't load kernel arguments"); + return false; + } + + const Device::ScratchBuffer* scratch = nullptr; + // Check if the device allocated more registers than the old setup + if (hsaKernel.workGroupInfo()->scratchRegs_ > 0) { + scratch = dev().scratch(hwRing()); + memList.push_back(scratch->memObj_); + } + + // Add GSL handle to the memory list for VidMM + for (uint i = 0; i < memList.size(); ++i) { + addVmMemory(memList[i]); + } + + // HW Debug for the kernel? + HwDbgKernelInfo kernelInfo; + HwDbgKernelInfo *pKernelInfo = nullptr; + + if (dbgManager) { + buildKernelInfo(hsaKernel, aqlPkt, kernelInfo, enqueueEvent); + pKernelInfo = &kernelInfo; + } + + GpuEvent gpuEvent; + + // Run AQL dispatch in HW + eventBegin(MainEngine); + if (nullptr == scratch) { + iCmd()->CmdDispatchAql(aqlPkt, 0, 0, 0, + hsaKernel.cpuAqlCode(), hsaQueueMem_->vmAddress(), pKernelInfo, 0x3ff); + } + else { + iCmd()->CmdDispatchAql(aqlPkt, scratch->memObj_->vmAddress(), + scratch->size_, scratch->offset_, + hsaKernel.cpuAqlCode(), hsaQueueMem_->vmAddress(), pKernelInfo, 0x3ff); + } + eventEnd(MainEngine, gpuEvent); + + if (dbgManager && (nullptr != dbgManager->postDispatchCallBackFunc())) { + dbgManager->executePostDispatchCallBack(); + } + + if (hsaKernel.dynamicParallelism()) { + // Make sure exculsive access to the device queue + amd::ScopedLock(defQueue->lock()); + + if (GPU_PRINT_CHILD_KERNEL != 0) { + waitForEvent(&gpuEvent); + + AmdAqlWrap* wraps = (AmdAqlWrap*)(&((AmdVQueueHeader*)gpuDefQueue->virtualQueue_->data())[1]); + uint p = 0; + for (uint i = 0; i < gpuDefQueue->vqHeader_->aql_slot_num; ++i) { + if (wraps[i].state != 0) { + uint j; + if (p == GPU_PRINT_CHILD_KERNEL) { + break; + } + p++; + std::stringstream print; + print.flags(std::ios::right | std::ios_base::hex | std::ios_base::uppercase); + print << "Slot#: " << i << "\n"; + print << "\tenqueue_flags: " << wraps[i].enqueue_flags << "\n"; + print << "\tcommand_id: " << wraps[i].command_id << "\n"; + print << "\tchild_counter: " << wraps[i].child_counter << "\n"; + print << "\tcompletion: " << wraps[i].completion << "\n"; + print << "\tparent_wrap: " << wraps[i].parent_wrap << "\n"; + print << "\twait_list: " << wraps[i].wait_list << "\n"; + print << "\twait_num: " << wraps[i].wait_num << "\n"; + uint offsEvents = wraps[i].wait_list - + gpuDefQueue->virtualQueue_->vmAddress(); + size_t* events = reinterpret_cast( + gpuDefQueue->virtualQueue_->data() + offsEvents); + for (j = 0; j < wraps[i].wait_num; ++j) { + uint offs = static_cast(events[j]) - + gpuDefQueue->virtualQueue_->vmAddress(); + AmdEvent* eventD = (AmdEvent*)(gpuDefQueue->virtualQueue_->data() + offs); + print << "Wait Event#: " << j << "\n"; + print << "\tState: " << eventD->state << + "; Counter: " << eventD->counter << "\n"; + } + print << "WorkGroupSize[ " << wraps[i].aql.workgroup_size_x << ", "; + print << wraps[i].aql.workgroup_size_y << ", "; + print << wraps[i].aql.workgroup_size_z << "]\n"; + print << "GridSize[ " << wraps[i].aql.grid_size_x << ", "; + print << wraps[i].aql.grid_size_y << ", "; + print << wraps[i].aql.grid_size_z << "]\n"; + + uint64_t* kernels = (uint64_t*)( + const_cast(hsaKernel.prog().kernelTable())->map(this)); + for (j = 0; j < hsaKernel.prog().kernels().size(); ++j) { + if (kernels[j] == wraps[i].aql.kernel_object) { + break; + } + } + const_cast(hsaKernel.prog().kernelTable())->unmap(this); + HSAILKernel* child = nullptr; + for (auto it = hsaKernel.prog().kernels().begin(); + it != hsaKernel.prog().kernels().end(); ++it) { + if (j == static_cast(it->second)->index()) { + child = static_cast(it->second); + } + } + if (child == nullptr) { + printf("Error: couldn't find child kernel!\n"); + continue; + } + const uint64_t kernarg_address = + static_cast(reinterpret_cast(wraps[i].aql.kernarg_address)); + uint offsArg = kernarg_address - + gpuDefQueue->virtualQueue_->vmAddress(); + address argum = gpuDefQueue->virtualQueue_->data() + offsArg; + print << "Kernel: " << child->name() << "\n"; + static const char* Names[HSAILKernel::MaxExtraArgumentsNum] = { + "Offset0: ", "Offset1: ","Offset2: ","PrintfBuf: ", "VqueuePtr: ", "AqlWrap: "}; + for (j = 0; j < child->extraArgumentsNum(); ++j) { + print << "\t" << Names[j] << *(size_t*)argum; + print << "\n"; + argum += sizeof(size_t); + } + for (j = 0; j < child->numArguments(); ++j) { + print << "\t" << child->argument(j)->name_ << ": "; + for (int s = child->argument(j)->size_ - 1; s >= 0; --s) { + print.width(2); + print.fill('0'); + print << (uint32_t)(argum[s]); + } + argum += child->argument(j)->size_; + print << "\n"; + } + printf("%s", print.str().c_str()); + } + } + } + + if (!dev().settings().useDeviceQueue_) { + Unimplemented(); +/* + // Add the termination handshake to the host queue + eventBegin(MainEngine); + cs()->VirtualQueueHandshake(gpuDefQueue->schedParams_->iMem(), + vmParentWrap + offsetof(AmdAqlWrap, state), AQL_WRAP_DONE, + vmParentWrap + offsetof(AmdAqlWrap, child_counter), + 0, dev().settings().useDeviceQueue_); + eventEnd(MainEngine, gpuEvent); +*/ + } + + // Get the global loop start before the scheduler + Unimplemented(); +/* + mcaddr loopStart = gpuDefQueue->cs()->VirtualQueueDispatcherStart(); + static_cast(gpuDefQueue->blitMgr()).runScheduler( + *gpuDefQueue->virtualQueue_, + *gpuDefQueue->schedParams_, gpuDefQueue->schedParamIdx_, + gpuDefQueue->vqHeader_->aql_slot_num / (DeviceQueueMaskSize * maskGroups_)); + const static bool FlushL2 = true; + gpuDefQueue->flushCUCaches(FlushL2); + + // Get the address of PM4 template and add write it to params + //! @note DMA flush must not occur between patch and the scheduler + mcaddr patchStart = gpuDefQueue->cs()->VirtualQueueDispatcherStart(); +*/ + Pal::gpusize patchStart = 0; + // Program parameters for the scheduler + SchedulerParam* param = &reinterpret_cast + (gpuDefQueue->schedParams_->data())[gpuDefQueue->schedParamIdx_]; + param->signal = 1; + // Scale clock to 1024 to avoid 64 bit div in the scheduler + param->eng_clk = (1000 * 1024) / dev().info().maxClockFrequency_; + param->hw_queue = patchStart + sizeof(uint32_t)/* Rewind packet*/; + param->hsa_queue = gpuDefQueue->hsaQueueMem()->vmAddress(); + param->releaseHostCP = 0; + param->parentAQL = vmParentWrap; + param->dedicatedQueue = dev().settings().useDeviceQueue_; + param->useATC = dev().settings().svmFineGrainSystem_; + + // Fill the scratch buffer information + if (hsaKernel.prog().maxScratchRegs() > 0) { + pal::Memory* scratchBuf = dev().scratch(gpuDefQueue->hwRing())->memObj_; + param->scratchSize = scratchBuf->size(); + param->scratch = scratchBuf->vmAddress(); + param->numMaxWaves = 32 * dev().info().maxComputeUnits_; + param->scratchOffset = dev().scratch(gpuDefQueue->hwRing())->offset_; + memList.push_back(scratchBuf); + } + else { + param->numMaxWaves = 0; + param->scratchSize = 0; + param->scratch = 0; + param->scratchOffset = 0; + } + + // Add all kernels in the program to the mem list. + //! \note Runtime doesn't know which one will be called + hsaKernel.prog().fillResListWithKernels(memList); + + // Add GPU memory handle to the memory list for VidMM + for (uint i = 0; i < memList.size(); ++i) { + gpuDefQueue->addVmMemory(memList[i]); + } + + Pal::gpusize signalAddr = gpuDefQueue->schedParams_->vmAddress() + + gpuDefQueue->schedParamIdx_ * sizeof(SchedulerParam); + Unimplemented(); +/* + gpuDefQueue->eventBegin(MainEngine); + gpuDefQueue->cs()->VirtualQueueDispatcherEnd( + gpuDefQueue->vmMems(), gpuDefQueue->cal_.memCount_, + signalAddr, loopStart, gpuDefQueue->vqHeader_->aql_slot_num / + (DeviceQueueMaskSize * maskGroups_)); + gpuDefQueue->eventEnd(MainEngine, gpuEvent); +*/ + // Set GPU event for the used resources + for (uint i = 0; i < memList.size(); ++i) { + memList[i]->setBusy(*gpuDefQueue, gpuEvent); + } + + if (dev().settings().useDeviceQueue_) { + Unimplemented(); +/* + // Add the termination handshake to the host queue + eventBegin(MainEngine); + cs()->VirtualQueueHandshake(gpuDefQueue->schedParams_->iMem(), + vmParentWrap + offsetof(AmdAqlWrap, state), AQL_WRAP_DONE, + vmParentWrap + offsetof(AmdAqlWrap, child_counter), + signalAddr, dev().settings().useDeviceQueue_); + eventEnd(MainEngine, gpuEvent); +*/ + } + + ++gpuDefQueue->schedParamIdx_ %= + gpuDefQueue->schedParams_->size() / sizeof(SchedulerParam); + //! \todo optimize the wrap around + if (gpuDefQueue->schedParamIdx_ == 0) { + gpuDefQueue->schedParams_->wait(*gpuDefQueue); + } + } + + // Set GPU event for the used resources + for (uint i = 0; i < memList.size(); ++i) { + memList[i]->setBusy(*this, gpuEvent); + } + + // Update the global GPU event + setGpuEvent(gpuEvent); + + if (!printfDbgHSA().output(*this, printfEnabled, hsaKernel.printfInfo())) { + LogError("Couldn't read printf data from the buffer!\n"); + return false; + } + } + + return true; +} + +void +VirtualGPU::submitNativeFn(amd::NativeFnCommand& vcmd) +{ + // Make sure VirtualGPU has an exclusive access to the resources + amd::ScopedLock lock(execution()); + + Unimplemented(); //!< @todo: Unimplemented +} + +void +VirtualGPU::submitMarker(amd::Marker& vcmd) +{ + //!@note runtime doesn't need to lock this command on execution + + if (vcmd.waitingEvent() != nullptr) { + bool foundEvent = false; + + // Loop through all outstanding command batches + while (!cbList_.empty()) { + CommandBatchList::const_iterator it = cbList_.begin(); + // Wait for completion + foundEvent = awaitCompletion(*it, vcmd.waitingEvent()); + // Release a command batch + delete *it; + // Remove command batch from the list + cbList_.pop_front(); + // Early exit if we found a command + if (foundEvent) break; + } + + // Event should be in the current command batch + if (!foundEvent) { + state_.forceWait_ = true; + } + // If we don't have any more batches, then assume GPU is idle + else if (cbList_.empty()) { + dmaFlushMgmt_.resetCbWorkload(dev()); + } + } +} + +GpuEvent* +VirtualGPU::getGpuEvent(Pal::IGpuMemory* iMem) +{ + GpuEvents::iterator it = gpuEvents_.find(iMem); + if (it == gpuEvents_.end()) { +// queue(MainEngine).addMemRef(iMem); +// queue(SdmaEngine).addMemRef(iMem); + } + return &gpuEvents_[iMem]; +} + +void +VirtualGPU::assignGpuEvent(Pal::IGpuMemory* iMem, GpuEvent gpuEvent) +{ + GpuEvents::iterator it = gpuEvents_.find(iMem); + if (it != gpuEvents_.end()) { + it->second = gpuEvent; + } + else { +// queue(gpuEvent.engineId_).addMemRef(iMem); + gpuEvents_[iMem] = gpuEvent; + } +// queues_[gpuEvent.engineId_]->addCmdMemRef(iMem); +} + +void +VirtualGPU::releaseMemory(Pal::IGpuMemory* iMem, bool wait) +{ + //! @note if there is no wait, then it's a view release + if (wait) { + waitForEvent(&gpuEvents_[iMem]); + //queue(MainEngine).removeMemRef(iMem); + //queue(SdmaEngine).removeMemRef(iMem); + queues_[MainEngine]->removeCmdMemRef(iMem); + queues_[SdmaEngine]->removeCmdMemRef(iMem); + gpuEvents_.erase(iMem); + } +} + +void +VirtualGPU::submitPerfCounter(amd::PerfCounterCommand& vcmd) +{ + // Make sure VirtualGPU has an exclusive access to the resources + amd::ScopedLock lock(execution()); + + const amd::PerfCounterCommand::PerfCounterList counters = vcmd.getCounters(); + + // Create performance experiment + Pal::PerfExperimentCreateInfo createInfo = {}; + createInfo.optionValues.sqShaderMask = Pal::PerfShaderMaskCs; + + PalCounterReference* palRef = PalCounterReference::Create(*this, createInfo); + if (palRef == nullptr) { + LogError("We failed to allocate memory for the GPU perfcounter"); + vcmd.setStatus(CL_INVALID_OPERATION); + return; + } + + bool newExperiment = false; + + for (uint i = 0; i < vcmd.getNumCounters(); ++i) { + amd::PerfCounter* amdCounter = + static_cast(counters[i]); + const PerfCounter* counter = + static_cast(amdCounter->getDeviceCounter()); + + // Make sure we have a valid gpu performance counter + if (nullptr == counter) { + amd::PerfCounter::Properties prop = amdCounter->properties(); + PerfCounter* gpuCounter = new PerfCounter( + gpuDevice_, + *this, + prop[CL_PERFCOUNTER_GPU_BLOCK_INDEX], + prop[CL_PERFCOUNTER_GPU_COUNTER_INDEX], + prop[CL_PERFCOUNTER_GPU_EVENT_INDEX]); + if (nullptr == gpuCounter) { + LogError("We failed to allocate memory for the GPU perfcounter"); + vcmd.setStatus(CL_INVALID_OPERATION); + return; + } + else if (gpuCounter->create(palRef)) { + amdCounter->setDeviceCounter(gpuCounter); + newExperiment = true; + } + else { + LogPrintfError("We failed to allocate a perfcounter in CAL.\ + Block: %d, counter: #d, event: %d", + gpuCounter->info()->blockIndex_, + gpuCounter->info()->counterIndex_, + gpuCounter->info()->eventIndex_); + delete gpuCounter; + vcmd.setStatus(CL_INVALID_OPERATION); + return; + } + counter = gpuCounter; + } + } + + if (newExperiment) { + palRef->finalize(); + } + + palRef->release(); + + Pal::IPerfExperiment* palPerf = nullptr; + for (uint i = 0; i < vcmd.getNumCounters(); ++i) { + amd::PerfCounter* amdCounter = + static_cast(counters[i]); + const PerfCounter* counter = + static_cast(amdCounter->getDeviceCounter()); + + if (palPerf != counter->iPerf()) { + palPerf = counter->iPerf(); + // Find the state and sends the command to PAL + if (vcmd.getState() == amd::PerfCounterCommand::Begin) { + iCmd()->CmdBeginPerfExperiment(palPerf); + } + else if (vcmd.getState() == amd::PerfCounterCommand::End) { + GpuEvent event; + eventBegin(MainEngine); + iCmd()->CmdEndPerfExperiment(palPerf); + eventEnd(MainEngine, event); + setGpuEvent(event); + } + else { + LogError("Unsupported performance counter state"); + vcmd.setStatus(CL_INVALID_OPERATION); + return; + } + } + } +} + +void +VirtualGPU::submitThreadTraceMemObjects(amd::ThreadTraceMemObjectsCommand& cmd) +{ + // Make sure VirtualGPU has an exclusive access to the resources + amd::ScopedLock lock(execution()); + + profilingBegin(cmd); + + switch(cmd.type()) { + case CL_COMMAND_THREAD_TRACE_MEM: + { + amd::ThreadTrace* amdThreadTrace = &cmd.getThreadTrace(); + ThreadTrace* threadTrace = + static_cast(amdThreadTrace->getDeviceThreadTrace()); + Unimplemented(); +/* + if (threadTrace == nullptr) { + gslQueryObject gslThreadTrace; + // Create a HW thread trace query object + gslThreadTrace = cs()->createQuery(GSL_SHADER_TRACE_BYTES_WRITTEN); + if (0 == gslThreadTrace) { + LogError("Failure in memory allocation for the GPU threadtrace"); + cmd.setStatus(CL_INVALID_OPERATION); + return; + } + CalThreadTraceReference* palRef = new CalThreadTraceReference(*this,gslThreadTrace); + if (palRef == nullptr) { + LogError("Failure in memory allocation for the GPU threadtrace"); + cmd.setStatus(CL_INVALID_OPERATION); + return; + } + size_t seNum = amdThreadTrace->deviceSeNumThreadTrace(); + ThreadTrace* gpuThreadTrace = new ThreadTrace( + gpuDevice_, + *this, + seNum); + if (nullptr == gpuThreadTrace) { + LogError("Failure in memory allocation for the GPU threadtrace"); + cmd.setStatus(CL_INVALID_OPERATION); + return; + } + if (gpuThreadTrace->create(palRef)) { + amdThreadTrace->setDeviceThreadTrace(gpuThreadTrace); + } + else { + LogError("Failure in memory allocation for the GPU threadtrace"); + delete gpuThreadTrace; + cmd.setStatus(CL_INVALID_OPERATION); + return; + } + threadTrace = gpuThreadTrace; + palRef->release(); + } + gslShaderTraceBufferObject* threadTraceBufferObjects = threadTrace->getThreadTraceBufferObjects(); + const size_t memObjSize = cmd.getMemoryObjectSize(); + const std::vector& memObj = cmd.getMemList(); + size_t se = 0; + for (std::vector::const_iterator itMemObj = memObj.begin();itMemObj != memObj.end();++itMemObj,++se) { + // Find GSL Mem Object + Pal::IGpuMemory* gslMemObj = dev().getGpuMemory(*itMemObj)->iMem(); + + // Bind GSL MemObject to the appropriate SE Thread Trace Buffer Object + threadTraceBufferObjects[se]->attachMemObject(cs(), gslMemObj, 0, 0, memObjSize, se); + } +*/ + break; + } + default: + LogError("Unsupported command type for ThreadTraceMemObjects!"); + break; + } +} + +void +VirtualGPU::submitThreadTrace(amd::ThreadTraceCommand& cmd) +{ + // Make sure VirtualGPU has an exclusive access to the resources + amd::ScopedLock lock(execution()); + + profilingBegin(cmd); + + switch(cmd.type()) { + case CL_COMMAND_THREAD_TRACE: + { + amd::ThreadTrace* amdThreadTrace = + static_cast(&cmd.getThreadTrace()); + ThreadTrace* threadTrace = + static_cast(amdThreadTrace->getDeviceThreadTrace()); + + // gpu thread trace object had to be generated prior to begin/end/pause/resume due + // to ThreadTraceMemObjectsCommand execution + if (threadTrace == nullptr) { + return; + } + else { + Unimplemented(); +/* + gslQueryObject gslThreadTrace; + gslThreadTrace = threadTrace->gslThreadTrace(); + uint32_t seNum = amdThreadTrace->deviceSeNumThreadTrace(); + + // Find the state and sends the commands to GSL + if (cmd.getState() == amd::ThreadTraceCommand::Begin) { + amd::ThreadTrace::ThreadTraceConfig* traceCfg = + static_cast(cmd.threadTraceConfig()); + const gslErrorCode ec = gslThreadTrace->BeginQuery(cs(), + GSL_SHADER_TRACE_BYTES_WRITTEN, 0); + assert(ec == GSL_NO_ERROR); + + for (uint32_t idx = 0; idx < seNum; ++idx) { + rs()->enableShaderTrace(cs(), idx, true); + rs()->setShaderTraceComputeUnit (idx, traceCfg->cu_); + rs()->setShaderTraceShaderArray (idx, traceCfg->sh_); + rs()->setShaderTraceSIMDMask (idx, traceCfg->simdMask_); + rs()->setShaderTraceVmIdMask (idx, traceCfg->vmIdMask_); + rs()->setShaderTraceTokenMask (idx, traceCfg->tokenMask_); + rs()->setShaderTraceRegisterMask(idx, traceCfg->regMask_); + rs()->setShaderTraceIssueMask (idx, traceCfg->instMask_); + rs()->setShaderTraceRandomSeed (idx, traceCfg->randomSeed_); + rs()->setShaderTraceCaptureMode (idx, traceCfg->captureMode_); + rs()->setShaderTraceWrap (idx, traceCfg->isWrapped_); + rs()->setShaderTraceUserData (idx, + (traceCfg->isUserData_) ? traceCfg->userData_ : 0); + } + } + else if (cmd.getState() == amd::ThreadTraceCommand::End) { + for (uint32_t idx = 0; idx < seNum; ++idx) { + rs()->enableShaderTrace(cs(), idx, false); + } + gslThreadTrace->EndQuery(cs(), 0); + } + else if (cmd.getState() == amd::ThreadTraceCommand::Pause) { + for (uint32_t idx = 0; idx < seNum; ++idx) { + rs()->setShaderTraceIsPaused(cs(), idx, true); + } + } + else if (cmd.getState() == amd::ThreadTraceCommand::Resume) { + for (uint32_t idx = 0; idx < seNum; ++idx) { + rs()->setShaderTraceIsPaused(cs(), idx, false); + } + } +*/ + } + break; + } + default: + LogError("Unsupported command type for ThreadTrace!"); + break; + } +} + +void +VirtualGPU::submitAcquireExtObjects(amd::AcquireExtObjectsCommand& vcmd) +{ + // Make sure VirtualGPU has an exclusive access to the resources + amd::ScopedLock lock(execution()); + + profilingBegin(vcmd); + + for (std::vector::const_iterator it = vcmd.getMemList().begin(); + it != vcmd.getMemList().end(); it++) { + // amd::Memory object should never be nullptr + assert(*it && "Memory object for interop is nullptr"); + pal::Memory* memory = dev().getGpuMemory(*it); + + // If resource is a shared copy of original resource, then + // runtime needs to copy data from original resource + (*it)->getInteropObj()->copyOrigToShared(); + + // Check if OpenCL has direct access to the interop memory + if (memory->interopType() == Memory::InteropDirectAccess) { + continue; + } + + // Does interop use HW emulation? + if (memory->interopType() == Memory::InteropHwEmulation) { + static const bool Entire = true; + amd::Coord3D origin(0, 0, 0); + amd::Coord3D region(memory->size()); + + // Synchronize the object + if (!blitMgr().copyBuffer(*memory->interop(), + *memory, origin, origin, region, Entire)) { + LogError("submitAcquireExtObjects - Interop synchronization failed!"); + vcmd.setStatus(CL_INVALID_OPERATION); + return; + } + } + } + + profilingEnd(vcmd); +} + +void +VirtualGPU::submitReleaseExtObjects(amd::ReleaseExtObjectsCommand& vcmd) +{ + // Make sure VirtualGPU has an exclusive access to the resources + amd::ScopedLock lock(execution()); + + profilingBegin(vcmd); + + for (std::vector::const_iterator it = vcmd.getMemList().begin(); + it != vcmd.getMemList().end(); it++) { + // amd::Memory object should never be nullptr + assert(*it && "Memory object for interop is nullptr"); + pal::Memory* memory = dev().getGpuMemory(*it); + + // Check if we can use HW interop + if (memory->interopType() == Memory::InteropHwEmulation) { + static const bool Entire = true; + amd::Coord3D origin(0, 0, 0); + amd::Coord3D region(memory->size()); + + // Synchronize the object + if (!blitMgr().copyBuffer(*memory, *memory->interop(), + origin, origin, region, Entire)) { + LogError("submitReleaseExtObjects interop synchronization failed!"); + vcmd.setStatus(CL_INVALID_OPERATION); + return; + } + } + else { + if (memory->interopType() != Memory::InteropDirectAccess) { + LogError("None interop release!"); + } + } + + // If resource is a shared copy of original resource, then + // runtime needs to copy data back to original resource + (*it)->getInteropObj()->copySharedToOrig(); + } + + profilingEnd(vcmd); +} + +void +VirtualGPU::submitSignal(amd::SignalCommand & vcmd) +{ + amd::ScopedLock lock(execution()); + profilingBegin(vcmd); + pal::Memory* gpuMemory = dev().getGpuMemory(&vcmd.memory()); + Unimplemented(); +/* + if (vcmd.type() == CL_COMMAND_WAIT_SIGNAL_AMD) { + uint64_t surfAddr = gpuMemory->iMem()->getPhysicalAddress(cs()); + uint64_t markerAddr = gpuMemory->iMem()->getMarkerAddress(cs()); + uint64_t markerOffset = markerAddr - surfAddr; + cs()->p2pMarkerOp(gpuMemory->iMem(), vcmd.markerValue(), + markerOffset, false); + } + else if (vcmd.type() == CL_COMMAND_WRITE_SIGNAL_AMD) { + GpuEvent gpuEvent; + eventBegin(MainEngine); + cs()->p2pMarkerOp(gpuMemory->iMem(), vcmd.markerValue(), vcmd.markerOffset(), true); + //! @todo We don't need flush if an event is tracked. + cs()->Flush(); + eventEnd(MainEngine, gpuEvent); + gpuMemory->setBusy(*this, gpuEvent); + // Update the global GPU event + setGpuEvent(gpuEvent); + } +*/ + profilingEnd(vcmd); +} + +void +VirtualGPU::submitMakeBuffersResident(amd::MakeBuffersResidentCommand & vcmd) +{ + amd::ScopedLock lock(execution()); + profilingBegin(vcmd); + std::vector memObjects = vcmd.memObjects(); + cl_uint numObjects = memObjects.size(); + Pal::IGpuMemory** pGpuMemObjects = new Pal::IGpuMemory*[numObjects]; + + for(cl_uint i = 0; i < numObjects; ++i) + { + pal::Memory* gpuMemory = dev().getGpuMemory(memObjects[i]); + pGpuMemObjects[i] = gpuMemory->iMem(); + gpuMemory->syncCacheFromHost(*this); + } + + uint64_t* surfBusAddr = new uint64_t[numObjects]; + uint64_t* markerBusAddr = new uint64_t[numObjects]; + Unimplemented(); +/* + gslErrorCode res = cs()->makeBuffersResident(numObjects, pGpuMemObjects, + surfBusAddr, markerBusAddr); + if(res != GSL_NO_ERROR) { + LogError("MakeBuffersResident failed"); + vcmd.setStatus(CL_INVALID_OPERATION); + } + else { + cl_bus_address_amd* busAddr = vcmd.busAddress(); + for(cl_uint i = 0; i < numObjects; ++i) + { + busAddr[i].surface_bus_address = surfBusAddr[i]; + busAddr[i].marker_bus_address = markerBusAddr[i]; + } + } +*/ + delete[] pGpuMemObjects; + delete[] surfBusAddr; + delete[] markerBusAddr; + profilingEnd(vcmd); +} + + +bool +VirtualGPU::awaitCompletion(CommandBatch* cb, const amd::Event* waitingEvent) +{ + bool found = false; + amd::Command* current; + amd::Command* head = cb->head_; + + // Make sure that profiling is enabled + if (state_.profileEnabled_) { + return profilingCollectResults(cb, waitingEvent); + } + // Mark the first command in the batch as running + if (head != nullptr) { + head->setStatus(CL_RUNNING); + } + else { + return found; + } + + // Wait for the last known GPU event + waitEventLock(cb); + + while (nullptr != head) { + current = head->getNext(); + if (head->status() == CL_SUBMITTED) { + head->setStatus(CL_RUNNING); + head->setStatus(CL_COMPLETE); + } + else if (head->status() == CL_RUNNING) { + head->setStatus(CL_COMPLETE); + } + else if ((head->status() != CL_COMPLETE) && (current != nullptr)) { + LogPrintfError("Unexpected command status - %d!", head->status()); + } + + // Check if it's a waiting command + if (head == waitingEvent) { + found = true; + } + + head->release(); + head = current; + } + + return found; +} + +void +VirtualGPU::flush(amd::Command* list, bool wait) +{ + CommandBatch* cb = nullptr; + bool gpuCommand = false; + + for (uint i = 0; i < AllEngines; ++i) { + if (cal_.events_[i].isValid()) { + gpuCommand = true; + } + } + + // If the batch doesn't have any GPU command and the list is empty + if (!gpuCommand && cbList_.empty()) { + state_.forceWait_ = true; + } + + // Insert the current batch into a list + if (nullptr != list) { + cb = new CommandBatch(list, cal()->events_, cal()->lastTS_); + } + + { + //! @todo: Check if really need a lock + amd::ScopedLock lock(execution()); + for (uint i = 0; i < AllEngines; ++i) { + flushDMA(i); + // Reset event so we won't try to wait again, + // if runtime didn't submit any commands + //! @note: it's safe to invalidate events, since + //! we already saved them with the batch creation step above + cal_.events_[i].invalidate(); + } + } + + // Mark last TS as nullptr, so runtime won't process empty batches with the old TS + cal_.lastTS_ = nullptr; + if (nullptr != cb) { + cbList_.push_back(cb); + } + + wait |= state_.forceWait_; + // Loop through all outstanding command batches + while (!cbList_.empty()) { + CommandBatchList::const_iterator it = cbList_.begin(); + // Check if command batch finished without a wait + bool finished = true; + for (uint i = 0; i < AllEngines; ++i) { + finished &= isDone(&(*it)->events_[i]); + } + if (finished || wait) { + // Wait for completion + awaitCompletion(*it); + // Release a command batch + delete *it; + // Remove command batch from the list + cbList_.pop_front(); + } + else { + // Early exit if no finished + break; + } + } + state_.forceWait_ = false; +} + +void +VirtualGPU::enableSyncedBlit() const +{ + return blitMgr_->enableSynchronization(); +} + +void +VirtualGPU::releaseMemObjects(bool scratch) +{ + for (GpuEvents::const_iterator it = gpuEvents_.begin(); + it != gpuEvents_.end(); ++it) { + GpuEvent event = it->second; + waitForEvent(&event); + queues_[MainEngine]->removeCmdMemRef(const_cast(it->first)); + queues_[SdmaEngine]->removeCmdMemRef(const_cast(it->first)); + } + + gpuEvents_.clear(); +} + +void +VirtualGPU::setGpuEvent( + GpuEvent gpuEvent, + bool flush) +{ + cal_.events_[engineID_] = gpuEvent; + + // Flush current DMA buffer if requested + if (flush) { + flushDMA(engineID_); + } +} + +void +VirtualGPU::flushDMA(uint engineID) +{ + if (engineID == MainEngine) { + // Clear memory dependency state, since runtime flushes compute + // memoryDependency().clear(); + //!@todo Keep memory dependency alive even if we flush DMA, + //! since only L2 cache is flushed in KMD frame, + //! but L1 still has to be invalidated. + } + + isDone(&cal_.events_[engineID]); +} + +bool +VirtualGPU::waitAllEngines(CommandBatch* cb) +{ + uint i; + GpuEvent* events; //!< GPU events for the batch + + // If command batch is nullptr then wait for the current + if (nullptr == cb) { + events = cal_.events_; + } + else { + events = cb->events_; + } + + bool earlyDone = true; + // The first loop is to flush all engines and/or check if + // engines are idle already + for (i = 0; i < AllEngines; ++i) { + earlyDone &= isDone(&events[i]); + } + + // Release all transfer buffers on this command queue + releaseXferWrite(); + + // Rlease all pinned memory + releasePinnedMem(); + + // The second loop is to wait all engines + for (i = 0; i < AllEngines; ++i) { + waitForEvent(&events[i]); + } + + return earlyDone; +} + +void +VirtualGPU::waitEventLock(CommandBatch* cb) +{ + // Make sure VirtualGPU has an exclusive access to the resources + amd::ScopedLock lock(execution()); + + bool earlyDone = waitAllEngines(cb); + + // Free resource cache if we have too many entries + //! \note we do it here, when all engines are idle, + // because Vista/Win7 idles GPU on a resource destruction + static const size_t MinCacheEntries = 4096; + dev().resourceCache().free(MinCacheEntries); + + // Find the timestamp object of the last command in the batch + if (cb->lastTS_ != nullptr) { + // If earlyDone is TRUE, then CPU didn't wait for GPU. + // Thus the sync point between CPU and GPU is unclear and runtime + // will use an older adjustment value to maintain the same timeline + if (!earlyDone || + //! \note Workaround for APU(s). + //! GPU-CPU timelines may go off too much, thus always + //! force calibration with the last batch in the list + (cbList_.size() <= 1) || + (readjustTimeGPU_ == 0)) { + uint64_t startTimeStampGPU = 0; + uint64_t endTimeStampGPU = 0; + + // Get the timestamp value of the last command in the batch + cb->lastTS_->value(&startTimeStampGPU, &endTimeStampGPU); + + uint64_t endTimeStampCPU = amd::Os::timeNanos(); + // Make sure the command batch has a valid GPU TS + if (!GPU_RAW_TIMESTAMP) { + // Adjust the base time by the execution time + readjustTimeGPU_ = endTimeStampGPU - endTimeStampCPU; + } + } + } +} + +bool +VirtualGPU::allocConstantBuffers() +{ + // Allocate/reallocate constant buffers + size_t minCbSize; + // GCN doesn't really have a limit + minCbSize = 256 * Ki; + uint i; + + // Create/reallocate constant buffer resources + for (i = 0; i < MaxConstBuffersArguments; ++i) { + ConstBuffer* constBuf = new ConstBuffer(*this, ((minCbSize + + ConstBuffer::VectorSize - 1) / ConstBuffer::VectorSize)); + + if ((constBuf != nullptr) && constBuf->create()) { + addConstBuffer(constBuf); + } + else { + // We failed to create a constant buffer + delete constBuf; + return false; + } + } + + return true; +} + +void +VirtualGPU::profilingBegin(amd::Command& command, bool drmProfiling) +{ + // Is profiling enabled? + if (command.profilingInfo().enabled_) { + // Allocate a timestamp object from the cache + TimeStamp* ts = tsCache_->allocTimeStamp(); + if (nullptr == ts) { + return; + } + // Save the TimeStamp object in the current OCL event + command.setData(ts); + currTs_ = ts; + state_.profileEnabled_ = true; + } +} + +void +VirtualGPU::profilingEnd(amd::Command& command) +{ + // Get the TimeStamp object associated witht the current command + TimeStamp* ts = reinterpret_cast(command.data()); + if (ts != nullptr) { + // Check if the command actually did any GPU submission + if (ts->isValid()) { + cal_.lastTS_ = ts; + } + else { + // Destroy the TimeStamp object + tsCache_->freeTimeStamp(ts); + command.setData(nullptr); + } + } +} + +bool +VirtualGPU::profilingCollectResults(CommandBatch* cb, const amd::Event* waitingEvent) +{ + bool found = false; + amd::Command* current; + amd::Command* first = cb->head_; + + // If the command list is, empty then exit + if (nullptr == first) { + return found; + } + + // Wait for the last known GPU events on all engines + waitEventLock(cb); + + // Find the CPU base time of the entire command batch execution + uint64_t endTimeStamp = amd::Os::timeNanos(); + uint64_t startTimeStamp = endTimeStamp; + + // First step, walk the command list to find the first valid command + //! \note The batch may have empty markers at the beginning. + //! So the start/end of the empty commands is equal to + //! the start of the first valid command in the batch. + first = cb->head_; + while (nullptr != first) { + // Get the TimeStamp object associated witht the current command + TimeStamp* ts = reinterpret_cast(first->data()); + + if (ts != nullptr) { + ts->value(&startTimeStamp, &endTimeStamp); + endTimeStamp -= readjustTimeGPU_; + startTimeStamp -= readjustTimeGPU_; + // Assign to endTimeStamp the start of the first valid command + endTimeStamp = startTimeStamp; + break; + } + first = first->getNext(); + } + + // Second step, walk the command list to construct the time line + first = cb->head_; + while (nullptr != first) { + // Get the TimeStamp object associated witht the current command + TimeStamp* ts = reinterpret_cast(first->data()); + + current = first->getNext(); + + if (ts != nullptr) { + ts->value(&startTimeStamp, &endTimeStamp); + endTimeStamp -= readjustTimeGPU_; + startTimeStamp -= readjustTimeGPU_; + // Destroy the TimeStamp object + tsCache_->freeTimeStamp(ts); + first->setData(nullptr); + } + else { + // For empty commands start/end is equal to + // the end of the last valid command + startTimeStamp = endTimeStamp; + } + + // Update the command status with the proper timestamps + if (first->status() == CL_SUBMITTED) { + first->setStatus(CL_RUNNING, startTimeStamp); + first->setStatus(CL_COMPLETE, endTimeStamp); + } + else if (first->status() == CL_RUNNING) { + first->setStatus(CL_COMPLETE, endTimeStamp); + } + else if ((first->status() != CL_COMPLETE) && (current != nullptr)) { + LogPrintfError("Unexpected command status - %d!", first->status()); + } + + // Do we wait this event? + if (first == waitingEvent) { + found = true; + } + + first->release(); + first = current; + } + + return found; +} + +bool +VirtualGPU::addVmMemory(const Memory* memory) +{ + queues_[MainEngine]->addCmdMemRef(memory->iMem()); + return true; +} + +void +VirtualGPU::profileEvent(EngineType engine, bool type) const +{ + if (nullptr == currTs_) { + return; + } + if (type) { + currTs_->begin((engine == SdmaEngine) ? true : false); + } + else { + currTs_->end((engine == SdmaEngine) ? true : false); + } +} + +bool +VirtualGPU::processMemObjectsHSA( + const amd::Kernel& kernel, + const_address params, + bool nativeMem, + std::vector* memList) +{ + static const bool NoAlias = true; + const HSAILKernel& hsaKernel = static_cast + (*(kernel.getDeviceKernel(dev(), NoAlias))); + const amd::KernelSignature& signature = kernel.signature(); + const amd::KernelParameters& kernelParams = kernel.parameters(); + + // Mark the tracker with a new kernel, + // so we can avoid checks of the aliased objects + memoryDependency().newKernel(); + + bool deviceSupportFGS = 0 != (dev().info().svmCapabilities_ & CL_DEVICE_SVM_FINE_GRAIN_SYSTEM); + bool supportFineGrainedSystem = deviceSupportFGS; + FGSStatus status = kernelParams.getSvmSystemPointersSupport(); + switch (status) { + case FGS_YES: + if (!deviceSupportFGS) { + return false; + } + supportFineGrainedSystem = true; + break; + case FGS_NO: + supportFineGrainedSystem = false; + break; + case FGS_DEFAULT: + default: + break; + } + + size_t count = kernelParams.getNumberOfSvmPtr(); + size_t execInfoOffset = kernelParams.getExecInfoOffset(); + bool sync = true; + + amd::Memory* memory = nullptr; + //get svm non arugment information + void* const* svmPtrArray = + reinterpret_cast(params + execInfoOffset); + for (size_t i = 0; i < count; i++) { + memory = amd::SvmManager::FindSvmBuffer(svmPtrArray[i]); + if (nullptr == memory) { + if (!supportFineGrainedSystem) { + return false; + } + else if (sync) { + Unimplemented(); + //flushCUCaches(); + // Clear memory dependency state + const static bool All = true; + memoryDependency().clear(!All); + } + } + else { + Memory* gpuMemory = dev().getGpuMemory(memory); + if (nullptr != gpuMemory) { + // Synchronize data with other memory instances if necessary + gpuMemory->syncCacheFromHost(*this); + + const static bool IsReadOnly = false; + // Validate SVM passed in the non argument list + memoryDependency().validate(*this, gpuMemory, IsReadOnly); + + memList->push_back(gpuMemory); + } + else { + return false; + } + } + } + + // Check all parameters for the current kernel + for (size_t i = 0; i < signature.numParameters(); ++i) { + const amd::KernelParameterDescriptor& desc = signature.at(i); + const HSAILKernel::Argument* arg = hsaKernel.argument(i); + Memory* memory = nullptr; + bool readOnly = false; + amd::Memory* svmMem = nullptr; + + // Find if current argument is a buffer + if ((desc.type_ == T_POINTER) && (arg->addrQual_ != HSAIL_ADDRESS_LOCAL)) { + if (kernelParams.boundToSvmPointer(dev(), params, i)) { + svmMem = amd::SvmManager::FindSvmBuffer( + *reinterpret_cast(params + desc.offset_)); + if (!svmMem) { + Unimplemented(); + //flushCUCaches(); + // Clear memory dependency state + const static bool All = true; + memoryDependency().clear(!All); + } + } + + if (nativeMem) { + memory = *reinterpret_cast(params + desc.offset_); + } + else if (*reinterpret_cast + (params + desc.offset_) != nullptr) { + if (nullptr == svmMem) { + memory = dev().getGpuMemory(*reinterpret_cast + (params + desc.offset_)); + } + else { + memory = dev().getGpuMemory(svmMem); + } + // Synchronize data with other memory instances if necessary + memory->syncCacheFromHost(*this); + } + + if (memory != nullptr) { + // Check image + readOnly = (desc.accessQualifier_ == + CL_KERNEL_ARG_ACCESS_READ_ONLY) ? true : false; + // Check buffer + readOnly |= (arg->access_ == HSAIL_ACCESS_TYPE_RO) ? true : false; + // Validate memory for a dependency in the queue + memoryDependency().validate(*this, memory, readOnly); + } + } + } + + for (pal::Memory* mem : hsaKernel.prog().globalStores()) { + const static bool IsReadOnly = false; + // Validate global store for a dependency in the queue + memoryDependency().validate(*this, mem, IsReadOnly); + } + + return true; +} + +amd::Memory* +VirtualGPU::createBufferFromImage(amd::Memory& amdImage) const +{ + amd::Memory* mem = new(amdImage.getContext()) + amd::Buffer(amdImage, 0, 0, amdImage.getSize()); + + if ((mem != nullptr) && !mem->create()) { + mem->release(); + } + + return mem; +} + +void +VirtualGPU::writeVQueueHeader(VirtualGPU& hostQ, uint64_t kernelTable) +{ + const static bool Wait = true; + vqHeader_->kernel_table = kernelTable; + virtualQueue_->writeRawData(hostQ, sizeof(AmdVQueueHeader), vqHeader_, !Wait); +} + +void +VirtualGPU::flushCuCaches(HwDbgGpuCacheMask cache_mask) +{ + Unimplemented(); +/* + //! @todo: fix issue of no event available for the flush/invalidate cache command + InvalidateSqCaches(cache_mask.sqICache_, + cache_mask.sqKCache_, + cache_mask.tcL1_, + cache_mask.tcL2_); +*/ + flushDMA(engineID_); + + return; +} + +void +VirtualGPU::buildKernelInfo(const HSAILKernel& hsaKernel, + hsa_kernel_dispatch_packet_t* aqlPkt, + HwDbgKernelInfo& kernelInfo, + amd::Event* enqueueEvent) +{ + amd::HwDebugManager * dbgManager = dev().hwDebugMgr(); + assert (dbgManager && "No HW Debug Manager!"); + + // Initialize structure with default values + + if (hsaKernel.prog().maxScratchRegs() > 0) { + pal::Memory* scratchBuf = dev().scratch(hwRing())->memObj_; + kernelInfo.scratchBufAddr = scratchBuf->vmAddress(); + kernelInfo.scratchBufferSizeInBytes = scratchBuf->size(); + + // Get the address of the scratch buffer and its size for CPU access + address scratchRingAddr = nullptr; + scratchRingAddr = static_cast
(scratchBuf->map(nullptr, 0)); + dbgManager->setScratchRing(scratchRingAddr,scratchBuf->size()); + scratchBuf->unmap(nullptr); + } + else { + kernelInfo.scratchBufAddr = 0; + kernelInfo.scratchBufferSizeInBytes = 0; + dbgManager->setScratchRing(nullptr, 0); + } + + //! @todo: need to verify what is wanted for the global memory + Unimplemented(); + kernelInfo.heapBufAddr = 0; + + kernelInfo.pAqlDispatchPacket = aqlPkt; + kernelInfo.pAqlQueuePtr = reinterpret_cast(hsaQueueMem_->vmAddress()); + + // Get the address of the kernel code and its size for CPU access + pal::Memory* aqlCode = hsaKernel.gpuAqlCode(); + if (nullptr != aqlCode) { + address aqlCodeAddr = static_cast
(aqlCode->map(nullptr, 0)); + dbgManager->setKernelCodeInfo(aqlCodeAddr, hsaKernel.aqlCodeSize()); + aqlCode->unmap(nullptr); + } + else { + dbgManager->setKernelCodeInfo(nullptr, 0); + } + + kernelInfo.trapPresent = false; + kernelInfo.trapHandler = nullptr; + kernelInfo.trapHandlerBuffer = nullptr; + + kernelInfo.excpEn = 0; + kernelInfo.cacheDisableMask = 0; + kernelInfo.sqDebugMode = 0; + + kernelInfo.mgmtSe0Mask = 0xFFFFFFFF; + kernelInfo.mgmtSe1Mask = 0xFFFFFFFF; + + // set kernel info for HW debug and call the callback function + if (nullptr != dbgManager->preDispatchCallBackFunc()) { + DebugToolInfo dbgSetting = {0}; + dbgSetting.scratchAddress_ = kernelInfo.scratchBufAddr; + dbgSetting.scratchSize_ = kernelInfo.scratchBufferSizeInBytes; + dbgSetting.globalAddress_ = kernelInfo.heapBufAddr; + dbgSetting.aclBinary_ = hsaKernel.prog().binaryElf(); + dbgSetting.event_ = enqueueEvent; + + // Call the predispatch callback function & set the trap info + AqlCodeInfo aqlCodeInfo; + aqlCodeInfo.aqlCode_ = (amd_kernel_code_t *) hsaKernel.cpuAqlCode(); + aqlCodeInfo.aqlCodeSize_ = hsaKernel.aqlCodeSize(); + + // Execute the pre-dispatch call back function + dbgManager->executePreDispatchCallBack(reinterpret_cast(aqlPkt), &dbgSetting); + + // assign the debug TMA and TBA for kernel dispatch + if (nullptr != dbgSetting.trapHandler_ && nullptr != dbgSetting.trapBuffer_) { + assignDebugTrapHandler(dbgSetting, kernelInfo); + } + + kernelInfo.trapPresent = (kernelInfo.trapHandler) ? true : false; + + // Execption policy + kernelInfo.excpEn = dbgSetting.exceptionMask_; + kernelInfo.cacheDisableMask = dbgSetting.cacheDisableMask_; + kernelInfo.sqDebugMode = dbgSetting.gpuSingleStepMode_; + + // Compute the mask for reserved CUs. These two dwords correspond to + // two registers used for reserving CUs for display. In the current + // implementation, the number of CUs reserved can be 0 to 7, and it + // is set by debugger users. + if (dbgSetting.monitorMode_) { + uint32_t i = dbgSetting.reservedCuNum_ / 2; + kernelInfo.mgmtSe0Mask <<= i; + i = dbgSetting.reservedCuNum_ - i; + kernelInfo.mgmtSe1Mask <<= i; + } + Unimplemented(); +/* + // flush/invalidate the instruction, data, L1 and L2 caches + InvalidateSqCaches(); +*/ + } +} + +void +VirtualGPU::assignDebugTrapHandler(const DebugToolInfo& dbgSetting, + HwDbgKernelInfo& kernelInfo) +{ + // setup the runtime trap handler code and trap buffer to be assigned before kernel dispatching + // + Memory* rtTrapHandlerMem = static_cast(dev().hwDebugMgr()->runtimeTBA()); + Memory* rtTrapBufferMem = static_cast(dev().hwDebugMgr()->runtimeTMA()); + + kernelInfo.trapHandler = reinterpret_cast(rtTrapHandlerMem->vmAddress() + TbaStartOffset); + // With the TMA corruption hw bug workaround, the trap handler buffer can be set to zero. + // However, by setting the runtime trap buffer (TMA) correct, the runtime trap hander + // without the workaround can still function correctly. + kernelInfo.trapHandlerBuffer = reinterpret_cast(rtTrapBufferMem->vmAddress()); + + address rtTrapBufferAddress = static_cast
(rtTrapBufferMem->map(this)); + + Memory* trapHandlerMem = dev().getGpuMemory(dbgSetting.trapHandler_); + Memory* trapBufferMem = dev().getGpuMemory(dbgSetting.trapBuffer_); + + // Address of the trap handler code/buffer should be 256-byte aligned + uint64_t tbaAddress = trapHandlerMem->vmAddress(); + uint64_t tmaAddress = trapBufferMem->vmAddress(); + if ((tbaAddress & 0xFF) != 0 || (tmaAddress & 0xFF) != 0) { + assert(false && "Trap handler/buffer is not 256-byte aligned"); + } + + // The addresses of the debug trap handler code (TBA) and buffer (TMA) are + // stored in the runtime trap handler buffer with offset location of 0x18-19 + // and 0x20-21, respectively. + uint64_t * rtTmaPtr = reinterpret_cast(rtTrapBufferAddress + 0x18); + rtTmaPtr[0] = tbaAddress; + rtTmaPtr[1] = tmaAddress; + + rtTrapBufferMem->unmap(nullptr); + + // Add GPU mem handles to the memory list for VidMM + addVmMemory(trapHandlerMem); + addVmMemory(trapBufferMem); + addVmMemory(rtTrapHandlerMem); + addVmMemory(rtTrapBufferMem); + +} + +bool +VirtualGPU::validateSdmaOverlap(const Resource& src, const Resource& dst) +{ + uint64_t srcVmEnd = src.vmAddress() + src.vmSize(); + if (((src.vmAddress() >= sdmaRange_.start_) && + (src.vmAddress() <= sdmaRange_.end_)) || + ((srcVmEnd >= sdmaRange_.start_) && + (srcVmEnd <= sdmaRange_.end_)) || + ((src.vmAddress() <= sdmaRange_.start_) && + (srcVmEnd >= sdmaRange_.end_))) { + sdmaRange_.start_ = dst.vmAddress(); + sdmaRange_.end_ = dst.vmAddress() + dst.vmSize(); + return true; + } + + sdmaRange_.start_ = std::min(sdmaRange_.start_, dst.vmAddress()); + sdmaRange_.end_ = std::max(sdmaRange_.end_, dst.vmAddress() + dst.vmSize()); + return false; +} + +} // namespace pal diff --git a/projects/clr/rocclr/runtime/device/pal/palvirtual.hpp b/projects/clr/rocclr/runtime/device/pal/palvirtual.hpp new file mode 100644 index 0000000000..1f7ca1307b --- /dev/null +++ b/projects/clr/rocclr/runtime/device/pal/palvirtual.hpp @@ -0,0 +1,576 @@ +// +// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved. +// + +#ifndef PALVIRTUAL_HPP_ +#define PALVIRTUAL_HPP_ + +#include "device/pal/paldefs.hpp" +#include "device/pal/palconstbuf.hpp" +#include "device/pal/palprintf.hpp" +#include "device/pal/paltimestamp.hpp" +#include "device/pal/palsched.hpp" +#include "device/pal/paldebugger.hpp" +#include "device/blit.hpp" +#include "palCmdBuffer.h" +#include "palCmdAllocator.h" +#include "palQueue.h" + +/*! \addtogroup PAL PAL Resource Implementation + * @{ + */ + +//! PAL Device Implementation +namespace pal { + +class Device; +class Kernel; +class Memory; +class CalCounterReference; +class VirtualGPU; +class Program; +class BlitManager; +class ThreadTrace; +class HSAILKernel; + +//! Virtual GPU +class VirtualGPU : public device::VirtualDevice +{ +public: + class Queue : public amd::HeapObject + { + public: + static const uint MaxCmdBuffers = 8; + static const uint MaxCommands = 512; + static const uint StartCmdBufIdx = 1; + static const uint FirstMemoryReference = 0x80000000; + static Queue* Create( + Pal::IDevice* palDev, //!< PAL device object + Pal::QueueType queueType, //!< PAL queue type + uint engineIdx, //!< Select particular engine index + Pal::ICmdAllocator* cmdAlloc//!< PAL CMD buffer allocator + ); + + Queue(Pal::IDevice* palDev) + : iDev_(palDev), iQueue_(NULL), + cmdBufIdSlot_(StartCmdBufIdx), cmdBufIdCurrent_(StartCmdBufIdx), + cmbBufIdRetired_(0), cmdCnt_(0) + { + for (uint i = 0; i < MaxCmdBuffers; ++i) { + iCmdBuffs_[i] = NULL; + iCmdFences_[i] = NULL; + } + } + + ~Queue(); + + void addCmdMemRef(Pal::IGpuMemory* iMem); + void removeCmdMemRef(Pal::IGpuMemory* iMem); + + void addMemRef(Pal::IGpuMemory* iMem) const + { + iDev_->AddGpuMemoryReferences(1, &iMem, NULL); + } + void removeMemRef(Pal::IGpuMemory* iMem) const + { + iDev_->RemoveGpuMemoryReferences(1, &iMem, NULL); + } + + //! Flushes the current command buffer to HW + //! Returns ID associated with the submission + uint submit(); + + bool flush(); + + bool waitForEvent(uint id); + + bool isDone(uint id); + + Pal::ICmdBuffer* iCmd() const { return iCmdBuffs_[cmdBufIdSlot_]; } + + Pal::IQueue* iQueue_; //!< PAL queue object + Pal::ICmdBuffer* iCmdBuffs_[MaxCmdBuffers]; //!< PAL command buffers + Pal::IFence* iCmdFences_[MaxCmdBuffers]; //!< PAL fences, associated with CMD + + private: + Pal::IDevice* iDev_; //!< PAL device + uint cmdBufIdSlot_; //!< Command buffer ID slot for submissions + uint cmdBufIdCurrent_; //!< Current global command buffer ID + uint cmbBufIdRetired_; //!< The last retired command buffer ID + uint cmdCnt_; //!< Counter of commands + std::map memReferences_; + }; + + struct CommandBatch : public amd::HeapObject + { + amd::Command* head_; //!< Command batch head + GpuEvent events_[AllEngines]; //!< Last known GPU events + TimeStamp* lastTS_; //!< TS associated with command batch + + //! Constructor + CommandBatch( + amd::Command* head, //!< Command batch head + const GpuEvent* events, //!< HW events on all engines + TimeStamp* lastTS //!< Last TS in command batch + ): head_(head), lastTS_(lastTS) + { + memcpy(&events_, events, AllEngines * sizeof(GpuEvent)); + } + }; + + //! The virtual GPU states + union State + { + struct + { + uint boundGlobal_ : 1; //!< Global buffer was bound + uint profiling_ : 1; //!< Profiling is enabled + uint forceWait_ : 1; //!< Forces wait in flush() + uint boundCb_ : 1; //!< Constant buffer was bound + uint boundPrintf_ : 1; //!< Printf buffer was bound + uint profileEnabled_: 1; //!< Profiling is enabled for WaveLimiter + }; + uint value_; + State(): value_(0) {} + }; + + //! CAL descriptor for the GPU virtual device + struct CalVirtualDesc : public amd::EmbeddedObject + { + GpuEvent events_[AllEngines]; //!< Last known GPU events + uint iterations_; //!< Number of iterations for the execution + TimeStamp* lastTS_; //!< Last timestamp executed on Virtual GPU + }; + + typedef std::vector constbufs_t; + + class MemoryDependency : public amd::EmbeddedObject + { + public: + //! Default constructor + MemoryDependency() + : memObjectsInQueue_(NULL) + , numMemObjectsInQueue_(0) + , maxMemObjectsInQueue_(0) {} + + ~MemoryDependency() { delete [] memObjectsInQueue_; } + + //! Creates memory dependecy structure + bool create(size_t numMemObj); + + //! Notify the tracker about new kernel + void newKernel() { endMemObjectsInQueue_ = numMemObjectsInQueue_; } + + //! Validates memory object on dependency + void validate(VirtualGPU& gpu, const Memory* memory, bool readOnly); + + //! Clear memory dependency + void clear(bool all = true); + + private: + struct MemoryState { + uint64_t start_; //! Busy memory start address + uint64_t end_; //! Busy memory end address + bool readOnly_; //! Current GPU state in the queue + }; + + MemoryState* memObjectsInQueue_; //!< Memory object state in the queue + size_t endMemObjectsInQueue_; //!< End of mem objects in the queue + size_t numMemObjectsInQueue_; //!< Number of mem objects in the queue + size_t maxMemObjectsInQueue_; //!< Maximum number of mem objects in the queue + }; + + + class DmaFlushMgmt : public amd::EmbeddedObject + { + public: + DmaFlushMgmt(const Device& dev); + + // Resets DMA command buffer workload + void resetCbWorkload(const Device& dev); + + // Finds split size for the current dispatch + void findSplitSize( + const Device& dev, //!< GPU device object + uint64_t threads, //!< Total number of execution threads + uint instructions //!< Number of ALU instructions + ); + + // Returns TRUE if DMA command buffer is ready for a flush + bool isCbReady( + VirtualGPU& gpu, //!< Virtual GPU object + uint64_t threads, //!< Total number of execution threads + uint instructions //!< Number of ALU instructions + ); + + // Returns dispatch split size + uint dispatchSplitSize() const { return dispatchSplitSize_; } + + private: + uint64_t maxDispatchWorkload_; //!< Maximum number of operations for a single dispatch + uint64_t maxCbWorkload_; //!< Maximum number of operations for DMA command buffer + uint64_t cbWorkload_; //!< Current number of operations in DMA command buffer + uint aluCnt_; //!< All ALUs on the chip + uint dispatchSplitSize_; //!< Dispath split size in elements + }; + +public: + VirtualGPU(Device& device); + //! Creates virtual gpu object + bool create( + bool profiling, //!< Enables profilng on the queue + uint deviceQueueSize = 0 //!< Device queue size, 0 if host queue + ); + ~VirtualGPU(); + + void submitReadMemory(amd::ReadMemoryCommand& vcmd); + void submitWriteMemory(amd::WriteMemoryCommand& vcmd); + void submitCopyMemory(amd::CopyMemoryCommand& vcmd); + void submitMapMemory(amd::MapMemoryCommand& vcmd); + void submitUnmapMemory(amd::UnmapMemoryCommand& vcmd); + void submitKernel(amd::NDRangeKernelCommand& vcmd); + bool submitKernelInternal( + const amd::NDRangeContainer& sizes, //!< Workload sizes + const amd::Kernel& kernel, //!< Kernel for execution + const_address parameters, //!< Parameters for the kernel + bool nativeMem = true, //!< Native memory objects + amd::Event* enqueueEvent = NULL //!< Event provided in the enqueue kernel command + ); + void submitNativeFn(amd::NativeFnCommand& vcmd); + void submitFillMemory(amd::FillMemoryCommand& vcmd); + void submitMigrateMemObjects(amd::MigrateMemObjectsCommand& cmd); + void submitMarker(amd::Marker& vcmd); + void submitAcquireExtObjects(amd::AcquireExtObjectsCommand& vcmd); + void submitReleaseExtObjects(amd::ReleaseExtObjectsCommand& vcmd); + void submitPerfCounter(amd::PerfCounterCommand& vcmd); + void submitThreadTraceMemObjects(amd::ThreadTraceMemObjectsCommand& cmd); + void submitThreadTrace(amd::ThreadTraceCommand& vcmd); + void submitSignal(amd::SignalCommand & vcmd); + void submitMakeBuffersResident(amd::MakeBuffersResidentCommand & vcmd); + virtual void submitSvmFreeMemory(amd::SvmFreeMemoryCommand& cmd); + virtual void submitSvmCopyMemory(amd::SvmCopyMemoryCommand& cmd); + virtual void submitSvmFillMemory(amd::SvmFillMemoryCommand& cmd); + virtual void submitSvmMapMemory(amd::SvmMapMemoryCommand& cmd); + virtual void submitSvmUnmapMemory(amd::SvmUnmapMemoryCommand& cmd); + + void releaseMemory(Pal::IGpuMemory* iMem, bool wait = true); + + void flush(amd::Command* list = NULL, bool wait = false); + bool terminate() { return true; } + + //! Returns GPU device object associated with this kernel + const Device& dev() const { return gpuDevice_; } + + //! Returns CAL descriptor of the virtual device + const CalVirtualDesc* cal() const { return &cal_; } + + //! Returns a GPU event, associated with GPU memory + GpuEvent* getGpuEvent( + Pal::IGpuMemory* iMem //!< PAL mem object + ); + + //! Assigns a GPU event, associated with GPU memory + void assignGpuEvent( + Pal::IGpuMemory* iMem, //!< PAL mem object + GpuEvent gpuEvent + ); + + //! Set the last known GPU event + void setGpuEvent( + GpuEvent gpuEvent, //!< GPU event for tracking + bool flush = false //!< TRUE if flush is required + ); + + //! Flush DMA buffer on the specified engine + void flushDMA( + uint engineID //!< Engine ID for DMA flush + ); + + //! Wait for all engines on this Virtual GPU + //! Returns TRUE if CPU didn't wait for GPU + bool waitAllEngines( + CommandBatch* cb = NULL //!< Command batch + ); + + //! Waits for the latest GPU event with a lock to prevent multiple entries + void waitEventLock( + CommandBatch* cb //!< Command batch + ); + + //! Returns a resource associated with the constant buffer + const ConstBuffer* cb(uint idx) const { return constBufs_[idx]; } + + //! Adds CAL objects into the constant buffer vector + void addConstBuffer(ConstBuffer* cb) { constBufs_.push_back(cb); } + + constbufs_t constBufs_; //!< constant buffers + + //! Start the command profiling + void profilingBegin( + amd::Command& command, //!< Command queue object + bool drmProfiling = false //!< Measure DRM time + ); + + //! End the command profiling + void profilingEnd(amd::Command& command); + + //! Collect the profiling results + bool profilingCollectResults( + CommandBatch* cb, //!< Command batch + const amd::Event* waitingEvent //!< Waiting event + ); + + //! Adds a memory handle into the GSL memory array for Virtual Heap + bool addVmMemory( + const Memory* memory //!< GPU memory object + ); + + //! Adds a stage write buffer into a list + void addXferWrite(Memory& memory); + + //! Adds a pinned memory object into a map + void addPinnedMem(amd::Memory* mem); + + //! Release pinned memory objects + void releasePinnedMem(); + + //! Finds if pinned memory is cached + amd::Memory* findPinnedMem(void* addr, size_t size); + + //! Returns the monitor object for execution access by VirtualGPU + amd::Monitor& execution() { return execution_; } + + //! Returns the virtual gpu unique index + uint index() const { return index_; } + + //! Get the PrintfDbg object + PrintfDbg& printfDbg() const { return *printfDbg_; } + + //! Get the PrintfDbgHSA object + PrintfDbgHSA& printfDbgHSA() const { return *printfDbgHSA_; } + + //! Enables synchronized transfers + void enableSyncedBlit() const; + + //! Checks if profiling is enabled + bool profiling() const { return state_.profiling_; } + + //! Returns memory dependency class + MemoryDependency& memoryDependency() { return memoryDependency_; } + + //! Returns hsaQueueMem_ + const Memory* hsaQueueMem() const { return hsaQueueMem_;} + + //! Returns DMA flush management structure + const DmaFlushMgmt& dmaFlushMgmt() const { return dmaFlushMgmt_; } + + //! Releases GSL memory objects allocated on this queue + void releaseMemObjects(bool scratch = true); + + //! Returns the HW ring used on this virtual device + uint hwRing() const { return hwRing_; } + + //! Returns current timestamp object for profiling + TimeStamp* currTs() const { return cal_.lastTS_; } + + //! Returns virtual queue object for device enqueuing + Memory* vQueue() const { return virtualQueue_; } + + //! Update virtual queue header + void writeVQueueHeader(VirtualGPU& hostQ, uint64_t kernelTable); + + //! Returns TRUE if virtual queue was successfully allocatted + bool createVirtualQueue( + uint deviceQueueSize //!< Device queue size + ); + + EngineType engineID_; //!< Engine ID for this VirtualGPU + State state_; //!< virtual GPU current state + CalVirtualDesc cal_; //!< CAL virtual device descriptor + + void flushCuCaches(HwDbgGpuCacheMask cache_mask); //!< flush/invalidate SQ cache + + //! Returns PAL command buffer interface + Pal::ICmdBuffer* iCmd() const { + Queue* queue = queues_[engineID_]; + return queue->iCmd(); + } + + //! Returns queue, associated with VirtualGPU + Queue& queue(EngineType id) const { return *queues_[id]; } + + void flushCUCaches() const + { + Pal::BarrierInfo barrier = {}; + barrier.pipePointWaitCount = 1; + Pal::HwPipePoint point = Pal::HwPipePostCs; + barrier.pPipePoints = &point; + barrier.transitionCount = 1; + Pal::BarrierTransition trans = {Pal::CoherShader, Pal::CoherShader, + {nullptr, { {Pal::ImageAspect::Color, 0, 0}, 0, 0 }, Pal::LayoutShaderRead, Pal::LayoutShaderRead}}; + barrier.pTransitions = &trans; + barrier.waitPoint = Pal::HwPipePreCs; + iCmd()->CmdBarrier(barrier); + } + + void eventBegin(EngineType engId) const { + const static bool Begin = true; + profileEvent(engId, Begin); + } + + void eventEnd(EngineType engId, GpuEvent& event) const { + const static bool End = false; + profileEvent(engId, End); + event.id = queues_[engId]->submit(); + event.engineId_ = engId; + } + + void waitForEvent(GpuEvent* event) const { + if (event->isValid()) { + assert(event->engineId_ < AllEngines); + queues_[event->engineId_]->waitForEvent(event->id); + event->invalidate(); + } + } + + bool isDone(GpuEvent* event) { + if (event->isValid()) { + assert(event->engineId_ < AllEngines); + if (queues_[event->engineId_]->isDone(event->id)) { + event->invalidate(); + return true; + } + return false; + } + return true; + } + + //! Returns TRUE if SDMA requires overlap synchronizaiton + bool validateSdmaOverlap( + const Resource& src, //!< Source resource for SDMA transfer + const Resource& dst //!< Destination resource for SDMA transfer + ); +protected: + void profileEvent(EngineType engine, bool type) const; + + //! Creates buffer object from image + amd::Memory* createBufferFromImage( + amd::Memory& amdImage //! The parent image object(untiled images only) + ) const; + +private: + struct MemoryRange { + uint64_t start_; //!< Memory range start address + uint64_t end_; //!< Memory range end address + MemoryRange(): start_(0), end_(0) {} + }; + + typedef std::map GpuEvents; + + //! Finds total amount of necessary iterations + inline void findIterations( + const amd::NDRangeContainer& sizes, //!< Original workload sizes + const amd::NDRange& local, //!< Local workgroup size + amd::NDRange& groups, //!< Calculated workgroup sizes + amd::NDRange& remainder, //!< Calculated remainder sizes + size_t& extra //!< Amount of extra executions for remainder + ); + + //! Allocates constant buffers + bool allocConstantBuffers(); + + //! Releases stage write buffers + void releaseXferWrite(); + + //! Allocate hsaQueueMem_ + bool allocHsaQueueMem(); + + //! Awaits a command batch with a waiting event + bool awaitCompletion( + CommandBatch* cb, //!< Command batch for to wait + const amd::Event* waitingEvent = NULL //!< A waiting event + ); + + //! Detects memory dependency for HSAIL kernels and flushes caches + bool processMemObjectsHSA( + const amd::Kernel& kernel, //!< AMD kernel object for execution + const_address params, //!< Pointer to the param's store + bool nativeMem, //!< Native memory objects + std::vector* memList //!< Memory list for KMD tracking + ); + + //! Common function for fill memory used by both svm Fill and non-svm fill + bool fillMemory( + cl_command_type type, //!< the command type + amd::Memory* amdMemory, //!< memory object to fill + const void* pattern, //!< pattern to fill the memory + size_t patternSize, //!< pattern size + const amd::Coord3D& origin, //!< memory origin + const amd::Coord3D& size //!< memory size for filling + ); + + bool copyMemory( + cl_command_type type, //!< the command type + amd::Memory& srcMem, //!< source memory object + amd::Memory& dstMem, //!< destination memory object + bool entire, //!< flag of entire memory copy + const amd::Coord3D& srcOrigin, //!< source memory origin + const amd::Coord3D& dstOrigin, //!< destination memory object + const amd::Coord3D& size, //!< copy size + const amd::BufferRect& srcRect, //!< region of source for copy + const amd::BufferRect& dstRect //!< region of destination for copy + ); + + void buildKernelInfo( + const HSAILKernel& hsaKernel, //!< hsa kernel + hsa_kernel_dispatch_packet_t* aqlPkt, //!< aql packet for dispatch + HwDbgKernelInfo& kernelInfo, //!< kernel info for the dispatch + amd::Event* enqueueEvent //!< Event provided in the enqueue kernel command + ); + + void assignDebugTrapHandler( + const DebugToolInfo& dbgSetting, //!< debug settings + HwDbgKernelInfo& kernelInfo //!< kernel info for the dispatch + ); + + GpuEvents gpuEvents_; //!< GPU events + + Device& gpuDevice_; //!< physical GPU device + amd::Monitor execution_; //!< Lock to serialise access to all device objects + uint index_; //!< The virtual device unique index + + PrintfDbg* printfDbg_; //!< GPU printf implemenation + PrintfDbgHSA* printfDbgHSA_; //!< HSAIL printf implemenation + + TimeStampCache* tsCache_; //!< TimeStamp cache + MemoryDependency memoryDependency_; //!< Memory dependency class + + DmaFlushMgmt dmaFlushMgmt_; //!< DMA flush management + + std::list xferWriteBuffers_; //!< Stage write buffers + std::list pinnedMems_;//!< Pinned memory list + + typedef std::list CommandBatchList; + CommandBatchList cbList_; //!< List of command batches + + uint hwRing_; //!< HW ring used on this virtual device + + uint64_t readjustTimeGPU_; //!< Readjust time between GPU and CPU timestamps + TimeStamp* currTs_; //!< current timestamp for command + + AmdVQueueHeader* vqHeader_; //!< Sysmem copy for virtual queue header + Memory* virtualQueue_; //!< Virtual device queue + Memory* schedParams_; //!< The scheduler parameters + uint schedParamIdx_; //!< Index in the scheduler parameters buffer + uint deviceQueueSize_; //!< Device queue size + uint maskGroups_; //!< The number of mask groups processed in the scheduler by one thread + + Memory* hsaQueueMem_; //!< Memory for the amd_queue_t object + Pal::ICmdAllocator* cmdAllocator_; //!< Command buffer allocator + Queue* queues_[AllEngines]; //!< HW queues for all engines + MemoryRange sdmaRange_; //!< SDMA memory range for write access +}; + +/*@}*/} // namespace pal + +#endif /*PALVIRTUAL_HPP_*/ diff --git a/projects/clr/rocclr/runtime/device/pal/palwavelimiter.cpp b/projects/clr/rocclr/runtime/device/pal/palwavelimiter.cpp new file mode 100644 index 0000000000..fec26ba8a7 --- /dev/null +++ b/projects/clr/rocclr/runtime/device/pal/palwavelimiter.cpp @@ -0,0 +1,354 @@ +// +// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved. +// +#include "device/pal/palkernel.hpp" +#include "device/pal/palwavelimiter.hpp" +#include "os/os.hpp" +#include "utils/flags.hpp" + +#include +using namespace std; + +namespace pal { + +uint WaveLimiter::MaxWave; +uint WaveLimiter::WarmUpCount; +uint WaveLimiter::RunCount; +uint WLAlgorithmSmooth::AdaptCount; +uint WLAlgorithmSmooth::AbandonThresh; +uint WLAlgorithmSmooth::DscThresh; + +WaveLimiter::WaveLimiter( + HSAILKernel* owner, + uint seqNum, + bool enable, + bool enableDump): + owner_(owner), + dumper_(owner_->name() + "_" + std::to_string(seqNum), enableDump) { + auto gpuDev = static_cast(&owner_->dev()); +Unimplemented(); + //auto attrib = gpuDev->getAttribs(); + auto hwInfo = gpuDev->hwInfo(); + setIfNotDefault(SIMDPerSH_, GPU_WAVE_LIMIT_CU_PER_SH, + /*attrib.numberOfCUsperShaderArray*/ 8 * hwInfo->simdPerCU_); + MaxWave = GPU_WAVE_LIMIT_MAX_WAVE; + WarmUpCount = GPU_WAVE_LIMIT_WARMUP; + RunCount = GPU_WAVE_LIMIT_RUN * MaxWave; + + state_ = WARMUP; + if (!flagIsDefault(GPU_WAVE_LIMIT_TRACE)) { + traceStream_.open(std::string(GPU_WAVE_LIMIT_TRACE) + owner_->name() + + ".txt"); + } + + waves_ = MaxWave; + currWaves_ = MaxWave; + bestWave_ = MaxWave; + enable_ = enable; +} + +WaveLimiter::~WaveLimiter() { + if (traceStream_.is_open()) { + traceStream_.close(); + } +} + +uint WaveLimiter::getWavesPerSH(){ + currWaves_ = waves_; + return waves_ * SIMDPerSH_; +} + +WLAlgorithmSmooth::WLAlgorithmSmooth(HSAILKernel* owner, uint seqNum, bool enable, bool enableDump): + WaveLimiter(owner, seqNum, enable, enableDump) { + AdaptCount = 2 * MaxWave + 1; + AbandonThresh = GPU_WAVE_LIMIT_ABANDON; + DscThresh = GPU_WAVE_LIMIT_DSC_THRESH; + + dynRunCount_ = RunCount; + measure_.resize(MaxWave + 1); + reference_.resize(MaxWave + 1); + trial_.resize(MaxWave + 1); + ratio_.resize(MaxWave + 1); + + clearData(); +} + +WLAlgorithmSmooth::~WLAlgorithmSmooth() { + +} + +void WLAlgorithmSmooth::clearData() { + waves_ = MaxWave; + countAll_ = 0; + clear(measure_); + clear(reference_); + clear(trial_); + clear(ratio_); + discontinuous_ = false; + dataCount_ = 0; +} + +void WLAlgorithmSmooth::updateData(ulong time) { + auto count = dataCount_ - 1; + assert(count < 2 * MaxWave + 1); + assert(time > 0); + assert(currWaves_ == waves_); + if (count % 2 == 0) { + assert(waves_ == MaxWave); + auto pos = count / 2; + measure_[pos] = time; + if (pos > 0) { + auto wave = MaxWave + 1 - pos; + if (abs(static_cast(measure_[pos - 1]) - + static_cast(measure_[pos])) * 100 / measure_[pos] > + DscThresh) { + discontinuous_ = true; + } + reference_[wave] = (time + measure_[pos - 1]) / 2; + ratio_[wave] = trial_[wave] * 100 / reference_[wave]; + if (ratio_[bestWave_] > ratio_[wave] && !discontinuous_) { + bestWave_ = wave; + } + } + } else { + assert(waves_ == MaxWave - count / 2); + trial_[waves_] = time; + } + outputTrace(); +} + +void WLAlgorithmSmooth::outputTrace() { + if (!traceStream_.is_open()) { + return; + } + + traceStream_ << "[WaveLimiter] " << owner_->name() << " state=" << state_ + << " currWaves=" << currWaves_ << " waves=" << waves_ + << " bestWave=" << bestWave_ << '\n'; + output(traceStream_, "\n measure = ", measure_); + output(traceStream_, "\n reference = ", reference_); + output(traceStream_, "\n ratio = ", ratio_); + traceStream_ << "\n\n"; +} + + +void WLAlgorithmSmooth::callback(ulong duration) { + dumper_.addData(duration, currWaves_, static_cast(state_)); + + if (!enable_) { + return; + } + + countAll_++; + + switch (state_) { + case WARMUP: + if (countAll_ < WarmUpCount) { + return; + } + state_ = ADAPT; + bestWave_ = MaxWave; + clearData(); + return; + case ADAPT: + assert(duration > 0); + if (waves_ == currWaves_) { + dataCount_++; + updateData(duration); + waves_ = MaxWave + 1 - dataCount_ / 2; + if (dataCount_ == 1 || (dataCount_ < AdaptCount && + !discontinuous_ && (dataCount_ % 2 == 0 || + ratio_[waves_] < AbandonThresh))) { + if (dataCount_ % 2 == 1) { + --waves_; + } else { + waves_ = MaxWave; + } + return; + } + waves_ = bestWave_; + if (dataCount_ >= AdaptCount) { + dynRunCount_ = RunCount; + } else { + dynRunCount_ = AdaptCount; + } + countAll_ = rand() % MaxWave; + state_ = RUN; + } + return; + case RUN: + if (countAll_ < dynRunCount_) { + return; + } + state_ = ADAPT; + bestWave_ = MaxWave; + clearData(); + return; + } +} + +WaveLimiter::DataDumper::DataDumper(const std::string &kernelName, bool enable) { + enable_ = enable; + if (enable_) { + fileName_ = std::string(GPU_WAVE_LIMIT_DUMP) + kernelName + ".csv"; + } +} + +WaveLimiter::DataDumper::~DataDumper() { + if (!enable_) { + return; + } + + std::ofstream OFS(fileName_); + for (size_t i = 0, e = time_.size(); i != e; ++i) { + OFS << i << ',' << time_[i] << ',' << wavePerSIMD_[i] << ',' + << static_cast(state_[i]) << '\n'; + } + OFS.close(); +} + +void WaveLimiter::DataDumper::addData(ulong time, uint wave, char state) { + if (!enable_) { + return; + } + + time_.push_back(time); + wavePerSIMD_.push_back(wave); + state_.push_back(state); +} + +WLAlgorithmAvrg::WLAlgorithmAvrg(HSAILKernel* owner, uint seqNum, bool enable, bool enableDump): + WaveLimiter(owner, seqNum, enable, enableDump) { + + measure_.resize(MaxWave + 1); + clear(measure_); + countAll_ = 0; +} + +WLAlgorithmAvrg::~WLAlgorithmAvrg() { + +} + +void WLAlgorithmAvrg::outputTrace() { + if (!traceStream_.is_open()) { + return; + } + + traceStream_ << "[WaveLimiter] " << owner_->name() << " state=" << state_ + << " currWaves=" << currWaves_ << " waves=" << waves_ + << " bestWave=" << bestWave_ << '\n'; + output(traceStream_, "\n measure = ", measure_); + traceStream_ << "\n\n"; +} + + +void WLAlgorithmAvrg::callback(ulong duration) { + dumper_.addData(duration, currWaves_, static_cast(state_)); + + if (!enable_) { + return; + } + + countAll_++; + + switch (state_) { + case WARMUP: + state_ = ADAPT; + case ADAPT: + measure_[waves_] += duration; + if (countAll_ <= MaxWave * 5) { + waves_--; + if (waves_ == 0) { + waves_ = MaxWave; + } + } + else { + bestWave_ = MaxWave; + for (uint i=1; i(&owner_->dev()); + Unimplemented(); + //auto attrib = gpuDev->getAttribs(); + auto hwInfo = gpuDev->hwInfo(); + unsigned simdPerSH = 0; + setIfNotDefault(simdPerSH, GPU_WAVE_LIMIT_CU_PER_SH, + /*attrib.numberOfCUsperShaderArray*/ 8 * hwInfo->simdPerCU_); + fixed_ = GPU_WAVES_PER_SIMD * simdPerSH; +} + +WaveLimiterManager::~WaveLimiterManager() { + for (auto &I: limiters_) { + delete I.second; + } +} + +uint WaveLimiterManager::getWavesPerSH(const device::VirtualDevice *vdev) const { + if (fixed_ > 0) { + return fixed_; + } + if (!enable_) { + return 0; + } + auto loc = limiters_.find(vdev); + if (loc == limiters_.end()) { + return 0; + } + assert(loc->second != nullptr); + return loc->second->getWavesPerSH(); +} + +amd::ProfilingCallback* WaveLimiterManager::getProfilingCallback( + const device::VirtualDevice *vdev) { + assert(vdev != nullptr); + if (!enable_ && !enableDump_) { + return nullptr; + } + + amd::ScopedLock SL(monitor_); + auto loc = limiters_.find(vdev); + if (loc != limiters_.end()) { + return loc->second; + } + + auto limiter = new WLAlgorithmSmooth(owner_, limiters_.size(), enable_, + enableDump_); + if (limiter == nullptr) { + enable_ = false; + return nullptr; + } + limiters_[vdev] = limiter; + return limiter; +} + +void WaveLimiterManager::enable() { + if (fixed_ > 0) { + return; + } + auto gpuDev = static_cast(&owner_->dev()); + auto hwInfo = gpuDev->hwInfo(); + Unimplemented(); + // Enable it only for CI+, unless GPU_WAVE_LIMIT_ENABLE is set to 1 + // Disabled for SI due to bug #10817 + setIfNotDefault(enable_, GPU_WAVE_LIMIT_ENABLE, + /*owner_->workGroupInfo()->limitWave_*/ false && gpuDev->settings().ciPlus_); +} + +} // namespace pal + diff --git a/projects/clr/rocclr/runtime/device/pal/palwavelimiter.hpp b/projects/clr/rocclr/runtime/device/pal/palwavelimiter.hpp new file mode 100644 index 0000000000..b75c49fbe5 --- /dev/null +++ b/projects/clr/rocclr/runtime/device/pal/palwavelimiter.hpp @@ -0,0 +1,154 @@ +// +// Copyright (c) 2008 Advanced Micro Devices, Inc. All rights reserved. +// + +#ifndef PALWAVELIMITER_HPP_ +#define PALWAVELIMITER_HPP_ + +#include "platform/command.hpp" +#include "thread/thread.hpp" +#include +#include +#include +#include +#include + +//! \namespace pal PAL Device Implementation +namespace pal { + +class HSAILKernel; + +// Adaptively limit the number of waves per SIMD based on kernel execution time +class WaveLimiter: public amd::ProfilingCallback { +public: + explicit WaveLimiter(HSAILKernel*, uint seqNum, bool enable, bool enableDump); + virtual ~WaveLimiter(); + + //! Get waves per shader array to be used for kernel execution. + uint getWavesPerSH(); + +protected: + enum StateKind { + WARMUP, ADAPT, RUN + }; + + class DataDumper { + public: + explicit DataDumper(const std::string &kernelName, bool enable); + ~DataDumper(); + + //! Record execution time, waves/simd and state of wave limiter. + void addData(ulong time, uint wave, char state); + + //! Whether this data dumper is enabled. + bool enabled() const { return enable_;} + private: + bool enable_; + std::string fileName_; + std::vector time_; + std::vector wavePerSIMD_; + std::vector state_; + }; + + std::vector measure_; + bool enable_; + uint SIMDPerSH_; // Number of SIMDs per SH + uint waves_; // Waves per SIMD to be set + uint bestWave_; // Optimal waves per SIMD + uint countAll_; // Number of kernel executions + StateKind state_; + HSAILKernel *owner_; + DataDumper dumper_; + std::ofstream traceStream_; + uint currWaves_; // Current waves per SIMD + + static uint MaxWave; // Maximum number of waves per SIMD + static uint WarmUpCount; // Number of kernel executions for warm up + static uint RunCount; // Number of kernel executions for normal run + + //! Call back from Event::recordProfilingInfo to get execution time. + virtual void callback(ulong duration)=0; + + //! Output trace of measurement/adaptation. + virtual void outputTrace()=0; + + template void clear(T& A) { + for (auto &I : A) { + I = 0; + } + } + template void output(std::ofstream &ofs, const std::string &prompt, + T& A) { + ofs << prompt; + for (auto &I : A) { + ofs << ' ' << static_cast(I); + } + } +}; + +class WLAlgorithmSmooth: public WaveLimiter { +public: + explicit WLAlgorithmSmooth(HSAILKernel* owner, uint seqNum, bool enable, bool enableDump); + virtual ~WLAlgorithmSmooth(); +private: + std::vector reference_; + std::vector trial_; + std::vector ratio_; + bool discontinuous_; // Measured data is discontinuous + uint dynRunCount_; + uint dataCount_; + + static uint AdaptCount; // Number of kernel executions for adapting + static uint AbandonThresh; // Threshold to abandon adaptation + static uint DscThresh; // Threshold for identifying discontinuities + + //! Update measurement data and optimal waves/simd with execution time. + void updateData(ulong time); + + //! Clear measurement data for the next adaptation. + void clearData(); + + //! Call back from Event::recordProfilingInfo to get execution time. + void callback(ulong duration); + + //! Output trace of measurement/adaptation. + void outputTrace(); +}; + +class WLAlgorithmAvrg: public WaveLimiter { +public: + explicit WLAlgorithmAvrg(HSAILKernel* owner, uint seqNum, bool enable, bool enableDump); + virtual ~WLAlgorithmAvrg(); +private: + //! Call back from Event::recordProfilingInfo to get execution time. + void callback(ulong duration); + + //! Output trace of measurement/adaptation. + void outputTrace(); +}; + +// Create wave limiter for each virtual device for a kernel and manages the wave limiters. +class WaveLimiterManager { +public: + explicit WaveLimiterManager(HSAILKernel* owner); + virtual ~WaveLimiterManager(); + + //! Get waves per shader array for a specific virtual device. + uint getWavesPerSH(const device::VirtualDevice *) const; + + //! Provide call back function for a specific virtual device. + amd::ProfilingCallback* getProfilingCallback(const device::VirtualDevice *); + + //! Enable wave limiter manager by kernel metadata and flags. + void enable(); +private: + HSAILKernel* owner_; //!< The kernel which owns this object + std::unordered_map limiters_; //!< Maps virtual device to wave limiter + bool enable_; //!< Whether the adaptation is enabled + bool enableDump_; //!< Whether the data dumper is enabled + uint fixed_; //!< The fixed waves/simd value if not zero + amd::Monitor monitor_; //!< The mutex for updating the wave limiter map +}; +} +#endif diff --git a/projects/clr/rocclr/runtime/top.hpp b/projects/clr/rocclr/runtime/top.hpp index 32ba1616f6..e2131f4403 100644 --- a/projects/clr/rocclr/runtime/top.hpp +++ b/projects/clr/rocclr/runtime/top.hpp @@ -137,6 +137,10 @@ class HeapObject public: void* operator new(size_t size); void operator delete(void* obj); + void* operator new(size_t size, size_t extSize) + { return HeapObject::operator new (size + extSize); }; + void operator delete(void* obj, size_t extSize) + { HeapObject::operator delete (obj); } }; /*! \brief For all reference counted objects. @@ -154,6 +158,10 @@ public: void* operator new(size_t size) { return ::operator new(size); } void operator delete(void* p) { return ::operator delete(p); } + void* operator new(size_t size, size_t extSize) + { return ReferenceCountedObject::operator new (size + extSize); }; + void operator delete(void* obj, size_t extSize) + { ReferenceCountedObject::operator delete (obj); } uint referenceCount() const { return referenceCount_; }