From 7df0da7bacee7dbd5fca6513107b6795831d569a Mon Sep 17 00:00:00 2001 From: foreman Date: Fri, 6 Nov 2015 03:40:14 -0500 Subject: [PATCH] P4 to Git Change 1208254 by nhaustov@nhaustov_hsa on 2015/11/06 03:25:21 SWDEV-77584 - Remove old OpenCL hsa device and loader. Reviewed by: Evgeniy Mankov Testing: pre-checkin Affected files ... ... //depot/stg/opencl/drivers/opencl/api/opencl/amdocl/build/Makefile.api#128 edit ... //depot/stg/opencl/drivers/opencl/compiler/loader/Makefile#2 delete ... //depot/stg/opencl/drivers/opencl/compiler/loader/libloader/Makefile#2 delete ... //depot/stg/opencl/drivers/opencl/compiler/loader/libloader/build/Makefile#3 delete ... //depot/stg/opencl/drivers/opencl/compiler/loader/libloader/build/Makefile.libloader#11 delete ... //depot/stg/opencl/drivers/opencl/compiler/loader/libloader/hsacore_symbol_loader.cpp#3 delete ... //depot/stg/opencl/drivers/opencl/compiler/loader/libloader/hsacore_symbol_loader.hpp#3 delete ... //depot/stg/opencl/drivers/opencl/compiler/loader/libloader/loader.cpp#14 delete ... //depot/stg/opencl/drivers/opencl/compiler/loader/libloader/loader.hpp#6 delete ... //depot/stg/opencl/drivers/opencl/runtime/Makefile#20 edit ... //depot/stg/opencl/drivers/opencl/runtime/build/Makefile.runtime#61 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/device.cpp#190 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/hsa/Makefile#8 delete ... //depot/stg/opencl/drivers/opencl/runtime/device/hsa/build/Makefile#5 delete ... //depot/stg/opencl/drivers/opencl/runtime/device/hsa/build/Makefile.oclhsa#23 delete ... //depot/stg/opencl/drivers/opencl/runtime/device/hsa/hsaappprofile.cpp#4 delete ... //depot/stg/opencl/drivers/opencl/runtime/device/hsa/hsaappprofile.hpp#4 delete ... //depot/stg/opencl/drivers/opencl/runtime/device/hsa/hsabinary.cpp#8 delete ... //depot/stg/opencl/drivers/opencl/runtime/device/hsa/hsabinary.hpp#5 delete ... //depot/stg/opencl/drivers/opencl/runtime/device/hsa/hsablit.cpp#10 delete ... //depot/stg/opencl/drivers/opencl/runtime/device/hsa/hsablit.hpp#3 delete ... //depot/stg/opencl/drivers/opencl/runtime/device/hsa/hsacompiler.cpp#27 delete ... //depot/stg/opencl/drivers/opencl/runtime/device/hsa/hsacompilerlib.cpp#13 delete ... //depot/stg/opencl/drivers/opencl/runtime/device/hsa/hsacompilerlib.hpp#10 delete ... //depot/stg/opencl/drivers/opencl/runtime/device/hsa/hsacore_symbol_loader.cpp#8 delete ... //depot/stg/opencl/drivers/opencl/runtime/device/hsa/hsacore_symbol_loader.hpp#8 delete ... //depot/stg/opencl/drivers/opencl/runtime/device/hsa/hsacounters.cpp#5 delete ... //depot/stg/opencl/drivers/opencl/runtime/device/hsa/hsacounters.hpp#3 delete ... //depot/stg/opencl/drivers/opencl/runtime/device/hsa/hsadefs.hpp#5 delete ... //depot/stg/opencl/drivers/opencl/runtime/device/hsa/hsadevice.cpp#95 delete ... //depot/stg/opencl/drivers/opencl/runtime/device/hsa/hsadevice.hpp#51 delete ... //depot/stg/opencl/drivers/opencl/runtime/device/hsa/hsakernel.cpp#27 delete ... //depot/stg/opencl/drivers/opencl/runtime/device/hsa/hsakernel.hpp#20 delete ... //depot/stg/opencl/drivers/opencl/runtime/device/hsa/hsamemory.cpp#43 delete ... //depot/stg/opencl/drivers/opencl/runtime/device/hsa/hsamemory.hpp#28 delete ... //depot/stg/opencl/drivers/opencl/runtime/device/hsa/hsaprogram.cpp#39 delete ... //depot/stg/opencl/drivers/opencl/runtime/device/hsa/hsaprogram.hpp#20 delete ... //depot/stg/opencl/drivers/opencl/runtime/device/hsa/hsasettings.cpp#40 delete ... //depot/stg/opencl/drivers/opencl/runtime/device/hsa/hsasettings.hpp#13 delete ... //depot/stg/opencl/drivers/opencl/runtime/device/hsa/hsavirtual.cpp#99 delete ... //depot/stg/opencl/drivers/opencl/runtime/device/hsa/hsavirtual.hpp#29 delete ... //depot/stg/opencl/drivers/opencl/runtime/device/hsa/oclhsa.def#2 delete ... //depot/stg/opencl/drivers/opencl/runtime/device/hsa/oclhsa_common.hpp#4 delete ... //depot/stg/opencl/drivers/opencl/runtime/device/hsa/services_symbol_loader.cpp#10 delete ... //depot/stg/opencl/drivers/opencl/runtime/device/hsa/services_symbol_loader.hpp#11 delete ... //depot/stg/opencl/drivers/opencl/runtime/device/hsa/system_memory.h#2 delete ... //depot/stg/opencl/drivers/opencl/runtime/device/hsa_foundation/hsaappprofile.cpp#2 edit --- rocclr/runtime/device/device.cpp | 2 +- rocclr/runtime/device/hsa/hsaappprofile.cpp | 61 - rocclr/runtime/device/hsa/hsaappprofile.hpp | 27 - rocclr/runtime/device/hsa/hsabinary.cpp | 152 -- rocclr/runtime/device/hsa/hsabinary.hpp | 56 - rocclr/runtime/device/hsa/hsablit.cpp | 1838 ----------------- rocclr/runtime/device/hsa/hsablit.hpp | 401 ---- rocclr/runtime/device/hsa/hsacompiler.cpp | 163 -- rocclr/runtime/device/hsa/hsacompilerlib.cpp | 67 - rocclr/runtime/device/hsa/hsacompilerlib.hpp | 92 - .../device/hsa/hsacore_symbol_loader.cpp | 53 - .../device/hsa/hsacore_symbol_loader.hpp | 75 - rocclr/runtime/device/hsa/hsacounters.cpp | 144 -- rocclr/runtime/device/hsa/hsacounters.hpp | 103 - rocclr/runtime/device/hsa/hsadefs.hpp | 42 - rocclr/runtime/device/hsa/hsadevice.cpp | 896 -------- rocclr/runtime/device/hsa/hsadevice.hpp | 334 --- rocclr/runtime/device/hsa/hsakernel.cpp | 573 ----- rocclr/runtime/device/hsa/hsakernel.hpp | 161 -- rocclr/runtime/device/hsa/hsamemory.cpp | 938 --------- rocclr/runtime/device/hsa/hsamemory.hpp | 202 -- rocclr/runtime/device/hsa/hsaprogram.cpp | 726 ------- rocclr/runtime/device/hsa/hsaprogram.hpp | 160 -- rocclr/runtime/device/hsa/hsasettings.cpp | 81 - rocclr/runtime/device/hsa/hsasettings.hpp | 65 - rocclr/runtime/device/hsa/hsavirtual.cpp | 1544 -------------- rocclr/runtime/device/hsa/hsavirtual.hpp | 181 -- rocclr/runtime/device/hsa/oclhsa.def | 3 - rocclr/runtime/device/hsa/oclhsa_common.hpp | 26 - .../device/hsa/services_symbol_loader.cpp | 52 - .../device/hsa/services_symbol_loader.hpp | 78 - rocclr/runtime/device/hsa/system_memory.h | 97 - 32 files changed, 1 insertion(+), 9392 deletions(-) delete mode 100644 rocclr/runtime/device/hsa/hsaappprofile.cpp delete mode 100644 rocclr/runtime/device/hsa/hsaappprofile.hpp delete mode 100644 rocclr/runtime/device/hsa/hsabinary.cpp delete mode 100644 rocclr/runtime/device/hsa/hsabinary.hpp delete mode 100644 rocclr/runtime/device/hsa/hsablit.cpp delete mode 100644 rocclr/runtime/device/hsa/hsablit.hpp delete mode 100644 rocclr/runtime/device/hsa/hsacompiler.cpp delete mode 100644 rocclr/runtime/device/hsa/hsacompilerlib.cpp delete mode 100644 rocclr/runtime/device/hsa/hsacompilerlib.hpp delete mode 100644 rocclr/runtime/device/hsa/hsacore_symbol_loader.cpp delete mode 100644 rocclr/runtime/device/hsa/hsacore_symbol_loader.hpp delete mode 100644 rocclr/runtime/device/hsa/hsacounters.cpp delete mode 100644 rocclr/runtime/device/hsa/hsacounters.hpp delete mode 100644 rocclr/runtime/device/hsa/hsadefs.hpp delete mode 100644 rocclr/runtime/device/hsa/hsadevice.cpp delete mode 100644 rocclr/runtime/device/hsa/hsadevice.hpp delete mode 100644 rocclr/runtime/device/hsa/hsakernel.cpp delete mode 100644 rocclr/runtime/device/hsa/hsakernel.hpp delete mode 100644 rocclr/runtime/device/hsa/hsamemory.cpp delete mode 100644 rocclr/runtime/device/hsa/hsamemory.hpp delete mode 100644 rocclr/runtime/device/hsa/hsaprogram.cpp delete mode 100644 rocclr/runtime/device/hsa/hsaprogram.hpp delete mode 100644 rocclr/runtime/device/hsa/hsasettings.cpp delete mode 100644 rocclr/runtime/device/hsa/hsasettings.hpp delete mode 100644 rocclr/runtime/device/hsa/hsavirtual.cpp delete mode 100644 rocclr/runtime/device/hsa/hsavirtual.hpp delete mode 100644 rocclr/runtime/device/hsa/oclhsa.def delete mode 100644 rocclr/runtime/device/hsa/oclhsa_common.hpp delete mode 100644 rocclr/runtime/device/hsa/services_symbol_loader.cpp delete mode 100644 rocclr/runtime/device/hsa/services_symbol_loader.hpp delete mode 100644 rocclr/runtime/device/hsa/system_memory.h diff --git a/rocclr/runtime/device/device.cpp b/rocclr/runtime/device/device.cpp index 378fd59a37..132e884720 100644 --- a/rocclr/runtime/device/device.cpp +++ b/rocclr/runtime/device/device.cpp @@ -7,7 +7,7 @@ #include "thread/monitor.hpp" #if defined(WITH_HSA_DEVICE) -#include "device/hsa/hsadevice.hpp" +#include "device/hsa_foundation/hsadevice.hpp" extern amd::AppProfile* oclhsaCreateAppProfile(); #endif diff --git a/rocclr/runtime/device/hsa/hsaappprofile.cpp b/rocclr/runtime/device/hsa/hsaappprofile.cpp deleted file mode 100644 index ae19bd7c0e..0000000000 --- a/rocclr/runtime/device/hsa/hsaappprofile.cpp +++ /dev/null @@ -1,61 +0,0 @@ -// -// Copyright (c) 2009 Advanced Micro Devices, Inc. All rights reserved. -// - - -#ifndef WITHOUT_FSA_BACKEND - -#include "top.hpp" -#include "device/device.hpp" -#include "device/appprofile.hpp" -#include "device/hsa/hsaappprofile.hpp" - -#include - -amd::AppProfile* oclhsaCreateAppProfile() -{ - amd::AppProfile* appProfile = new oclhsa::AppProfile; - - if ((appProfile == NULL) || !appProfile->init()) { - return NULL; - } - - return appProfile; -} - -namespace oclhsa { - -bool AppProfile::ParseApplicationProfile() -{ - std::string appName("Explorer"); - - std::transform(appName.begin(), appName.end(), appName.begin(), ::tolower); - std::transform(appFileName_.begin(), appFileName_.end(), appFileName_.begin(), ::tolower); - - if (appFileName_.compare(appName) == 0 ) { - hsaDeviceHint_ = CL_HSA_DISABLED_AMD; - gpuvmHighAddr_ = false; - noHsaInit_ = true; - profileOverridesAllSettings_ = true; - } - - // Setting both bits is invalid, make it niether. - if (hsaDeviceHint_ & CL_HSA_ENABLED_AMD - && hsaDeviceHint_ & CL_HSA_DISABLED_AMD) { - hsaDeviceHint_ = 0; - } - - if (noHsaInit_) { - // If no HSA initialization, then force hint flag to non-HSA device. - // Even if this is not forced, the device selection logic will endure it. - // After all hint flags are treated as hint only - depending on - // availibility. - hsaDeviceHint_ = CL_HSA_DISABLED_AMD; - } - - return true; -} - -} - -#endif diff --git a/rocclr/runtime/device/hsa/hsaappprofile.hpp b/rocclr/runtime/device/hsa/hsaappprofile.hpp deleted file mode 100644 index e2cac7d71f..0000000000 --- a/rocclr/runtime/device/hsa/hsaappprofile.hpp +++ /dev/null @@ -1,27 +0,0 @@ -// -// Copyright (c) 2009 Advanced Micro Devices, Inc. All rights reserved. -// -#ifndef HSAAPPPROFILE_HPP_ -#define HSAAPPPROFILE_HPP_ - - -#ifndef WITHOUT_FSA_BACKEND - -namespace oclhsa { - -class AppProfile : public amd::AppProfile -{ -public: - AppProfile(): amd::AppProfile() {} - -protected: - //! parse application profile based on application file name - virtual bool ParseApplicationProfile(); -}; - -} - -#endif - -#endif - diff --git a/rocclr/runtime/device/hsa/hsabinary.cpp b/rocclr/runtime/device/hsa/hsabinary.cpp deleted file mode 100644 index 342b8214c7..0000000000 --- a/rocclr/runtime/device/hsa/hsabinary.cpp +++ /dev/null @@ -1,152 +0,0 @@ -// -// Copyright (c) 2009 Advanced Micro Devices, Inc. All rights reserved. -// - - -#ifndef WITHOUT_FSA_BACKEND - - -#include "hsabinary.hpp" -#include "hsaprogram.hpp" -#include "hsakernel.hpp" -#include "utils/options.hpp" -#include "os/os.hpp" -#include -#include - - - -namespace oclhsa { - /* -bool -ClBinary::loadKernels(FSAILProgram& program, NameKernelMap &kernels) -{ - return true; - - - const char _kernel[] = "_kernel"; - const char __FSA_[] = "__FSA_"; - const char _header[] = "_header"; - const char _fsail[] = "_fsail"; - bool hasKernels = false; - - // TODO : jugu - // Target should be 15 bit maximum. Should check this somewhere. - uint32_t target = static_cast(21);//dev().calTarget()); - uint16_t elf_target; - amd::OclElf::oclElfPlatform platform; - if (!elfIn()->getTarget(elf_target, platform) || - (platform != amd::OclElf::CAL_PLATFORM) || - ((uint32_t)target != elf_target)) { - // warning ! - // LogError("The OCL binary image loading failed: different target"); - - // LHOWES TODO: target in kannan's elf is wrong so skip this for now - // We may want a special HSA target or a similar more substantial change. - // return false; - } - - for (amd::Sym_Handle sym = elfIn()->nextSymbol(NULL); - sym != NULL; - sym = elfIn()->nextSymbol(sym)) { - amd::OclElf::SymbolInfo symInfo; - if (!elfIn()->getSymbolInfo(sym, &symInfo)) { - LogError("LoadKernelFromElf: getSymbolInfo() fails"); - return false; - } - - std::string elfSymName(symInfo.sym_name); - - const size_t offset = sizeof(__FSA_) - 1; - if (elfSymName.compare(0, offset, __FSA_) != 0) { - continue; - } - - // Assume this elfSymName is associated with a kernel name. The folloiwng code will adjust - // If it isn't. - const size_t suffixPos = elfSymName.rfind('_'); - bool isKernel = true; // assume it is a kernel - std::string functionName = elfSymName.substr(sizeof(__FSA_)-1, suffixPos-(sizeof(__FSA_)-1)); - //"__OpenCL_"; - //functionName.append(elfSymName.substr(sizeof(__FSA_)-1, suffixPos-(sizeof(__FSA_)-1))); - //functionName.append("_kernel"); // make the kernel's linkage name - - // Find kernel in map and get its kernel representation - NameKernelMap::iterator searchIterator = kernels.find(functionName); - Kernel *currentKernel = 0; - if( searchIterator == kernels.end() ) { - // TODO: note, this will need to be decided on based on the the device type. As we have no CPU yet... - //currentKernel = new Kernel(functionName); - //kernels[functionName] = currentKernel; - } else { - currentKernel = static_cast(searchIterator->second); - } - - - // Add info for this elf symbol into tempobj's functionNameMap[] - if (elfSymName.compare(suffixPos, sizeof(_fsail) - 1, _fsail) == 0) { - - assert (currentKernel->hasFSAIL() && - "More than one fsail symbol for a kernel"); - // LHOWES TODO: Currently this is using the section address and size because - // we only have a single kernel and there is a bug in the current AMP compiler. - // Kannan is working on fixing this and once we have the symbol address and size - // correct in the metadata then we can change this and it'll work properly for - // multiple kernels. - std::string options(""); - std::string fsailString(symInfo.sec_addr, symInfo.sec_addr + symInfo.sec_size); - currentKernel->setFSAIL(fsailString); - //currentKernel->compile(options); - - } - - - // LHOWES - // Hack to assume that this is the AMP path for now - // until we have kernel metadata we need a way to generate the parameter list. - { - device::Kernel::parameters_t parameterList; - // Is AMP code - - amd::KernelParameterDescriptor desc; - desc.name_ = "Functor"; - desc.type_ = T_POINTER; - - desc.size_ = sizeof(void*); - desc.offset_ = 0; - - // BKENDALL HACK - desc.typeName_ = ""; - desc.typeQualifier_ = 0; - desc.accessQualifier_ = 0; - desc.addressQualifier_ = 0; - // !BKENDALL HACK - - parameterList.push_back(desc); - // oclhsa OpenCL integration - } - - hasKernels = true; - } - - - return hasKernels; - -} - */ -/* -bool -ClBinary::clearElfOut() -{ - // Recreate libelf elf object - if (!elfOut()->Clear()) { - return false; - } - - // Need to re-setup target - return setElfTarget(); -} -*/ -} // namespace oclhsa - -#endif // WITHOUT_FSA_BACKEND diff --git a/rocclr/runtime/device/hsa/hsabinary.hpp b/rocclr/runtime/device/hsa/hsabinary.hpp deleted file mode 100644 index 5fa3ab53ba..0000000000 --- a/rocclr/runtime/device/hsa/hsabinary.hpp +++ /dev/null @@ -1,56 +0,0 @@ -// -// Copyright (c) 2009 Advanced Micro Devices, Inc. All rights reserved. -// -#ifndef HSABINARY_HPP_ -#define HSABINARY_HPP_ - -#include "top.hpp" -#include "hsadevice.hpp" - -#ifndef WITHOUT_FSA_BACKEND - -namespace oclhsa { - - -typedef std::map NameKernelMap; - -class FSAILProgram; - -class ClBinary : public device::ClBinary -{ -public: - ClBinary(const Device& dev, BinaryImageFormat bifVer = BIF_VERSION3) - : device::ClBinary(dev, bifVer) - {} - - //! Destructor - ~ClBinary() {} - - -protected: - bool setElfTarget() { - uint32_t target = static_cast(21);//dev().calTarget()); - assert (((0xFFFF8000 & target) == 0) && "ASIC target ID >= 2^15"); - uint16_t elf_target = (uint16_t)(0x7FFF & target); - return elfOut()->setTarget(elf_target, amd::OclElf::CAL_PLATFORM); - return true; - } - -private: - //! Disable default copy constructor - ClBinary(const ClBinary&); - - //! Disable default operator= - ClBinary& operator=(const ClBinary&); - - //! Returns the HSA device for this object - const Device& dev() const { return static_cast(dev_); } - -}; - -} // namespace oclhsa - -#endif // WITHOUT_FSA_BACKEND - -#endif // HSABINARY_HPP_ - diff --git a/rocclr/runtime/device/hsa/hsablit.cpp b/rocclr/runtime/device/hsa/hsablit.cpp deleted file mode 100644 index ff7a735534..0000000000 --- a/rocclr/runtime/device/hsa/hsablit.cpp +++ /dev/null @@ -1,1838 +0,0 @@ -// -// Copyright (c) 2013 Advanced Micro Devices, Inc. All rights reserved. -// - -#include "platform/commandqueue.hpp" -#include "device/hsa/hsadevice.hpp" -#include "device/hsa/hsablit.hpp" -#include "device/hsa/hsamemory.hpp" -#include "device/hsa/hsavirtual.hpp" -#include "device/hsa/oclhsa_common.hpp" -#include "utils/debug.hpp" - -namespace oclhsa { -HsaBlitManager::HsaBlitManager(device::VirtualDevice& vDev, Setup setup) - : HostBlitManager(vDev, setup) -{ } - -bool -HsaBlitManager::readBuffer( - device::Memory& srcMemory, - void* dstHost, - const amd::Coord3D& origin, - const amd::Coord3D& size, - bool entire) const -{ - // Wait on the last outstanding kernel. - gpu().releaseGpuMemoryFence(); - - if (setup_.disableReadBuffer_ || srcMemory.isHostMemDirectAccess()) { - return HostBlitManager::readBuffer( - srcMemory, dstHost, origin, size, entire); - } - - void *src = static_cast(srcMemory).getDeviceMemory(); - - // Copy memory - HsaStatus status = hsacoreapi->HsaCopyMemory( - dstHost, reinterpret_cast(src) + origin[0], size[0]); - if (status != kHsaStatusSuccess) { - LogPrintfError("DMA buffer failed with code %d", status); - return false; - } - return true; -} - -bool -HsaBlitManager::readBufferRect( - device::Memory& srcMemory, - void* dstHost, - const amd::BufferRect& bufRect, - const amd::BufferRect& hostRect, - const amd::Coord3D& size, - bool entire) const -{ - // Wait on the last outstanding kernel. - gpu().releaseGpuMemoryFence(); - - if (setup_.disableReadBufferRect_ || srcMemory.isHostMemDirectAccess()) { - return HostBlitManager::readBufferRect( - srcMemory, dstHost, bufRect, hostRect, size, entire); - } - - void *src = static_cast(srcMemory).getDeviceMemory(); - - size_t srcOffset; - size_t dstOffset; - - for (size_t z = 0; z < size[2]; ++z) { - for (size_t y = 0; y < size[1]; ++y) { - srcOffset = bufRect.offset(0, y, z); - dstOffset = hostRect.offset(0, y, z); - - // Copy memory line by line - HsaStatus status = - hsacoreapi->HsaCopyMemory( - (reinterpret_cast
(dstHost) + dstOffset), - (reinterpret_cast(src) + srcOffset), - size[0]); - - if (status != kHsaStatusSuccess) { - LogPrintfError("DMA buffer failed with code %d", status); - return false; - } - } - } - - return true; -} - -bool -HsaBlitManager::readImage( - device::Memory& srcMemory, - void* dstHost, - const amd::Coord3D& origin, - const amd::Coord3D& size, - size_t rowPitch, - size_t slicePitch, - bool entire) const -{ - // Wait on the last outstanding kernel. - gpu().releaseGpuMemoryFence(); - - oclhsa::Image &image = static_cast(srcMemory); - - const uint8_t *src = static_cast(image.getDeviceMemory()); - uint8_t* dst = static_cast(dstHost); - - const amd::Coord3D srcOffset = origin; - const amd::Coord3D dstOffset = amd::Coord3D(0); - - size_t srcRowPitch = image.getDeviceRowPitchSize(); - size_t srcSlicePitch = image.getDeviceSlicePitchSize(); - - size_t elementSize = - srcMemory.owner()->asImage()->getImageFormat().getElementSize(); - size_t dstRowPitch = - (rowPitch == 0) ? (size[0] * elementSize) : rowPitch; - size_t dstSlicePitch = - (slicePitch == 0) ? (size[1] * dstRowPitch) : slicePitch; - - const amd::Coord3D& sizeToCopy = size; - - return importExportImage( - dst, src, dstOffset, dstRowPitch, dstSlicePitch, srcOffset, srcRowPitch, - srcSlicePitch, sizeToCopy, elementSize); -} - -bool -HsaBlitManager::writeBuffer( - const void* srcHost, - device::Memory& dstMemory, - const amd::Coord3D& origin, - const amd::Coord3D& size, - bool entire) const -{ - // Wait on the last outstanding kernel. - gpu().releaseGpuMemoryFence(); - - if (setup_.disableWriteBuffer_ || dstMemory.isHostMemDirectAccess()) { - return HostBlitManager::writeBuffer( - srcHost, dstMemory, origin, size, entire); - } - - void *dst = static_cast(dstMemory).getDeviceMemory(); - - // Copy memory - HsaStatus status = - hsacoreapi->HsaCopyMemory( - reinterpret_cast
(dst) + origin[0], srcHost, size[0]); - - if (status != kHsaStatusSuccess) { - LogPrintfError("DMA buffer failed with code %d", status); - return false; - } - - return true; -} - -bool -HsaBlitManager::writeBufferRect( - const void* srcHost, - device::Memory& dstMemory, - const amd::BufferRect& hostRect, - const amd::BufferRect& bufRect, - const amd::Coord3D& size, - bool entire) const -{ - // Wait on the last outstanding kernel. - gpu().releaseGpuMemoryFence(); - - if (setup_.disableWriteBufferRect_ || dstMemory.isHostMemDirectAccess()) { - return HostBlitManager::writeBufferRect( - srcHost, dstMemory, hostRect, bufRect, size, entire); - } - - void *dst = static_cast(dstMemory).getDeviceMemory(); - - size_t srcOffset; - size_t dstOffset; - - for (size_t z = 0; z < size[2]; ++z) { - for (size_t y = 0; y < size[1]; ++y) { - srcOffset = hostRect.offset(0, y, z); - dstOffset = bufRect.offset(0, y, z); - - // Copy memory line by line - HsaStatus status = - hsacoreapi->HsaCopyMemory( - (reinterpret_cast
(dst) + dstOffset), - (reinterpret_cast(srcHost) + srcOffset), - size[0]); - - if (status != kHsaStatusSuccess) { - LogPrintfError("DMA buffer failed with code %d", status); - return false; - } - } - } - - return true; -} - -bool -HsaBlitManager::writeImage( - const void* srcHost, - device::Memory& dstMemory, - const amd::Coord3D& origin, - const amd::Coord3D& size, - size_t rowPitch, - size_t slicePitch, - bool entire) const -{ - // Wait on the last outstanding kernel. - gpu().releaseGpuMemoryFence(); - - oclhsa::Image &image = static_cast(dstMemory); - - const uint8_t* src = static_cast(srcHost); - uint8_t *dst = static_cast(image.getDeviceMemory()); - - const amd::Coord3D srcOffset = amd::Coord3D(0); - const amd::Coord3D dstOffset = origin; - - size_t elementSize = - dstMemory.owner()->asImage()->getImageFormat().getElementSize(); - size_t srcRowPitch = - (rowPitch == 0) ? (size[0] * elementSize) : rowPitch; - size_t srcSlicePitch = - (slicePitch == 0) ? (size[1] * srcRowPitch) : slicePitch; - - size_t dstRowPitch = image.getDeviceRowPitchSize(); - size_t dstSlicePitch = image.getDeviceSlicePitchSize(); - - const amd::Coord3D& sizeToCopy = size; - - return importExportImage( - dst, src, dstOffset, dstRowPitch, dstSlicePitch, srcOffset, srcRowPitch, - srcSlicePitch, sizeToCopy, elementSize); -} - -bool -HsaBlitManager::copyBuffer( - device::Memory& srcMemory, - device::Memory& dstMemory, - const amd::Coord3D& srcOrigin, - const amd::Coord3D& dstOrigin, - const amd::Coord3D& size, - bool entire) const -{ - // Wait on the last outstanding kernel. - gpu().releaseGpuMemoryFence(); - - if (setup_.disableCopyBuffer_ || - (srcMemory.isHostMemDirectAccess() && - dstMemory.isHostMemDirectAccess())) { - return HostBlitManager::copyBuffer( - srcMemory, dstMemory, srcOrigin, dstOrigin, size, entire); - } - - void *src = static_cast(srcMemory).getDeviceMemory(); - void *dst = static_cast(dstMemory).getDeviceMemory(); - - // Straight forward buffer copy - HsaStatus status = - hsacoreapi->HsaCopyMemory( - (reinterpret_cast
(dst) + dstOrigin[0]), - (reinterpret_cast(src) + srcOrigin[0]), - size[0]); - - if (status != kHsaStatusSuccess) { - LogPrintfError("DMA buffer failed with code %d", status); - return false; - } - - return true; -} - -bool -HsaBlitManager::copyBufferRect( - device::Memory& srcMemory, - device::Memory& dstMemory, - const amd::BufferRect& srcRect, - const amd::BufferRect& dstRect, - const amd::Coord3D& size, - bool entire) const -{ - // Wait on the last outstanding kernel. - gpu().releaseGpuMemoryFence(); - - if (setup_.disableCopyBuffer_ || - (srcMemory.isHostMemDirectAccess() && - dstMemory.isHostMemDirectAccess())) { - return HostBlitManager::copyBufferRect( - srcMemory, dstMemory, srcRect, dstRect, size, entire); - } - - void *src = static_cast(srcMemory).getDeviceMemory(); - void *dst = static_cast(dstMemory).getDeviceMemory(); - - for (size_t z = 0; z < size[2]; ++z) { - for (size_t y = 0; y < size[1]; ++y) { - size_t srcOffset = srcRect.offset(0, y, z); - size_t dstOffset = dstRect.offset(0, y, z); - - // Copy memory line by line - HsaStatus status = - hsacoreapi->HsaCopyMemory( - (reinterpret_cast
(dst) + dstOffset), - (reinterpret_cast(src) + srcOffset), - size[0]); - - if (status != kHsaStatusSuccess) { - LogPrintfError("DMA buffer failed with code %d", status); - return false; - } - } - } - - return true; -} - -bool -HsaBlitManager::copyImageToBuffer( - device::Memory& srcMemory, - device::Memory& dstMemory, - const amd::Coord3D& srcOrigin, - const amd::Coord3D& dstOrigin, - const amd::Coord3D& size, - bool entire, - size_t rowPitch, - size_t slicePitch) const -{ - // Wait on the last outstanding kernel. - gpu().releaseGpuMemoryFence(); - - oclhsa::Image& srcImage = static_cast(srcMemory); - oclhsa::Buffer& destBuff = static_cast(dstMemory); - - const uint8_t *src = static_cast(srcImage.getDeviceMemory()); - uint8_t* dst = static_cast(destBuff.getDeviceMemory()); - - size_t elementSize = - srcMemory.owner()->asImage()->getImageFormat().getElementSize(); - size_t dstRowPitch = size[0] * elementSize; - size_t dstSlicePitch = size[1] * dstRowPitch; - - size_t srcRowPitch = srcImage.getDeviceRowPitchSize(); - size_t srcSlicePitch = srcImage.getDeviceSlicePitchSize(); - - return importExportImage( - dst, src, dstOrigin, dstRowPitch, dstSlicePitch, srcOrigin, srcRowPitch, - srcSlicePitch, size, elementSize); -} - -bool -HsaBlitManager::copyBufferToImage( - device::Memory& srcMemory, - device::Memory& dstMemory, - const amd::Coord3D& srcOrigin, - const amd::Coord3D& dstOrigin, - const amd::Coord3D& size, - bool entire, - size_t rowPitch, - size_t slicePitch) const -{ - // Wait on the last outstanding kernel. - gpu().releaseGpuMemoryFence(); - - oclhsa::Buffer& srcBuff = static_cast(srcMemory); - oclhsa::Image& dstImage = static_cast(dstMemory); - - const uint8_t *src = static_cast(srcBuff.getDeviceMemory()); - uint8_t* dst = static_cast(dstImage.getDeviceMemory()); - - size_t elementSize = - dstMemory.owner()->asImage()->getImageFormat().getElementSize(); - size_t srcRowPitch = size[0] * elementSize; - size_t srcSlicePitch = size[1] * srcRowPitch; - - size_t dstRowPitch = dstImage.getDeviceRowPitchSize(); - size_t dstSlicePitch = dstImage.getDeviceSlicePitchSize(); - - return importExportImage( - dst, src, dstOrigin, dstRowPitch, dstSlicePitch, srcOrigin, srcRowPitch, - srcSlicePitch, size, elementSize); -} - -bool -HsaBlitManager::copyImage( - device::Memory& srcMemory, - device::Memory& dstMemory, - const amd::Coord3D& srcOrigin, - const amd::Coord3D& dstOrigin, - const amd::Coord3D& size, - bool entire) const -{ - // Wait on the last outstanding kernel. - gpu().releaseGpuMemoryFence(); - - oclhsa::Image& srcImage = static_cast(srcMemory); - oclhsa::Image& destImage = static_cast(dstMemory); - - const uint8_t *src = static_cast(srcImage.getDeviceMemory()); - uint8_t* dst = static_cast(destImage.getDeviceMemory()); - - size_t srcRowPitch = srcImage.getDeviceRowPitchSize(); - size_t srcSlicePitch = srcImage.getDeviceSlicePitchSize(); - - size_t dstRowPitch = destImage.getDeviceRowPitchSize(); - size_t dstSlicePitch = destImage.getDeviceSlicePitchSize(); - - size_t elementSize = - srcMemory.owner()->asImage()->getImageFormat().getElementSize(); - - return importExportImage( - dst, src, dstOrigin, dstRowPitch, dstSlicePitch, srcOrigin, srcRowPitch, - srcSlicePitch, size, elementSize); -} - -bool -HsaBlitManager::fillBuffer( - device::Memory& memory, - const void* pattern, - size_t patternSize, - const amd::Coord3D& origin, - const amd::Coord3D& size, - bool entire - ) const -{ - // Wait on the last outstanding kernel. - gpu().releaseGpuMemoryFence(); - - if (setup_.disableFillBuffer_ || memory.isHostMemDirectAccess()) { - return HostBlitManager::fillBuffer(memory, pattern, patternSize, - origin, size, entire); - } - - void *fillMem = static_cast(memory).getDeviceMemory(); - - size_t offset = origin[0]; - size_t fillSize = size[0]; - - if ((fillSize % patternSize) != 0) { - LogError("Misaligned buffer size and pattern size!"); - } - - // Fill the buffer memory with a pattern - for (size_t i = 0; i < (fillSize / patternSize); i++) { - HsaStatus status = - hsacoreapi->HsaCopyMemory( - (reinterpret_cast
(fillMem) + offset), - (reinterpret_cast(pattern)), - patternSize); - - if (status != kHsaStatusSuccess) { - LogPrintfError("DMA buffer failed with code %d", status); - return false; - } - - offset += patternSize; - } - - return true; -} - -bool -HsaBlitManager::fillImage( - device::Memory& memory, - const void* pattern, - const amd::Coord3D& origin, - const amd::Coord3D& size, - bool entire - ) const -{ - // Wait on the last outstanding kernel. - gpu().releaseGpuMemoryFence(); - - oclhsa::Image& image = static_cast(memory); - - void *fillMem = image.getDeviceMemory(); - - size_t elementSize = - memory.owner()->asImage()->getImageFormat().getElementSize(); - - float fillValue[4]; - memset(fillValue, 0, sizeof(fillValue)); - memory.owner()->asImage()->getImageFormat().formatColor( - pattern, fillValue); - - size_t rowPitchSize = image.getDeviceRowPitchSize(); - size_t slicePitchSize = image.getDeviceSlicePitchSize(); - - size_t offset = origin[0] * elementSize; - - // Adjust offset with Y dimension - offset += rowPitchSize * origin[1]; - - // Adjust offset with Z dimension - offset += slicePitchSize * origin[2]; - - size_t offsetOrg = offset; - - // Fill the image memory with a pattern - for (size_t slice = 0; slice < size[2]; ++slice) { - offset = offsetOrg + slice * slicePitchSize; - - for (size_t rows = 0; rows < size[1]; ++rows) { - size_t pixOffset = offset; - - // Copy memory pixel by pixel - for (size_t column = 0; column < size[0]; ++column) { - HsaStatus status = - hsacoreapi->HsaCopyMemory( - (reinterpret_cast
(fillMem) + pixOffset), - (reinterpret_cast(fillValue)), - elementSize); - - if (status != kHsaStatusSuccess) { - LogPrintfError("DMA buffer failed with code %d", status); - return false; - } - - pixOffset += elementSize; - } - - offset += rowPitchSize; - } - } - - return true; -} - -bool -HsaBlitManager::importExportImage( - uint8_t* dst, - const uint8_t* src, - const amd::Coord3D& dstOffset, - size_t dstRowPitch, - size_t dstSlicePitch, - const amd::Coord3D& srcOffset, - size_t srcRowPitch, - size_t srcSlicePitch, - const amd::Coord3D& sizeToCopy, - size_t elementSize) const -{ - for (size_t zDim = 0; zDim < sizeToCopy[2]; ++zDim) { - for (size_t yDim = 0; yDim < sizeToCopy[1]; ++yDim) { - size_t srcImgOffset = - srcOffset[0] * elementSize + (srcOffset[1] + yDim) * srcRowPitch + - (srcOffset[2] + zDim) * srcSlicePitch; - size_t dstImgOffset = - dstOffset[0] * elementSize + (dstOffset[1] + yDim) * dstRowPitch + - (dstOffset[2] + zDim) * dstSlicePitch; - HsaStatus status = hsacoreapi->HsaCopyMemory( - dst + dstImgOffset, src + srcImgOffset, sizeToCopy[0]*elementSize); - - if (status != kHsaStatusSuccess) { - LogPrintfError("DMA import/export image failed with code %d", status); - return false; - } - } - } - - return true; -} - -static void -CalcRowSlicePitches( - cl_ulong* pitch, const cl_int* copySize, - size_t rowPitch, size_t slicePitch, const Memory& mem) -{ - const oclhsa::Image &hsaImage = static_cast< const oclhsa::Image &>(mem); - bool img1Darray = - (mem.owner()->getType() == CL_MEM_OBJECT_IMAGE1D_ARRAY) ? true : false; - size_t memFmtSize = mem.owner()->asImage()->getImageFormat().getElementSize(); - - if (rowPitch == 0) { - pitch[0] = copySize[0]; - } - else { - pitch[0] = rowPitch / memFmtSize; - } - if (slicePitch == 0) { - pitch[1] = pitch[0] * (img1Darray ? 1 : copySize[1]); - } - else { - pitch[1] = slicePitch / memFmtSize; - } - assert((pitch[0] <= pitch[1]) && "rowPitch must be <= slicePitch"); - - if (img1Darray) { - // For 1D array rowRitch = slicePitch - pitch[0] = pitch[1]; - } -} - -KernelBlitManager::KernelBlitManager(device::VirtualDevice& vDev, Setup setup) - : HsaBlitManager(vDev, setup), - context_(NULL), - program_(NULL) -{ - for (uint i = 0; i < BlitTotal; ++i) { - kernels_[i] = NULL; - } -} - -KernelBlitManager::~KernelBlitManager() -{ - for (uint i = 0; i < BlitTotal; ++i) { - if (NULL != kernels_[i]) { - kernels_[i]->release(); - } - } - - if (NULL != program_) { - program_->release(); - } - - if (NULL != context_) { - // Release a dummy context - context_->release(); - } -} - -bool -KernelBlitManager::readBuffer( - device::Memory& srcMemory, - void* dstHost, - const amd::Coord3D& origin, - const amd::Coord3D& size, - bool entire) const -{ - if (setup_.disableReadBuffer_ || srcMemory.isHostMemDirectAccess()) { - return HsaBlitManager::readBuffer(srcMemory, dstHost, origin, - size, entire); - } - - amd::Buffer *dstMemory = new (*context_) amd::Buffer( - *context_, CL_MEM_USE_HOST_PTR, size[0]); - - if (!dstMemory->create(const_cast(dstHost))) { - LogError("[OCL] Fail to create mem object for destination"); - return false; - } - - device::Memory *devDstMemory = dstMemory->getDeviceMemory(dev_); - if (devDstMemory== NULL) { - LogError("[OCL] Fail to create device mem object for destination"); - return false; - } - - bool result = copyBuffer( - srcMemory, *devDstMemory, origin, amd::Coord3D(0), size, entire); - - // Wait for the transfer to finish so that we could safely release the - // destination memory object. - // TODO: we could remove this if issue on implicit memory registration is - // fixed by KFD, so that we could pass the pattern as SVM. - gpu().releaseGpuMemoryFence(); - - dstMemory->release(); - - return result; -} - -bool -KernelBlitManager::readBufferRect( - device::Memory& srcMemory, - void* dstHost, - const amd::BufferRect& bufRect, - const amd::BufferRect& hostRect, - const amd::Coord3D& size, - bool entire) const -{ - if (setup_.disableReadBufferRect_ || srcMemory.isHostMemDirectAccess()) { - return HsaBlitManager::readBufferRect( - srcMemory, dstHost, bufRect, hostRect, size, entire); - } - - size_t dstSize = hostRect.start_ + hostRect.end_; - amd::Buffer *dstMemory = - new (*context_) amd::Buffer(*context_, CL_MEM_USE_HOST_PTR, dstSize); - - if (!dstMemory->create(const_cast(dstHost))) { - LogError("[OCL] Fail to create mem object for destination"); - return false; - } - - device::Memory *devDstMemory = dstMemory->getDeviceMemory(dev_); - if (devDstMemory== NULL) { - LogError("[OCL] Fail to create device mem object for destination"); - return false; - } - - bool result = copyBufferRect( - srcMemory, *devDstMemory, bufRect, hostRect, size, entire); - - // Wait for the transfer to finish so that we could safely release the - // destination memory object. - // TODO: we could remove this if issue on implicit memory registration is - // fixed by KFD, so that we could pass the pattern as SVM. - gpu().releaseGpuMemoryFence(); - - dstMemory->release(); - - return result; -} - -void -FindLinearSize( - size_t& linearSize, const amd::Coord3D& size, - size_t& rowPitch, size_t& slicePitch, const device::Memory& mem) -{ - const oclhsa::Image &image = static_cast(mem); - size_t elementSize = mem.owner()->asImage()->getImageFormat().getElementSize(); - - linearSize = size[0] * elementSize; - if ((rowPitch == 0) || (rowPitch == linearSize)) { - rowPitch = 0; - } - else { - linearSize = rowPitch; - } - - // Calculate the pin size, which should be equal to the copy size - for (uint i = 1; i < mem.owner()->asImage()->getDims(); ++i) { - linearSize *= size[i]; - if (i == 1) { - if ((slicePitch == 0) || (slicePitch == linearSize)) { - slicePitch = 0; - } - else { - if (mem.owner()->getType() != CL_MEM_OBJECT_IMAGE1D_ARRAY) { - linearSize = slicePitch; - } - else { - linearSize = slicePitch * size[i]; - } - } - } - } -} - -// The following data structures will be used for the view creations. -// Some formats has to be converted before a kernel blit operation -struct FormatConvertion { - cl_uint clOldType_; - cl_uint clNewType_; -}; - -// The list of rejected data formats and corresponding conversion -static const FormatConvertion RejectedData[] = -{ - { CL_UNORM_INT8, CL_UNSIGNED_INT8 }, - { CL_UNORM_INT16, CL_UNSIGNED_INT16 }, - { CL_SNORM_INT8, CL_UNSIGNED_INT8 }, - { CL_SNORM_INT16, CL_UNSIGNED_INT16 }, - { CL_HALF_FLOAT, CL_UNSIGNED_INT16 }, - { CL_FLOAT, CL_UNSIGNED_INT32 }, - { CL_SIGNED_INT8, CL_UNSIGNED_INT8 }, - { CL_SIGNED_INT16, CL_UNSIGNED_INT16 }, - { CL_SIGNED_INT32, CL_UNSIGNED_INT32 } -}; - -// The list of rejected channel's order and corresponding conversion -static const FormatConvertion RejectedOrder[] = -{ - { CL_A, CL_R }, - { CL_RA, CL_RG }, - { CL_LUMINANCE, CL_R }, - { CL_INTENSITY, CL_R }, - { CL_BGRA, CL_RGBA }, - { CL_ARGB, CL_RGBA } -}; - -const uint RejectedFormatDataTotal = - sizeof(RejectedData) / sizeof(FormatConvertion); -const uint RejectedFormatChannelTotal = - sizeof(RejectedOrder) / sizeof(FormatConvertion); - -amd::Image::Format -KernelBlitManager::filterFormat(amd::Image::Format oldFormat) const -{ - cl_image_format newFormat; - newFormat.image_channel_data_type = oldFormat.image_channel_data_type; - newFormat.image_channel_order = oldFormat.image_channel_order; - - // Find unsupported formats - for (uint i = 0; i < RejectedFormatDataTotal; ++i) { - if (RejectedData[i].clOldType_ == oldFormat.image_channel_data_type) { - newFormat.image_channel_data_type = RejectedData[i].clNewType_; - break; - } - } - - // Find unsupported channel's order - for (uint i = 0; i < RejectedFormatChannelTotal; ++i) { - if (RejectedOrder[i].clOldType_ == oldFormat.image_channel_order) { - newFormat.image_channel_order = RejectedOrder[i].clNewType_; - break; - } - } - - return amd::Image::Format(newFormat); -} - -device::Memory * -KernelBlitManager::createImageView( - device::Memory &parent, - amd::Image::Format newFormat) const -{ - amd::Image *image = - parent.owner()->asImage()->createView(parent.owner()->getContext(), newFormat, &gpu()); - - if (image == NULL) { - LogError("[OCL] Fail to allocate view of image object"); - return NULL; - } - - Image* devImage = new oclhsa::Image(static_cast(dev_), *image); - if (devImage == NULL) { - LogError("[OCL] Fail to allocate device mem object for the view"); - image->release(); - return NULL; - } - - if (!devImage->createView(static_cast(parent))) { - LogError("[OCL] Fail to create device mem object for the view"); - delete devImage; - image->release(); - return NULL; - } - - image->replaceDeviceMemory(&dev_, devImage); - - return devImage; -} - -bool -KernelBlitManager::readImage( - device::Memory& srcMemory, - void* dstHost, - const amd::Coord3D& origin, - const amd::Coord3D& size, - size_t rowPitch, - size_t slicePitch, - bool entire) const -{ - if (setup_.disableReadImage_ || srcMemory.isHostMemDirectAccess()) { - return HsaBlitManager::readImage( - srcMemory, dstHost, origin, size, rowPitch, slicePitch, entire); - } - - size_t linearSize = 0; - FindLinearSize(linearSize, size, rowPitch, slicePitch, srcMemory); - amd::Buffer *dstMemory = - new (*context_) amd::Buffer(*context_, CL_MEM_USE_HOST_PTR, linearSize); - - if (!dstMemory->create(const_cast(dstHost))) { - LogError("[OCL] Fail to create mem object for destination"); - return false; - } - - device::Memory *devDstMemory = dstMemory->getDeviceMemory(dev_); - if (devDstMemory== NULL) { - LogError("[OCL] Fail to create device mem object for destination"); - return false; - } - - bool result = copyImageToBuffer( - srcMemory, *devDstMemory, origin, amd::Coord3D(0), size, entire, rowPitch, - slicePitch); - - // Wait for the transfer to finish so that we could safely release the - // destination memory object. - // TODO: we could remove this if issue on implicit memory registration is - // fixed by KFD, so that we could pass the pattern as SVM. - gpu().releaseGpuMemoryFence(); - - dstMemory->release(); - - return result; -} - -bool -KernelBlitManager::writeBuffer( - const void* srcHost, - device::Memory& dstMemory, - const amd::Coord3D& origin, - const amd::Coord3D& size, - bool entire) const -{ - if (setup_.disableWriteBuffer_ || dstMemory.isHostMemDirectAccess()) { - return HsaBlitManager::writeBuffer(srcHost, dstMemory, origin, size, - entire); - } - - amd::Buffer *srcMemory = - new (*context_) amd::Buffer(*context_, CL_MEM_USE_HOST_PTR, size[0]); - - if (!srcMemory->create(const_cast(srcHost))) { - LogError("[OCL] Fail to create mem object for destination"); - return false; - } - - device::Memory *devSrcMemory = srcMemory->getDeviceMemory(dev_); - if (devSrcMemory== NULL) { - LogError("[OCL] Fail to create device mem object for destination"); - return false; - } - - bool result = - copyBuffer(*devSrcMemory, dstMemory, amd::Coord3D(0), origin, size, entire); - - // Wait for the transfer to finish so that we could safely release the - // source memory object. - // TODO: we could remove this if issue on implicit memory registration is - // fixed by KFD, so that we could pass the pattern as SVM. - gpu().releaseGpuMemoryFence(); - - srcMemory->release(); - - return result; -} - -bool -KernelBlitManager::writeBufferRect( - const void* srcHost, - device::Memory& dstMemory, - const amd::BufferRect& hostRect, - const amd::BufferRect& bufRect, - const amd::Coord3D& size, - bool entire) const -{ - if (setup_.disableWriteBufferRect_ || dstMemory.isHostMemDirectAccess()) { - return HsaBlitManager::writeBufferRect( - srcHost, dstMemory, hostRect, bufRect, size, entire); - } - - size_t srcSize = hostRect.start_ + hostRect.end_; - amd::Buffer *srcMemory = - new (*context_) amd::Buffer(*context_, CL_MEM_USE_HOST_PTR, srcSize); - - if (!srcMemory->create(const_cast(srcHost))) { - LogError("[OCL] Fail to create mem object for destination"); - return false; - } - - device::Memory *devSrcMemory = srcMemory->getDeviceMemory(dev_); - if (devSrcMemory== NULL) { - LogError("[OCL] Fail to create device mem object for destination"); - return false; - } - - bool result = copyBufferRect( - *devSrcMemory, dstMemory, hostRect, bufRect, size, entire); - - // Wait for the transfer to finish so that we could safely release the - // destination memory object. - // TODO: we could remove this if issue on implicit memory registration is - // fixed by KFD, so that we could pass the pattern as SVM. - gpu().releaseGpuMemoryFence(); - - srcMemory->release(); - - return result; -} - -bool -KernelBlitManager::writeImage( - const void* srcHost, - device::Memory& dstMemory, - const amd::Coord3D& origin, - const amd::Coord3D& size, - size_t rowPitch, - size_t slicePitch, - bool entire) const -{ - if (setup_.disableWriteImage_ || dstMemory.isHostMemDirectAccess()) { - return HsaBlitManager::writeImage( - srcHost, dstMemory, origin, size, rowPitch, slicePitch, entire); - } - - size_t linearSize = 0; - FindLinearSize(linearSize, size, rowPitch, slicePitch, dstMemory); - amd::Buffer *srcMemory = - new (*context_) amd::Buffer(*context_, CL_MEM_USE_HOST_PTR, linearSize); - - if (!srcMemory->create(const_cast(srcHost))) { - LogError("[OCL] Fail to create mem object for destination"); - return false; - } - - device::Memory *devSrcMemory = srcMemory->getDeviceMemory(dev_); - if (devSrcMemory== NULL) { - LogError("[OCL] Fail to create device mem object for destination"); - return false; - } - - bool result = copyBufferToImage( - *devSrcMemory, dstMemory, amd::Coord3D(0), origin, size, entire, - rowPitch, slicePitch); - - // Wait for the transfer to finish so that we could safely release the - // destination memory object. - // TODO: we could remove this if issue on implicit memory registration is - // fixed by KFD, so that we could pass the pattern as SVM. - gpu().releaseGpuMemoryFence(); - - srcMemory->release(); - - return result; -} - -bool -KernelBlitManager::copyBuffer( - device::Memory& srcMemory, - device::Memory& dstMemory, - const amd::Coord3D& srcOrigin, - const amd::Coord3D& dstOrigin, - const amd::Coord3D& sizeIn, - bool entire) const -{ - if (setup_.disableCopyBuffer_ || - srcMemory.isHostMemDirectAccess() || - dstMemory.isHostMemDirectAccess()) { - return HsaBlitManager::copyBuffer( - srcMemory, dstMemory, srcOrigin, dstOrigin, sizeIn, entire); - } - - uint blitType = BlitCopyBuffer; - size_t dim = 1; - size_t globalWorkOffset[3] = { 0, 0, 0 }; - size_t globalWorkSize = 0; - size_t localWorkSize = 0; - - const static uint CopyBuffAlignment[3] = { 16, 4, 1 }; - amd::Coord3D size(sizeIn[0], sizeIn[1], sizeIn[2]); - - bool aligned; - uint i; - for (i = 0; i < 3; ++i) { - // Check source alignments - aligned = ((srcOrigin[0] % CopyBuffAlignment[i]) == 0); - // Check destination alignments - aligned &= ((dstOrigin[0] % CopyBuffAlignment[i]) == 0); - // Check copy size alignment in the first dimension - aligned &= ((sizeIn[0] % CopyBuffAlignment[i]) == 0); - - if (aligned) { - if (CopyBuffAlignment[i] != 1) { - blitType = BlitCopyBufferAligned; - } - break; - } - } - - cl_uint remain; - if (blitType == BlitCopyBufferAligned) { - size.c[0] /= CopyBuffAlignment[i]; - } - else { - remain = size[0] % 4; - size.c[0] /= 4; - size.c[0] += 1; - } - - // Program the dispatch dimensions - localWorkSize = 256; - globalWorkSize = amd::alignUp(size[0] , 256); - - // Program kernels arguments for the blit operation - cl_mem clmem = ((cl_mem) as_cl(srcMemory.owner())); - kernels_[blitType]->parameters().set(0, sizeof(cl_mem), &clmem); - clmem = ((cl_mem) as_cl(dstMemory.owner())); - kernels_[blitType]->parameters().set(1, sizeof(cl_mem), &clmem); - // Program source origin - cl_ulong srcOffset = srcOrigin[0] / CopyBuffAlignment[i]; - kernels_[blitType]->parameters().set(2, sizeof(srcOffset), &srcOffset); - - // Program destinaiton origin - cl_ulong dstOffset = dstOrigin[0] / CopyBuffAlignment[i]; - kernels_[blitType]->parameters().set(3, sizeof(dstOffset), &dstOffset); - - cl_ulong copySize = size[0]; - kernels_[blitType]->parameters().set(4, sizeof(copySize), ©Size); - - if (blitType == BlitCopyBufferAligned) { - cl_int alignment = CopyBuffAlignment[i]; - kernels_[blitType]->parameters().set(5, sizeof(alignment), &alignment); - } - else { - kernels_[blitType]->parameters().set(5, sizeof(remain), &remain); - } - - // Create ND range object for the kernel's execution - amd::NDRangeContainer ndrange( - 1, globalWorkOffset, &globalWorkSize, &localWorkSize); - - // Execute the blit - address parameters = kernels_[blitType]->parameters().capture(dev_); - bool result = gpu().submitKernelInternal( - ndrange, *kernels_[blitType], parameters, NULL); - kernels_[blitType]->parameters().release(const_cast
(parameters), dev_); - return result; -} - -bool -KernelBlitManager::copyBufferRect( - device::Memory& srcMemory, - device::Memory& dstMemory, - const amd::BufferRect& srcRectIn, - const amd::BufferRect& dstRectIn, - const amd::Coord3D& sizeIn, - bool entire) const -{ - if (setup_.disableCopyBuffer_ || - (srcMemory.isHostMemDirectAccess() && dstMemory.isHostMemDirectAccess())) { - return HsaBlitManager::copyBufferRect( - srcMemory, dstMemory, srcRectIn, dstRectIn, sizeIn, entire); - } - - uint blitType = BlitCopyBufferRect; - size_t dim = 3; - size_t globalWorkOffset[3] = { 0, 0, 0 }; - size_t globalWorkSize[3]; - size_t localWorkSize[3]; - - const static uint CopyRectAlignment[3] = { 16, 4, 1 }; - - bool aligned; - uint i; - for (i = 0; i < sizeof(CopyRectAlignment) / sizeof(uint); i++) { - // Check source alignments - aligned = ((srcRectIn.rowPitch_ % CopyRectAlignment[i]) == 0); - aligned &= ((srcRectIn.slicePitch_ % CopyRectAlignment[i]) == 0); - aligned &= ((srcRectIn.start_ % CopyRectAlignment[i]) == 0); - - // Check destination alignments - aligned &= ((dstRectIn.rowPitch_ % CopyRectAlignment[i]) == 0); - aligned &= ((dstRectIn.slicePitch_ % CopyRectAlignment[i]) == 0); - aligned &= ((dstRectIn.start_ % CopyRectAlignment[i]) == 0); - - // Check copy size alignment in the first dimension - aligned &= ((sizeIn[0] % CopyRectAlignment[i]) == 0); - - if (aligned) { - if (CopyRectAlignment[i] != 1) { - blitType = BlitCopyBufferRectAligned; - } - break; - } - } - - amd::BufferRect srcRect; - amd::BufferRect dstRect; - amd::Coord3D size(sizeIn[0], sizeIn[1], sizeIn[2]); - - srcRect.rowPitch_ = srcRectIn.rowPitch_ / CopyRectAlignment[i]; - srcRect.slicePitch_ = srcRectIn.slicePitch_ / CopyRectAlignment[i]; - srcRect.start_ = srcRectIn.start_ / CopyRectAlignment[i]; - srcRect.end_ = srcRectIn.end_ / CopyRectAlignment[i]; - - dstRect.rowPitch_ = dstRectIn.rowPitch_ / CopyRectAlignment[i]; - dstRect.slicePitch_ = dstRectIn.slicePitch_ / CopyRectAlignment[i]; - dstRect.start_ = dstRectIn.start_ / CopyRectAlignment[i]; - dstRect.end_ = dstRectIn.end_ / CopyRectAlignment[i]; - - size.c[0] /= CopyRectAlignment[i]; - - // Program the kernel's workload depending on the transfer dimensions - if ((size[1] == 1) && (size[2] == 1)) { - globalWorkSize[0] = amd::alignUp(size[0], 256); - globalWorkSize[1] = 1; - globalWorkSize[2] = 1; - localWorkSize[0] = 256; - localWorkSize[1] = 1; - localWorkSize[2] = 1; - } - else if (size[2] == 1) { - globalWorkSize[0] = amd::alignUp(size[0], 16); - globalWorkSize[1] = amd::alignUp(size[1], 16); - globalWorkSize[2] = 1; - localWorkSize[0] = localWorkSize[1] = 16; - localWorkSize[2] = 1; - } - else { - globalWorkSize[0] = amd::alignUp(size[0], 8); - globalWorkSize[1] = amd::alignUp(size[1], 8); - globalWorkSize[2] = amd::alignUp(size[2], 4); - localWorkSize[0] = localWorkSize[1] = 8; - localWorkSize[2] = 4; - } - - - // Program kernels arguments for the blit operation - cl_mem clmem = ((cl_mem) as_cl(srcMemory.owner())); - kernels_[blitType]->parameters().set(0, sizeof(cl_mem), &clmem); - clmem = ((cl_mem) as_cl(dstMemory.owner())); - kernels_[blitType]->parameters().set(1, sizeof(cl_mem), &clmem); - cl_ulong src[4] = { srcRect.rowPitch_, - srcRect.slicePitch_, - srcRect.start_, 0 }; - kernels_[blitType]->parameters().set(2, sizeof(src), src); - cl_ulong dst[4] = { dstRect.rowPitch_, - dstRect.slicePitch_, - dstRect.start_, 0 }; - kernels_[blitType]->parameters().set(3, sizeof(dst), dst); - cl_ulong copySize[4] = { size[0], - size[1], - size[2], - CopyRectAlignment[i] }; - kernels_[blitType]->parameters().set(4, sizeof(copySize), copySize); - - // Create ND range object for the kernel's execution - amd::NDRangeContainer ndrange(dim, - globalWorkOffset, globalWorkSize, localWorkSize); - - // Execute the blit - address parameters = kernels_[blitType]->parameters().capture(dev_); - bool result = gpu().submitKernelInternal( - ndrange, *kernels_[blitType], parameters, NULL); - kernels_[blitType]->parameters().release(const_cast
(parameters), dev_); - return result; -} - -bool -KernelBlitManager::copyImageToBuffer( - device::Memory& srcMemory, - device::Memory& dstMemory, - const amd::Coord3D& srcOrigin, - const amd::Coord3D& dstOrigin, - const amd::Coord3D& size, - bool entire, - size_t rowPitch, - size_t slicePitch) const -{ - if (srcMemory.isHostMemDirectAccess() && dstMemory.isHostMemDirectAccess()) { - return HsaBlitManager::copyImageToBuffer( - srcMemory, dstMemory, srcOrigin, dstOrigin, size, entire, - rowPitch, slicePitch); - } - - amd::Image::Format oldFormat = srcMemory.owner()->asImage()->getImageFormat(); - amd::Image::Format newFormat = filterFormat(oldFormat); - bool useView = false; - - device::Memory *srcView = &srcMemory; - if (oldFormat != newFormat) { - srcView = createImageView(srcMemory, newFormat); - useView = true; - } - - oclhsa::Image &srcImage = static_cast(*srcView); - - amd::Image * image = srcImage.owner()->asImage(); - uint blitType = 0; - blitType = BlitCopyImageToBuffer; - size_t dim = 0; - size_t globalWorkOffset[3] = { 0, 0, 0 }; - size_t globalWorkSize[3]; - size_t localWorkSize[3]; - - // Program the kernels workload depending on the blit dimensions - const size_t imageDims = srcImage.owner()->asImage()->getDims(); - dim = 3; - // Find the current blit type - if (imageDims == 1) { - globalWorkSize[0] = amd::alignUp(size[0], 256); - globalWorkSize[1] = amd::alignUp(size[1], 1); - globalWorkSize[2] = amd::alignUp(size[2], 1); - localWorkSize[0] = 256; - localWorkSize[1] = localWorkSize[2] = 1; - } - else if (imageDims == 2) { - globalWorkSize[0] = amd::alignUp(size[0], 16); - globalWorkSize[1] = amd::alignUp(size[1], 16); - globalWorkSize[2] = amd::alignUp(size[2], 1); - localWorkSize[0] = localWorkSize[1] = 16; - localWorkSize[2] = 1; - } - else { - globalWorkSize[0] = amd::alignUp(size[0], 8); - globalWorkSize[1] = amd::alignUp(size[1], 8); - globalWorkSize[2] = amd::alignUp(size[2], 4); - localWorkSize[0] = localWorkSize[1] = 8; - localWorkSize[2] = 4; - } - - // Program kernels arguments for the blit operation - cl_mem clmem = ((cl_mem) as_cl(srcImage.owner())); - kernels_[blitType]->parameters().set(0, sizeof(cl_mem), &clmem); - clmem = ((cl_mem) as_cl(dstMemory.owner())); - kernels_[blitType]->parameters().set(1, sizeof(cl_mem), &clmem); - - // Update extra paramters for USHORT and UBYTE pointers. - // Only then compiler can optimize the kernel to use - // UAV Raw for other writes - kernels_[blitType]->parameters().set(2, sizeof(cl_mem), &clmem); - kernels_[blitType]->parameters().set(3, sizeof(cl_mem), &clmem); - - cl_int srcOrg[4] = { (cl_int)srcOrigin[0], - (cl_int)srcOrigin[1], - (cl_int)srcOrigin[2], 0 }; - cl_int copySize[4] = { (cl_int)size[0], - (cl_int)size[1], - (cl_int)size[2], 0 }; - - kernels_[blitType]->parameters().set(4, sizeof(srcOrg), srcOrg); - - const size_t elementSize = - srcImage.owner()->asImage()->getImageFormat().getElementSize(); - const size_t numChannels = - srcImage.owner()->asImage()->getImageFormat().getNumChannels(); - - // 1 element granularity for writes by default - cl_int granularity = 1; - if (elementSize == 2) { - granularity = 2; - } - else if (elementSize >= 4) { - granularity = 4; - } - CondLog(((dstOrigin[0] % granularity) != 0), "Unaligned offset in blit!"); - cl_ulong dstOrg[4] = { dstOrigin[0] / granularity, - dstOrigin[1], - dstOrigin[2], - 0 }; - kernels_[blitType]->parameters().set(5, sizeof(dstOrg), dstOrg); - kernels_[blitType]->parameters().set(6, sizeof(copySize), copySize); - - // Program memory format - uint multiplier = elementSize / sizeof(uint32_t); - multiplier = (multiplier == 0) ? 1 : multiplier; - cl_uint format[4] = { (cl_uint)numChannels, - (cl_uint)(elementSize / numChannels), - multiplier, 0 }; - kernels_[blitType]->parameters().set(7, sizeof(format), format); - - // Program row and slice pitches - cl_ulong pitch[4] = { 0 }; - CalcRowSlicePitches(pitch, copySize, rowPitch, slicePitch, srcImage); - kernels_[blitType]->parameters().set(8, sizeof(pitch), pitch); - - // Create ND range object for the kernel's execution - amd::NDRangeContainer ndrange(dim, - globalWorkOffset, globalWorkSize, localWorkSize); - - // Execute the blit - address parameters = kernels_[blitType]->parameters().capture(dev_); - bool result = gpu().submitKernelInternal( - ndrange, *kernels_[blitType], parameters, NULL); - kernels_[blitType]->parameters().release(const_cast
(parameters), dev_); - - if (useView) { - srcView->owner()->release(); - } - - return result; -} - -bool -KernelBlitManager::copyBufferToImage( - device::Memory& srcMemory, - device::Memory& dstMemory, - const amd::Coord3D& srcOrigin, - const amd::Coord3D& dstOrigin, - const amd::Coord3D& size, - bool entire, - size_t rowPitch, - size_t slicePitch) const -{ - if (srcMemory.isHostMemDirectAccess() && dstMemory.isHostMemDirectAccess()) { - return HsaBlitManager::copyBufferToImage( - srcMemory, dstMemory, srcOrigin, dstOrigin, size, entire, - rowPitch, slicePitch); - } - - amd::Image::Format oldFormat = dstMemory.owner()->asImage()->getImageFormat(); - amd::Image::Format newFormat = filterFormat(oldFormat); - bool useView = false; - - device::Memory *dstView = &dstMemory; - if (oldFormat != newFormat) { - dstView = createImageView(dstMemory, newFormat); - useView = true; - } - - oclhsa::Image &dstImage = static_cast(*dstView); - - // Use a common blit type with three dimensions by default - uint blitType = BlitCopyBufferToImage; - size_t dim = 0; - size_t globalWorkOffset[3] = { 0, 0, 0 }; - size_t globalWorkSize[3]; - size_t localWorkSize[3]; - - // Program the kernels workload depending on the blit dimensions - const size_t imageDims = dstImage.owner()->asImage()->getDims(); - dim = 3; - if (imageDims == 1) { - globalWorkSize[0] = amd::alignUp(size[0], 256); - globalWorkSize[1] = amd::alignUp(size[1], 1); - globalWorkSize[2] = amd::alignUp(size[2], 1); - localWorkSize[0] = 256; - localWorkSize[1] = localWorkSize[2] = 1; - } - else if (imageDims == 2) { - globalWorkSize[0] = amd::alignUp(size[0], 16); - globalWorkSize[1] = amd::alignUp(size[1], 16); - globalWorkSize[2] = amd::alignUp(size[2], 1); - localWorkSize[0] = localWorkSize[1] = 16; - localWorkSize[2] = 1; - } - else { - globalWorkSize[0] = amd::alignUp(size[0], 8); - globalWorkSize[1] = amd::alignUp(size[1], 8); - globalWorkSize[2] = amd::alignUp(size[2], 4); - localWorkSize[0] = localWorkSize[1] = 8; - localWorkSize[2] = 4; - } - - // Program kernels arguments for the blit operation - cl_mem clmem = ((cl_mem) as_cl(srcMemory.owner())); - kernels_[blitType]->parameters().set(0, sizeof(cl_mem), &clmem); - clmem = ((cl_mem) as_cl(dstImage.owner())); - kernels_[blitType]->parameters().set(1, sizeof(cl_mem), &clmem); - - const size_t elementSize = - dstImage.owner()->asImage()->getImageFormat().getElementSize(); - const size_t numChannels = - dstImage.owner()->asImage()->getImageFormat().getNumChannels(); - - // 1 element granularity for writes by default - cl_int granularity = 1; - if (elementSize == 2) { - granularity = 2; - } - else if (elementSize >= 4) { - granularity = 4; - } - CondLog(((srcOrigin[0] % granularity) != 0), "Unaligned offset in blit!"); - cl_ulong srcOrg[4] = { srcOrigin[0] / granularity, - srcOrigin[1], - srcOrigin[2], 0 }; - kernels_[blitType]->parameters().set(2, sizeof(srcOrg), srcOrg); - - cl_int dstOrg[4] = { (cl_int)dstOrigin[0], - (cl_int)dstOrigin[1], - (cl_int)dstOrigin[2], 0 }; - cl_int copySize[4] = { (cl_int)size[0], - (cl_int)size[1], - (cl_int)size[2], 0 }; - - kernels_[blitType]->parameters().set(3, sizeof(dstOrg), dstOrg); - kernels_[blitType]->parameters().set(4, sizeof(copySize), copySize); - - // Program memory format - uint multiplier = elementSize / sizeof(uint32_t); - multiplier = (multiplier == 0) ? 1 : multiplier; - cl_uint format[4] = { (cl_uint)numChannels, - (cl_uint)(elementSize / numChannels), - multiplier, 0 }; - kernels_[blitType]->parameters().set(5, sizeof(format), format); - - // Program row and slice pitches - cl_ulong pitch[4] = { 0 }; - CalcRowSlicePitches(pitch, copySize, rowPitch, slicePitch, dstImage); - kernels_[blitType]->parameters().set(6, sizeof(pitch), pitch); - - // Create ND range object for the kernel's execution - amd::NDRangeContainer ndrange(dim, - globalWorkOffset, globalWorkSize, localWorkSize); - - // Execute the blit - address parameters = kernels_[blitType]->parameters().capture(dev_); - bool result = gpu().submitKernelInternal( - ndrange, *kernels_[blitType], parameters, NULL); - kernels_[blitType]->parameters().release(const_cast
(parameters), dev_); - - if (useView) { - dstView->owner()->release(); - } - - return result; -} - -bool -KernelBlitManager::copyImage( - device::Memory& srcMemory, - device::Memory& dstMemory, - const amd::Coord3D& srcOrigin, - const amd::Coord3D& dstOrigin, - const amd::Coord3D& size, - bool entire) const -{ - if (srcMemory.isHostMemDirectAccess() && - dstMemory.isHostMemDirectAccess()) { - return HsaBlitManager::copyImage( - srcMemory, dstMemory, srcOrigin, dstOrigin, size, entire); - } - - amd::Image::Format srcOldFormat = srcMemory.owner()->asImage()->getImageFormat(); - amd::Image::Format srcNewFormat = filterFormat(srcOldFormat); - bool useSrcView = false; - - device::Memory *srcView = &srcMemory; - if (srcOldFormat != srcNewFormat) { - srcView = createImageView(srcMemory, srcNewFormat); - useSrcView = true; - } - - oclhsa::Image &srcImage = static_cast(*srcView); - - amd::Image::Format dstOldFormat = srcMemory.owner()->asImage()->getImageFormat(); - amd::Image::Format dstNewFormat = filterFormat(dstOldFormat); - bool useDstView = false; - - device::Memory *dstView = &dstMemory; - if (dstOldFormat != dstNewFormat) { - dstView = createImageView(dstMemory, dstNewFormat); - useDstView = true; - } - - oclhsa::Image &dstImage = static_cast(*dstView); - - uint blitType = BlitCopyImage; - size_t dim = 0; - size_t globalWorkOffset[3] = { 0, 0, 0 }; - size_t globalWorkSize[3]; - size_t localWorkSize[3]; - - // Program the kernels workload depending on the blit dimensions - dim = 3; - // Find the current blit type - const size_t srcDimSize = srcImage.owner()->asImage()->getDims(); - const size_t dstDimSize = dstImage.owner()->asImage()->getDims(); - if ((srcDimSize == 1) || - (dstDimSize == 1)) { - globalWorkSize[0] = amd::alignUp(size[0], 256); - globalWorkSize[1] = amd::alignUp(size[1], 1); - globalWorkSize[2] = amd::alignUp(size[2], 1); - localWorkSize[0] = 256; - localWorkSize[1] = localWorkSize[2] = 1; - } - else if ((srcDimSize == 2) || - (dstDimSize == 2)) { - globalWorkSize[0] = amd::alignUp(size[0], 16); - globalWorkSize[1] = amd::alignUp(size[1], 16); - globalWorkSize[2] = amd::alignUp(size[2], 1); - localWorkSize[0] = localWorkSize[1] = 16; - localWorkSize[2] = 1; - } - else { - globalWorkSize[0] = amd::alignUp(size[0], 8); - globalWorkSize[1] = amd::alignUp(size[1], 8); - globalWorkSize[2] = amd::alignUp(size[2], 4); - localWorkSize[0] = localWorkSize[1] = 8; - localWorkSize[2] = 4; - } - - // The current OpenCL spec allows "copy images from a 1D image - // array object to a 1D image array object" only. - if ((srcImage.owner()->getType() == CL_MEM_OBJECT_IMAGE1D_ARRAY) || - (dstImage.owner()->getType() == CL_MEM_OBJECT_IMAGE1D_ARRAY)) { - blitType = BlitCopyImage1DA; - } - - // Program kernels arguments for the blit operation - cl_mem clmem = ((cl_mem) as_cl(srcImage.owner())); - kernels_[blitType]->parameters().set(0, sizeof(cl_mem), &clmem); - clmem = ((cl_mem) as_cl(dstImage.owner())); - kernels_[blitType]->parameters().set(1, sizeof(cl_mem), &clmem); - - // Program source origin - cl_int srcOrg[4] = { (cl_int)srcOrigin[0], - (cl_int)srcOrigin[1], - (cl_int)srcOrigin[2], 0 }; - - kernels_[blitType]->parameters().set(2, sizeof(srcOrg), srcOrg); - - // Program destination origin - cl_int dstOrg[4] = { (cl_int)dstOrigin[0], - (cl_int)dstOrigin[1], - (cl_int)dstOrigin[2], 0 }; - kernels_[blitType]->parameters().set(3, sizeof(dstOrg), dstOrg); - - cl_int copySize[4] = { (cl_int)size[0], - (cl_int)size[1], - (cl_int)size[2], 0 }; - kernels_[blitType]->parameters().set(4, sizeof(copySize), copySize); - - // Create ND range object for the kernel's execution - amd::NDRangeContainer ndrange( - dim, globalWorkOffset, globalWorkSize, localWorkSize); - - // Execute the blit - address parameters = kernels_[blitType]->parameters().capture(dev_); - bool result = gpu().submitKernelInternal( - ndrange, *kernels_[blitType], parameters, NULL); - kernels_[blitType]->parameters().release(const_cast
(parameters), dev_); - - if (useSrcView) { - srcView->owner()->release(); - } - - if (useDstView) { - dstView->owner()->release(); - } - - return result; -} - -bool -KernelBlitManager::fillBuffer( - device::Memory& memory, - const void* pattern, - size_t patternSize, - const amd::Coord3D& origin, - const amd::Coord3D& size, - bool entire - ) const -{ - if (setup_.disableFillBuffer_ || memory.isHostMemDirectAccess()) { - return HsaBlitManager::fillBuffer( - memory, pattern, patternSize, origin, size, entire); - } - - uint fillType = FillBuffer; - size_t globalWorkOffset[3] = { 0, 0, 0 }; - cl_ulong fillSize = size[0] / patternSize; - size_t globalWorkSize = amd::alignUp(fillSize, 256); - size_t localWorkSize = 256; - bool dwordAligned = - ((patternSize % sizeof(uint32_t)) == 0) ? true : false; - - // Program kernels arguments for the fill operation - if (dwordAligned) { - kernels_[fillType]->parameters().set(0, sizeof(cl_mem), NULL); - cl_mem clmem = ((cl_mem) as_cl(memory.owner())); - kernels_[fillType]->parameters().set(1, sizeof(cl_mem), &clmem); - } - else { - cl_mem clmem = ((cl_mem) as_cl(memory.owner())); - kernels_[fillType]->parameters().set(0, sizeof(cl_mem), &clmem); - kernels_[fillType]->parameters().set(1, sizeof(cl_mem), NULL); - } - - amd::Buffer *fillMemory = - new (*context_) amd::Buffer(*context_, CL_MEM_USE_HOST_PTR, patternSize); - - if (!fillMemory->create(const_cast(pattern))) { - LogError("[OCL] Fail to create mem object for destination"); - return false; - } - - if (fillMemory->getDeviceMemory(dev_) == NULL) { - LogError("[OCL] Fail to create device mem object for destination"); - return false; - } - - cl_mem clmem = ((cl_mem) as_cl(fillMemory)); - kernels_[fillType]->parameters().set(2, sizeof(cl_mem), &clmem); - cl_ulong offset = origin[0]; - if (dwordAligned) { - patternSize /= sizeof(uint32_t); - offset /= sizeof(uint32_t); - } - kernels_[fillType]->parameters().set(3, sizeof(cl_uint), &patternSize); - kernels_[fillType]->parameters().set(4, sizeof(offset), &offset); - kernels_[fillType]->parameters().set(5, sizeof(fillSize), &fillSize); - - // Create ND range object for the kernel's execution - amd::NDRangeContainer ndrange(1, - globalWorkOffset, &globalWorkSize, &localWorkSize); - - // Execute the blit - address parameters = kernels_[fillType]->parameters().capture(dev_); - bool result = gpu().submitKernelInternal( - ndrange, *kernels_[fillType], parameters, NULL); - kernels_[fillType]->parameters().release(const_cast
(parameters), dev_); - - // Wait for the transfer to finish so that we could safely release the - // fill memory object. - // TODO: we could remove this if issue on implicit memory registration is - // fixed by KFD, so that we could pass the pattern as SVM. - gpu().releaseGpuMemoryFence(); - - fillMemory->release(); - - return result; -} - -bool -KernelBlitManager::fillImage( - device::Memory& memory, - const void* pattern, - const amd::Coord3D& origin, - const amd::Coord3D& size, - bool entire - ) const -{ - if (memory.isHostMemDirectAccess()) { - return HsaBlitManager::fillImage(memory, pattern, origin, size, entire); - } - - amd::Image *image = memory.owner()->asImage(); - - uint fillType; - size_t dim = 0; - size_t globalWorkOffset[3] = { 0, 0, 0 }; - size_t globalWorkSize[3]; - size_t localWorkSize[3]; - - // Program the kernels workload depending on the fill dimensions - fillType = FillImage; - dim = 3; - // Find the current blit type - const size_t dimSize = image->getDims(); - if (dimSize == 1) { - globalWorkSize[0] = amd::alignUp(size[0], 256); - globalWorkSize[1] = amd::alignUp(size[1], 1); - globalWorkSize[2] = amd::alignUp(size[2], 1); - localWorkSize[0] = 256; - localWorkSize[1] = localWorkSize[2] = 1; - } - else if (dimSize == 2) { - globalWorkSize[0] = amd::alignUp(size[0], 16); - globalWorkSize[1] = amd::alignUp(size[1], 16); - globalWorkSize[2] = amd::alignUp(size[2], 1); - localWorkSize[0] = localWorkSize[1] = 16; - localWorkSize[2] = 1; - } - else { - globalWorkSize[0] = amd::alignUp(size[0], 8); - globalWorkSize[1] = amd::alignUp(size[1], 8); - globalWorkSize[2] = amd::alignUp(size[2], 4); - localWorkSize[0] = localWorkSize[1] = 8; - localWorkSize[2] = 4; - } - - // Program kernels arguments for the blit operation - cl_mem clmem = ((cl_mem) as_cl(memory.owner())); - kernels_[fillType]->parameters().set(0, sizeof(cl_mem), &clmem); - kernels_[fillType]->parameters().set(1, sizeof(cl_float4), pattern); - kernels_[fillType]->parameters().set(2, sizeof(cl_int4), pattern); - kernels_[fillType]->parameters().set(3, sizeof(cl_uint4), pattern); - - cl_int fillOrigin[4] = { (cl_int)origin[0], - (cl_int)origin[1], - (cl_int)origin[2], 0 }; - cl_int fillSize[4] = { (cl_int)size[0], - (cl_int)size[1], - (cl_int)size[2], 0 }; - kernels_[fillType]->parameters().set(4, sizeof(fillOrigin), fillOrigin); - kernels_[fillType]->parameters().set(5, sizeof(fillSize), fillSize); - - // Find the type of image - uint32_t type = 0; - amd::Image::Format format(image->getImageFormat()); - switch (format.image_channel_data_type) { - case CL_SNORM_INT8: - case CL_SNORM_INT16: - case CL_UNORM_INT8: - case CL_UNORM_INT16: - case CL_UNORM_SHORT_565: - case CL_UNORM_SHORT_555: - case CL_UNORM_INT_101010: - case CL_HALF_FLOAT: - case CL_FLOAT: - type = 0; - break; - case CL_SIGNED_INT8: - case CL_SIGNED_INT16: - case CL_SIGNED_INT32: - type = 1; - break; - case CL_UNSIGNED_INT8: - case CL_UNSIGNED_INT16: - case CL_UNSIGNED_INT32: - type = 2; - break; - } - kernels_[fillType]->parameters().set(6, sizeof(type), &type); - - // Create ND range object for the kernel's execution - amd::NDRangeContainer ndrange(dim, - globalWorkOffset, globalWorkSize, localWorkSize); - - // Execute the blit - address parameters = kernels_[fillType]->parameters().capture(dev_); - bool result = gpu().submitKernelInternal( - ndrange, *kernels_[fillType], parameters, NULL); - kernels_[fillType]->parameters().release(const_cast
(parameters), dev_); - - return result; -} - -bool -KernelBlitManager::create(amd::Device& device) -{ - if (!createProgram(static_cast(device))) { - return false; - } - - return true; -} - -bool -KernelBlitManager::createProgram(Device& device) -{ - // Save context and program for this device - context_ = device.blitProgram()->context_; - context_->retain(); - program_ = device.blitProgram()->program_; - program_->retain(); - - bool result = false; - do { - // Create kernel objects for all blits - for (uint i = 0; i < BlitTotal; ++i) { - const amd::Symbol* symbol = program_->findSymbol(BlitName[i]); - if (symbol == NULL) { - break; - } - kernels_[i] = new amd::Kernel(*program_, *symbol, BlitName[i]); - if (kernels_[i] == NULL) { - break; - } - } - - result = true; - } while(!result); - - return result; -} - -} // namespace oclhsa diff --git a/rocclr/runtime/device/hsa/hsablit.hpp b/rocclr/runtime/device/hsa/hsablit.hpp deleted file mode 100644 index b24a61a8cd..0000000000 --- a/rocclr/runtime/device/hsa/hsablit.hpp +++ /dev/null @@ -1,401 +0,0 @@ -// -// Copyright (c) 2013 Advanced Micro Devices, Inc. All rights reserved. -// - -#ifndef HSABLIT_HPP_ -#define HSABLIT_HPP_ - -#include "top.hpp" -#include "platform/command.hpp" -#include "platform/commandqueue.hpp" -#include "device/device.hpp" -#include "device/blit.hpp" - -/*! \addtogroup HSA Blit Implementation - * @{ - */ - -//! HSA Blit Manager Implementation -namespace oclhsa { - -class Device; -class Kernel; -class Memory; -class VirtualGPU; - -//! DMA Blit Manager -class HsaBlitManager : public device::HostBlitManager -{ -public: - //! Constructor - HsaBlitManager( - device::VirtualDevice& vdev, //!< Virtual GPU to be used for blits - Setup setup = Setup() //!< Specifies HW accelerated blits - ); - - //! Destructor - virtual ~HsaBlitManager() { } - - //! Creates HostBlitManager object - virtual bool create(amd::Device& device) { return true; } - - //! Copies a buffer object to system memory - virtual bool readBuffer( - device::Memory& srcMemory, //!< Source memory object - void* dstHost, //!< Destination host memory - const amd::Coord3D& origin, //!< Source origin - const amd::Coord3D& size, //!< Size of the copy region - bool entire = false //!< Entire buffer will be updated - ) const; - - //! Copies a buffer object to system memory - virtual bool readBufferRect( - device::Memory& srcMemory, //!< Source memory object - void* dstHost, //!< Destinaiton host memory - const amd::BufferRect& bufRect, //!< Source rectangle - const amd::BufferRect& hostRect, //!< Destination rectangle - const amd::Coord3D& size, //!< Size of the copy region - bool entire = false //!< Entire buffer will be updated - ) const; - - //! Copies an image object to system memory - virtual bool readImage( - device::Memory& srcMemory, //!< Source memory object - void* dstHost, //!< Destination host memory - const amd::Coord3D& origin, //!< Source origin - const amd::Coord3D& size, //!< Size of the copy region - size_t rowPitch, //!< Row pitch for host memory - size_t slicePitch, //!< Slice pitch for host memory - bool entire = false //!< Entire buffer will be updated - ) const; - - //! Copies system memory to a buffer object - virtual bool writeBuffer( - const void* srcHost, //!< Source host memory - device::Memory& dstMemory, //!< Destination memory object - const amd::Coord3D& origin, //!< Destination origin - const amd::Coord3D& size, //!< Size of the copy region - bool entire = false //!< Entire buffer will be updated - ) const; - - //! Copies system memory to a buffer object - virtual bool writeBufferRect( - const void* srcHost, //!< Source host memory - device::Memory& dstMemory, //!< Destination memory object - const amd::BufferRect& hostRect, //!< Destination rectangle - const amd::BufferRect& bufRect, //!< Source rectangle - const amd::Coord3D& size, //!< Size of the copy region - bool entire = false //!< Entire buffer will be updated - ) const; - - //! Copies system memory to an image object - virtual bool writeImage( - const void* srcHost, //!< Source host memory - device::Memory& dstMemory, //!< Destination memory object - const amd::Coord3D& origin, //!< Destination origin - const amd::Coord3D& size, //!< Size of the copy region - size_t rowPitch, //!< Row pitch for host memory - size_t slicePitch, //!< Slice pitch for host memory - bool entire = false //!< Entire buffer will be updated - ) const; - - //! Copies a buffer object to another buffer object - virtual bool copyBuffer( - device::Memory& srcMemory, //!< Source memory object - device::Memory& dstMemory, //!< Destination memory object - const amd::Coord3D& srcOrigin, //!< Source origin - const amd::Coord3D& dstOrigin, //!< Destination origin - const amd::Coord3D& size, //!< Size of the copy region - bool entire = false //!< Entire buffer will be updated - ) const; - - //! Copies a buffer object to another buffer object - virtual bool copyBufferRect( - device::Memory& srcMemory, //!< Source memory object - device::Memory& dstMemory, //!< Destination memory object - const amd::BufferRect& srcRect, //!< Source rectangle - const amd::BufferRect& dstRect, //!< Destination rectangle - const amd::Coord3D& size, //!< Size of the copy region - bool entire = false //!< Entire buffer will be updated - ) const; - - //! Copies an image object to a buffer object - virtual bool copyImageToBuffer( - device::Memory& srcMemory, //!< Source memory object - device::Memory& dstMemory, //!< Destination memory object - const amd::Coord3D& srcOrigin, //!< Source origin - const amd::Coord3D& dstOrigin, //!< Destination origin - const amd::Coord3D& size, //!< Size of the copy region - bool entire = false, //!< Entire buffer will be updated - size_t rowPitch = 0, //!< Pitch for buffer - size_t slicePitch = 0 //!< Slice for buffer - ) const; - - //! Copies a buffer object to an image object - virtual bool copyBufferToImage( - device::Memory& srcMemory, //!< Source memory object - device::Memory& dstMemory, //!< Destination memory object - const amd::Coord3D& srcOrigin, //!< Source origin - const amd::Coord3D& dstOrigin, //!< Destination origin - const amd::Coord3D& size, //!< Size of the copy region - bool entire = false, //!< Entire buffer will be updated - size_t rowPitch = 0, //!< Pitch for buffer - size_t slicePitch = 0 //!< Slice for buffer - ) const; - - //! Copies an image object to another image object - virtual bool copyImage( - device::Memory& srcMemory, //!< Source memory object - device::Memory& dstMemory, //!< Destination memory object - const amd::Coord3D& srcOrigin, //!< Source origin - const amd::Coord3D& dstOrigin, //!< Destination origin - const amd::Coord3D& size, //!< Size of the copy region - bool entire = false //!< Entire buffer will be updated - ) const; - - //! Fills a buffer memory with a pattern data - virtual bool fillBuffer( - device::Memory& memory, //!< Memory object to fill with pattern - const void* pattern, //!< Pattern data - size_t patternSize, //!< Pattern size - const amd::Coord3D& origin, //!< Destination origin - const amd::Coord3D& size, //!< Size of the copy region - bool entire = false //!< Entire buffer will be updated - ) const; - - //! Fills an image memory with a pattern data - virtual bool fillImage( - device::Memory& dstMemory, //!< Memory object to fill with pattern - const void* pattern, //!< Pattern data - const amd::Coord3D& origin, //!< Destination origin - const amd::Coord3D& size, //!< Size of the copy region - bool entire = false //!< Entire buffer will be updated - ) const; - -protected: - //! Returns the virtual GPU object - VirtualGPU& gpu() const { return static_cast(vDev_); } - -private: - //! Disable copy constructor - HsaBlitManager(const HsaBlitManager&); - - //! Disable operator= - HsaBlitManager& operator=(const HsaBlitManager&); - - bool importExportImage( - uint8_t* dst, - const uint8_t* src, - const amd::Coord3D& dstOffset, - size_t dstRowPitch, - size_t dstSlicePitch, - const amd::Coord3D& srcOffset, - size_t srcRowPitch, - size_t srcSlicePitch, - const amd::Coord3D& sizeToCopy, - size_t elementSize) const; -}; - -//! Kernel Blit Manager -class KernelBlitManager : public HsaBlitManager -{ -public: - enum { - BlitCopyImage = 0, - BlitCopyImage1DA, - BlitCopyImageToBuffer, - BlitCopyBufferToImage, - BlitCopyBufferRect, - BlitCopyBufferRectAligned, - BlitCopyBuffer, - BlitCopyBufferAligned, - FillBuffer, - FillImage, - BlitTotal - }; - - //! Constructor - KernelBlitManager( - device::VirtualDevice& vdev, //!< Virtual GPU to be used for blits - Setup setup = Setup() //!< Specifies HW accelerated blits - ); - - //! Destructor - virtual ~KernelBlitManager(); - - //! Creates HostBlitManager object - virtual bool create(amd::Device& device); - - //! Copies a buffer object to system memory - virtual bool readBuffer( - device::Memory& srcMemory, //!< Source memory object - void* dstHost, //!< Destination host memory - const amd::Coord3D& origin, //!< Source origin - const amd::Coord3D& size, //!< Size of the copy region - bool entire = false //!< Entire buffer will be updated - ) const; - - //! Copies a buffer object to system memory - virtual bool readBufferRect( - device::Memory& srcMemory, //!< Source memory object - void* dstHost, //!< Destinaiton host memory - const amd::BufferRect& bufRect, //!< Source rectangle - const amd::BufferRect& hostRect, //!< Destination rectangle - const amd::Coord3D& size, //!< Size of the copy region - bool entire = false //!< Entire buffer will be updated - ) const; - - //! Copies an image object to system memory - virtual bool readImage( - device::Memory& srcMemory, //!< Source memory object - void* dstHost, //!< Destination host memory - const amd::Coord3D& origin, //!< Source origin - const amd::Coord3D& size, //!< Size of the copy region - size_t rowPitch, //!< Row pitch for host memory - size_t slicePitch, //!< Slice pitch for host memory - bool entire = false //!< Entire buffer will be updated - ) const; - - //! Copies system memory to a buffer object - virtual bool writeBuffer( - const void* srcHost, //!< Source host memory - device::Memory& dstMemory, //!< Destination memory object - const amd::Coord3D& origin, //!< Destination origin - const amd::Coord3D& size, //!< Size of the copy region - bool entire = false //!< Entire buffer will be updated - ) const; - - //! Copies system memory to a buffer object - virtual bool writeBufferRect( - const void* srcHost, //!< Source host memory - device::Memory& dstMemory, //!< Destination memory object - const amd::BufferRect& hostRect, //!< Destination rectangle - const amd::BufferRect& bufRect, //!< Source rectangle - const amd::Coord3D& size, //!< Size of the copy region - bool entire = false //!< Entire buffer will be updated - ) const; - - //! Copies system memory to an image object - virtual bool writeImage( - const void* srcHost, //!< Source host memory - device::Memory& dstMemory, //!< Destination memory object - const amd::Coord3D& origin, //!< Destination origin - const amd::Coord3D& size, //!< Size of the copy region - size_t rowPitch, //!< Row pitch for host memory - size_t slicePitch, //!< Slice pitch for host memory - bool entire = false //!< Entire buffer will be updated - ) const; - - //! Copies a buffer object to another buffer object - virtual bool copyBuffer( - device::Memory& srcMemory, //!< Source memory object - device::Memory& dstMemory, //!< Destination memory object - const amd::Coord3D& srcOrigin, //!< Source origin - const amd::Coord3D& dstOrigin, //!< Destination origin - const amd::Coord3D& size, //!< Size of the copy region - bool entire = false //!< Entire buffer will be updated - ) const; - - //! Copies a buffer object to another buffer object - virtual bool copyBufferRect( - device::Memory& srcMemory, //!< Source memory object - device::Memory& dstMemory, //!< Destination memory object - const amd::BufferRect& srcRect, //!< Source rectangle - const amd::BufferRect& dstRect, //!< Destination rectangle - const amd::Coord3D& size, //!< Size of the copy region - bool entire = false //!< Entire buffer will be updated - ) const; - - //! Copies an image object to a buffer object - virtual bool copyImageToBuffer( - device::Memory& srcMemory, //!< Source memory object - device::Memory& dstMemory, //!< Destination memory object - const amd::Coord3D& srcOrigin, //!< Source origin - const amd::Coord3D& dstOrigin, //!< Destination origin - const amd::Coord3D& size, //!< Size of the copy region - bool entire = false, //!< Entire buffer will be updated - size_t rowPitch = 0, //!< Pitch for buffer - size_t slicePitch = 0 //!< Slice for buffer - ) const; - - //! Copies a buffer object to an image object - virtual bool copyBufferToImage( - device::Memory& srcMemory, //!< Source memory object - device::Memory& dstMemory, //!< Destination memory object - const amd::Coord3D& srcOrigin, //!< Source origin - const amd::Coord3D& dstOrigin, //!< Destination origin - const amd::Coord3D& size, //!< Size of the copy region - bool entire = false, //!< Entire buffer will be updated - size_t rowPitch = 0, //!< Pitch for buffer - size_t slicePitch = 0 //!< Slice for buffer - ) const; - - //! Copies an image object to another image object - virtual bool copyImage( - device::Memory& srcMemory, //!< Source memory object - device::Memory& dstMemory, //!< Destination memory object - const amd::Coord3D& srcOrigin, //!< Source origin - const amd::Coord3D& dstOrigin, //!< Destination origin - const amd::Coord3D& size, //!< Size of the copy region - bool entire = false //!< Entire buffer will be updated - ) const; - - //! Fills a buffer memory with a pattern data - virtual bool fillBuffer( - device::Memory& memory, //!< Memory object to fill with pattern - const void* pattern, //!< Pattern data - size_t patternSize, //!< Pattern size - const amd::Coord3D& origin, //!< Destination origin - const amd::Coord3D& size, //!< Size of the copy region - bool entire = false //!< Entire buffer will be updated - ) const; - - //! Fills an image memory with a pattern data - virtual bool fillImage( - device::Memory& dstMemory, //!< Memory object to fill with pattern - const void* pattern, //!< Pattern data - const amd::Coord3D& origin, //!< Destination origin - const amd::Coord3D& size, //!< Size of the copy region - bool entire = false //!< Entire buffer will be updated - ) const; - -private: - //! Disable copy constructor - KernelBlitManager(const KernelBlitManager&); - - //! Disable operator= - KernelBlitManager& operator=(const KernelBlitManager&); - - //! Creates a program for all blit operations - bool createProgram( - Device& device //!< Device object - ); - - amd::Image::Format filterFormat(amd::Image::Format oldFormat) const; - - device::Memory *createImageView( - device::Memory &parent, - amd::Image::Format newFormat) const; - - amd::Context *context_; //!< A dummy context - amd::Program *program_; //!< GPU program obejct - amd::Kernel *kernels_[BlitTotal]; //!< GPU kernels for blit -}; - -static const char* BlitName[KernelBlitManager::BlitTotal] = { - "copyImage", - "copyImage1DA", - "copyImageToBuffer", - "copyBufferToImage", - "copyBufferRect", - "copyBufferRectAligned", - "copyBuffer", - "copyBufferAligned", - "fillBuffer", - "fillImage" - }; - -/*@}*/ -} // namespace oclhsa - -#endif /*HSABLIT_HPP_*/ diff --git a/rocclr/runtime/device/hsa/hsacompiler.cpp b/rocclr/runtime/device/hsa/hsacompiler.cpp deleted file mode 100644 index 1c1fb5987b..0000000000 --- a/rocclr/runtime/device/hsa/hsacompiler.cpp +++ /dev/null @@ -1,163 +0,0 @@ -// -// Copyright (c) 2008 Advanced Micro Devices, Inc. All rights reserved. -// -#ifndef WITHOUT_FSA_BACKEND - -#include -#include -#include -#include - -#include "os/os.hpp" -#include "hsadevice.hpp" -#include "hsaprogram.hpp" -#include "hsacompilerlib.hpp" -//#include "gpukernel.hpp" -//#include "compiler/compiler.hpp" -#include "utils/options.hpp" -#include - -//CLC_IN_PROCESS_CHANGE -extern int openclFrontEnd(const char* cmdline, std::string*, std::string* typeInfo = NULL); - - -namespace oclhsa { - - -/* Temporary log function for the compiler library */ -static void logFunction(const char* msg, size_t size) -{ - std::cout<< "Compiler Log: " << msg << std::endl; -} - -static int programsCount = 0; - - -bool -FSAILProgram::compileImpl(const std::string& sourceCode, - const std::vector& headers, - const char** headerIncludeNames, - amd::option::Options* options) -{ - - acl_error errorCode; - aclTargetInfo target; - target = g_complibApi._aclGetTargetInfo(LP64_SWITCH("hsail","hsail-64"), - dev().deviceInfo().targetName_, - &errorCode); - - //end if asic info is ready - // We dump the source code for each program (param: headers) - // into their filenames (headerIncludeNames) into the TEMP - // folder specific to the OS and add the include path while - // compiling - - //Find the temp folder for the OS - std::string tempFolder = amd::Os::getEnvironment("TEMP"); - if (tempFolder.empty()) { - tempFolder = amd::Os::getEnvironment("TMP"); - if (tempFolder.empty()) { - tempFolder = WINDOWS_SWITCH(".","/tmp");; - } - } - //Iterate through each source code and dump it into tmp - std::fstream f; - std::vector headerFileNames(headers.size()); - std::vector newDirs; - for (size_t i = 0; i < headers.size(); ++i) { - std::string headerPath = tempFolder; - std::string headerIncludeName(headerIncludeNames[i]); - // replace / in path with current os's file separator - if ( amd::Os::fileSeparator() != '/') { - for (std::string::iterator it = headerIncludeName.begin(), - end = headerIncludeName.end(); - it != end; - ++it) { - if (*it == '/') *it = amd::Os::fileSeparator(); - } - } - size_t pos = headerIncludeName.rfind(amd::Os::fileSeparator()); - if (pos != std::string::npos) { - headerPath += amd::Os::fileSeparator(); - headerPath += headerIncludeName.substr(0, pos); - headerIncludeName = headerIncludeName.substr(pos+1); - } - if (!amd::Os::pathExists(headerPath)) { - bool ret = amd::Os::createPath(headerPath); - assert(ret && "failed creating path!"); - newDirs.push_back(headerPath); - } - std::string headerFullName - = headerPath + amd::Os::fileSeparator() + headerIncludeName; - headerFileNames[i] = headerFullName; - f.open(headerFullName.c_str(), std::fstream::out); - //Should we allow asserts - assert(!f.fail() && "failed creating header file!"); - f.write(headers[i]->c_str(), headers[i]->length()); - f.close(); - } - - //Create Binary - binaryElf_ = g_complibApi._aclBinaryInit(sizeof(aclBinary), - &target, - &binOpts_, - &errorCode); - - if( errorCode!=ACL_SUCCESS ) { - buildLog_ += "Error while compiling opencl source:\ - aclBinary init failure \n"; - LogWarning("aclBinaryInit failed"); - return false; - } - - //Insert opencl into binary - errorCode = g_complibApi._aclInsertSection(device().compiler(), - binaryElf_, - sourceCode.c_str(), - strlen(sourceCode.c_str()), - aclSOURCE); - - if ( errorCode != ACL_SUCCESS ) { - buildLog_ += "Error while converting to BRIG: \ - Inserting openCl Source \n"; - } - - - //Set the options for the compiler - //Set the include path for the temp folder that contains the includes - if(!headers.empty()) { - this->compileOptions_.append(" -I"); - this->compileOptions_.append(tempFolder); - } - - //Add only for CL2.0 and later - if (options->oVariables->CLStd[2] >= '2') { - std::stringstream opts; - opts << " -D" << "CL_DEVICE_MAX_GLOBAL_VARIABLE_SIZE=" - << device().info().maxGlobalVariableSize_; - compileOptions_.append(opts.str()); - } - - //Compile source to IR - this->compileOptions_.append(hsailOptions()); - errorCode = g_complibApi._aclCompile(device().compiler(), - binaryElf_, - //"-Wf,--support_all_extensions", - this->compileOptions_.c_str(), - ACL_TYPE_OPENCL, - ACL_TYPE_LLVMIR_BINARY, - logFunction); - buildLog_ += g_complibApi._aclGetCompilerLog(device().compiler()); - if( errorCode!=ACL_SUCCESS ) { - LogWarning("aclCompile failed"); - buildLog_ += "Error while compiling \ - opencl source: Compiling CL to IR"; - return false; - } - // Save the binary in the interface class - saveBinaryAndSetType(TYPE_COMPILED); - return true; - -} -} -#endif // WITHOUT_GPU_BACKEND diff --git a/rocclr/runtime/device/hsa/hsacompilerlib.cpp b/rocclr/runtime/device/hsa/hsacompilerlib.cpp deleted file mode 100644 index ca568dc683..0000000000 --- a/rocclr/runtime/device/hsa/hsacompilerlib.cpp +++ /dev/null @@ -1,67 +0,0 @@ -#include "hsacompilerlib.hpp" -#include "utils/flags.hpp" - -#include "acl.h" - -namespace oclhsa { - -void* g_complibModule = NULL; -struct CompLibApi g_complibApi; - - -// -// g_complibModule is defined in LoadCompLib(). This macro must be used only in LoadCompLib() function. -// -#define LOADSYMBOL(api) \ - g_complibApi._##api = (pfn_##api) amd::Os::getSymbol(g_complibModule, #api); \ - if( g_complibApi._##api == NULL ) { \ - LogError ("amd::Os::getSymbol() for exported func " #api " failed."); \ - amd::Os::unloadLibrary(g_complibModule); \ - return false; \ - } - - -bool LoadCompLib(bool offline) -{ - g_complibModule = amd::Os::loadLibrary("amdhsacl" LP64_SWITCH(LINUX_SWITCH("32",""), "64")); - if( g_complibModule == NULL ) { - if (!offline) { - LogError( "amd::Os::loadLibrary() for loading of amdhsacl.dll failed."); - } - return false; - } - - LOADSYMBOL(aclCompilerInit) - LOADSYMBOL(aclGetTargetInfo) - LOADSYMBOL(aclBinaryInit) - LOADSYMBOL(aclInsertSection) - LOADSYMBOL(aclCompile) - LOADSYMBOL(aclCompilerFini) - LOADSYMBOL(aclBinaryFini) - LOADSYMBOL(aclExtractSection) - LOADSYMBOL(aclWriteToMem) - LOADSYMBOL(aclQueryInfo) - LOADSYMBOL(aclGetDeviceBinary) - LOADSYMBOL(aclExtractSymbol) - LOADSYMBOL(aclGetCompilerLog) - LOADSYMBOL(aclCreateFromBinary) - LOADSYMBOL(aclReadFromMem) - - LOADSYMBOL(aclRemoveSymbol) - LOADSYMBOL(aclInsertSymbol) - LOADSYMBOL(aclWriteToFile) - LOADSYMBOL(aclBinaryVersion) - LOADSYMBOL(aclLink) - - return true; -} - -void UnloadCompLib() -{ - if( g_complibModule ) - { - amd::Os::unloadLibrary(g_complibModule); - } -} - -} // namespace oclhsa \ No newline at end of file diff --git a/rocclr/runtime/device/hsa/hsacompilerlib.hpp b/rocclr/runtime/device/hsa/hsacompilerlib.hpp deleted file mode 100644 index 1d245ddede..0000000000 --- a/rocclr/runtime/device/hsa/hsacompilerlib.hpp +++ /dev/null @@ -1,92 +0,0 @@ -#ifndef HSACOMPILERLIB_HPP_ -#define HSACOMPILERLIB_HPP_ - -// -// This file hsa the code for explicity loading amdoclcl.dll. -// Exported functions from amdoclcl.dll can be added for usage as need-basis. -// With explicit/dynamic loading oclhsa will not have any linkage to amdoclcl.lib. -// - -#include "thread/thread.hpp" -#include "acl.h" -#include "utils/debug.hpp" - -using namespace amd; - -namespace oclhsa { - -// -// To use any new exported function from amdhsacl.dll please add/make that function specific changes -// in typedef below, struct CompLibApi and in hsacompilerLib.cpp::LoadCompLib() function. -// - -// -// Convention: The typedefed function name must be prefixed with pfn_ -// -typedef aclCompiler* (ACL_API_ENTRY *pfn_aclCompilerInit) (aclCompilerOptions *opts, acl_error *error_code); -typedef aclTargetInfo (ACL_API_ENTRY *pfn_aclGetTargetInfo) (const char*, const char*, acl_error*); -typedef aclBinary* (ACL_API_ENTRY *pfn_aclBinaryInit) (size_t, const aclTargetInfo*, const aclBinaryOptions*, acl_error*); -typedef acl_error (ACL_API_ENTRY *pfn_aclInsertSection) (aclCompiler *cl, aclBinary *binary, const void *data, size_t data_size, aclSections id); -typedef acl_error (ACL_API_ENTRY *pfn_aclCompile) (aclCompiler *cl, aclBinary *bin, const char *options, aclType from, aclType to, aclLogFunction compile_callback); -typedef acl_error (ACL_API_ENTRY *pfn_aclCompilerFini) (aclCompiler *cl); -typedef acl_error (ACL_API_ENTRY *pfn_aclBinaryFini) (aclBinary *bin); -typedef const void* (ACL_API_ENTRY *pfn_aclExtractSection) (aclCompiler *cl, const aclBinary *binary, size_t *size, aclSections id, acl_error *error_code); -typedef acl_error (ACL_API_ENTRY *pfn_aclWriteToMem) (aclBinary *bin,void **mem, size_t *size); -typedef acl_error (ACL_API_ENTRY *pfn_aclQueryInfo) (aclCompiler *cl, const aclBinary *binary, aclQueryType query, const char *kernel, void *data_ptr, size_t *ptr_size); - - -typedef const void* (ACL_API_ENTRY *pfn_aclGetDeviceBinary) (aclCompiler *cl,const aclBinary *bin,const char *kernel,size_t *size,acl_error *error_code); -typedef const void* (ACL_API_ENTRY *pfn_aclExtractSymbol) (aclCompiler *cl,const aclBinary *binary,size_t *size,aclSections id,const char *symbol,acl_error *error_code); -typedef aclBinary* (ACL_API_ENTRY *pfn_aclReadFromMem) (void *mem,size_t size, acl_error *error_code); -typedef acl_error (ACL_API_ENTRY *pfn_aclRemoveSymbol) (aclCompiler *cl, aclBinary *binary, aclSections id, const char *symbol); -typedef acl_error (ACL_API_ENTRY *pfn_aclInsertSymbol) (aclCompiler *cl, aclBinary *binary, const void *data, size_t data_size, aclSections id, const char *symbol); -typedef acl_error (ACL_API_ENTRY *pfn_aclWriteToFile) (aclBinary *bin, const char *str); - - -typedef char* (ACL_API_ENTRY *pfn_aclGetCompilerLog) (aclCompiler* cl); -typedef aclBinary* (ACL_API_ENTRY *pfn_aclCreateFromBinary) (const aclBinary *binary,aclBIFVersion version); -typedef aclBIFVersion (ACL_API_ENTRY *pfn_aclBinaryVersion) (const aclBinary *binary); -typedef acl_error (ACL_API_ENTRY *pfn_aclLink) (aclCompiler* cl, aclBinary *src_bin, unsigned int num_libs, aclBinary **libs, aclType link_mode,const char* options, aclLogFunction link_callback); -// -// Convention: prefix struct member variable with with underscore '_' -// would be nice if there was no underscore prfix, but on Linux the token -// pasting in the macro is srtict and his is the workaround. -// -struct CompLibApi -{ - pfn_aclCompilerInit _aclCompilerInit; - pfn_aclGetTargetInfo _aclGetTargetInfo; - pfn_aclBinaryInit _aclBinaryInit; - pfn_aclInsertSection _aclInsertSection; - pfn_aclCompile _aclCompile; - pfn_aclCompilerFini _aclCompilerFini; - pfn_aclBinaryFini _aclBinaryFini; - pfn_aclExtractSection _aclExtractSection; - pfn_aclWriteToMem _aclWriteToMem; - pfn_aclQueryInfo _aclQueryInfo; - pfn_aclGetDeviceBinary _aclGetDeviceBinary; - pfn_aclExtractSymbol _aclExtractSymbol; - pfn_aclReadFromMem _aclReadFromMem; - pfn_aclRemoveSymbol _aclRemoveSymbol; - pfn_aclInsertSymbol _aclInsertSymbol; - pfn_aclWriteToFile _aclWriteToFile; - pfn_aclGetCompilerLog _aclGetCompilerLog; - pfn_aclCreateFromBinary _aclCreateFromBinary; - pfn_aclBinaryVersion _aclBinaryVersion; - pfn_aclLink _aclLink; -}; - - -// -// Use g_ prefix for all global variables. -// -extern void* g_complibModule; -extern CompLibApi g_complibApi; - -// Note: initializes global variable g_complibApi. -// Not sure what error values we have, for now returning false on failure. -bool LoadCompLib(bool isOfflineDevice=false); -void UnloadCompLib(); - -} // namespace oclhsa -#endif diff --git a/rocclr/runtime/device/hsa/hsacore_symbol_loader.cpp b/rocclr/runtime/device/hsa/hsacore_symbol_loader.cpp deleted file mode 100644 index d71b16d089..0000000000 --- a/rocclr/runtime/device/hsa/hsacore_symbol_loader.cpp +++ /dev/null @@ -1,53 +0,0 @@ -// -// Copyright (c) 2013 Advanced Micro Devices, Inc. All rights reserved. -// - -// Implementation of the the loading of dll and loading of all the exported -// function symbols. - - -#include "runtime/device/hsa/hsacore_symbol_loader.hpp" - -#include "runtime/thread/thread.hpp" -#include "runtime/utils/debug.hpp" -#include "runtime/os/os.hpp" - -#include -#include - -HsacoreApiSymbols* HsacoreApiSymbols::instance_ = NULL; -// hsacore_dll_handle_ is defined in HsacoreApiSymbols class. -// This macro must be used only in member functions of HsacoreApiSymbols -// class. -#define LOADSYMBOL(api) \ - api = (pfn_ ## api) amd::Os::getSymbol(hsacore_dll_handle_, # api); \ - if (api == NULL) { \ - amd::log_printf(amd::LOG_ERROR, __FILE__, __LINE__, \ - "amd::Os::getSymbol() for exported func " # api " failed."); \ - amd::Os::unloadLibrary(hsacore_dll_handle_); \ - abort(); \ - } - -HsacoreApiSymbols::HsacoreApiSymbols() - : hsacore_dll_name_(HSACORE_DLL_NAME) { - hsacore_dll_handle_ = amd::Os::loadLibrary(hsacore_dll_name_.c_str()); - if( hsacore_dll_handle_ == NULL) { - // Do not print, otherwise tests fail when HSA core and services DLLs are - // not installed, in which case only ORCA stack is initialized and it is - // not an error.. - //amd::log_printf(amd::LOG_INFO, __FILE__, __LINE__, - // "Cannot load hsa core dll. HSA DLLs may not be installed on the machine." - // " OpenCL requirement, returning without error."); - return; - } - - LOADSYMBOL(HsaGetCoreApiTable) -} - -HsacoreApiSymbols::~HsacoreApiSymbols() { - if (hsacore_dll_handle_) { - amd::Os::unloadLibrary(hsacore_dll_handle_); - hsacore_dll_handle_ = NULL; - } -} - diff --git a/rocclr/runtime/device/hsa/hsacore_symbol_loader.hpp b/rocclr/runtime/device/hsa/hsacore_symbol_loader.hpp deleted file mode 100644 index 4133ecdfdf..0000000000 --- a/rocclr/runtime/device/hsa/hsacore_symbol_loader.hpp +++ /dev/null @@ -1,75 +0,0 @@ -// -// Copyright (c) 2013 Advanced Micro Devices, Inc. All rights reserved. -// - -#ifndef _OPENCL_RUNTIME_DEVICE_HSA_HSACORE_SYMBOL_LOADER_HPP_ -#define _OPENCL_RUNTIME_DEVICE_HSA_HSACORE_SYMBOL_LOADER_HPP_ - -// File: hsacore_symbol_loader.hpp -// The main purpose of this file (class HsacoreApiSymbols), is to load the HSA -// API function symbol HsaGetCoreApiTable() from hsacore DLL/so module. -// This function outputs HsaCoreApiTable which has pointers to the rest of the -// hsacore API functions, which should be used to invoke the API functions. - -#include "newcore.h" -#include "hsacoreagent.h" - -#include - -// In case of change in the name of hsacore dll name, change the -// #define HSACORE_DLL_NAME value. this is the only place the DLL name should -// be changed or referred to. -#define HSACORE_DLL_NAME "newhsacore" LP64_ONLY("64") - -// Convention: The typedefed function name must be prefixed with pfn_ indicating -// it as pointer-to-function. -typedef HsaStatus (*pfn_HsaGetCoreApiTable)(const HsaCoreApiTable **api_table); - - -// Singleton HsacoreApiSymbols class contains the module handle and loaded -// symbols of one accessor API accessor function. -// To call hsacore API funciton, instance of this class must be used. -// Example: -// // In initialization code -// const HsaCoreApiTable *hsacoreapi = NULL; -// HsacoreApiSymbols::Instance().HsaGetCoreApiTable(&hsacoreapi); -// ... -// ... -// // Calling the core api. -// hsacoreapi->HsaGetDevices(...); -// hsacoreapi->HsaRegisterMemory(...); -class HsacoreApiSymbols { - public: - // Only the access function symbol is loaded, which in turn has pointers to - // rest of the hsacore api. - pfn_HsaGetCoreApiTable HsaGetCoreApiTable; - - static HsacoreApiSymbols &Instance() { - if (instance_ == NULL) { - instance_ = new HsacoreApiSymbols(); - } - return *instance_; - } - static void teardown(){ - if (instance_ != NULL){ - delete instance_; - } - } - static bool IsDllLoaded() { - return Instance().hsacore_dll_handle_ ? true : false; - }; - - private: - - static HsacoreApiSymbols* instance_; - // Force singleton pattern.export LD_LIBRAR - explicit HsacoreApiSymbols(); - ~HsacoreApiSymbols(); - HsacoreApiSymbols(const HsacoreApiSymbols &) {} - const HsacoreApiSymbols &operator=(const HsacoreApiSymbols &) {return *this; } - - // Data. - void *hsacore_dll_handle_; - const std::string hsacore_dll_name_; -}; -#endif // header guard diff --git a/rocclr/runtime/device/hsa/hsacounters.cpp b/rocclr/runtime/device/hsa/hsacounters.cpp deleted file mode 100644 index 448d7f6d03..0000000000 --- a/rocclr/runtime/device/hsa/hsacounters.cpp +++ /dev/null @@ -1,144 +0,0 @@ -// -// Copyright (c) 2013 Advanced Micro Devices, Inc. All rights reserved. -// - -#include "device/hsa/oclhsa_common.hpp" -#include "device/hsa/hsacounters.hpp" -#include "device/hsa/hsavirtual.hpp" - -namespace oclhsa { - -PerfCounter::~PerfCounter() -{ - // Destroy the corresponding HSA counter object - HsaStatus status; - status = servicesapi->HsaPmuDestroyCounter(counter_block_, counter_); - if (status != kHsaStatusSuccess) { - LogError("Destroy counter failed"); - return; - } - - // If no enabled counter corresponding to the PMU, - // Release the PMU - uint32_t counter_num; - if (!getEnabledCounterNum(counter_num)) { - LogError("getEnabledCounterNum failed"); - return; - } - - if (counter_num == 0) { - status = servicesapi->HsaReleasePmu(hsaPmu_); - if (status != kHsaStatusSuccess) { - LogError("Destroy pmu failed"); - return; - } - } -} - -bool -PerfCounter::create(HsaPmu hsaPmu) -{ - HsaStatus status; - hsaPmu_ = hsaPmu; - uint32_t blockIndex = static_cast(info()->blockIndex_); - status = servicesapi->HsaPmuGetCounterBlockById(hsaPmu_, blockIndex, &counter_block_); - if (status != kHsaStatusSuccess) { - LogError("HsaPmuGetCounterBlockById, failed"); - return false; - } - - status = servicesapi->HsaPmuCreateCounter(counter_block_, &counter_); - if (status != kHsaStatusSuccess) { - LogPrintfError("HsaPmuCreateCounter, failed.\ - Block: %d, counter: #d, event: %d", - info()->blockIndex_, - info()->counterIndex_, - info()->eventIndex_); - - return false; - } - - status = servicesapi->HsaPmuCounterSetEnabled(counter_, true); - if (status != kHsaStatusSuccess) { - LogError("HsaPmuCounterSetEnabled, failed"); - return false; - } - - uint32_t eventIndex = static_cast(info()->eventIndex_); - status = servicesapi->HsaPmuCounterSetParameter(counter_, - kHsaCounterParameterEventIndex, - sizeof(uint32_t), (void *)&eventIndex); - if (status != kHsaStatusSuccess) { - LogError("HsaPmuCounterSetParameter, failed"); - return false; - } - - return true; -} - -uint64_t -PerfCounter::getInfo(uint64_t infoType) const -{ - switch (infoType) { - case CL_PERFCOUNTER_GPU_BLOCK_INDEX: { - // Return the GPU block index - return info()->blockIndex_; - } - case CL_PERFCOUNTER_GPU_COUNTER_INDEX: { - // Return the GPU counter index - return info()->counterIndex_; - } - case CL_PERFCOUNTER_GPU_EVENT_INDEX: { - // Return the GPU event index - return info()->eventIndex_; - } - case CL_PERFCOUNTER_DATA: { - HsaStatus status; - uint64_t counterValue; - status = servicesapi->HsaPmuCounterGetResult(counter_, &counterValue); - if (status != kHsaStatusSuccess) { - LogError("HsaPmuCounterGetResult, failed"); - } - return counterValue; - } - default: - LogError("Wrong PerfCounter::getInfo parameter"); - } - - return 0; -} - -bool -PerfCounter::getEnabledCounterNum(uint32_t &counter_num) -{ - // Collect all the program counter blocks - uint32_t counterblock_num, num; - uint32_t i; - HsaStatus status; - HsaCounter *pp_counters; - HsaCounterBlock *pp_counterblocks; - status = servicesapi->HsaPmuGetAllCounterBlocks(hsaPmu_, - &pp_counterblocks, - &counterblock_num); - if (status != kHsaStatusSuccess) { - LogError("HsaPmuGetAllCounterBlocks, failed"); - return false; - } - - counter_num = 0; - for (i = 0; i < counterblock_num; i++) { - // Retrieve all enabled pp_counters in each counter block - status = servicesapi->HsaPmuGetEnabledCounters(pp_counterblocks[i], - &pp_counters, &num); - if (status != kHsaStatusSuccess) { - LogError("HsaPmuGetEnabledCounters, failed"); - return false; - } - counter_num += num; - } - - return true; -} - - -} // namespace oclhsa diff --git a/rocclr/runtime/device/hsa/hsacounters.hpp b/rocclr/runtime/device/hsa/hsacounters.hpp deleted file mode 100644 index 3f6669e98a..0000000000 --- a/rocclr/runtime/device/hsa/hsacounters.hpp +++ /dev/null @@ -1,103 +0,0 @@ -// -// Copyright (c) 2013 Advanced Micro Devices, Inc. All rights reserved. -// -#ifndef HSACOUNTERS_HPP_ -#define HSACOUNTERS_HPP_ - -#include "top.hpp" -#include "device/device.hpp" -#include "device/hsa/hsadevice.hpp" - -namespace oclhsa { - -class VirtualGPU; - -//! Performance counter implementation on GPU -class PerfCounter : public device::PerfCounter -{ -public: - //! The performance counter info - struct Info : public amd::EmbeddedObject - { - uint blockIndex_; //!< Index of the block to configure - uint counterIndex_; //!< Index of the hardware counter - uint eventIndex_; //!< Event you wish to count with the counter - }; - - //! The PerfCounter flags - enum Flags - { - BeginIssued = 0x00000001, - EndIssued = 0x00000002, - ResultReady = 0x00000004 - }; - - //! Constructor for the GPU PerfCounter object - PerfCounter( - const HsaDevice *device, //!< A GPU device object - const VirtualGPU& gpu, //!< Virtual GPU device object - cl_uint blockIndex, //!< HW block index - cl_uint counterIndex, //!< Counter index within the block - cl_uint eventIndex) //!< Event index for profiling - : gpuDevice_(device) - , gpu_(gpu) - , hsaPmu_(NULL) - , flags_(0) - , counter_(0) - , index_(0) - { - info_.blockIndex_ = blockIndex; - info_.counterIndex_ = counterIndex; - info_.eventIndex_ = eventIndex; - } - - //! Destructor for the GPU PerfCounter object - virtual ~PerfCounter(); - - //! Creates the counter object - bool create( - HsaPmu hsaPmu //!< Reference counter - ); - - //! Returns the specific information about the counter - uint64_t getInfo( - uint64_t infoType //!< The type of returned information - ) const; - - //! Returns the GPU device, associated with the current object - const HsaDevice * dev() const { return gpuDevice_; } - - //! Returns the virtual GPU device - const VirtualGPU& gpu() const { return gpu_; } - - //! Returns the CAL performance counter descriptor - const Info* info() const { return &info_; } - - //! Returns the Info structure for performance counter - HsaPmu getCounterPmu() const { return hsaPmu_; } - -private: - //! Disable default copy constructor - PerfCounter(const PerfCounter&); - - //! Disable default operator= - PerfCounter& operator=(const PerfCounter&); - - //! Get enabled counter number - bool getEnabledCounterNum(uint32_t &counter_num); - - const HsaDevice *gpuDevice_; //!< The backend device - const VirtualGPU& gpu_; //!< The virtual GPU device object - - HsaPmu hsaPmu_; //!< Hsa pmu - uint flags_; //!< The perfcounter object state - Info info_; //!< The info structure for perfcounter - HsaCounter counter_; //!< HSA counter object - HsaCounterBlock counter_block_; //!< counter block that the counter belongs to - uint index_; //!< Counter index in the CAL container -}; - -} // namespace oclhsa - -#endif // HSACOUNTERS_HPP_ - diff --git a/rocclr/runtime/device/hsa/hsadefs.hpp b/rocclr/runtime/device/hsa/hsadefs.hpp deleted file mode 100644 index 35a9964a7a..0000000000 --- a/rocclr/runtime/device/hsa/hsadefs.hpp +++ /dev/null @@ -1,42 +0,0 @@ -#ifndef _OPENCL_RUNTIME_DEVICE_HSA_HSADEFS_HPP_ -#define _OPENCL_RUNTIME_DEVICE_HSA_HSADEFS_HPP_ - -#ifndef WITHOUT_FSA_BACKEND - -namespace oclhsa { - -typedef uint HsaDeviceId; - -struct AMDDeviceInfo { - HsaDeviceId hsaDeviceId_; //!< Machine id - const char* targetName_; //!< Target name for compilation - const char* machineTarget_; //!< Machine target - uint simdPerCU_; //!< Number of SIMDs per CU - uint simdWidth_; //!< Number of workitems processed per SIMD - uint simdInstructionWidth_; //!< Number of instructions processed per SIMD - uint memChannelBankWidth_; //!< Memory channel bank width - uint localMemSizePerCU_; //!< Local memory size per CU - uint localMemBanks_; //!< Number of banks of local memory -}; - -//The device ID must match with the device's index into DeviceInfo -const HsaDeviceId HSA_SPECTRE_ID = 0; -const HsaDeviceId HSA_SPOOKY_ID = 1; -const HsaDeviceId HSA_TONGA_ID = 2; -const HsaDeviceId HSA_CARRIZO_ID = 3; -const HsaDeviceId HSA_ICELAND_ID = 4; -const HsaDeviceId HSA_INVALID_DEVICE_ID = -1; - -static const AMDDeviceInfo DeviceInfoTable[] = { - // targetName machineTarget -/* TARGET_KAVERI_SPECTRE */ {HSA_SPECTRE_ID, "Spectre", "Spectre", 4, 16, 1, 256, 64 * Ki, 32 }, -/* TARGET_KAVERI_SPOOKY */ {HSA_SPOOKY_ID, "Spooky", "Spooky", 4, 16, 1, 256, 64 * Ki, 32 }, -/* TARGET_TONGA */ {HSA_TONGA_ID, "Tonga", "Tonga", 4, 16, 1, 256, 64 * Ki, 32}, -/* TARGET_CARRIZO */ {HSA_CARRIZO_ID, "Carrizo", "Carrizo", 4, 16, 1, 256, 64 * Ki, 32}, -/* TARGET_ICELAND */ {HSA_ICELAND_ID, "Topaz", "Topaz", 4, 16, 1, 256, 64 * Ki, 32} -}; - - -} -#endif -#endif \ No newline at end of file diff --git a/rocclr/runtime/device/hsa/hsadevice.cpp b/rocclr/runtime/device/hsa/hsadevice.cpp deleted file mode 100644 index 20356d3227..0000000000 --- a/rocclr/runtime/device/hsa/hsadevice.cpp +++ /dev/null @@ -1,896 +0,0 @@ -// -// Copyright (c) 2008 Advanced Micro Devices, Inc. All rights reserved. -// - - -#ifndef WITHOUT_FSA_BACKEND - - -#include "platform/program.hpp" -#include "platform/kernel.hpp" -#include "os/os.hpp" -#include "utils/debug.hpp" -#include "utils/flags.hpp" -#include "utils/versions.hpp" -#include "thread/monitor.hpp" -#include "CL/cl_ext.h" - -#include "newcore.h" - -#include "amdocl/cl_common.hpp" -#include "device/hsa/hsadevice.hpp" -#include "device/hsa/hsavirtual.hpp" -#include "device/hsa/hsaprogram.hpp" -#include "device/hsa/hsablit.hpp" -#include "device/hsa/hsacompilerlib.hpp" -#include "device/hsa/hsamemory.hpp" -#include "hsacore_symbol_loader.hpp" -#include "device/hsa/oclhsa_common.hpp" -#include "kv_id.h" -#include "vi_id.h" -#include "cz_id.h" -#include "hsainterop.h" - -#include -#include -#include "CL/cl_gl.h" - -#ifdef _WIN32 -#include "CL/cl_d3d10.h" -#endif // _WIN32 - -#include -#include -#include -#include -#include -#include -#endif // WITHOUT_FSA_BACKEND - -const HsaCoreApiTable *hsacoreapi = NULL; -const HsaServicesApiTable *servicesapi = NULL; -#define OPENCL_VERSION_STR XSTR(OPENCL_MAJOR) "." XSTR(OPENCL_MINOR) - -#ifndef WITHOUT_FSA_BACKEND -namespace device { -extern const char* BlitSourceCode; -} - -namespace oclhsa { - -aclCompiler* NullDevice::compilerHandle_; -bool oclhsa::Device::isHsaInitialized_ = false; -const bool oclhsa::Device::offlineDevice_ = false; -const bool oclhsa::NullDevice::offlineDevice_= true; - -static HsaDeviceId getHsaDeviceId(const HsaDevice *device) { - /* - * Use the device id to determine the ASIC family - */ - switch (device->device_id) { - case DEVICE_ID_SPECTRE_MOBILE: - case DEVICE_ID_SPECTRE_DESKTOP: - case DEVICE_ID_SPECTRE_LITE_MOBILE_1309: - case DEVICE_ID_SPECTRE_LITE_MOBILE_130A: - case DEVICE_ID_SPECTRE_SL_MOBILE_130B: - case DEVICE_ID_SPECTRE_MOBILE_130C: - case DEVICE_ID_SPECTRE_LITE_MOBILE_130D: - case DEVICE_ID_SPECTRE_SL_MOBILE_130E: - case DEVICE_ID_SPECTRE_DESKTOP_130F: - case DEVICE_ID_SPECTRE_WORKSTATION_1310: - case DEVICE_ID_SPECTRE_WORKSTATION_1311: - case DEVICE_ID_SPECTRE_LITE_DESKTOP_1313: - case DEVICE_ID_SPECTRE_SL_DESKTOP_1315: - case DEVICE_ID_SPECTRE_SL_MOBILE_1318: - case DEVICE_ID_SPECTRE_SL_EMBEDDED_131B: - case DEVICE_ID_SPECTRE_EMBEDDED_131C: - case DEVICE_ID_SPECTRE_LITE_EMBEDDED_131D: - return HSA_SPECTRE_ID; - case DEVICE_ID_SPOOKY_MOBILE: - case DEVICE_ID_SPOOKY_DESKTOP: - case DEVICE_ID_SPOOKY_DESKTOP_1312: - case DEVICE_ID_SPOOKY_DESKTOP_1316: - case DEVICE_ID_SPOOKY_MOBILE_1317: - return HSA_SPOOKY_ID; - case DEVICE_ID_VI_TONGA_P_6920: - case DEVICE_ID_VI_TONGA_P_6921: - case DEVICE_ID_VI_TONGA_P_6928: - case DEVICE_ID_VI_TONGA_P_692B: - case DEVICE_ID_VI_TONGA_P_692F: - case DEVICE_ID_VI_TONGA_P_6938: - case DEVICE_ID_VI_TONGA_P_6939: - return HSA_TONGA_ID; - case DEVICE_ID_CZ_9870: - case DEVICE_ID_CZ_9874: - case DEVICE_ID_CZ_9875: - case DEVICE_ID_CZ_9876: - case DEVICE_ID_CZ_9877: - return HSA_CARRIZO_ID; - case DEVICE_ID_VI_ICELAND_M_6900: - case DEVICE_ID_VI_ICELAND_M_6901: - case DEVICE_ID_VI_ICELAND_M_6902: - case DEVICE_ID_VI_ICELAND_M_6903: - case DEVICE_ID_VI_ICELAND_M_6907: - return HSA_ICELAND_ID; - default: - return HSA_INVALID_DEVICE_ID; - } -} -bool NullDevice::create(const AMDDeviceInfo& deviceInfo) { - online_ = false; - deviceInfo_ = deviceInfo; - // Mark the device as GPU type - info_.type_ = CL_DEVICE_TYPE_GPU | CL_HSA_ENABLED_AMD; - info_.vendorId_ = 0x1002; - - settings_ = new Settings(); - oclhsa::Settings* hsaSettings = static_cast(settings_); - if ((hsaSettings == NULL) || - // @Todo sramalin Use double precision from constsant - !hsaSettings->create((true) & 0x1)) { - LogError("Error creating settings for NULL HSA device"); - return false; - } - // Report the device name - ::strcpy(info_.name_, deviceInfo_.machineTarget_); - info_.extensions_ = getExtensionString(); - info_.maxWorkGroupSize_ = hsaSettings->maxWorkGroupSize_; - ::strcpy(info_.vendor_, "Advanced Micro Devices, Inc."); - info_.oclcVersion_ = "OpenCL C " OPENCL_VERSION_STR " "; - std::string driverVersion = AMD_BUILD_STRING; - driverVersion.append(" (HSA)"); - strcpy(info_.driverVersion_, driverVersion.c_str()); - info_.version_ = "OpenCL " OPENCL_VERSION_STR " "; - return true; -} - -Device::Device(const HsaDevice *bkendDevice) - : _bkendDevice(bkendDevice), context_(NULL), xferQueue_(NULL) -{ -} - -Device::~Device() -{ - // Destroy transfer queue - if (xferQueue_ && xferQueue_->terminate()) { - delete xferQueue_; - xferQueue_ = NULL; - } - - if (blitProgram_) { - delete blitProgram_; - blitProgram_ = NULL; - } - - if (context_ != NULL) { - context_->release(); - } - - if (info_.extensions_) { - delete[]info_.extensions_; - info_.extensions_ = NULL; - } - - if (settings_) { - delete settings_; - settings_ = NULL; - } -} -bool NullDevice::initCompiler(bool isOffline) { - // Initializes g_complibModule and g_complibApi if they were not initialized - if( g_complibModule == NULL ){ - if (!LoadCompLib(isOffline)) { - if (!isOffline) { - LogError("Error - could not find the compiler library"); - } - return false; - } - } - //Initialize the compiler handle if has already not been initialized - //This is destroyed in Device::teardown - acl_error error; - if (!compilerHandle_) { - compilerHandle_ = g_complibApi._aclCompilerInit(NULL, &error); - if (error != ACL_SUCCESS) { - LogError("Error initializing the compiler handle"); - return false; - } - } - return true; -} - -bool NullDevice::destroyCompiler() { - if (compilerHandle_ != NULL) { - acl_error error = g_complibApi._aclCompilerFini(compilerHandle_); - if (error != ACL_SUCCESS) { - LogError("Error closing the compiler"); - return false; - } - } - if( g_complibModule != NULL ){ - UnloadCompLib(); - } - return true; -} - -void NullDevice::tearDown() { - destroyCompiler(); -} -bool NullDevice::init() { - //Initialize the compiler - if (!initCompiler(offlineDevice_)){ - return false; - } - //If there is an HSA enabled device online then skip any offline device - std::vector devices; - devices = getDevices(CL_DEVICE_TYPE_GPU | CL_HSA_ENABLED_AMD, false); - - //Load the offline devices - //Iterate through the set of available offline devices - for (uint id = 0; id < sizeof(DeviceInfoTable)/sizeof(AMDDeviceInfo); id++) { - bool isOnline = false; - //Check if the particular device is online - for (unsigned int i=0; i< devices.size(); i++) { - if (static_cast(devices[i])->deviceInfo_.hsaDeviceId_ == - DeviceInfoTable[id].hsaDeviceId_){ - isOnline = true; - } - } - if (isOnline) { - continue; - } - NullDevice* nullDevice = new NullDevice(); - if (!nullDevice->create(DeviceInfoTable[id])) { - LogError("Error creating new instance of Device."); - delete nullDevice; - return false; - } - nullDevice->registerDevice(); - } - return true; -} -NullDevice::~NullDevice() { - if (info_.extensions_) { - delete[]info_.extensions_; - info_.extensions_ = NULL; - } - - if (settings_) { - delete settings_; - settings_ = NULL; - } -} -bool Device::init() { - // Assumption: init() will be called by ocl only once at the start of program - // with a matching tearDown() when program exits. - // TODO(papte) Check if init(), - // tearDown(), init(), tearDown() repeat sequence is possible in one session - // (process lifetime). If so we will be calling LoadLibrary() and - // FreeLibrary() ifcn the similar repeat sequence. Investigate the effect of - // this on the HSA Device and Core runtime's initialzers, where the device list - // is generated in the runtime. -#ifdef BUILD_STATIC_HSA - HsaGetCoreApiTable(&hsacoreapi); - HsaGetServicesApiTable(&servicesapi); -#else - bool core_dll_loaded = HsacoreApiSymbols::Instance().IsDllLoaded(); - bool service_dll_loaded = ServicesApiSymbols::Instance().IsDllLoaded(); - - if (!core_dll_loaded && !service_dll_loaded ) { - // Both DLLs are not loaded, assume HSA not installed on a non-HSA - // machine, returning true. - LogInfo("HSA stack not available."); - return true; // Return true, indicating nothing is wrong and - // assuming HSA not installed. - } else if (core_dll_loaded ^ service_dll_loaded) { - // If Only one of the two HSA DLLs failed, then its an ERROR. - LogError("One of the HSA libraies, core or services failed to load.\n"); - return false; - } else { - // Both DLLs loaded, continue initializing HSA stack. - LogInfo("Initializing HSA stack."); - } - - // First thing first, initialize hsacoreapi and servicesapi to call core and - // services API respectively. - HsacoreApiSymbols::Instance().HsaGetCoreApiTable(&hsacoreapi); - ServicesApiSymbols::Instance().HsaGetServicesApiTable(&servicesapi); -#endif - isHsaInitialized_ = false; - if (hsacoreapi->HsaAmdInitialize() != kHsaStatusSuccess) { - // Either an error in HSA core initialization or - // KFD not installed on the machine. - // Return without error, so OpenCL can continue without HSA stack. - return true; - } - isHsaInitialized_ = true; - - // Initialize the structure used to configure the - // behavior of Hsa Runtime - // TODO (PA) : verify if this ito be called or not. - // Latest code does not call. - // SetHsaEnvConfig(); - - //Initialize the compiler - if (!initCompiler(offlineDevice_)){ - return false; - } - - const HsaDevice *devices = NULL; - unsigned num_devices = 0; - - // Initialize the Hsa Service layer - servicesapi->HsaInitServices(128); - - HsaStatus status = hsacoreapi->HsaGetDevices(&num_devices, &devices); - if (status != kHsaStatusSuccess) { - LogPrintfError( - "in %s(), Call to newcore HsaGetDevices() failed, HsaStatus: %d", - __FUNCTION__, status); - return false; - } - - for (unsigned int i = 0; i < num_devices; i++) { - Device *oclhsa_device = new Device(&devices[i]); - if (!oclhsa_device) { - LogError("Error creating new instance of Device on then heap."); - return false; - } - HsaDeviceId deviceId = getHsaDeviceId(&devices[i]); - if (deviceId == HSA_INVALID_DEVICE_ID) { - LogError(" Invalid HSA device"); - return false; - } - //Find device id in the table - unsigned sizeOfTable = sizeof(DeviceInfoTable)/sizeof(AMDDeviceInfo); - uint id; - for (id = 0; id < sizeOfTable; id++) { - if (DeviceInfoTable[id].hsaDeviceId_ == deviceId){ - break; - } - } - //If the AmdDeviceInfo for the HsaDevice Id could not be found return false - if (id == sizeOfTable) { - return false; - } - oclhsa_device->deviceInfo_ = DeviceInfoTable[id]; - - if (!oclhsa_device->mapHSADeviceToOpenCLDevice(&devices[i])) { - LogError("Failed mapping of HsaDevice to Device."); - return false; - } - - if (!oclhsa_device->create()) { - LogError("Error creating new instance of Device."); - return false; - } - oclhsa_device->registerDevice(); // no return code for this function - } - return true; -} - -void -Device::tearDown() -{ - if (isHsaInitialized_) { - if (servicesapi != NULL && servicesapi->HsaDestroyServices != NULL) { - servicesapi->HsaDestroyServices(); - } - hsacoreapi->HsaAmdShutdown(); - } - NullDevice::tearDown(); - HsacoreApiSymbols::teardown(); - ServicesApiSymbols::teardown(); -} - -bool -Device::create() -{ - amd::Context::Info info = {0}; - std::vector devices; - devices.push_back(this); - - // Create a dummy context - context_ = new amd::Context(devices, info); - if (context_ == NULL) { - return false; - } - - blitProgram_ = new BlitProgram(context_); - // Create blit programs - if (blitProgram_ == NULL || !blitProgram_->create(this)) { - delete blitProgram_; - blitProgram_ = NULL; - LogError("Couldn't create blit kernels!"); - return false; - } - - return true; -} - -oclhsa::Memory* -Device::getOclHsaMemory(amd::Memory* mem) const -{ - return static_cast(mem->getDeviceMemory(*this)); -} - -device::Program* -NullDevice::createProgram(bool hsail) { - return new oclhsa::FSAILProgram(*this); -} - -device::Program* -Device::createProgram(bool hsail) { - return new oclhsa::FSAILProgram(*this); -} - -cl_device_svm_capabilities -Device::getSvmCapabilities(const HsaDevice* device) -{ - // KV supports all types of SVM - if (device->device_id >= DEVICE_ID_SPECTRE_MOBILE && - device->device_id <= DEVICE_ID_SPECTRE_EMBEDDED_131C) { - - cl_bitfield atomics = CL_DEVICE_SVM_ATOMICS; - // Atomics are allowed in 32 bits if a environment variable is set - if (Is32Bits() && !settings().enableSvm32BitsAtomics_) { - atomics = 0; - } - return CL_DEVICE_SVM_COARSE_GRAIN_BUFFER | - CL_DEVICE_SVM_FINE_GRAIN_BUFFER | - CL_DEVICE_SVM_FINE_GRAIN_SYSTEM | - atomics; - } - // Devices such as Bonaire enable some HSA features but they do not include - // CL_DEVICE_SVM_FINE_GRAIN_SYSTEM (because of addresses above 2^40) or - // CL_DEVICE_SVM_ATOMICS capabilities. - return CL_DEVICE_SVM_COARSE_GRAIN_BUFFER | - CL_DEVICE_SVM_FINE_GRAIN_BUFFER; -} - -bool -Device::mapHSADeviceToOpenCLDevice(const HsaDevice *dev) -{ - // Create HSA settings - settings_ = new Settings(); - oclhsa::Settings* hsaSettings = static_cast(settings_); - if ((hsaSettings == NULL) || - !hsaSettings->create((dev->is_double_precision) & 0x1)) { - return false; - } - // Report the device name - ::strcpy(info_.name_, deviceInfo_.machineTarget_); - strcpy(info_.boardName_, dev->device_name); - - if (dev->number_cache_descriptors != 0) { - HsaCacheDescriptor* cacheDesc = dev->cache_descriptors; - info_.globalMemCacheLineSize_ = cacheDesc->cache_line_size; - info_.globalMemCacheSize_ = cacheDesc->cache_size * Ki; - - info_.globalMemCacheType_ = (cacheDesc->cache_type.value == 0) ? - CL_NONE : CL_READ_WRITE_CACHE; - } - else { - info_.globalMemCacheType_ = CL_NONE; - info_.globalMemCacheLineSize_ = 0; - info_.globalMemCacheSize_ = 0; - } - - // Map HSA device types to OCL device types. - // if (dev->device_type == kHsaDeviceTypeThroughput) - info_.type_ = CL_DEVICE_TYPE_GPU | CL_HSA_ENABLED_AMD; - - info_.maxComputeUnits_ = dev->number_compute_units; - info_.deviceTopology_.pcie.type = CL_DEVICE_TOPOLOGY_TYPE_PCIE_AMD; - info_.deviceTopology_.pcie.bus = (dev->location_id&(0xFF<<8))>>8; - info_.deviceTopology_.pcie.device = (dev->location_id&(0x1F<<3))>>3; - info_.deviceTopology_.pcie.function = (dev->location_id&0x07); - info_.extensions_ = getExtensionString(); - info_.nativeVectorWidthDouble_ = - info_.preferredVectorWidthDouble_ = (settings().doublePrecision_) ? 1 : 0; - - info_.maxWorkGroupSize_ = dev->wave_front_size * dev->max_waves_per_simd; - info_.maxClockFrequency_ = dev->max_clock_rate_of_f_compute; - //info_.imageSupport_ = dev->is_image_support; - info_.imageSupport_ = false; - - info_.localMemSizePerCU_ = dev->group_memory_size; - - if (populateOCLDeviceConstants() == false) { - return false; - } - - // Populate the single config setting. - info_.singleFPConfig_ = CL_FP_ROUND_TO_NEAREST | CL_FP_ROUND_TO_ZERO | - CL_FP_ROUND_TO_INF | CL_FP_INF_NAN | CL_FP_FMA; - - if (hsaSettings->doublePrecision_) { - info_.doubleFPConfig_ = info_.singleFPConfig_ | CL_FP_DENORM; - info_.singleFPConfig_ |= CL_FP_CORRECTLY_ROUNDED_DIVIDE_SQRT; - } - - info_.svmCapabilities_ = getSvmCapabilities(dev); - info_.preferredPlatformAtomicAlignment_ = 0; - info_.preferredGlobalAtomicAlignment_ = 0; - info_.preferredLocalAtomicAlignment_ = 0; - - return true; -} - -static bool -isFrameBufferDescriptor(HsaMemoryDescriptor &desc) -{ - return (desc.heap_type == kHsaHeapTypeFrameBufferPrivate); -} - -bool -Device::populateOCLDeviceConstants() -{ - info_.available_ = true; - /*info_.maxWorkGroupSize_ = 256;*/ - info_.maxWorkItemDimensions_ = 3; - - // Get frame buffer memory descriptor. - HsaMemoryDescriptor *memDescBegin = _bkendDevice->memory_descriptors; - HsaMemoryDescriptor *memDescEnd = - memDescBegin + _bkendDevice->number_memory_descriptors; - HsaMemoryDescriptor *hsaFbDesc = - std::find_if(memDescBegin, memDescEnd, isFrameBufferDescriptor); - - if ((hsaFbDesc != memDescEnd) && (hsaFbDesc->size_in_bytes > 0)) { - // Device local memory exists. Populate OpenCL info field with - // attributes of HSA GPU local memory descriptor. - info_.globalMemSize_ = hsaFbDesc->size_in_bytes; - - info_.maxMemAllocSize_ = - std::max(std::min(cl_ulong(1 * Gi), info_.globalMemSize_ / 4), - cl_ulong(128 * Mi)); - - // Make sure the max allocation size is not larger than the available - // memory size. - info_.maxMemAllocSize_ = - std::min(info_.maxMemAllocSize_, info_.globalMemSize_); - } - else { - // The HSA device backend does not have local memory, so we use system - // memory as default. - info_.globalMemSize_ = Os::getPhysicalMemSize(); - if (info_.globalMemSize_ == 0) { - return false; - } - - // Cap global memory -#if defined (_LP64) - // Cap at 8TiB for 64-bit - const cl_ulong maxGlobalMemSize = 8ULL * Ki * Gi; -#elif defined (_WIN32) - // Cap at 2GiB (see http://msdn.microsoft.com/en-us/library/aa366778.aspx) - const cl_ulong maxGlobalMemSize = 2ULL * Gi; -#else // linux - // Cap at 3.5GiB - const cl_ulong maxGlobalMemSize = 3584ULL * Mi; -#endif - info_.globalMemSize_ = std::min(info_.globalMemSize_, maxGlobalMemSize); - - info_.maxMemAllocSize_ = - info_.globalMemSize_ * CPU_MAX_ALLOC_PERCENT / 100; - if (flagIsDefault(CPU_MAX_ALLOC_PERCENT)) { - const cl_ulong minAllocSize = LP64_SWITCH(1ULL * Gi, 2ULL * Gi); - info_.maxMemAllocSize_ = std::max(info_.maxMemAllocSize_, - std::min(info_.globalMemSize_, minAllocSize)); - } - } - - /*make sure we don't run anything over 8 params for now*/ - info_.maxParameterSize_ = 1024; // [TODO]: CAL stack values: 1024* - // constant - info_.maxWorkItemSizes_[0] = 256; - info_.maxWorkItemSizes_[1] = 256; - info_.maxWorkItemSizes_[2] = 256; - - info_.nativeVectorWidthChar_ = info_.preferredVectorWidthChar_ = 4; - info_.nativeVectorWidthShort_ = info_.preferredVectorWidthShort_ = 2; - info_.nativeVectorWidthInt_ = info_.preferredVectorWidthInt_ = 1; - info_.nativeVectorWidthLong_ = info_.preferredVectorWidthLong_ = 1; - info_.nativeVectorWidthFloat_ = info_.preferredVectorWidthFloat_ = 1; - - info_.localMemSize_ = 32 * 1024; - info_.hostUnifiedMemory_ = CL_TRUE; - info_.memBaseAddrAlign_ = 8 * (flagIsDefault(MEMOBJ_BASE_ADDR_ALIGN) ? - sizeof(cl_long16) : MEMOBJ_BASE_ADDR_ALIGN); - info_.minDataTypeAlignSize_ = sizeof(cl_long16); - - info_.maxConstantArgs_ = 8; - info_.maxConstantBufferSize_ = 64 * 1024; - info_.localMemType_ = CL_LOCAL; - info_.errorCorrectionSupport_ = false; - info_.profilingTimerResolution_ = 1; - info_.littleEndian_ = true; - info_.compilerAvailable_ = true; - info_.executionCapabilities_ = CL_EXEC_KERNEL; - info_.queueProperties_ = CL_QUEUE_PROFILING_ENABLE; - info_.platform_ = AMD_PLATFORM; - info_.profile_ = "FULL_PROFILE"; - strcpy(info_.vendor_, "Advanced Micro Devices, Inc."); - - info_.addressBits_ = LP64_SWITCH(32, 64); - info_.maxSamplers_ = 16; - info_.maxReadImageArgs_ = 128; - info_.maxWriteImageArgs_ = 8; - info_.maxReadWriteImageArgs_ = 64; - info_.image2DMaxWidth_ = 16 * 1024; - info_.image2DMaxHeight_ = 16 * 1024; - info_.image3DMaxWidth_ = 2 * 1024; - info_.image3DMaxHeight_ = 2 * 1024; - info_.image3DMaxDepth_ = 2 * 1024; - info_.imageMaxArraySize_ = 2 * 1024; - info_.imageMaxBufferSize_ = 64 * 1024; - info_.imagePitchAlignment_ = 256; - info_.imageBaseAddressAlignment_ = 256; - info_.imageMaxArraySize_ = 2048; - info_.imageMaxBufferSize_ = 65536; - info_.bufferFromImageSupport_ = CL_TRUE; - info_.oclcVersion_ = "OpenCL C " OPENCL_VERSION_STR " "; - std::string driverVersion = AMD_BUILD_STRING; - driverVersion.append(" (HSA)"); - strcpy(info_.driverVersion_, driverVersion.c_str()); - info_.version_ = "OpenCL " OPENCL_VERSION_STR " "; - - info_.builtInKernels_ = ""; - info_.linkerAvailable_ = true; - info_.preferredInteropUserSync_ = true; - info_.printfBufferSize_ = 1000 * 1024; - info_.vendorId_ = 0x1002; // from gpudevice - - info_.maxGlobalVariableSize_ = static_cast(info_.maxMemAllocSize_); - info_.globalVariablePreferredTotalSize_ = - static_cast(info_.globalMemSize_); - return true; -} - -device::VirtualDevice* -Device::createVirtualDevice(amd::CommandQueue* queue) -{ - bool interopQueue = (queue != NULL) && - (0 != (queue->context().info().flags_ & - (amd::Context::GLDeviceKhr | - amd::Context::D3D10DeviceKhr | - amd::Context::D3D11DeviceKhr))); - - // Initialization of heap and other resources occur during the command - // queue creation time. - HsaQueueType type = kHsaQueueTypeCompute; - if (interopQueue) { - type = kHsaQueueTypeInterop; - } - - VirtualGPU *virtualDevice = new VirtualGPU(*this); - - if (!virtualDevice->create(type)) { - delete virtualDevice; - virtualDevice = NULL; - } - - return virtualDevice; -} - -bool -Device::globalFreeMemory(size_t *freeMemory) const -{ - return false; -} - -bool -Device::bindExternalDevice( - intptr_t type, - void* gfxDevice, - void* gfxContext, - bool validateOnly) -{ - switch (type) { -#ifdef _WIN32 - case CL_CONTEXT_D3D10_DEVICE_KHR: - if (kHsaStatusSuccess != hsacoreapi->HsaBeginD3D10Interop( - _bkendDevice, reinterpret_cast(gfxDevice))) { - LogError("Failed HsaBeginD3D10Interop()"); - return false; - } - break; - case CL_CONTEXT_D3D11_DEVICE_KHR: - if (kHsaStatusSuccess != hsacoreapi->HsaBeginD3D11Interop( - _bkendDevice, reinterpret_cast(gfxDevice))) { - LogError("Failed HsaBeginD3D11Interop()"); - return false; - } - break; -#endif // _WIN32 - case CL_GL_CONTEXT_KHR: - if (kHsaStatusSuccess != hsacoreapi->HsaBeginGLInterop( - _bkendDevice, reinterpret_cast(gfxContext))) { - LogError("Failed HsaBeginGLInterop()"); - return false; - } - break; - default: - LogError("Unknown external device!"); - return false; - } - - if (validateOnly) { - return unbindExternalDevice(type, gfxDevice, gfxContext, validateOnly); - } - return true; -} - -bool -Device::unbindExternalDevice( - intptr_t type, - void* gfxDevice, - void* gfxContext, - bool validateOnly) -{ - switch (type) { -#ifdef _WIN32 - case CL_CONTEXT_D3D10_DEVICE_KHR: - if (kHsaStatusSuccess != hsacoreapi->HsaEndD3D10Interop( - _bkendDevice, reinterpret_cast(gfxDevice))) { - LogError("Failed HsaEndD3D10Interop()"); - return false; - } - break; - case CL_CONTEXT_D3D11_DEVICE_KHR: - if (kHsaStatusSuccess != hsacoreapi->HsaEndD3D11Interop( - _bkendDevice, reinterpret_cast(gfxDevice))) { - LogError("Failed HsaEndD3D11Interop()"); - return false; - } - break; -#endif // _WIN32 - case CL_GL_CONTEXT_KHR: - if (kHsaStatusSuccess != hsacoreapi->HsaEndGLInterop( - _bkendDevice, reinterpret_cast(gfxContext))) { - LogError("Failed HsaEndGLInterop()"); - return false; - } - break; - default: - LogError("Unknown external device!"); - return false; - } - - return true; -} - -device::Memory* -Device::createMemory(amd::Memory &owner) const -{ - oclhsa::Memory* memory = NULL; - - if (owner.asBuffer()) { - memory = new oclhsa::Buffer(*this, owner); - } - else if (owner.asImage()) { - memory = new oclhsa::Image(*this, owner); - } - else { - LogError("Unknown memory type"); - } - - if (memory == NULL) { - return NULL; - } - - bool result = false; - if (owner.isInterop() && (owner.parent() == NULL)) { - result = memory->createInterop(); - } - else { - result = memory->create(); - } - - if (!result) { - delete memory; - return NULL; - } - - if (!memory->isHostMemDirectAccess() && owner.asImage() && - owner.parent() == NULL && - (owner.getMemFlags() & - (CL_MEM_COPY_HOST_PTR | CL_MEM_USE_HOST_PTR))) { - // To avoid recurssive call to Device::createMemory, we perform - // data transfer to the view of the image. - amd::Image *imageView = - owner.asImage()->createView( - owner.getContext(), owner.asImage()->getImageFormat(), xferQueue()); - - if (imageView == NULL) { - LogError("[OCL] Fail to allocate view of image object"); - return NULL; - } - - Image* devImageView = - new oclhsa::Image(static_cast(*this), *imageView); - if (devImageView == NULL) { - LogError("[OCL] Fail to allocate device mem object for the view"); - imageView->release(); - return NULL; - } - - if (devImageView != NULL && - !devImageView->createView(static_cast(*memory))) { - LogError("[OCL] Fail to create device mem object for the view"); - delete devImageView; - imageView->release(); - return NULL; - } - - imageView->replaceDeviceMemory(this, devImageView); - - result = xferMgr().writeImage( - owner.getHostMem(), - *devImageView, - amd::Coord3D(0), - imageView->getRegion(), - imageView->getRowPitch(), - imageView->getSlicePitch(), - true); - - imageView->release(); - } - - if (!result) { - delete memory; - return NULL; - } - - return memory; -} - -void* -Device::hostAlloc(size_t size, size_t alignment, bool atomics) const -{ - void* ret; - alignment = std::max(alignment, static_cast(info_.memBaseAddrAlign_)); - assert(amd::isMultipleOf(alignment, info_.memBaseAddrAlign_)); - HsaAmdSystemMemoryType type = amd::Is64Bits() && atomics - ? kHsaAmdSystemMemoryTypeCoherent : kHsaAmdSystemMemoryTypeDefault; - hsacoreapi->HsaAmdAllocateSystemMemory(size, alignment, type, &ret); - return ret; -} - -void -Device::hostFree(void* ptr, size_t size) const -{ - hsacoreapi->HsaAmdFreeSystemMemory(ptr); -} - -void* -Device::svmAlloc(amd::Context& context, size_t size, size_t alignment, cl_svm_mem_flags flags, void* svmPtr) const -{ - bool atomics = (flags & CL_MEM_SVM_ATOMICS) != 0; - return hostAlloc(size, alignment, atomics); -} - -void -Device::svmFree(void* ptr) const -{ - hostFree(ptr); -} - -VirtualGPU* -Device::xferQueue() const -{ - if (!xferQueue_) { - // Create virtual device for internal memory transfer - Device* thisDevice = const_cast(this); - thisDevice->xferQueue_ = reinterpret_cast( - thisDevice->createVirtualDevice()); - if (!xferQueue_) { - LogError("Couldn't create the device transfer manager!"); - } - } - return xferQueue_; -} - -} -#endif // WITHOUT_FSA_BACKEND diff --git a/rocclr/runtime/device/hsa/hsadevice.hpp b/rocclr/runtime/device/hsa/hsadevice.hpp deleted file mode 100644 index 30cfc9fcf5..0000000000 --- a/rocclr/runtime/device/hsa/hsadevice.hpp +++ /dev/null @@ -1,334 +0,0 @@ -// -// Copyright (c) 2009 Advanced Micro Devices, Inc. All rights reserved. -// - -#ifndef _OPENCL_RUNTIME_DEVICE_HSA_HSADEVICE_HPP_ -#define _OPENCL_RUNTIME_DEVICE_HSA_HSADEVICE_HPP_ - -#ifndef WITHOUT_FSA_BACKEND - -#include "top.hpp" -#include "device/device.hpp" -#include "platform/command.hpp" -#include "platform/program.hpp" -#include "platform/perfctr.hpp" -#include "platform/memory.hpp" -#include "utils/concurrent.hpp" -#include "thread/thread.hpp" -#include "thread/monitor.hpp" -#include "utils/versions.hpp" -#include "aclTypes.h" - -#include "device/hsa/hsasettings.hpp" -#include "device/hsa/hsavirtual.hpp" -#include "device/hsa/hsadefs.hpp" - -#include "newcore.h" - -#include - -// extern hsa::Runtime* g_hsaruntime; - -/*! \addtogroup HSA - * @{ - */ - -//! HSA Device Implementation -namespace oclhsa { - -/** - * @brief List of environment variables that could be used to - * configure the behavior of Hsa Runtime - */ -#define ENVVAR_HSA_POLL_KERNEL_COMPLETION "HSA_POLL_COMPLETION" - -//! Forward declarations -class Command; -class Device; -class GpuCommand; -class Heap; -class HeapBlock; -class Program; -class Kernel; -class Memory; -class Resource; -class VirtualDevice; -class PrintfDbg; - -//A NULL Device type used only for offline compilation -// Only functions that are used for compilation will be in this device -class NullDevice : public amd::Device { -public: - //! constructor - NullDevice(){}; - - //!create the device - bool create(const AMDDeviceInfo& deviceInfo); - - //! Initialise all the offline devices that can be used for compilation - static bool init(); - //! Teardown for offline devices - static void tearDown(); - - //! Destructor for the Null device - virtual ~NullDevice(); - - aclCompiler *compiler() const { return compilerHandle_; } - - //! Construct an HSAIL program object from the ELF assuming it is valid - virtual device::Program *createProgram(bool hsail = false); - - const AMDDeviceInfo& deviceInfo() const { - return deviceInfo_; - } - //! Gets the backend device for the NULL device type - virtual const HsaDevice* getBackendDevice() const { - ShouldNotReachHere(); - return NULL; - } - - //List of dummy functions which are disabled for NullDevice - - //! Create sub-devices according to the given partition scheme. - virtual cl_int createSubDevices( - device::CreateSubDevicesInfo& create_info, - cl_uint num_entries, - cl_device_id* devices, - cl_uint* num_devices) { - ShouldNotReachHere(); - return CL_INVALID_VALUE; }; - - //! Create a new virtual device environment. - virtual device::VirtualDevice* createVirtualDevice( - amd::CommandQueue* queue = NULL) { return NULL; } - - virtual bool registerSvmMemory(void* ptr, size_t size) const { - ShouldNotReachHere(); - return false; - } - - virtual void deregisterSvmMemory(void* ptr) const { - ShouldNotReachHere(); - } - - //! Just returns NULL for the dummy device - virtual device::Memory* createMemory(amd::Memory& owner) const { - ShouldNotReachHere(); - return NULL; } - - //! Sampler object allocation - virtual bool createSampler( - const amd::Sampler& owner, //!< abstraction layer sampler object - device::Sampler** sampler //!< device sampler object - ) const - { - ShouldNotReachHere(); - return true; - } - - //! Just returns NULL for the dummy device - virtual device::Memory* createView( - amd::Memory& owner, //!< Owner memory object - const device::Memory& parent //!< Parent device memory object for the view - ) const { - ShouldNotReachHere(); - return NULL; - } - - //! Just returns NULL for the dummy device - virtual void* svmAlloc( - amd::Context& context, //!< The context used to create a buffer - size_t size, //!< size of svm spaces - size_t alignment, //!< alignment requirement of svm spaces - cl_svm_mem_flags flags, //!< flags of creation svm spaces - void* svmPtr //!< existing svm pointer for mGPU case - ) const { - ShouldNotReachHere(); - return NULL; - } - - //! Just returns NULL for the dummy device - virtual void svmFree( - void* ptr //!< svm pointer needed to be freed - ) const { - ShouldNotReachHere(); - return; - } - - //! Reallocates the provided buffer object - virtual bool reallocMemory(amd::Memory& owner) const { - ShouldNotReachHere(); - return false; - } - - //! Acquire external graphics API object in the host thread - //! Needed for OpenGL objects on CPU device - - virtual bool bindExternalDevice( - intptr_t type, void* pDevice, void* pContext, bool validateOnly) { - ShouldNotReachHere(); - return false; - } - - virtual bool unbindExternalDevice( - intptr_t type, void* pDevice, void* pContext, bool validateOnly) { - ShouldNotReachHere(); - return false; - } - - //! Releases non-blocking map target memory - virtual void freeMapTarget(amd::Memory& mem, void* target) { ShouldNotReachHere();} - - //! Empty implementation on Null device - virtual bool globalFreeMemory(size_t* freeMemory) const { - ShouldNotReachHere(); - return false; - } - -protected: - //! Initialize compiler instance and handle - static bool initCompiler(bool isOffline); - //! destroy compiler instance and handle - static bool destroyCompiler(); - //! Handle to the the compiler - static aclCompiler* compilerHandle_; - //! Device Id for an HsaDevice - AMDDeviceInfo deviceInfo_; -private: - static const bool offlineDevice_; -}; - -//! A HSA device ordinal (physical HSA device) -class Device : public NullDevice { -public: - //! Initialise the whole HSA device subsystem (CAL init, device enumeration, etc). - static bool init(); - static void tearDown(); - - static bool loadHsaModules(); - - bool create(); - - //! Construct a new physical HSA device - Device(const HsaDevice *bkendDevice); - virtual const HsaDevice *getBackendDevice() const - { - return (_bkendDevice); - } - - //! Destructor for the physical HSA device - virtual ~Device(); - - bool mapHSADeviceToOpenCLDevice(const HsaDevice *hsadevice); - - // Temporary, delete it later when HSA Runtime and KFD is fully fucntional. - void fake_device(); - - /////////////////////////////////////////////////////////////////////////////// - // TODO: Below are all mocked up virtual functions from amd::Device, they may - // need real implementation. - /////////////////////////////////////////////////////////////////////////////// - -// #ifdef cl_ext_device_fission - //! Create sub-devices according to the given partition scheme. - virtual cl_int createSubDevices( - device::CreateSubDevicesInfo &create_inf, - cl_uint num_entries, - cl_device_id *devices, - cl_uint *num_devices) - { return CL_INVALID_VALUE; } -// #endif // cl_ext_device_fission - - // bool Device::create(CALuint ordinal); - - //! Instantiate a new virtual device - virtual device::VirtualDevice *createVirtualDevice( - amd::CommandQueue* queue = NULL); - - //! Construct an HSAIL program object from the ELF assuming it is valid - virtual device::Program *createProgram(bool hsail = false); - - virtual device::Memory *createMemory(amd::Memory &owner) const; - - //! Sampler object allocation - virtual bool createSampler( - const amd::Sampler& owner, //!< abstraction layer sampler object - device::Sampler** sampler //!< device sampler object - ) const - { - //! \todo HSA team has to implement sampler allocation - *sampler = NULL; - return true; - } - - - //! Just returns NULL for the dummy device - virtual device::Memory *createView( - amd::Memory &owner, //!< Owner memory object - const device::Memory &parent //!< Parent device memory object for the view - ) const { return NULL; } - - //! Reallocates the provided buffer object - virtual bool reallocMemory(amd::Memory &owner) const {return true; } - - //! Acquire external graphics API object in the host thread - //! Needed for OpenGL objects on CPU device - virtual bool bindExternalDevice( - intptr_t type, void *pDevice, void *pContext, bool validateOnly); - - /** - * @brief Removes the external device as an available device. - * - * @note: The current implementation is to avoid build break - * and does not represent actual / correct implementation. This - * needs to be done. - */ - bool unbindExternalDevice( - intptr_t type, //!< Enum val. for ext.API type: GL, D3D10, etc. - void *gfxDevice, //!< D3D device do D3D, HDC/Display handle of X Window for GL - void *gfxContext, //!< HGLRC/GLXContext handle - bool validateOnly //!< Only validate if the device can inter-operate with - //!< pDevice/pContext, do not bind. - ); - - //! Gets free memory on a GPU device - virtual bool globalFreeMemory(size_t *freeMemory) const; - - virtual void* hostAlloc(size_t size, size_t alignment, bool atomics = false) const; - - virtual void hostFree(void* ptr, size_t size = 0) const; - - virtual void* svmAlloc(amd::Context& context, size_t size, size_t alignment, cl_svm_mem_flags flags = CL_MEM_READ_WRITE, void* svmPtr = NULL) const; - - virtual void svmFree(void* ptr) const; - - //! Returns a OCLHSA memory object from AMD memory object - oclhsa::Memory* getOclHsaMemory( - amd::Memory* mem //!< Pointer to AMD memory object - ) const; - - const Settings &settings() const { return reinterpret_cast(*settings_); } - - //! Returns transfer engine object - const device::BlitManager& xferMgr() const { return xferQueue()->blitMgr();} - -private: - bool populateOCLDeviceConstants(); - - cl_device_svm_capabilities getSvmCapabilities(const HsaDevice* device); - - VirtualGPU* xferQueue() const; - - static bool isHsaInitialized_; - const HsaDevice *_bkendDevice; - static const bool offlineDevice_; - amd::Context *context_; //!< A dummy context for internal data transfer - VirtualGPU *xferQueue_; //!< Transfer queue, created on demand -}; // class oclhsa::Device -} // namespace oclhsa - -/** - * @} - */ -#endif /*WITHOUT_FSA_BACKEND*/ -#endif /*HSA_HPP_*/ diff --git a/rocclr/runtime/device/hsa/hsakernel.cpp b/rocclr/runtime/device/hsa/hsakernel.cpp deleted file mode 100644 index 844d28f646..0000000000 --- a/rocclr/runtime/device/hsa/hsakernel.cpp +++ /dev/null @@ -1,573 +0,0 @@ -// -// Copyright (c) 2009 Advanced Micro Devices, Inc. All rights reserved. -// - -#include "device/hsa/hsakernel.hpp" - -#include "device/hsa/oclhsa_common.hpp" - -#include - -#ifndef WITHOUT_FSA_BACKEND - -namespace oclhsa { - -inline static HSAIL_ARG_TYPE -GetHSAILArgType(const aclArgData* argInfo) -{ - switch (argInfo->type) { - case ARG_TYPE_POINTER: - return HSAIL_ARGTYPE_POINTER; - case ARG_TYPE_VALUE: - return HSAIL_ARGTYPE_VALUE; - case ARG_TYPE_IMAGE: - return HSAIL_ARGTYPE_IMAGE; - case ARG_TYPE_SAMPLER: - return HSAIL_ARGTYPE_SAMPLER; - case ARG_TYPE_ERROR: - default: - return HSAIL_ARGTYPE_ERROR; - } -} - -inline static size_t -GetHSAILArgAlignment(const aclArgData* argInfo) -{ - switch (argInfo->type) { - case ARG_TYPE_POINTER: - return argInfo->arg.pointer.align; - default: - return 1; - } -} - -inline static HSAIL_ADDRESS_QUALIFIER -GetHSAILAddrQual(const aclArgData* argInfo) -{ - if (argInfo->type == ARG_TYPE_POINTER) { - switch (argInfo->arg.pointer.memory) { - case PTR_MT_CONSTANT_EMU: - case PTR_MT_CONSTANT: - case PTR_MT_UAV: - case PTR_MT_GLOBAL: - return HSAIL_ADDRESS_GLOBAL; - case PTR_MT_LDS_EMU: - case PTR_MT_LDS: - return HSAIL_ADDRESS_LOCAL; - case PTR_MT_ERROR: - default: - LogError("Unsupported address type"); - return HSAIL_ADDRESS_ERROR; - } - } - else if ((argInfo->type == ARG_TYPE_IMAGE) || - (argInfo->type == ARG_TYPE_SAMPLER)) { - return HSAIL_ADDRESS_GLOBAL; - } - return HSAIL_ADDRESS_ERROR; -} - -/* f16 returns f32 - workaround due to comp lib */ -inline static HSAIL_DATA_TYPE -GetHSAILDataType(const aclArgData* argInfo) -{ - aclArgDataType dataType; - - if (argInfo->type == ARG_TYPE_POINTER) { - dataType = argInfo->arg.pointer.data; - } - else if (argInfo->type == ARG_TYPE_VALUE) { - dataType = argInfo->arg.value.data; - } - else { - return HSAIL_DATATYPE_ERROR; - } - switch (dataType) { - case DATATYPE_i1: - return HSAIL_DATATYPE_B1; - case DATATYPE_i8: - return HSAIL_DATATYPE_S8; - case DATATYPE_i16: - return HSAIL_DATATYPE_S16; - case DATATYPE_i32: - return HSAIL_DATATYPE_S32; - case DATATYPE_i64: - return HSAIL_DATATYPE_S64; - case DATATYPE_u8: - return HSAIL_DATATYPE_U8; - case DATATYPE_u16: - return HSAIL_DATATYPE_U16; - case DATATYPE_u32: - return HSAIL_DATATYPE_U32; - case DATATYPE_u64: - return HSAIL_DATATYPE_U64; - case DATATYPE_f16: - return HSAIL_DATATYPE_F32; - case DATATYPE_f32: - return HSAIL_DATATYPE_F32; - case DATATYPE_f64: - return HSAIL_DATATYPE_F64; - case DATATYPE_struct: - return HSAIL_DATATYPE_STRUCT; - case DATATYPE_opaque: - return HSAIL_DATATYPE_OPAQUE; - case DATATYPE_ERROR: - default: - return HSAIL_DATATYPE_ERROR; - } -} - -// returns size in number of bytes -inline static int -GetHSAILArgSize(const aclArgData *argInfo) -{ - switch (argInfo->type) { - case ARG_TYPE_VALUE: - switch (GetHSAILDataType(argInfo)) { - case HSAIL_DATATYPE_B1: - return 1; - case HSAIL_DATATYPE_B8: - case HSAIL_DATATYPE_S8: - case HSAIL_DATATYPE_U8: - return 1; - case HSAIL_DATATYPE_B16: - case HSAIL_DATATYPE_U16: - case HSAIL_DATATYPE_S16: - case HSAIL_DATATYPE_F16: - return 2; - case HSAIL_DATATYPE_B32: - case HSAIL_DATATYPE_U32: - case HSAIL_DATATYPE_S32: - case HSAIL_DATATYPE_F32: - return 4; - case HSAIL_DATATYPE_B64: - case HSAIL_DATATYPE_U64: - case HSAIL_DATATYPE_S64: - case HSAIL_DATATYPE_F64: - return 8; - case HSAIL_DATATYPE_STRUCT: - return argInfo->arg.value.numElements; - default: - return -1; - } - case ARG_TYPE_POINTER: - case ARG_TYPE_IMAGE: - case ARG_TYPE_SAMPLER: - return sizeof(void*); - default: - return -1; - } -} - -inline static clk_value_type_t -GetOclType(const aclArgData* argInfo) -{ - static const clk_value_type_t ClkValueMapType[6][6] = { - { T_CHAR, T_CHAR2, T_CHAR3, T_CHAR4, T_CHAR8, T_CHAR16 }, - { T_SHORT, T_SHORT2, T_SHORT3, T_SHORT4, T_SHORT8, T_SHORT16 }, - { T_INT, T_INT2, T_INT3, T_INT4, T_INT8, T_INT16 }, - { T_LONG, T_LONG2, T_LONG3, T_LONG4, T_LONG8, T_LONG16 }, - { T_FLOAT, T_FLOAT2, T_FLOAT3, T_FLOAT4, T_FLOAT8, T_FLOAT16 }, - { T_DOUBLE, T_DOUBLE2, T_DOUBLE3, T_DOUBLE4, T_DOUBLE8, T_DOUBLE16 }, - }; - - uint sizeType; - if ((argInfo->type == ARG_TYPE_POINTER) || (argInfo->type == ARG_TYPE_IMAGE)) { - return T_POINTER; - } - else if (argInfo->type == ARG_TYPE_VALUE) { - switch (argInfo->arg.value.data) { - case DATATYPE_i8: - case DATATYPE_u8: - sizeType = 0; - break; - case DATATYPE_i16: - case DATATYPE_u16: - sizeType = 1; - break; - case DATATYPE_i32: - case DATATYPE_u32: - sizeType = 2; - break; - case DATATYPE_i64: - case DATATYPE_u64: - sizeType = 3; - break; - case DATATYPE_f16: - case DATATYPE_f32: - sizeType = 4; - break; - case DATATYPE_f64: - sizeType = 5; - break; - default: - return T_VOID; - } - switch (argInfo->arg.value.numElements) { - case 1: return ClkValueMapType[sizeType][0]; - case 2: return ClkValueMapType[sizeType][1]; - case 3: return ClkValueMapType[sizeType][2]; - case 4: return ClkValueMapType[sizeType][3]; - case 8: return ClkValueMapType[sizeType][4]; - case 16: return ClkValueMapType[sizeType][5]; - default: return T_VOID; - } - } - else if (argInfo->type == ARG_TYPE_SAMPLER) { - return T_SAMPLER; - } - else { - return T_VOID; - } -} - -inline static cl_kernel_arg_address_qualifier -GetOclAddrQual(const aclArgData* argInfo) -{ - if (argInfo->type == ARG_TYPE_POINTER) { - switch (argInfo->arg.pointer.memory) { - case PTR_MT_UAV: - case PTR_MT_GLOBAL: - return CL_KERNEL_ARG_ADDRESS_GLOBAL; - case PTR_MT_CONSTANT: - case PTR_MT_UAV_CONSTANT: - case PTR_MT_CONSTANT_EMU: - return CL_KERNEL_ARG_ADDRESS_CONSTANT; - case PTR_MT_LDS_EMU: - case PTR_MT_LDS: - return CL_KERNEL_ARG_ADDRESS_LOCAL; - default: - return CL_KERNEL_ARG_ADDRESS_PRIVATE; - } - } - else if (argInfo->type == ARG_TYPE_IMAGE) { - return CL_KERNEL_ARG_ADDRESS_GLOBAL; - } - //default for all other cases - return CL_KERNEL_ARG_ADDRESS_PRIVATE; -} - -inline static cl_kernel_arg_access_qualifier -GetOclAccessQual(const aclArgData* argInfo) -{ - if (argInfo->type == ARG_TYPE_IMAGE) { - switch (argInfo->arg.image.type) { - case ACCESS_TYPE_RO: - return CL_KERNEL_ARG_ACCESS_READ_ONLY; - case ACCESS_TYPE_WO: - return CL_KERNEL_ARG_ACCESS_WRITE_ONLY; - case ACCESS_TYPE_RW: - return CL_KERNEL_ARG_ACCESS_READ_WRITE; - default: - return CL_KERNEL_ARG_ACCESS_NONE; - } - } - return CL_KERNEL_ARG_ACCESS_NONE; -} - -inline static cl_kernel_arg_type_qualifier -GetOclTypeQual(const aclArgData* argInfo) -{ - cl_kernel_arg_type_qualifier rv = CL_KERNEL_ARG_TYPE_NONE; - if (argInfo->type == ARG_TYPE_POINTER) { - if (argInfo->arg.pointer.isVolatile) { - rv |= CL_KERNEL_ARG_TYPE_VOLATILE; - } - if (argInfo->arg.pointer.isRestrict) { - rv |= CL_KERNEL_ARG_TYPE_RESTRICT; - } - if (argInfo->isConst) { - rv |= CL_KERNEL_ARG_TYPE_CONST; - } - switch (argInfo->arg.pointer.memory) { - case PTR_MT_CONSTANT: - case PTR_MT_UAV_CONSTANT: - case PTR_MT_CONSTANT_EMU: - rv |= CL_KERNEL_ARG_TYPE_CONST; - break; - default: - break; - } - } - return rv; -} - -static int -GetOclSize(const aclArgData* argInfo) -{ - switch (argInfo->type) { - case ARG_TYPE_POINTER: return sizeof(void *); - case ARG_TYPE_VALUE: - switch (argInfo->arg.value.data) { - case DATATYPE_i8: - case DATATYPE_u8: - case DATATYPE_struct: - return 1 * argInfo->arg.value.numElements; - case DATATYPE_u16: - case DATATYPE_i16: - case DATATYPE_f16: - return 2 * argInfo->arg.value.numElements; - case DATATYPE_u32: - case DATATYPE_i32: - case DATATYPE_f32: - return 4 * argInfo->arg.value.numElements; - case DATATYPE_i64: - case DATATYPE_u64: - case DATATYPE_f64: - return 8 * argInfo->arg.value.numElements; - case DATATYPE_ERROR: - default: return -1; - } - case ARG_TYPE_IMAGE: return sizeof(cl_mem); - case ARG_TYPE_SAMPLER: return sizeof(cl_sampler); - default: return -1; - } -} - -KernelArg::KernelArg(aclArgData *argInfo) { - argInfo_ = argInfo; - name_ = argInfo_->argStr; - typeName_ = argInfo->typeStr; -} - -int KernelArg::size() { - switch (argInfo_->type) { - case ARG_TYPE_POINTER: { - return sizeof(void *); - } - case ARG_TYPE_VALUE: { - switch (argInfo_->arg.value.data) { - case DATATYPE_ERROR: { - return -1; - } - case DATATYPE_i8: - case DATATYPE_u8: - case DATATYPE_struct: { - return 1 * argInfo_->arg.value.numElements; - } - case DATATYPE_u16: - case DATATYPE_i16: - case DATATYPE_f16: { - return 2 * argInfo_->arg.value.numElements; - } - case DATATYPE_u32: - case DATATYPE_i32: - case DATATYPE_f32: { - return 4 * argInfo_->arg.value.numElements; - } - case DATATYPE_i64: - case DATATYPE_u64: - case DATATYPE_f64: { - return 8 * argInfo_->arg.value.numElements; - } - default: - return -1; - } - } - case ARG_TYPE_IMAGE: { - return sizeof(cl_mem); - } - case ARG_TYPE_SAMPLER: { - return sizeof(cl_sampler); - } - default: - return -1; - } -} - -std::string& KernelArg::name() { - return name_; -} - -std::string& KernelArg::typeName() -{ - return typeName_; -} - -void -Kernel::initArgList(const aclArgData* aclArg) -{ - // Initialize the hsail argument list too - initHsailArgs(aclArg); - - // Iterate through the arguments and insert into parameterList - device::Kernel::parameters_t params; - amd::KernelParameterDescriptor desc; - size_t offset = 0; - - // Reserved arguments for HSAIL launch - aclArg += ExtraArguments; - for (uint i = 0; aclArg->struct_size != 0; i++, aclArg++) { - desc.name_ = hsailArgList_[i]->name_.c_str(); - desc.type_ = GetOclType(aclArg); - desc.addressQualifier_ = GetOclAddrQual(aclArg); - desc.accessQualifier_ = GetOclAccessQual(aclArg); - desc.typeQualifier_ = GetOclTypeQual(aclArg); - desc.typeName_ = hsailArgList_[i]->typeName_.c_str(); - - // Make a check if it is local or global - if (desc.addressQualifier_ == CL_KERNEL_ARG_ADDRESS_LOCAL) { - desc.size_ = 0; - } - else { - desc.size_ = GetOclSize(aclArg); - } - - // Make offset alignment to match CPU metadata, since - // in multidevice config abstraction layer has a single signature - // and CPU sends the paramaters as they are allocated in memory - size_t size = desc.size_; - if (size == 0) { - // Local memory for CPU - size = sizeof(cl_mem); - } - offset = amd::alignUp(offset, std::min(size, size_t(16))); - desc.offset_ = offset; - offset += amd::alignUp(size, sizeof(uint32_t)); - params.push_back(desc); - } - createSignature(params); -} - -void -Kernel::initHsailArgs(const aclArgData* aclArg) -{ - int offset = 0; - - // Reserved arguments for HSAIL launch - aclArg += ExtraArguments; - - // Iterate through the each kernel argument - for (; aclArg->struct_size != 0; aclArg++) { - HsailKernelArg* arg = new HsailKernelArg; - // Initialize HSAIL kernel argument - arg->name_ = aclArg->argStr; - arg->typeName_ = aclArg->typeStr; - arg->size_ = GetHSAILArgSize(aclArg); - arg->offset_ = offset; - arg->type_ = GetHSAILArgType(aclArg); - arg->addrQual_ = GetHSAILAddrQual(aclArg); - arg->dataType_ = GetHSAILDataType(aclArg); - // If vector of args we add additional arguments to flatten it out - arg->numElem_ = ((aclArg->type == ARG_TYPE_VALUE) && - (aclArg->arg.value.data != DATATYPE_struct)) ? - aclArg->arg.value.numElements : 1; - arg->alignment_ = GetHSAILArgAlignment(aclArg); - offset += GetHSAILArgSize(aclArg); - hsailArgList_.push_back(arg); - } -} - -Kernel::Kernel(std::string name, - FSAILProgram* prog, - HsaBrig* brig, - std::string compileOptions): - device::Kernel(name), - program_(prog), - compileOptions_(compileOptions), - brig_(brig), - kernelCode_(NULL), - debugInfo_(NULL){ -} - -bool Kernel::init(){ - acl_error errorCode; - //compile kernel down to ISA - const HsaDevice *hsaDevice = program_->hsaDevice(); - std::string openClKernelName("&__OpenCL_" + name() + "_kernel"); - HsaStatus status = hsacoreapi->HsaFinalizeBrig( - hsaDevice, brig_, - openClKernelName.c_str(), - compileOptions_.c_str(), - &kernelCode_, - &debugInfo_); - if (status != kHsaStatusSuccess) { - return false; - } - // Pull out metadata from the ELF - size_t sizeOfArgList; - aclCompiler* compileHandle = program_->dev().compiler(); - errorCode = g_complibApi._aclQueryInfo(compileHandle, - program_->binaryElf(), - RT_ARGUMENT_ARRAY, - openClKernelName.c_str(), - NULL, - &sizeOfArgList); - if (errorCode != ACL_SUCCESS) { - return false; - } - char *argList = (char *)malloc(sizeOfArgList); - errorCode = g_complibApi._aclQueryInfo(compileHandle, - program_->binaryElf(), - RT_ARGUMENT_ARRAY, - openClKernelName.c_str(), - argList, - &sizeOfArgList); - if (errorCode != ACL_SUCCESS) { - return false; - } - //Set the argList - initArgList((const aclArgData *) argList); - - //Pull out amdKernelInfo - HsaKernelAmdInfo kernelAmdInfo; - status = servicesapi->HsaGetKernelAmdInfo(kernelCode_, &kernelAmdInfo); - if (status != kHsaStatusSuccess) { - return false; - } - HsaDeviceAmdInfo devInfo; - status = servicesapi->HsaGetDeviceAmdInfo(hsaDevice, &devInfo); - if (status != kHsaStatusSuccess) { - return false; - } - //Set the workgroup information for the kernel - memset(&workGroupInfo_, 0, sizeof(workGroupInfo_)); - workGroupInfo_.availableLDSSize_ = hsaDevice->group_memory_size; - workGroupInfo_.availableSGPRs_ = devInfo.max_number_of_sgprs; - workGroupInfo_.availableVGPRs_ = devInfo.max_number_of_vgprs; - size_t sizeOfWorkGroupSize; - errorCode = g_complibApi._aclQueryInfo(compileHandle, - program_->binaryElf(), - RT_WORK_GROUP_SIZE, - openClKernelName.c_str(), - NULL, - &sizeOfWorkGroupSize); - if (errorCode != ACL_SUCCESS) { - return false; - } - errorCode = g_complibApi._aclQueryInfo(compileHandle, - program_->binaryElf(), - RT_WORK_GROUP_SIZE, - openClKernelName.c_str(), - workGroupInfo_.compileSize_, - &sizeOfWorkGroupSize); - if (errorCode != ACL_SUCCESS) { - return false; - } - //Setting it the same as used LDS - workGroupInfo_.localMemSize_ = kernelCode_->workgroup_group_segment_byte_size; - workGroupInfo_.privateMemSize_ = kernelCode_->workitem_private_segment_byte_size; - workGroupInfo_.usedLDSSize_ = kernelCode_->workgroup_group_segment_byte_size; - workGroupInfo_.preferredSizeMultiple_ = hsaDevice->wave_front_size; - workGroupInfo_.usedSGPRs_ = kernelAmdInfo.wave_front_sgpr_count; - workGroupInfo_.usedStackSize_ = 0; - workGroupInfo_.usedVGPRs_ = kernelAmdInfo.work_item_vgpr_count; - workGroupInfo_.wavefrontPerSIMD_ = hsaDevice->max_waves_per_simd; - workGroupInfo_.wavefrontSize_ = hsaDevice->wave_front_size; - //TODO: Need to populate it from the shader object - workGroupInfo_.size_ = 256; - return true; -} - -Kernel::~Kernel() { - while (!hsailArgList_.empty()) { - HsailKernelArg* kernelArgPointer = hsailArgList_.back(); - delete kernelArgPointer; - hsailArgList_.pop_back(); - } - hsacoreapi->HsaFreeKernelCode(kernelCode_); - hsacoreapi->HsaFreeKernelDebug(debugInfo_); -} - -} // namespace oclhsa -#endif // WITHOUT_FSA_BACKEND diff --git a/rocclr/runtime/device/hsa/hsakernel.hpp b/rocclr/runtime/device/hsa/hsakernel.hpp deleted file mode 100644 index e5d3af2477..0000000000 --- a/rocclr/runtime/device/hsa/hsakernel.hpp +++ /dev/null @@ -1,161 +0,0 @@ -// -// Copyright (c) 2009 Advanced Micro Devices, Inc. All rights reserved. -// -#ifndef HSAKERNEL_HPP_ -#define HSAKERNEL_HPP_ - -#include "acl.h" -#include "device/hsa/hsaprogram.hpp" -#include "newcore.h" -#include "top.hpp" - -#ifndef WITHOUT_FSA_BACKEND - -namespace oclhsa { - -#define MAX_INFO_STRING_LEN 0x40 -enum HSAIL_ADDRESS_QUALIFIER{ -HSAIL_ADDRESS_ERROR=0, -HSAIL_ADDRESS_GLOBAL, -HSAIL_ADDRESS_LOCAL, -HSAIL_MAX_ADDRESS_QUALIFIERS -} ; - -enum HSAIL_ARG_TYPE{ -HSAIL_ARGTYPE_ERROR=0, -HSAIL_ARGTYPE_POINTER, -HSAIL_ARGTYPE_VALUE, -HSAIL_ARGTYPE_IMAGE, -HSAIL_ARGTYPE_SAMPLER, -HSAIL_ARGMAX_ARG_TYPES -}; - -enum HSAIL_DATA_TYPE{ -HSAIL_DATATYPE_ERROR=0, -HSAIL_DATATYPE_B1, -HSAIL_DATATYPE_B8, -HSAIL_DATATYPE_B16, -HSAIL_DATATYPE_B32, -HSAIL_DATATYPE_B64, -HSAIL_DATATYPE_S8, -HSAIL_DATATYPE_S16, -HSAIL_DATATYPE_S32, -HSAIL_DATATYPE_S64, -HSAIL_DATATYPE_U8, -HSAIL_DATATYPE_U16, -HSAIL_DATATYPE_U32, -HSAIL_DATATYPE_U64, -HSAIL_DATATYPE_F16, -HSAIL_DATATYPE_F32, -HSAIL_DATATYPE_F64, -HSAIL_DATATYPE_STRUCT, -HSAIL_DATATYPE_OPAQUE, -HSAIL_DATATYPE_MAX_TYPES -}; - -struct HsailKernelArg -{ - std::string name_; //!< Argument's name - std::string typeName_; //!< Argument's type name - uint size_; //!< Size in bytes - uint offset_; //!< Argument's offset - uint alignment_; //!< Argument's alignment - HSAIL_ARG_TYPE type_; //!< Type of the argument - HSAIL_ADDRESS_QUALIFIER addrQual_; //!< Address qualifier of the argument - HSAIL_DATA_TYPE dataType_; //!< The type of data - uint numElem_; //!< Number of elements -}; - -class KernelArg -{ -public: - KernelArg(aclArgData* argInfo); - //! Return type of the argument - clk_value_type_t amdoclType(); - //! Global, local etc - returns amdocl types - clk_address_space_t amdoclAddrQual(); - //! Global,localetc - returns opencl type - cl_kernel_arg_address_qualifier oclAddrQual(); - //! read , write etc - returns amdocl type - clk_arg_qualifier_t amdoclAccessQual(); - //! read , write etc - returns opencl type type - cl_kernel_arg_access_qualifier oclAccessQual(); - //! const,volatile,restrict etc - returns opencl type type - cl_kernel_arg_type_qualifier oclTypeQual(); - - //! Name of the argument - std::string& name(); - //! Name of the argument - std::string& typeName(); - //! reflection - std::string reflection(){ return name(); }; - //! Returns the size of the argument - int size(); - //! returns the offset - int offset(); - - void setOffset(); - -private: - aclArgData* argInfo_; - int offset_; - std::string name_; - std::string typeName_; -}; - -class Kernel : public device::Kernel -{ -public: - // Global offsets located in the first 3 elements - static const uint ExtraArguments = 3; - - Kernel(std::string name, - FSAILProgram* prog, - HsaBrig* brig, - std::string compileOptions); - - ~Kernel(); - - //! Initializes the metadata required for this kernel - bool init(); - - const FSAILProgram* program() { - return static_cast(program_); - } - - //! Returns the AqlKernel associated with this Kernel - const HsaKernelCode* kernelCode() { return - static_cast(kernelCode_); - } - - //! Returns the BRIG that was used to compile this kernel - const HsaBrig* brig() { - return static_cast(brig_); - } - - //!returns a pointer to the hsail argument at the specified index - HsailKernelArg* hsailArgAt(size_t index) { - return hsailArgList_[index]; - } - -private: - //! Populates hsailArgList_ - void initArgList(const aclArgData* aclArg); - - //! Initializes Hsail Argument metadata and info ; - void initHsailArgs(const aclArgData* aclArg); - - FSAILProgram *program_; //!< The oclhsa::FSAILProgram context - std::vector hsailArgList_; //!< Vector list of HSAIL Arguments - std::string compileOptions_; //!< compile used for finalizing this kernel - HsaBrig* brig_; //!< The brig used to generate ISA for this kernel - HsaKernelCode* kernelCode_; //!< AQL kernel code for this kernel - HsaKernelDebug* debugInfo_; //!< Dwarf info for this kernel -}; - -} // namespace oclhsa - -#endif // WITHOUT_FSA_BACKEND - -#endif // HSAKERNEL_HPP_ - diff --git a/rocclr/runtime/device/hsa/hsamemory.cpp b/rocclr/runtime/device/hsa/hsamemory.cpp deleted file mode 100644 index 08b0b6d5b0..0000000000 --- a/rocclr/runtime/device/hsa/hsamemory.cpp +++ /dev/null @@ -1,938 +0,0 @@ -// -// Copyright (c) 2008 Advanced Micro Devices, Inc. All rights reserved. -// - -#ifndef WITHOUT_FSA_BACKEND - -#include "CL/cl_ext.h" - -#include "device/device.hpp" -#include "device/hsa/hsamemory.hpp" -#include "device/hsa/hsadevice.hpp" -#include "device/hsa/hsablit.hpp" -#include "device/hsa/oclhsa_common.hpp" -#include "thread/monitor.hpp" -#include "platform/memory.hpp" -#include "platform/sampler.hpp" - -namespace oclhsa { - -/////////////////////////////////oclhsa::Memory////////////////////////////// -Memory::Memory(const oclhsa::Device &dev, amd::Memory &owner) - : device::Memory(owner), - dev_(dev), - deviceMemory_(NULL), - interopType_(InteropNone) -{ -} - -Memory::~Memory() -{} - -bool -Memory::allocateMapMemory(size_t allocationSize) -{ - assert(mapMemory_ == NULL); - - void *mapData = NULL; - - // Use/reuse system memory from HSA system memory pool as backing - // storage of the map target. - if (kHsaStatusSuccess != - servicesapi->HsaAllocateSystemMemory( - owner()->getSize(), 0, kHsaSystemMemoryTypeDefault, &mapData)) { - LogError("[OCL] Fail to allocate the backing storage for map target"); - return false; - } - - // Create buffer object to contain the map target. - amd::Memory *mapMemory = - new(owner()->getContext()) amd::Buffer( - owner()->getContext(), CL_MEM_USE_HOST_PTR, owner()->getSize()); - - if ((mapMemory == NULL) || (!mapMemory->create(mapData))) { - LogError("[OCL] Fail to allocate map target object"); - servicesapi->HsaFreeSystemMemory(mapData); - if (mapMemory) { - mapMemory->release(); - } - return false; - } - - mapMemory_ = mapMemory; - - return true; -} - -void -Memory::freeMapMemory() -{ - // Return the memory to HSA system memory pool. - assert(mapMemory_ != NULL); - servicesapi->HsaFreeSystemMemory(mapMemory_->getHostMem()); - - // Release the buffer object containing the map data. - mapMemory_->release(); - mapMemory_ = NULL; -} - -void * -Memory::allocMapTarget(const amd::Coord3D &origin, - const amd::Coord3D ®ion, - uint mapFlags, - size_t *rowPitch, - size_t *slicePitch) -{ - // Map/Unmap must be serialized. - amd::ScopedLock lock(owner()->lockMemoryOps()); - - incIndMapCount(); - - // If the device backing storage is direct accessible, use it. - if (isHostMemDirectAccess()) { - return (static_cast(deviceMemory_) + origin[0]); - } - - // Otherwise, check for host memory. - void *hostMem = owner()->getHostMem(); - if (hostMem != NULL) { - return (static_cast(hostMem) + origin[0]); - } - - // Allocate one if needed. - if (indirectMapCount_ == 1) { - if (!allocateMapMemory(owner()->getSize())) { - decIndMapCount(); - return NULL; - } - } - else { - // Did the map resource allocation fail? - if (mapMemory_ == NULL) { - LogError("Could not map target resource"); - return NULL; - } - } - - return (static_cast(mapMemory_->getHostMem()) + origin[0]); -} - -void -Memory::decIndMapCount() -{ - // Map/Unmap must be serialized. - amd::ScopedLock lock(owner()->lockMemoryOps()); - - if (indirectMapCount_ == 0) { - LogError("decIndMapCount() called when indirectMapCount_ already zero"); - return; - } - - // Decrement the counter and release indirect map if it's the last op - if (--indirectMapCount_ == 0 && - mapMemory_ != NULL) { - freeMapMemory(); - } -} - -void * -Memory::cpuMap( - device::VirtualDevice& vDev, - uint flags, - uint startLayer, - uint numLayers, - size_t* rowPitch, - size_t* slicePitch - ) -{ - // Create the map target. - void * mapTarget = - allocMapTarget(amd::Coord3D(0), amd::Coord3D(0), 0, rowPitch, slicePitch); - - // Sync to map target if no direct access. - if (!isHostMemDirectAccess()) { - if (!vDev.blitMgr().readBuffer( - *this, mapTarget, amd::Coord3D(0), amd::Coord3D(size()), true)) { - decIndMapCount(); - return NULL; - } - } - - return mapTarget; -} - -void -Memory::cpuUnmap(device::VirtualDevice& vDev) -{ - // Sync to device backing storage if no direct access. - if (!isHostMemDirectAccess()) { - if (!vDev.blitMgr().writeBuffer( - mapMemory_->getHostMem(), *this, amd::Coord3D(0), - amd::Coord3D(size()), true)) { - LogError("[OCL] Fail sync the device memory on cpuUnmap"); - } - } - - decIndMapCount(); -} - -void Memory::destroyInterop() -{ - HsaStatus status; -#ifdef _WIN32 - if (interopType_ == InteropD3D10) { - HsaStatus status = hsacoreapi->HsaUnmapD3D10Resource( - dev_.getBackendDevice(), d3d10Resource_); - if (status != kHsaStatusSuccess) { - LogError("[OCL] Fail on HsaUnmapD3D10Resource"); - return; - } - } - - else if (interopType_ == InteropD3D11) { - HsaStatus status = hsacoreapi->HsaUnmapD3D11Resource( - dev_.getBackendDevice(), d3d11Resource_); - if (status != kHsaStatusSuccess) { - LogError("[OCL] Fail on HsaUnmapD3D11Resource"); - return; - } - } -#endif - - if (interopType_ == InteropGL) { - void * glContext =owner()->getContext().info().hCtx_; - status = hsacoreapi->HsaReleaseGLResources( dev_.getBackendDevice(), - glContext, - &glResource_, - 1); - if (kHsaStatusSuccess != status) { - LogError("[OCL] Fail on HsaReleaseGLResources"); - } - - status = hsacoreapi->HsaUnmapGLResource( - dev_.getBackendDevice(), glContext, &glResource_); - - if (status != kHsaStatusSuccess) { - LogError("[OCL] Fail on HsaUnmapGLResource"); - return; - } - } -} - -bool -Memory::isHsaLocalMemory() const { - if (owner()->isInterop()) { - return true; - } - else { - if (amd::Is64Bits()) { - uint64_t addr = reinterpret_cast(deviceMemory_); - - // Fast check: in 64 bits, CPU can only access the high area - // (VA[63:47] == 0x1FFFF) and low area (VA[63:47 == 0). - // Reference: GFXIP7_ShaderIO_Delt.doc - addr >>= 47; // discard least significant 47 bits - return (addr != 0x1FFFF && addr != 0); - } - else { - const HsaMemoryDescriptor &memDesc = - dev_.getBackendDevice()->memory_descriptors[0]; - - if (memDesc.heap_type == kHsaHeapTypeFrameBufferPrivate) { - const uintptr_t addr = - reinterpret_cast(deviceMemory_); - const uintptr_t gpuvmBase = memDesc.virtual_base_address; - const size_t size = memDesc.size_in_bytes; - return (addr >= gpuvmBase && addr < (gpuvmBase + size)); - } - } - } - return false; -} - -/////////////////////////////////oclhsa::Buffer////////////////////////////// - -Buffer::Buffer(const oclhsa::Device &dev, amd::Memory &owner) - : oclhsa::Memory(dev, owner) -{} - -Buffer::~Buffer() -{ - destroy(); -} - -void -Buffer::destroy() -{ - if (owner()->parent() != NULL) { - return; - } - - if (owner()->isInterop()) { - destroyInterop(); - return; - } - - if (isHostMemoryRegistered()) { - hsacoreapi->HsaDeregisterSystemMemory(deviceMemory_); - } - else { - if (!isHostMemDirectAccess()) { - hsacoreapi->HsaFreeDeviceMemory(deviceMemory_); - } - else if (deviceMemory_ != owner()->getHostMem()) { - // if they are identical, the host pointer will be - // deallocated later on => avoid double deallocation - hsacoreapi->HsaAmdFreeSystemMemory(deviceMemory_); - } - } -} - -bool Buffer::createInterop() -{ - amd::InteropObject *interopObject = owner()->getInteropObj(); - -#ifdef _WIN32 - if (interopObject->asD3D10Object() != NULL) { - amd::D3D10Object *d3d10Object = interopObject->asD3D10Object(); - // 1. Get the D3D11 resource - ID3D10Resource *resource = d3d10Object->getD3D10Resource(); - ID3D10Buffer *d3d10Buffer = static_cast(resource); - - HsaStatus status = hsacoreapi->HsaMapD3D10Buffer( - dev_.getBackendDevice(), d3d10Buffer, &deviceMemory_); - if (status != kHsaStatusSuccess) { - LogError("[OCL] Fail on HsaMapD3D10Buffer"); - return false; - } - interopType_ = InteropD3D10; - d3d10Resource_ = d3d10Buffer; - } - - if (interopObject->asD3D11Object() != NULL) { - amd::D3D11Object *d3d11Object = interopObject->asD3D11Object(); - // 1. Get the D3D11 resource - ID3D11Resource *resource = d3d11Object->getD3D11Resource(); - ID3D11Buffer *d3d11Buffer = static_cast(resource); - - HsaStatus status = hsacoreapi->HsaMapD3D11Buffer( - dev_.getBackendDevice(), d3d11Buffer, &deviceMemory_); - if (status != kHsaStatusSuccess) { - LogError("[OCL] Fail on HsaMapD3D10Buffer"); - return false; - } - interopType_ = InteropD3D11; - d3d11Resource_ = d3d11Buffer; - } -#endif - - if (interopObject->asBufferGL()) { - amd::BufferGL *buffer_gl = interopObject->asBufferGL(); - HsaGLResource gl_resource = {0}; - gl_resource.name = buffer_gl->getGLName(); - gl_resource.type = buffer_gl->getGLInternalFormat(); - - void * glContext =owner()->getContext().info().hCtx_; - HsaStatus status = hsacoreapi->HsaMapGLBuffer( - dev_.getBackendDevice(), glContext, &gl_resource, &deviceMemory_); - if (status != kHsaStatusSuccess) { - LogError("[OCL] Fail on HsaMapGLBuffer"); - return false; - } - - status = hsacoreapi->HsaAcquireGLResources( dev_.getBackendDevice(), - glContext, - &gl_resource, - 1); - - if (status != kHsaStatusSuccess) { - LogError("[OCL] Fail on HsaAcquireGLResources"); - return false; - } - interopType_ = InteropGL; - glResource_ = gl_resource; - } - return true; -} - -bool -Buffer::create() -{ - if (owner()->parent()) { - // Sub-Buffer creation. - oclhsa::Memory *parentBuffer = - static_cast(owner()->parent()->getDeviceMemory(dev_)); - - if (parentBuffer == NULL) { - LogError("[OCL] Fail to allocate parent buffer"); - return false; - } - - const size_t offset = owner()->getOrigin(); - deviceMemory_ = - static_cast(parentBuffer->getDeviceMemory()) + offset; - - void* parentHostPtr = parentBuffer->owner()->getHostMem(); - if (parentHostPtr) { - owner()->setHostMem(static_cast(parentHostPtr) + offset); - } - - flags_ |= owner()->parent()->getMemFlags(); - return true; - } - - // Allocate backing storage in device local memory unless UHP or AHP are set - const cl_mem_flags memFlags = owner()->getMemFlags(); - if (!(memFlags & (CL_MEM_USE_HOST_PTR | CL_MEM_ALLOC_HOST_PTR))) { - bool useDeviceMemory = dev_.settings().enableLocalMemory_; - size_t alignment = static_cast(dev_.info().memBaseAddrAlign_); - if (useDeviceMemory) { - hsacoreapi->HsaAllocateDeviceMemory( - size(), alignment, dev_.getBackendDevice(), &deviceMemory_); - if (deviceMemory_ && (memFlags & CL_MEM_COPY_HOST_PTR)) { - bool ret = dev_.xferMgr().writeBuffer(owner()->getHostMem(), *this, - amd::Coord3D(0), amd::Coord3D(size()), true); - if (!ret) { - hsacoreapi->HsaFreeDeviceMemory(deviceMemory_); - deviceMemory_ = NULL; - } - return ret; - } - // if device memory is depleted, do not fall back to system memory - return deviceMemory_ != NULL; - } - else if (!(owner()->getHostMem())) { - flags_ |= HostMemoryDirectAccess; - deviceMemory_ = dev_.hostAlloc(size(), alignment); - // no need to copy - otherwise, the host pointer will not be NULL - return deviceMemory_ != NULL; - } - } - - flags_ |= HostMemoryDirectAccess; - void* hostMem = owner()->getHostMem(); - assert(hostMem); - // If there is a host ptr, then register it only if it was not allocated, - // (=> allocated by us) - if (!(owner()->getHostMemRef()->alloced())) { - // Reuse existing host memory for the backing storage and register it. - // - // SVM precludes a possible 64-bits optimization in which host buffers - // allocated by the user (UHP) in the default, coherent space could be - // mapped into the non-coherent space by means of CreateFileMapping/mmap - // without copying any data (the "device memory" would be the - // non-coherent buffer). - // The optimization cannot be applied because regular buffers allocated - // using UHP are expected to have same characteristics as the original - // buffer, i.e., if the original buffer supports atomics then the - // corresponding OpenCL buffer will support atomics too. - flags_ |= HostMemoryRegistered; - if (hsacoreapi->HsaRegisterSystemMemory(hostMem, size()) != kHsaStatusSuccess) { - LogError("[OCL] Failed to register system memory"); - return false; - } - } - deviceMemory_ = hostMem; - return true; -} - -bool -Buffer::recreate(size_t newSize, size_t newAlignment, bool forceSystem) { - const size_t memFlag = static_cast(owner()->getMemFlags()); - if ((memFlag & CL_MEM_ALLOC_HOST_PTR) || - (memFlag & CL_MEM_USE_HOST_PTR) || - !dev_.settings().enableLocalMemory_) { - forceSystem = true; - } - - void *newDeviceMemory = NULL; - uint hostDirectAccess = 0; - - if (forceSystem) { - newDeviceMemory = dev_.hostAlloc(newSize, newAlignment); - if (newDeviceMemory == NULL) { - LogError("[OCL] Fail to reallocate system memory"); - return false; - } - - // Copy the old data to the new memory location. - if (!dev_.xferMgr().readBuffer(*this, newDeviceMemory, - amd::Coord3D(0), - amd::Coord3D(size()), - true)) { - LogError("[OCL] Fail to copy the current value"); - dev_.hostFree(newDeviceMemory); - newDeviceMemory = NULL; - return false; - } - - hostDirectAccess = HostMemoryDirectAccess; - } - else { - hsacoreapi->HsaAllocateDeviceMemory( - newSize, newAlignment, dev_.getBackendDevice(), &newDeviceMemory); - - if (newDeviceMemory == NULL) { - LogError("[OCL] Fail to reallocate device local memory"); - return false; - } - - assert( - amd::isMultipleOf(static_cast(newDeviceMemory), - newAlignment)); - - // Copy the old data to the new memory location. - if (!dev_.xferMgr().readBuffer( - *this, newDeviceMemory, amd::Coord3D(0), amd::Coord3D(size()), - true)) { - LogError("[OCL] Fail to copy the current value"); - hsacoreapi->HsaFreeDeviceMemory(newDeviceMemory); - newDeviceMemory = NULL; - return false; - } - } - - destroy(); - - deviceMemory_ = newDeviceMemory; - - if ((memFlag & CL_MEM_ALLOC_HOST_PTR) && - (owner()->getContext().devices().size() == 1)) { - owner()->setHostMem(deviceMemory_); - } - - flags_ &= (~HostMemoryDirectAccess & ~HostMemoryRegistered); - flags_ |= hostDirectAccess; - - return true; -} - -/////////////////////////////////oclhsa::Image////////////////////////////// - -Image::Image(const oclhsa::Device& dev, amd::Memory& owner) : - oclhsa::Memory(dev, owner) -{ - flags_ &= (~HostMemoryDirectAccess & ~HostMemoryRegistered); - populateImageDescriptor(); -} - -struct ImageFormatLayout { - cl_image_format clFormat; - HsaImageFormat hsaFormat; -}; - -static const ImageFormatLayout - ImageFormatLayoutMap[] = { - { { CL_R, CL_UNORM_INT8 }, HSA_IMAGE_FMT_R8_UNORM }, - { { CL_R, CL_UNORM_INT16}, HSA_IMAGE_FMT_R16_UNORM }, - { { CL_R, CL_SNORM_INT8 }, HSA_IMAGE_FMT_R8_SNORM }, - { { CL_R, CL_SNORM_INT16}, HSA_IMAGE_FMT_R16_SNORM }, - { { CL_R, CL_SIGNED_INT8}, HSA_IMAGE_FMT_R8_SINT }, - { { CL_R, CL_SIGNED_INT16}, HSA_IMAGE_FMT_R16_SINT}, - { { CL_R, CL_SIGNED_INT32}, HSA_IMAGE_FMT_R32_SINT}, - { { CL_R, CL_UNSIGNED_INT8},HSA_IMAGE_FMT_R8_UINT }, - { { CL_R, CL_UNSIGNED_INT16}, HSA_IMAGE_FMT_R16_UINT}, - { { CL_R, CL_UNSIGNED_INT32}, HSA_IMAGE_FMT_R32_UINT}, - { { CL_R, CL_HALF_FLOAT}, HSA_IMAGE_FMT_R_HALFFLOAT}, - { { CL_R, CL_FLOAT }, HSA_IMAGE_FMT_R_FLOAT}, - { { CL_A, CL_UNORM_INT8 }, HSA_IMAGE_FMT_A8_UNORM}, - { { CL_A, CL_UNORM_INT16 }, HSA_IMAGE_FMT_A16_UNORM}, - { { CL_A, CL_SNORM_INT8 }, HSA_IMAGE_FMT_A8_SNORM}, - { { CL_A, CL_SNORM_INT16 }, HSA_IMAGE_FMT_A16_SNORM}, - { { CL_A, CL_SIGNED_INT8 }, HSA_IMAGE_FMT_A8_SINT}, - { { CL_A, CL_SIGNED_INT16 },HSA_IMAGE_FMT_A16_SINT}, - { { CL_A, CL_SIGNED_INT32}, HSA_IMAGE_FMT_A32_SINT}, - { { CL_A, CL_UNSIGNED_INT8 },HSA_IMAGE_FMT_A8_UINT}, - { { CL_A, CL_UNSIGNED_INT16}, HSA_IMAGE_FMT_A16_UINT}, - { { CL_A, CL_UNSIGNED_INT32}, HSA_IMAGE_FMT_A32_UINT}, - { { CL_A, CL_HALF_FLOAT}, HSA_IMAGE_FMT_A_HALFFLOAT}, - { { CL_A, CL_FLOAT}, HSA_IMAGE_FMT_A_FLOAT}, - { { CL_RG,CL_UNORM_INT8}, HSA_IMAGE_FMT_R8G8_UNORM}, - { { CL_RG,CL_UNORM_INT16},HSA_IMAGE_FMT_R16G16_UNORM}, - { { CL_RG,CL_SNORM_INT8}, HSA_IMAGE_FMT_R8G8_SNORM}, - { { CL_RG,CL_SNORM_INT16},HSA_IMAGE_FMT_R16G16_SNORM}, - { { CL_RG,CL_SIGNED_INT8},HSA_IMAGE_FMT_R8G8_SINT}, - { { CL_RG,CL_SIGNED_INT16},HSA_IMAGE_FMT_R16G16_SINT}, - { { CL_RG,CL_SIGNED_INT32},HSA_IMAGE_FMT_R32G32_SINT}, - { { CL_RG,CL_UNSIGNED_INT8},HSA_IMAGE_FMT_R8G8_UINT}, - { { CL_RG,CL_UNSIGNED_INT16},HSA_IMAGE_FMT_R16G16_UINT}, - { { CL_RG,CL_UNSIGNED_INT32},HSA_IMAGE_FMT_R32G32_UINT}, - { { CL_RG,CL_HALF_FLOAT},HSA_IMAGE_FMT_RG_HALFFLOAT}, - { { CL_RG,CL_FLOAT},HSA_IMAGE_FMT_RG_FLOAT}, - { { CL_RA,CL_UNORM_INT8}, HSA_IMAGE_FMT_R8A8_UNORM}, - { { CL_RA,CL_UNORM_INT16},HSA_IMAGE_FMT_R16A16_UNORM}, - { { CL_RA,CL_SNORM_INT8}, HSA_IMAGE_FMT_R8A8_SNORM}, - { { CL_RA,CL_SNORM_INT16},HSA_IMAGE_FMT_R16A16_SNORM}, - { { CL_RA,CL_SIGNED_INT8},HSA_IMAGE_FMT_R8A8_SINT}, - { { CL_RA,CL_SIGNED_INT16},HSA_IMAGE_FMT_R16A16_SINT}, - { { CL_RA,CL_SIGNED_INT32},HSA_IMAGE_FMT_R32A32_SINT}, - { { CL_RA,CL_UNSIGNED_INT8},HSA_IMAGE_FMT_R8A8_UINT}, - { { CL_RA,CL_UNSIGNED_INT16},HSA_IMAGE_FMT_R16A16_UINT}, - { { CL_RA,CL_UNSIGNED_INT32},HSA_IMAGE_FMT_R32A32_UINT}, - { { CL_RA,CL_HALF_FLOAT},HSA_IMAGE_FMT_RA_HALFFLOAT}, - { { CL_RA,CL_FLOAT},HSA_IMAGE_FMT_RA_FLOAT}, - { { CL_RGBA,CL_UNORM_INT8}, HSA_IMAGE_FMT_R8G8B8A8_UNORM}, - { { CL_RGBA,CL_UNORM_INT16},HSA_IMAGE_FMT_R16G16B16A16_UNORM}, - { { CL_RGBA,CL_SNORM_INT8}, HSA_IMAGE_FMT_R8G8B8A8_SNORM}, - { { CL_RGBA,CL_SNORM_INT16},HSA_IMAGE_FMT_R16G16B16A16_SNORM}, - { { CL_RGBA,CL_SIGNED_INT8},HSA_IMAGE_FMT_R8G8B8A8_SINT}, - { { CL_RGBA,CL_SIGNED_INT16},HSA_IMAGE_FMT_R16G16B16A16_SINT}, - { { CL_RGBA,CL_SIGNED_INT32},HSA_IMAGE_FMT_R32G32B32A32_SINT}, - { { CL_RGBA,CL_UNSIGNED_INT8},HSA_IMAGE_FMT_R8G8B8A8_UINT}, - { { CL_RGBA,CL_UNSIGNED_INT16},HSA_IMAGE_FMT_R16G16B16A16_UINT}, - { { CL_RGBA,CL_UNSIGNED_INT32},HSA_IMAGE_FMT_R32G32B32A32_UINT}, - { { CL_RGBA,CL_HALF_FLOAT},HSA_IMAGE_FMT_RGBA_HALFFLOAT}, - { { CL_RGBA,CL_FLOAT},HSA_IMAGE_FMT_RGBA_FLOAT}, - { { CL_ARGB,CL_UNORM_INT8},HSA_IMAGE_FMT_A8R8G8B8_UNORM}, - { { CL_ARGB,CL_SNORM_INT8},HSA_IMAGE_FMT_A8R8G8B8_SNORM}, - { { CL_ARGB,CL_SIGNED_INT8},HSA_IMAGE_FMT_A8R8G8B8_SINT}, - { { CL_ARGB,CL_UNSIGNED_INT8},HSA_IMAGE_FMT_A8R8G8B8_UINT}, - { { CL_BGRA,CL_UNORM_INT8},HSA_IMAGE_FMT_B8G8R8A8_UNORM}, - { { CL_BGRA,CL_SNORM_INT8},HSA_IMAGE_FMT_B8G8R8A8_SNORM}, - { { CL_BGRA,CL_SIGNED_INT8},HSA_IMAGE_FMT_B8G8R8A8_SINT}, - { { CL_BGRA,CL_UNSIGNED_INT8},HSA_IMAGE_FMT_B8G8R8A8_UINT}, - { {CL_LUMINANCE,CL_SNORM_INT8}, HSA_IMAGE_FMT_L8_SNORM}, - { {CL_LUMINANCE,CL_SNORM_INT16},HSA_IMAGE_FMT_L16_SNORM}, - { {CL_LUMINANCE,CL_UNORM_INT8},HSA_IMAGE_FMT_L8_UNORM}, - { {CL_LUMINANCE,CL_UNORM_INT16},HSA_IMAGE_FMT_L16_UNORM}, - { {CL_LUMINANCE,CL_HALF_FLOAT},HSA_IMAGE_FMT_L_HALFFLOAT}, - { {CL_LUMINANCE,CL_FLOAT},HSA_IMAGE_FMT_L_FLOAT}, - { {CL_INTENSITY,CL_SNORM_INT8}, HSA_IMAGE_FMT_I8_SNORM}, - { {CL_INTENSITY,CL_SNORM_INT16},HSA_IMAGE_FMT_I16_SNORM}, - { {CL_INTENSITY,CL_UNORM_INT8},HSA_IMAGE_FMT_I8_UNORM}, - { {CL_INTENSITY,CL_UNORM_INT16},HSA_IMAGE_FMT_I16_UNORM}, - { {CL_INTENSITY,CL_HALF_FLOAT},HSA_IMAGE_FMT_I_HALFFLOAT}, - { {CL_INTENSITY,CL_FLOAT},HSA_IMAGE_FMT_I_FLOAT}, - { {CL_RGB, CL_UNORM_SHORT_565},HSA_IMAGE_FMT_R5G6B5_UNORM}, - { {CL_RGB, CL_UNORM_SHORT_555},HSA_IMAGE_FMT_R5G5B5_UNORM}, - { {CL_RGB, CL_UNORM_INT_101010},HSA_IMAGE_FMT_R10G10B10_UNORM} -}; - -void -Image::populateImageDescriptor() -{ - amd::Image* image = owner()->asImage(); - - // build HSA runtime image descriptor - imageDescriptor_.width = image->getWidth(); - imageDescriptor_.height = image->getHeight(); - imageDescriptor_.depth = image->getDepth(); - imageDescriptor_.arraySize = 0; - - // Device specific image does not require rowpitch/slicepitch information. - // Only image buffer is required to specify rowpitch size. - imageDescriptor_.rowPitchInBytes = 0; - imageDescriptor_.slicePitchInBytes = 0; - - switch (image->getType()) - { - case CL_MEM_OBJECT_IMAGE1D: - imageDescriptor_.geometry = HSA_GEOMETRY_1D; - imageDescriptor_.height = 1; - imageDescriptor_.depth = 1; - break; - case CL_MEM_OBJECT_IMAGE1D_BUFFER: - imageDescriptor_.geometry = HSA_GEOMETRY_1DBuffer; - imageDescriptor_.height = 1; - imageDescriptor_.depth = 1; - break; - case CL_MEM_OBJECT_IMAGE1D_ARRAY: - //@todo - arraySize = height ?! - imageDescriptor_.geometry = HSA_GEOMETRY_1DArray; - imageDescriptor_. height = 1; - imageDescriptor_.arraySize = image->getHeight(); - break; - case CL_MEM_OBJECT_IMAGE2D: - imageDescriptor_.geometry = HSA_GEOMETRY_2D; - imageDescriptor_.depth = 1; - break; - case CL_MEM_OBJECT_IMAGE2D_ARRAY: - //@todo - arraySize = depth ?! - imageDescriptor_.geometry = HSA_GEOMETRY_2DArray; - imageDescriptor_.depth = 1; - imageDescriptor_.arraySize = image->getDepth(); - break; - case CL_MEM_OBJECT_IMAGE3D: - imageDescriptor_.geometry = HSA_GEOMETRY_3D; - break; - } - - for (uint i = 0; i < sizeof(ImageFormatLayoutMap) / sizeof(ImageFormatLayout); ++i) { - if ((image->getImageFormat().image_channel_data_type == - ImageFormatLayoutMap[i].clFormat.image_channel_data_type) && - (image->getImageFormat().image_channel_order == - ImageFormatLayoutMap[i].clFormat.image_channel_order)) { - imageDescriptor_.format = ImageFormatLayoutMap[i].hsaFormat; - } - } -} - -bool Image::createInterop() { - amd::ScopedLock lock(owner()->lockMemoryOps()); - amd::InteropObject *interopObject = owner()->getInteropObj(); - void *hsaImageObjectInterop = NULL; - size_t hsaImageObjectInteropSize = 0; -#ifdef _WIN32 - if (interopObject->asD3D10Object()) { - amd::D3D10Object *d3d10Object = interopObject->asD3D10Object(); - // 1. Get the D3D11 resource - ID3D10Resource *resource = d3d10Object->getD3D10Resource(); - HsaStatus status = hsacoreapi->HsaMapD3D10Texture( - dev_.getBackendDevice(), resource, &hsaImageObjectInterop, - &hsaImageObjectInteropSize, kHsaMapFlagsReadWrite); - if (status != kHsaStatusSuccess || hsaImageObjectInteropSize == 0 ) { - LogError("[OCL] Fail on HsaMapD3D10Texture"); - return false; - } - interopType_ = InteropD3D10; - d3d10Resource_ = resource; - } - - if (interopObject->asD3D11Object()) { - amd::D3D11Object *d3d11Object = interopObject->asD3D11Object(); - - // 1. Get the D3D11 resource - ID3D11Resource *resource = d3d11Object->getD3D11Resource(); - HsaStatus status = hsacoreapi->HsaMapD3D11Texture( - dev_.getBackendDevice(), resource, &hsaImageObjectInterop, - &hsaImageObjectInteropSize, kHsaMapFlagsReadWrite, - d3d11Object->getPlane()); - if (status != kHsaStatusSuccess || hsaImageObjectInteropSize == 0 ) { - LogError("[OCL] Fail on HsaMapD3D11Texture"); - return false; - } - interopType_ = InteropD3D11; - d3d11Resource_ = resource; - } -#endif - - if (interopObject->asGLObject()) { - amd::GLObject* gl_object = interopObject->asGLObject(); - HsaGLResource gl_resource = {0}; - gl_resource.name = gl_object->getGLName(); - if (gl_object->getGLTarget() != GL_TEXTURE_CUBE_MAP) { - gl_resource.type = gl_object->getGLTarget(); - } - else { - gl_resource.type = gl_object->getCubemapFace(); - } - gl_resource.mipmap_level = gl_object->getGLMipLevel(); - - void * glContext =owner()->getContext().info().hCtx_; - - // Get the texture SRD. - HsaStatus status = hsacoreapi->HsaMapGLTexture( - dev_.getBackendDevice(), glContext, &gl_resource, - &hsaImageObjectInterop, &hsaImageObjectInteropSize); - if (status != kHsaStatusSuccess || hsaImageObjectInteropSize == 0) { - LogError("[OCL] Fail on HsaMapGLTexture"); - return false; - } - - status = hsacoreapi->HsaAcquireGLResources( dev_.getBackendDevice(), - glContext, - &gl_resource, - 1); - - if (status != kHsaStatusSuccess) { - LogError("[OCL] Fail on HsaAcquireGLResources"); - return false; - } - - // Get the flat address for texture buffer. - if (owner()->getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER) { - // Map the texture buffer resource as buffer. - HsaStatus status = hsacoreapi->HsaMapGLBuffer( - dev_.getBackendDevice(), glContext, &gl_resource, - &deviceMemory_); - if (status != kHsaStatusSuccess) { - LogError("[OCL] Fail on HsaMapGLBuffer"); - return false; - } - // Sanity check. - assert((deviceMemory_ != NULL) && - "deviceMemory_ should not be \ - NULL upon successful return from HsaMapGLBuffer"); - } - - interopType_ = InteropGL; - glResource_ = gl_resource; - } - - // Populate HSA specific information to the interop image object. - HsaStatus status = hsacoreapi->HsaAmdCreateDeviceImageView( - &imageDescriptor_, hsaImageObjectInterop, hsaImageObject_); - if (status != kHsaStatusSuccess) { - LogError("[OCL] Fail to tranform interop image SRD"); - return false; - } - return true; -} - -bool Image::create() -{ - if (owner()->parent()) { - // Image view creation - oclhsa::Image *parentImage = - static_cast(owner()->parent()->getDeviceMemory(dev_)); - - if (parentImage == NULL) { - LogError("[OCL] Fail to allocate parent image"); - return false; - } - - return createView(*parentImage); - } - - amd::ScopedLock lock(owner()->lockMemoryOps()); - - // Get memory size requirement for device specific image. - HsaStatus status = hsacoreapi->HsaGetDeviceImageInfo( - dev_.getBackendDevice(), &imageDescriptor_, - &deviceImageInfo_); - - if (status != kHsaStatusSuccess) { - LogError("[OCL] Fail to allocate image memory"); - return false; - } - - if (dev_.settings().enableLocalMemory_) { - status = hsacoreapi->HsaAllocateDeviceMemory( - deviceImageInfo_.imageSizeInBytes, - deviceImageInfo_.imageAlignmentInBytes, - dev_.getBackendDevice(), - &deviceMemory_); - } else { - status = servicesapi->HsaAllocateSystemMemory( - deviceImageInfo_.imageSizeInBytes, - deviceImageInfo_.imageAlignmentInBytes, - kHsaSystemMemoryTypeDefault, - &deviceMemory_); - } - - if (status != kHsaStatusSuccess) { - LogError("[OCL] Fail to allocate image memory"); - return false; - } - - assert(amd::isMultipleOf( - deviceMemory_, deviceImageInfo_.imageAlignmentInBytes)); - - status = hsacoreapi->HsaCreateDeviceImage( - dev_.getBackendDevice(), &imageDescriptor_, - deviceMemory_, &hsaImageObject_[0]); - - return true; -} - -bool -Image::createView(Image &parent) -{ - amd::ScopedLock lock(owner()->lockMemoryOps()); - - if (parent.owner()->asBuffer()) { - // Get new texture SRD since parent is a buffer. - deviceMemory_ = parent.getDeviceMemory(); - - // Force device specific image implementation to use rowpitch size. - amd::Image* image = owner()->asImage(); - imageDescriptor_.rowPitchInBytes = image->getRowPitch(); - - HsaStatus status = hsacoreapi->HsaCreateDeviceImage( - dev_.getBackendDevice(), &imageDescriptor_, - deviceMemory_, &hsaImageObject_[0]); - - if (status != kHsaStatusSuccess) { - LogError("[OCL] Fail to create HSA image object"); - return false; - } - } else { - // Get the view of the existing parent's SRD based on the child's image - // descriptor. - HsaStatus status = hsacoreapi->HsaAmdCreateDeviceImageView( - &imageDescriptor_, parent.getHsaImageObjectAddress(), - &hsaImageObject_[0]); - if (status != kHsaStatusSuccess) { - LogError("[OCL] Fail to get view of parent image"); - return false; - } - } - - return true; -} - -void* Image::allocMapTarget(const amd::Coord3D& origin, - const amd::Coord3D& region, - uint mapFlags, - size_t* rowPitch, - size_t* slicePitch) -{ - amd::ScopedLock lock(owner()->lockMemoryOps()); - - incIndMapCount(); - - void* pHostMem = owner()->getHostMem(); - - if (pHostMem == NULL) { - if (indirectMapCount_ == 1) { - if (!allocateMapMemory(owner()->getSize())) { - decIndMapCount(); - return NULL; - } - } - else { - // Did the map resource allocation fail? - if (mapMemory_ == NULL) { - LogError("Could not map target resource"); - return NULL; - } - } - - pHostMem = mapMemory_->getHostMem(); - } - - amd::Image* image = owner()->asImage(); - - size_t elementSize = image->getImageFormat().getElementSize(); - - size_t offset = origin[0] * elementSize; - - // Adjust offset with Y dimension - offset += image->getRowPitch() * origin[1]; - - // Adjust offset with Z dimension - offset += image->getSlicePitch() * origin[2]; - - *rowPitch = image->getRowPitch(); - if (slicePitch != NULL) - *slicePitch = image->getSlicePitch(); - - return (static_cast(pHostMem) + offset); -} - -Image::~Image() -{ - destroy(); -} - -void -Image::destroy() -{ - if (owner()->parent() != NULL) { - return; - } - - if (owner()->isInterop()) { - destroyInterop(); - return; - } - - if (dev_.settings().enableLocalMemory_) { - hsacoreapi->HsaFreeDeviceMemory(deviceMemory_); - } - else { - servicesapi->HsaFreeSystemMemory(deviceMemory_); - } -} -} -#endif // WITHOUT_FSA_BACKEND diff --git a/rocclr/runtime/device/hsa/hsamemory.hpp b/rocclr/runtime/device/hsa/hsamemory.hpp deleted file mode 100644 index 3ebdb3e7cc..0000000000 --- a/rocclr/runtime/device/hsa/hsamemory.hpp +++ /dev/null @@ -1,202 +0,0 @@ -#ifndef HSAMEMORY_HPP_ -#define HSAMEMORY_HPP_ - -#include "top.hpp" -#include "platform/memory.hpp" -#include "utils/debug.hpp" -#include "hsadevice.hpp" -#include "services.h" -#ifdef _WIN32 -#include "amdocl/cl_d3d11_amd.hpp" -#endif -#include "amdocl/cl_gl_amd.hpp" -#include "hsainterop.h" - -namespace oclhsa { - -enum InteropType { - InteropNone = 0, - InteropD3D9 = 1, - InteropD3D10 = 2, - InteropD3D11 = 3, - InteropGL = 4 -}; - -class Memory : public device::Memory { - public: - Memory(const oclhsa::Device &dev, amd::Memory &owner); - - virtual ~Memory(); - - // Getter for deviceMemory_. - void *getDeviceMemory() const { return deviceMemory_; } - - // Gets a pointer to a region of host-visible memory for use as the target - // of an indirect map for a given memory object - virtual void *allocMapTarget(const amd::Coord3D &origin, - const amd::Coord3D ®ion, - uint mapFlags, - size_t *rowPitch, - size_t *slicePitch); - - // Create device memory according to OpenCL memory flag. - virtual bool create() = 0; - virtual bool createInterop() = 0; - - // Pins system memory associated with this memory object. - virtual bool pinSystemMemory(void *hostPtr, // System memory address - size_t size // Size of allocated system memory - ) { - Unimplemented(); - return true; - } - - // Immediate blocking write from device cache to owners's backing store. - // Marks owner as "current" by resetting the last writer to NULL. - virtual void syncHostFromCache(SyncFlags syncFlags = SyncFlags()) - { - // Need to revisit this when multi-devices is supported. - } - - bool processGLResource (GLResourceOP operation) { return true;} - - // Releases indirect map surface - void releaseIndirectMap() { decIndMapCount(); } - - //! Map the device memory to CPU visible - virtual void* cpuMap( - device::VirtualDevice& vDev, //!< Virtual device for map operaiton - uint flags = 0, //!< flags for the map operation - // Optimization for multilayer map/unmap - uint startLayer = 0, //!< Start layer for multilayer map - uint numLayers = 0, //!< End layer for multilayer map - size_t* rowPitch = NULL,//!< Row pitch for the device memory - size_t* slicePitch = NULL //!< Slice pitch for the device memory - ); - - //! Unmap the device memory - virtual void cpuUnmap( - device::VirtualDevice& vDev //!< Virtual device for unmap operaiton - ); - - bool isHsaLocalMemory() const; - - // Accessors for indirect map memory object - amd::Memory *mapMemory() const { return mapMemory_; } - - protected: - bool allocateMapMemory(size_t allocationSize); - - void freeMapMemory(); - - // Decrement map count - virtual void decIndMapCount(); - - // Free / deregister device memory. - virtual void destroy() = 0; - - //This function is called in the destructor ~Buffer() and ~Image(), - //since InteropObject belonging to owner() is destroyed before - //the destructor is called, we use the cached values of - //interopType and Resource in this function. - virtual void destroyInterop(); - - // Pointer to the device associated with this memory object. - const oclhsa::Device &dev_; - - // Pointer to the device memory. This could be in system or device local mem. - void* deviceMemory_; - - InteropType interopType_; -#ifdef _WIN32 - ID3D10Resource* d3d10Resource_; - ID3D11Resource* d3d11Resource_; -#endif - HsaGLResource glResource_; - - private: - // Disable copy constructor - Memory(const Memory &); - - // Disable operator= - Memory &operator=(const Memory &); -}; - - - -class Buffer : public oclhsa::Memory { - public: - Buffer(const oclhsa::Device &dev, amd::Memory &owner); - - virtual ~Buffer(); - - // Create device memory according to OpenCL memory flag. - virtual bool create(); - - // Recreate the device memory using new size and alignment. - bool recreate(size_t newSize, size_t newAlignment, bool forceSystem); - - //! Create a interop memory - bool createInterop(); - - private: - // Disable copy constructor - Buffer(const Buffer &); - - // Disable operator= - Buffer &operator=(const Buffer &); - - // Free / deregister device memory. - void destroy(); -}; - -class Image : public oclhsa::Memory -{ -public: - Image(const oclhsa::Device& dev, amd::Memory& owner); - - virtual ~Image(); - - //! Create device memory according to OpenCL memory flag. - virtual bool create(); - - //! Create an image view - bool createView(Image &image); - - virtual bool createInterop(); - - //! Gets a pointer to a region of host-visible memory for use as the target - //! of an indirect map for a given memory object - virtual void* allocMapTarget(const amd::Coord3D& origin, - const amd::Coord3D& region, - uint mapFlags, - size_t* rowPitch, - size_t* slicePitch); - - size_t getDeviceRowPitchSize() { return deviceImageInfo_.rowPitchInBytes; } - size_t getDeviceSlicePitchSize() { return deviceImageInfo_.slicePitchInBytes; } - size_t getDeviceDataSize() { return deviceImageInfo_.imageSizeInBytes; } - size_t getDeviceDataAlignment() { return deviceImageInfo_.imageAlignmentInBytes; } - - void* getHsaImageObjectAddress() { return &hsaImageObject_[0];} - size_t getHsaImageObjectSizeInBytes() {return sizeof(hsaImageObject_); } - -private: - //! Disable copy constructor - Image(const Buffer&); - - //! Disable operator= - Image& operator=(const Buffer&); - - // Free / deregister device memory. - void destroy(); - - void populateImageDescriptor(); - - HsaImageDescriptor imageDescriptor_; - HsaDeviceImageInfo deviceImageInfo_; - uint8_t hsaImageObject_[HSA_IMAGE_OBJECT_SIZE]; -}; - -} -#endif diff --git a/rocclr/runtime/device/hsa/hsaprogram.cpp b/rocclr/runtime/device/hsa/hsaprogram.cpp deleted file mode 100644 index b2e93aaaa3..0000000000 --- a/rocclr/runtime/device/hsa/hsaprogram.cpp +++ /dev/null @@ -1,726 +0,0 @@ -// -// Copyright (c) 2008 Advanced Micro Devices, Inc. All rights reserved. -// - - -#ifndef WITHOUT_FSA_BACKEND - -#include "device/hsa/hsaprogram.hpp" - -#include "compiler/lib/loaders/elf/elf.hpp" -#include "compiler/lib/utils/options.hpp" -#include "runtime/device/hsa/hsakernel.hpp" -#include "runtime/device/hsa/hsacompilerlib.hpp" -#include "runtime/device/hsa/oclhsa_common.hpp" -#include "utils/bif_section_labels.hpp" -#include "utils/libUtils.h" - -#include -#include -#include -#include -#include -#include -#include - - -#endif // WITHOUT_FSA_BACKEND - -namespace oclhsa { -#ifndef WITHOUT_FSA_BACKEND - /* Temporary log function for the compiler library */ - static void logFunction(const char *msg, size_t size) { - std::cout << "Compiler Library log :" << msg << std::endl; - } - - FSAILProgram::~FSAILProgram() { - unloadBrig(); - acl_error error; - // Free the elf binary - if (binaryElf_ != NULL) { - error = g_complibApi._aclBinaryFini(binaryElf_); - if (error != ACL_SUCCESS) { - LogWarning( "Error while destroying the acl binary \n" ); - } - } - } - - FSAILProgram::FSAILProgram(oclhsa::NullDevice& device): device::Program(device), - llvmBinary_(), - binaryElf_(NULL), - device_(device), - isBrigLoaded_(false) - { - memset(&binOpts_, 0, sizeof(binOpts_)); - binOpts_.struct_size = sizeof(binOpts_); - //binOpts_.elfclass = LP64_SWITCH( ELFCLASS32, ELFCLASS64 ); - //Setting as 32 bit because hsail64 returns an invalid aclTargetInfo - //when aclGetTargetInfo is called - EPR# 377910 - binOpts_.elfclass = ELFCLASS32; - binOpts_.bitness = ELFDATA2LSB; - binOpts_.alloc = &::malloc; - binOpts_.dealloc = &::free; - } - - bool FSAILProgram::initClBinary(char *binaryIn, size_t size) { // Save the - // original - // binary that - // isn't owned - // by ClBinary - clBinary()->saveOrigBinary(binaryIn, size); - - char *bin = binaryIn; - size_t sz = size; - - int encryptCode; - - char *decryptedBin; - size_t decryptedSize; - if (!clBinary()->decryptElf(binaryIn, size, - &decryptedBin, &decryptedSize, &encryptCode)) { - return false; - } - if (decryptedBin != NULL) { - // It is decrypted binary. - bin = decryptedBin; - sz = decryptedSize; - } - - // Both 32-bit and 64-bit are allowed! - if (!amd::isElfMagic(bin)) { - // Invalid binary. - if (decryptedBin != NULL) { - delete[]decryptedBin; - } - return false; - } - - clBinary()->setFlags(encryptCode); - - return clBinary()->setBinary(bin, sz, (decryptedBin != NULL)); - } - - bool FSAILProgram::initBuild(amd::option::Options *options) { - if (!device::Program::initBuild(options)) { - return false; - } - - // Need to get device information from CAL !?!? - // Needs the device pointer from CAL to send to options class - // - // Shreyas: Commenting this might cause a bug - keeping this fro now - // options->setPerBuildInfo("hsa", - // binary_.getEncryptCode() - // ); - - // Elf Binary setup - std::string outFileName; - - // true means fsail required - clBinary()->init(options, true); - if (options->isDumpFlagSet(amd::option::DUMP_BIF)) { - outFileName = options->getDumpFileName(".bin"); - } - - bool useELF64 = getCompilerOptions()->oVariables->EnableGpuElf64; - if (!clBinary()->setElfOut(useELF64 ? ELFCLASS64 : ELFCLASS32, - (outFileName.size() > - 0) ? outFileName.c_str() : NULL)) { - LogError("Setup elf out for gpu failed"); - return false; - } - return true; - } - - // ! post-compile setup for GPU - bool FSAILProgram::finiBuild(bool isBuildGood) { - clBinary()->resetElfOut(); - clBinary()->resetElfIn(); - - if (!isBuildGood) { - // Prevent the encrypted binary form leaking out - clBinary()->setBinary(NULL, 0); - - } - - return device::Program::finiBuild(isBuildGood); - } - - static char *readFile(std::string source_filename, size_t &size) { - FILE *fp = ::fopen(source_filename.c_str(), "rb"); - unsigned int length; - size_t offset = 0; - char *ptr; - - if (!fp) { - return NULL; - } - - // obtain file size. - ::fseek(fp, 0, SEEK_END); - length = ::ftell(fp); - ::rewind(fp); - - ptr = reinterpret_cast(malloc(offset + length + 1)); - if (length != fread(&ptr[offset], 1, length, fp)) { - free(ptr); - return NULL; - } - - ptr[offset + length] = '\0'; - size = offset + length; - ::fclose(fp); - return ptr; - } - - aclType FSAILProgram::getNextCompilationStageFromBinary() { - acl_error errorCode; - size_t secSize = 0; - aclType from = ACL_TYPE_DEFAULT; - // Checking llvmir in .llvmir section - bool isLlvmirText = true; - const void *llvmirText = g_complibApi._aclExtractSection(device().compiler(), - binaryElf_, - &secSize, - aclLLVMIR, - &errorCode); - if (errorCode != ACL_SUCCESS) { - isLlvmirText = false; - } - // Checking compile & link options in .comment section - bool isOpts = true; - const void *opts = g_complibApi._aclExtractSection(device().compiler(), - binaryElf_, - &secSize, - aclCOMMENT, - &errorCode); - if (errorCode != ACL_SUCCESS) { - isOpts = false; - } - if (isLlvmirText) { - from = ACL_TYPE_LLVMIR_BINARY; - } else { - if (!isLlvmirText) { - buildLog_ +="Error while linking : \ - Invalid binary (Missing LLVMIR section)\n" ; - } - if (!isOpts) { - buildLog_ +="Warning while linking : \ - Invalid binary (Missing COMMENT section)\n" ; - } - return ACL_TYPE_DEFAULT; - } - bool isHsailText = true; - // Checking HSAIL in .cg section - const void *hsailText = g_complibApi._aclExtractSection(device().compiler(), - binaryElf_, - &secSize, - aclCODEGEN, - &errorCode); - if (errorCode != ACL_SUCCESS) { - isHsailText = false; - } - // Checking BRIG STRTAB in .brig_strtab section - bool isBrigStrtab = true; - const void *brigStrtab = g_complibApi._aclExtractSection(device().compiler(), - binaryElf_, - &secSize, - aclBRIGstrs, - &errorCode); - if (errorCode != ACL_SUCCESS) { - isBrigStrtab = false; - } - // Checking BRIG CODE in .brig_code section - bool isBrigCode = true; - const void *brigCode = g_complibApi._aclExtractSection(device().compiler(), - binaryElf_, - &secSize, - aclBRIGcode, - &errorCode); - if (errorCode != ACL_SUCCESS) { - isBrigCode = false; - } - // Checking BRIG OPERANDS in .brig_operands section - bool isBrigOps = true; - const void *brigOps = g_complibApi._aclExtractSection(device().compiler(), - binaryElf_, - &secSize, - aclBRIGoprs, - &errorCode); - if (errorCode != ACL_SUCCESS) { - isBrigOps = false; - } - if (isHsailText && isBrigStrtab && isBrigCode && isBrigOps) { - from = ACL_TYPE_HSAIL_BINARY; - } else if (!isHsailText && !isBrigStrtab && !isBrigCode && !isBrigOps) { - from = ACL_TYPE_LLVMIR_BINARY; - } else { - if (!isHsailText) { - buildLog_ +="Error while linking : \ - Invalid binary (Missing CG section)\n" ; - } - if (!isBrigStrtab) { - buildLog_ +="Error while linking : \ - Invalid binary (Missing BRIG_STRTAB section)\n" ; - } - if (!isBrigCode) { - buildLog_ +="Error while linking : \ - Invalid binary (Missing BRIG_CODE section)\n" ; - } - if (!isBrigOps) { - buildLog_ +="Error while linking : \ - Invalid binary (Missing BRIG_OPERANDS section)\n" ; - } - return ACL_TYPE_DEFAULT; - } - // Checking ISA in .text section - bool isShaderIsa = true; - const void *shaderIsa = g_complibApi._aclExtractSection(device().compiler(), - binaryElf_, - &secSize, - aclTEXT, - &errorCode); - if (errorCode != ACL_SUCCESS) { - isShaderIsa = false; - } - if (isShaderIsa && from == ACL_TYPE_LLVMIR_BINARY) { - from = ACL_TYPE_DEFAULT; - } - return from; - } - bool FSAILProgram::updateAclBinaryWithKernelIsaAndDebug(std::string kernelName){ - assert(brig_.loadmap_section != NULL); - aclBinary * internalAclBinary = reinterpret_cast(brig_.loadmap_section); - - std::string openClKernelName("&__OpenCL_" + kernelName + "_kernel"); - const oclBIFSymbolStruct* isaSymbolStruct = findBIF30SymStruct(symISABinary); - assert(isaSymbolStruct && "symbol not found"); - std::string kernelIsaSymbol = isaSymbolStruct->str[bif::PRE] + - openClKernelName + isaSymbolStruct->str[bif::POST]; - - const oclBIFSymbolStruct* debugSymbolStruct = findBIF30SymStruct(symDebugInfo); - assert(debugSymbolStruct && "symbol not found"); - //For debug symbols, the PRE is used for BRIG debug and the POST is used for - //ISA debug - std::string kernelIsaDebugSymbol = debugSymbolStruct->str[bif::POST] + openClKernelName; - - //Extract the ISA section - size_t symbolSize; - acl_error errorCode; - const void* isaSymbol = g_complibApi._aclExtractSymbol(device().compiler(), - internalAclBinary, - &symbolSize, - aclTEXT, - kernelIsaSymbol.c_str(), - &errorCode); - if (errorCode != ACL_SUCCESS) { - buildLog_ += "Failed to extract ISA for kernel"; - return false; - } - //Insert the ISA section - errorCode = g_complibApi._aclInsertSymbol(device().compiler(), - binaryElf_, - isaSymbol, - symbolSize, - aclTEXT, - kernelIsaSymbol.c_str()); - if (errorCode != ACL_SUCCESS) { - buildLog_ += "Failed to insert ISA for kernel"; - return false; - } - const void* debugSymbol = g_complibApi._aclExtractSymbol(device().compiler(), - internalAclBinary, - &symbolSize, - aclHSADEBUG, - kernelIsaDebugSymbol.c_str(), - &errorCode); - //If debug information is available - if (errorCode == ACL_SUCCESS) { - //Update binary with the debug section for the kernel - errorCode = g_complibApi._aclInsertSymbol(device().compiler(), - binaryElf_, - debugSymbol, - symbolSize, - aclHSADEBUG, - kernelIsaDebugSymbol.c_str()); - if (errorCode != ACL_SUCCESS) { - buildLog_ += "Failed to insert debug information for kernel"; - return false; - } - } - return true; - } - bool FSAILProgram::ExtractSymbolAndCopy(aclSections id, - const char *symbol_name, - void** address_to_copy, - size_t* symbol_size_bytes, - bool verify) { - acl_error error_code; - *symbol_size_bytes = 0; - const void* symbol_data = g_complibApi._aclExtractSymbol( - device().compiler(), - binaryElf_, - symbol_size_bytes, - id, - symbol_name, - &error_code); - //If the section is not mandatory and the section does not exist - //skip this section - if (error_code != ACL_SUCCESS) { - if (!verify) { - return true; - } - std::string error = "Could not find Brig Directive in BIFF: "; - error += symbol_name; - LogError(error.c_str()); - buildLog_ += error; - return false; - } - *address_to_copy = malloc(*symbol_size_bytes); - if (*address_to_copy == NULL) { - LogError(" Failed to allocate memory"); - return false; - } - memcpy(*address_to_copy, symbol_data, *symbol_size_bytes); - - return true; - } - - bool FSAILProgram::saveBinaryAndSetType(type_t type) { - //Write binary to memory - void *rawBinary = NULL; - size_t size; - if (g_complibApi._aclWriteToMem(binaryElf_, &rawBinary, &size) - != ACL_SUCCESS) { - buildLog_ += "Failed to write binary to memory \n"; - return false; - } - clBinary()->saveBIFBinary((char*)rawBinary, size); - //Set the type of binary - setType(type); - //Free memory containing rawBinary - binaryElf_->binOpts.dealloc(rawBinary); - return true; - } - - bool FSAILProgram::linkImpl(const std::vector &inputPrograms, - amd::option::Options *options, - bool createLibrary) { - std::vector::const_iterator it - = inputPrograms.begin(); - std::vector::const_iterator itEnd - = inputPrograms.end(); - acl_error errorCode; - - // For each program we need to extract the LLVMIR and create - // aclBinary for each - std::vector binaries_to_link; - - for (size_t i = 0; it != itEnd; ++it, ++i) { - FSAILProgram *program = (FSAILProgram *)*it; - // Check if the program was created with clCreateProgramWIthBinary - binary_t binary = program->binary(); - if ((binary.first != NULL) && (binary.second > 0)) { - // Binary already exists -- we can also check if there is no - // opencl source code - // Need to check if LLVMIR exists in the binary - // If LLVMIR does not exist then is it valid - // We need to pull out all the compiled kernels - // We cannot do this at present because we need at least - // Hsail text to pull the kernels oout - void *mem = const_cast(binary.first); - binaryElf_ = g_complibApi._aclReadFromMem(mem, - binary.second, - &errorCode); - - if (errorCode != ACL_SUCCESS) { - LogWarning("Error while linking : Could not read from raw binary"); - return false; - } - } - // At this stage each FSAILProgram contains a valid binary_elf - // Check if LLVMIR is in the binary - // @TODO - Memory leak , cannot free this buffer - // need to fix this.. File EPR on compiler library - size_t llvmirSize = 0; - const void *llvmirText = g_complibApi._aclExtractSection(device().compiler(), - binaryElf_, - &llvmirSize, - aclLLVMIR, - &errorCode); - if (errorCode != ACL_SUCCESS) { - buildLog_ +="Error while linking : \ - Invalid binary (Missing LLVMIR section)" ; - return false; - } - // Create a new aclBinary for each LLVMIR and save it in a list - aclBIFVersion ver = g_complibApi._aclBinaryVersion(binaryElf_); - aclBinary *bin = g_complibApi._aclCreateFromBinary(binaryElf_, ver); - binaries_to_link.push_back(bin); - } - - // At this stage each FSAILProgram in the list has an aclBinary initialized - // and contains LLVMIR - // We can now go ahead and link them. - if (binaries_to_link.size() > 1) { - errorCode = g_complibApi._aclLink(device().compiler(), - binaries_to_link[0], - binaries_to_link.size() - 1, - &binaries_to_link[1], - ACL_TYPE_LLVMIR_BINARY, - "-create-library", - NULL); - } - else { - errorCode = g_complibApi._aclLink(device().compiler(), - binaries_to_link[0], - 0, - NULL, - ACL_TYPE_LLVMIR_BINARY, - "-create-library", - NULL); - } - if (errorCode != ACL_SUCCESS) { - buildLog_ += "Failed to link programs"; - return false; - } - // Store the newly linked aclBinary for this program. - binaryElf_ = binaries_to_link[0]; - // Free all the other aclBinaries - for (size_t i = 1; i < binaries_to_link.size(); i++) { - g_complibApi._aclBinaryFini(binaries_to_link[i]); - } - if (createLibrary) { - saveBinaryAndSetType(TYPE_LIBRARY); - return true; - } - - // Now call linkImpl with the new options - return linkImpl(options); - } - - bool FSAILProgram::loadBrig() { - //Copy all the sections into BRIG - memset(&brig_, 0 ,sizeof(HsaBrig)); - bool codeStatus = ExtractSymbolAndCopy(aclBRIGcode, - "__BRIG__code", - &brig_.code_section, - &brig_.code_section_byte_size, - true - ); - bool oprStatus = ExtractSymbolAndCopy(aclBRIGoprs, - "__BRIG__operands", - &brig_.operand_section, - &brig_.operand_section_byte_size, - true - ); - bool strStatus = ExtractSymbolAndCopy(aclBRIGstrs, - "__BRIG__strtab", - &brig_.string_section, - &brig_.string_section_byte_size, - true - ); - bool dbgStatus = ExtractSymbolAndCopy(aclHSADEBUG , - "__debug_brig__", - &brig_.debug_section, - &brig_.debug_section_byte_size, - false - ); - if (!codeStatus || !oprStatus || !strStatus || !dbgStatus) { - LogError("Failed to Extract one or more BRIG sections"); - buildLog_ += "Error: Failed to Extract one or more BRIG sections"; - return false; - } - if(hsacoreapi->HsaLoadBrig(device_.getBackendDevice(), &brig_) - != kHsaStatusSuccess){ - return false; - } - isBrigLoaded_ = true; - return true; - } - - bool FSAILProgram::unloadBrig() { - if (isBrigLoaded_ == true) { - HsaStatus status = hsacoreapi->HsaUnloadBrig(&brig_); - if (status != kHsaStatusSuccess){ - return false; - } - //Destroy the BRIG - free(brig_.code_section); - free(brig_.operand_section); - free(brig_.string_section); - free(brig_.debug_section); - } - return true; - } - - bool FSAILProgram::linkImpl(amd::option::Options *options) { - acl_error errorCode; - aclType continueCompileFrom = ACL_TYPE_LLVMIR_BINARY; - //If the binaryElf_ is not set then program must have been created - // using clCreateProgramWithBinary - if (!binaryElf_) { - binary_t binary = this->binary(); - if ((binary.first != NULL) && (binary.second > 0)) { - // Binary already exists -- we can also check if there is no - // opencl source code - // Need to check if LLVMIR exists in the binary - // If LLVMIR does not exist then is it valid - // We need to pull out all the compiled kernels - // We cannot do this at present because we need at least - // Hsail text to pull the kernels oout - void *mem = const_cast(binary.first); - binaryElf_ = g_complibApi._aclReadFromMem(mem, - binary.second, - &errorCode); - if (errorCode != ACL_SUCCESS) { - buildLog_ += "Error while converting to BRIG: aclBinary init failure \n" ; - LogWarning("aclBinaryInit failed"); - return false; - } - // Check that all needed section also exist in binaryElf_ - // No any validity checks here - continueCompileFrom = getNextCompilationStageFromBinary(); - if (ACL_TYPE_DEFAULT == continueCompileFrom) { - return false; - } - if (ACL_TYPE_HSAIL_BINARY == continueCompileFrom) { - // Save binary in the interface class - // Also load compile & link options from binary into Program class members: - // compileOptions_ & linkOptions_ - setBinary(static_cast(mem), binary.second); - // Compare options loaded from binary with current ones - // If they differ then recompile from ACL_TYPE_LLVMIR_BINARY - // @TODO It is needed to compare options taking into account that: - // 1. options are order independent; - // 2. (may be not trivial) compare only options that affect binary - std::string curOptions = options->origOptionStr + hsailOptions(); - if (compileOptions_ + linkOptions_ != curOptions) { - continueCompileFrom = ACL_TYPE_LLVMIR_BINARY; - } - } - } - } - // Compilation from ACL_TYPE_LLVMIR_BINARY to ACL_TYPE_CG in cases: - // 1. if the program is not created with binary; - // 2. if the program is created with binary and contains only .llvmir & .comment - // 3. if the program is created with binary, contains all brig sections, - // but the binary's compile & link options differ from current ones (recompilation); - if (ACL_TYPE_LLVMIR_BINARY == continueCompileFrom) { - std::string curOptions = options->origOptionStr + hsailOptions(); - errorCode = g_complibApi._aclCompile(device().compiler(), - binaryElf_, - curOptions.c_str(), - ACL_TYPE_LLVMIR_BINARY, - ACL_TYPE_CG, - logFunction); - } - if (errorCode != ACL_SUCCESS) { - buildLog_ += "Error while converting to BRIG: Compiling LLVMIR to BRIG \n" ; - return false; - } - //Stop compilation if it is an offline device - HSA runtime does not - //support ISA compiled offline - if (!dev().isOnline()) { - return true; - } - - const HsaDevice *hsaDevice = dev().getBackendDevice(); - if (!loadBrig()) { - buildLog_ += "Error while loading BRIG" ; - return false; - } - - size_t kernelNamesSize = 0; - errorCode = aclQueryInfo(dev().compiler(), binaryElf_, RT_KERNEL_NAMES, NULL, NULL, &kernelNamesSize); - if (errorCode != ACL_SUCCESS) { - buildLog_ += "Error while Finalization phase: kernel names query from the ELF failed\n"; - return false; - } - if (kernelNamesSize > 0) { - char* kernelNames = new char[kernelNamesSize]; - errorCode = aclQueryInfo(dev().compiler(), binaryElf_, RT_KERNEL_NAMES, NULL, kernelNames, &kernelNamesSize); - if (errorCode != ACL_SUCCESS) { - buildLog_ += "Error while Finalization phase: kernel's Metadata is corrupted in the ELF\n"; - delete kernelNames; - return false; - } - std::vector vKernels = splitSpaceSeparatedString(kernelNames); - delete kernelNames; - std::vector::iterator it = vKernels.begin(); - bool dynamicParallelism = false; - for (it; it != vKernels.end(); ++it) { - std::string kernelName = *it; - Kernel *aKernel = new oclhsa::Kernel(kernelName, - this, - &brig_, - options->origOptionStr + hsailOptions()); - if (!aKernel->init() ) { - return false; - } - aKernel->setUniformWorkGroupSize(options->oVariables->UniformWorkGroupSize); - // Update the binary in the FSAILProgram to save the ISA and debug information. - // This is so the debugger and the profiler can use the a single aclBinary for all their needs. - if (!updateAclBinaryWithKernelIsaAndDebug(kernelName)) { - return false; - } - kernels()[kernelName] = aKernel; - } - } - saveBinaryAndSetType(TYPE_EXECUTABLE); - buildLog_ += g_complibApi._aclGetCompilerLog(device().compiler()); - return true; - } - - bool FSAILProgram::createBinary(amd::option::Options *options) { - return false; - } - - bool FSAILProgram::initClBinary() { - if (clBinary_ == NULL) { - clBinary_ = new ClBinary(static_cast(device())); - if (clBinary_ == NULL) { - return false; - } - } - return true; - } - - void FSAILProgram::releaseClBinary() { - if (clBinary_ != NULL) { - delete clBinary_; - clBinary_ = NULL; - } - } - - std::string FSAILProgram::hsailOptions() { - std::string hsailOptions; - //Set options for the standard device specific options - //This is just for legacy compiler code - // All our devices support these options now - hsailOptions.append(" -DFP_FAST_FMAF=1"); - hsailOptions.append(" -DFP_FAST_FMA=1"); - //TODO(sramalin) : Query the device for opencl version - // and only set if -cl-std wasn't specified in - // original build options (app) - //hsailOptions.append(" -cl-std=CL1.2"); - //check if the host is 64 bit or 32 bit - LP64_ONLY(hsailOptions.append(" -m64")); - //Now append each extension supported by the device - // one by one - std::string token; - std::istringstream iss(""); - iss.str(device().info().extensions_); - while (getline(iss, token, ' ')) { - if (!token.empty()) { - hsailOptions.append(" -D"); - hsailOptions.append(token); - hsailOptions.append("=1"); - } - } - return hsailOptions; - } - -#endif // WITHOUT_FSA_BACKEND -} // namespace hsa - diff --git a/rocclr/runtime/device/hsa/hsaprogram.hpp b/rocclr/runtime/device/hsa/hsaprogram.hpp deleted file mode 100644 index e1d96f1515..0000000000 --- a/rocclr/runtime/device/hsa/hsaprogram.hpp +++ /dev/null @@ -1,160 +0,0 @@ -// -// Copyright (c) 2008 Advanced Micro Devices, Inc. All rights reserved. -// - -#ifndef HSAPROGRAM_HPP_ -#define HSAPROGRAM_HPP_ - -#ifndef WITHOUT_FSA_BACKEND - -#include "hsabinary.hpp" -#include "hsacompilerlib.hpp" -#include "services.h" -#include "acl.h" -#include "oclhsa_common.hpp" -#include -#include -#include -#include -#include "hsadevice.hpp" - -//! \namespace oclhsa HSA Device Implementation -namespace oclhsa { - - //! \class empty program - class FSAILProgram : public device::Program - { - friend class ClBinary; - public: - //! Default constructor - FSAILProgram(oclhsa::NullDevice& device); - //! Default destructor - ~FSAILProgram(); - - // Initialize Binary for GPU (used only for clCreateProgramWithBinary()). - virtual bool initClBinary(char *binaryIn, size_t size); - - //! Returns the aclBinary associated with the progrm - const aclBinary* binaryElf() const { - return static_cast(binaryElf_); } - - //! Returns the brig associated with the progrm - const HsaBrig* brig() { - return static_cast(&brig_); } - - const NullDevice& dev() const { return device_; } - //! Returns the hsaBinary associated with the progrm - const HsaDevice* hsaDevice() const { - return dev().getBackendDevice(); - } - - protected: - //! pre-compile setup for GPU - virtual bool initBuild(amd::option::Options* options); - - //! post-compile setup for GPU - virtual bool finiBuild(bool isBuildGood); - - /*! \brief Compiles GPU CL program to LLVM binary (compiler frontend) - * - * \return True if we successefully compiled a GPU program - */ - virtual bool compileImpl( - const std::string& sourceCode, //!< the program's source code - const std::vector& headers, - const char** headerIncludeNames, - amd::option::Options* options //!< compile options's object - ); - - /*! \brief Compiles LLVM binary to FSAIL code (compiler backend: link+opt+codegen) - * - * \return The build error code - */ - int compileBinaryToFSAIL( - amd::option::Options* options //!< options for compilation - ); - - - virtual bool linkImpl(amd::option::Options* options); - - //! Link the device programs. - virtual bool linkImpl (const std::vector& inputPrograms, - amd::option::Options* options, - bool createLibrary); - - virtual bool createBinary(amd::option::Options* options); - - //! Initialize Binary - virtual bool initClBinary(); - - //! Release the Binary - virtual void releaseClBinary(); - - virtual const aclTargetInfo & info(const char * str = ""){ - return info_; - } - - virtual bool isElf(const char* bin) const { - return amd::isElfMagic(bin); - //return false; - } - - //! Returns the binary - // This should ensure that the binary is updated with all the kernels - // ClBinary& clBinary() { return binary_; } - ClBinary* clBinary() { - return static_cast(device::Program::clBinary()); - } - const ClBinary* clBinary() const { - return static_cast(device::Program::clBinary()); - } - - private: - - //! Extracts a symbol from the binaryElf_ - // and copies it to a buffer allocated - // by the function - bool ExtractSymbolAndCopy(aclSections id, - const char *symbol_name, - void** address_to_copy, - size_t* symbol_size_bytes, - bool verify); - //! Extracts the aclBinary used internally within the brig - // and pulls the debug and ISA section for a particular kernel - // and inserts it into aclBinary contained in the program - bool updateAclBinaryWithKernelIsaAndDebug(std::string kernelName); - //! Checks the existence of sections in binaryElf_ - // and calculates the next stage of compilation; - // if set of the section is impossible, then - // binary is invalid and function returns ACL_TYPE_DEFAULT - aclType getNextCompilationStageFromBinary(); - //! Loads the global variables for the BRIG - bool loadBrig(); - //! Unloads the global variables for the BRIG - bool unloadBrig(); - bool saveBinaryAndSetType(type_t type); - //! Disable default copy constructor - FSAILProgram(const FSAILProgram&); - - //! Disable operator= - FSAILProgram& operator=(const FSAILProgram&); - - //! Returns all the options to be appended while passing to the - //compiler library - std::string hsailOptions(); - - std::string openCLSource_; //!< Original OpenCL source - std::string fsailProgram_; //!< FSAIL program after compilation. - std::string llvmBinary_; //!< LLVM IR binary code - //!< aclBinary and aclCompiler - for the compiler libray - aclBinary* binaryElf_; //! -#include - -namespace oclhsa { - -Timestamp::~Timestamp() { - if (signal_ != 0) { - hsacoreapi->HsaDestroySignal(signal_); - } -} - -HsaSignal Timestamp::createSignal() { - start_ = 0; - end_ = 0; - - HsaStatus status = hsacoreapi->HsaCreateSignal(&signal_); - if (status != kHsaStatusSuccess) { - LogError("HsaCreateSignal failed, could not create signal for timestamp"); - return 0; - } - return signal_; -} - -void Timestamp::start() { - start_ = amd::Os::timeNanos(); - signal_ = 0; -} - -void Timestamp::end() { - end_ = amd::Os::timeNanos(); -} - -/** - * @brief Waits on an outstanding kernel without regard to how - * it was dispatched - with or without a signal - * - * @return bool true if Wait returned successfully, false - * otherwise - */ -bool VirtualGPU::releaseGpuMemoryFence() { - - // Return if there is no pending dispatch - if (!hasPendingDispatch_) { - return false; - } - - // Reset the wait on dispatch flag - HsaStatus status; - hasPendingDispatch_ = false; - - // This is the first call to wait on a kernel, issue - // a End Of Pipe - Release_Mem command - HsaQueue *hsaQueue; - hsaQueue = (lastSubmitQueue_ == kHsaQueueTypeCompute) ? - gpu_queue_ : interopQueue_; - if (hsaQueue != NULL) { - status = hsacoreapi->HsaAmdReleaseGpuFence(hsaQueue); - if (status == kHsaStatusSuccess) { - return true; - } - } - - LogError("Call to HsaAmdReleaseGpuFence() failed.\n"); - return false; -} - -VirtualGPU::VirtualGPU(Device &device) - : device::VirtualDevice(device), oclhsa_device_(device) -{ - lastSubmitQueue_ = static_cast(0xFFFF); - gpu_device_ = const_cast(device.getBackendDevice()); - interopQueue_ = NULL; - timestamp_ = NULL; - - // Initialize the last signal and dispatch flags - hasPendingDispatch_ = false; -} - -VirtualGPU::~VirtualGPU() -{ - if (timestamp_ != NULL) { - delete timestamp_; - timestamp_ = NULL; - LogError("There was a timestamp that was not used; deleting."); - } -} - -/* profilingBegin, when profiling is enabled, creates a timestamp to save in - * virtualgpu's timestamp_, and calls start() to get the current host - * timestamp. - */ -void VirtualGPU::profilingBegin(amd::Command &command, bool drmProfiling) -{ - if (command.profilingInfo().enabled_) { - if (timestamp_ != NULL) { - LogWarning("Trying to create a second timestamp in VirtualGPU. \ - This could have unintended consequences."); - return; - } - timestamp_ = new Timestamp; - timestamp_->start(); - } -} - -/* profilingEnd, when profiling is enabled, checks to see if a signal was - * created for whatever command we are running and calls end() to get the - * current host timestamp if no signal is available. It then saves the pointer - * timestamp_ to the command's data. - */ -void VirtualGPU::profilingEnd(amd::Command &command) -{ - if (command.profilingInfo().enabled_) { - if (timestamp_->getSignal() == 0) { - timestamp_->end(); - } - command.setData(reinterpret_cast(timestamp_)); - timestamp_ = NULL; - } -} - -bool VirtualGPU::profilingCollectResults(amd::Command *list) -{ - uint32_t cmdType; - HsaAmdProfileObject profileObj; - Timestamp *ts = NULL; - HsaStatus status; - - amd::Command* current = list; - amd::Command* next = NULL; - - // If the command list is, empty then exit - if (current == NULL) { - return true; - } - - // Determine profiling has been enabled. - if (!current->profilingInfo().enabled_) { - return false; - } - - // This block gets the current device and system clock counters, and uses - // the delta between the two to adjust the device clock to the host domain. - uint64_t endTimeStampGPU = 0; - uint64_t endTimeStamp = 0; - // Device frequency - double deviceNsPerTick = 0; - HsaDeviceClockCounterInfo clockCounterInfo; - if (kHsaStatusSuccess == hsacoreapi->HsaDeviceGetClockCounters(gpu_device_, &clockCounterInfo)) { - // Device frequency - deviceNsPerTick = 1000000000.0 / - clockCounterInfo.device_clock_frequency_hz; - endTimeStampGPU = clockCounterInfo.device_clock_counter * deviceNsPerTick; - // keep this order of operations for accuracy - endTimeStamp = clockCounterInfo.system_clock_counter * - (1000000000.0 / clockCounterInfo.system_clock_frequency_hz); - } else { - LogWarning("Could not get device/system counters. Device times could be off."); - endTimeStamp = amd::Os::timeNanos(); - } - - uint64_t startTimeStamp = endTimeStamp; - uint64_t readjustTimeGPU = 0; - if (endTimeStampGPU != 0) { - readjustTimeGPU = endTimeStampGPU - endTimeStamp; - } - - // This block gets the first valid timestamp from the first command that has - // one. This timestamp is used below to mark any command that came before - // it to start and end with this first valid start time. - current = list; - while (current != NULL) { - cmdType = current->type(); - if (current->data() != NULL) { - ts = reinterpret_cast(current->data()); - if (ts->getSignal() != 0) { - status = hsacoreapi->HsaAmdGetProfileObject(ts->getSignal(), &profileObj); - if (status != kHsaStatusSuccess) { - LogError("Error reading profile data."); - continue; - } - startTimeStamp = *profileObj.launch_time_ * deviceNsPerTick; - startTimeStamp -= readjustTimeGPU; - endTimeStamp = startTimeStamp; - } else { - startTimeStamp = ts->getStart(); - endTimeStamp = ts->getStart(); - } - break; - } - current = current->getNext(); - } - - // Iterate through the list of commands, and set timestamps as appropriate - // Note, if a command does not have a timestamp, it does one of two things: - // - if the command (without a timestamp), A, precedes another command, C, - // that _does_ contain a valid timestamp, command A will set RUNNING and - // COMPLETE with the RUNNING (start) timestamp from command C. This would - // also be true for command B, which is between A and C. These timestamps - // are actually retrieved in the block above (startTimeStamp, endTimeStamp). - // - if the command (without a timestamp), C, follows another command, A, - // that has a valid timestamp, command C will be set RUNNING and COMPLETE - // with the COMPLETE (end) timestamp of the previous command, A. This is - // also true for any command B, which falls between A and C. - current = list; - while (current != NULL) { - cmdType = current->type(); - if (current->data() != NULL) { - // Since this is a valid command to get a timestamp, we use the - // timestamp provided by the runtime (saved in the data()) - ts = reinterpret_cast(current->data()); - if (ts->getSignal() != 0) { - status = hsacoreapi->HsaAmdGetProfileObject(ts->getSignal(), &profileObj); - if (status != kHsaStatusSuccess) { - LogError("Error reading profile data."); - continue; - } - startTimeStamp = *profileObj.launch_time_ * deviceNsPerTick; - endTimeStamp = *profileObj.completion_time_ * deviceNsPerTick; - startTimeStamp -= readjustTimeGPU; - endTimeStamp -= readjustTimeGPU; - } else { - startTimeStamp = ts->getStart(); - endTimeStamp = ts->getEnd(); - } - delete ts; - current->setData(NULL); - } else { - // If we don't have a command that contains a valid timestamp, we - // simply use the end timestamp of the previous command. - // Note, if this is a command before the first valid timestamp, - // this will be equal to the start timestamp of the first valid - // timestamp at this point. - startTimeStamp = endTimeStamp; - } - - if (current->status() == CL_SUBMITTED) { - current->setStatus(CL_RUNNING, startTimeStamp); - current->setStatus(CL_COMPLETE, endTimeStamp); - } - else if (current->status() != CL_COMPLETE) { - LogPrintfError("Unexpected command status - %d.", current->status()); - } - - next = current->getNext(); - current->release(); - current = next; - } - - // Release the memory blocks allocated for the various - // struct arguments of one or more kernel submissions - std::for_each(kernelArgList_.begin(), - kernelArgList_.end(), - std::ptr_fun(servicesapi->HsaFreeSystemMemory)); - kernelArgList_.clear(); - - // Reset the queue parameter - lastSubmitQueue_ = static_cast(0xFFFF); - - // Return True so that OpenCL commands are - // not processed again - return true; -} - -bool -VirtualGPU::create(HsaQueueType queueType) -{ - //context was created with d3d11 or d3d10 or gl - //extension enabled, RT still needs to create - //two queues even for an interop application. - bool isInterop = (queueType == kHsaQueueTypeInterop); - if (kHsaStatusSuccess != - hsacoreapi->HsaCreateUserModeQueue(gpu_device_, - NULL, - 0, - kHsaQueueTypeCompute, - kHsaQueuePriorityMaximum, - kHsaQueueFractionTen, - &gpu_queue_)) { - LogError("Error creating hsa queue"); - return false; - } - - if ((dev().settings().enableLocalMemory_ || isInterop) && - kHsaStatusSuccess != - hsacoreapi->HsaCreateUserModeQueue(gpu_device_, - NULL, - 0, - kHsaQueueTypeInterop, - kHsaQueuePriorityMaximum, - kHsaQueueFractionTen, - &interopQueue_)) { - LogError("Error creating hsa interop queue"); - return false; - } - - device::BlitManager::Setup blitSetup; - blitMgr_ = new KernelBlitManager(*this, blitSetup); - if ((NULL == blitMgr_) || !blitMgr_->create(oclhsa_device_)) { - LogError("Could not create BlitManager!"); - return false; - } - - return true; -} - -bool -VirtualGPU::terminate() -{ - delete blitMgr_; - - // Release the resources of signal - releaseGpuMemoryFence(); - - // Close the user mode queue - if (interopQueue_) { - hsacoreapi->HsaDestroyUserModeQueue(interopQueue_); - } - hsacoreapi->HsaDestroyUserModeQueue(gpu_queue_); - - return true; -} - -void VirtualGPU::submitReadMemory(amd::ReadMemoryCommand &cmd) -{ - device::Memory *devMem = cmd.source().getDeviceMemory(dev()); - void *dst = cmd.destination(); - amd::Coord3D size = cmd.size(); - - //! @todo: add multi-devices synchronization when supported. - - cl_command_type type = cmd.type(); - bool result = false; - bool imageBuffer = false; - - // Force buffer read for IMAGE1D_BUFFER - if ((type == CL_COMMAND_READ_IMAGE) && - (cmd.source().getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER)) { - type = CL_COMMAND_READ_BUFFER; - imageBuffer = true; - } - - profilingBegin(cmd); - - switch (type) { - case CL_COMMAND_READ_BUFFER: { - amd::Coord3D origin(cmd.origin()[0]); - if (imageBuffer) { - size_t elemSize = - cmd.source().asImage()->getImageFormat().getElementSize(); - origin.c[0] *= elemSize; - size.c[0] *= elemSize; - } - result = blitMgr().readBuffer( - *devMem, dst, origin, size, - cmd.isEntireMemory()); - break; - } - case CL_COMMAND_READ_BUFFER_RECT: { - result = blitMgr().readBufferRect( - *devMem, dst, cmd.bufRect(), cmd.hostRect(), size, - cmd.isEntireMemory()); - break; - } - case CL_COMMAND_READ_IMAGE: { - result = blitMgr().readImage( - *devMem, dst, cmd.origin(), size, cmd.rowPitch(), - cmd.slicePitch(), cmd.isEntireMemory()); - break; - } - default: - ShouldNotReachHere(); - break; - } - - profilingEnd(cmd); - - if (!result) { - LogError("submitReadMemory failed!"); - cmd.setStatus(CL_OUT_OF_RESOURCES); - } -} - -void VirtualGPU::submitWriteMemory(amd::WriteMemoryCommand &cmd) -{ - device::Memory *devMem = cmd.destination().getDeviceMemory(dev()); - const char *src = static_cast(cmd.source()); - amd::Coord3D size = cmd.size(); - - //! @todo add multi-devices synchronization when supported. - - cl_command_type type = cmd.type(); - bool result = false; - bool imageBuffer = false; - - // Force buffer write for IMAGE1D_BUFFER - if ((type == CL_COMMAND_WRITE_IMAGE) && - (cmd.destination().getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER)) { - type = CL_COMMAND_WRITE_BUFFER; - imageBuffer = true; - } - - profilingBegin(cmd); - - switch (type) { - case CL_COMMAND_WRITE_BUFFER: { - amd::Coord3D origin(cmd.origin()[0]); - if (imageBuffer) { - size_t elemSize = - cmd.destination().asImage()->getImageFormat().getElementSize(); - origin.c[0] *= elemSize; - size.c[0] *= elemSize; - } - result = blitMgr().writeBuffer( - src, *devMem , origin, size, - cmd.isEntireMemory()); - break; - } - case CL_COMMAND_WRITE_BUFFER_RECT: { - result = blitMgr().writeBufferRect( - src, *devMem, cmd.hostRect(), cmd.bufRect(), size, - cmd.isEntireMemory()); - break; - } - case CL_COMMAND_WRITE_IMAGE: { - result = blitMgr().writeImage( - src, *devMem, cmd.origin(), size, cmd.rowPitch(), - cmd.slicePitch(), cmd.isEntireMemory()); - break; - } - default: - ShouldNotReachHere(); - break; - } - - if (!result) { - LogError("submitWriteMemory failed!"); - cmd.setStatus(CL_OUT_OF_RESOURCES); - } - else { - cmd.destination().signalWrite(&dev()); - } - - profilingEnd(cmd); -} - -void VirtualGPU::submitCopyMemory(amd::CopyMemoryCommand &cmd) -{ - device::Memory *srcDevMem = cmd.source().getDeviceMemory(dev()); - device::Memory *destDevMem = cmd.destination().getDeviceMemory(dev()); - amd::Coord3D size = cmd.size(); - - //! @todo add multi-devices synchronization when supported. - - cl_command_type type = cmd.type(); - bool result = false; - bool srcImageBuffer = false; - bool dstImageBuffer = false; - - // Force buffer copy for IMAGE1D_BUFFER - if (cmd.source().getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER) { - srcImageBuffer = true; - type = CL_COMMAND_COPY_BUFFER; - } - if (cmd.destination().getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER) { - dstImageBuffer = true; - type = CL_COMMAND_COPY_BUFFER; - } - - profilingBegin(cmd); - - switch (cmd.type()) { - case CL_COMMAND_COPY_BUFFER: { - amd::Coord3D srcOrigin(cmd.srcOrigin()[0]); - amd::Coord3D dstOrigin(cmd.dstOrigin()[0]); - - if (srcImageBuffer) { - const size_t elemSize = - cmd.source().asImage()->getImageFormat().getElementSize(); - srcOrigin.c[0] *= elemSize; - if (dstImageBuffer) { - dstOrigin.c[0] *= elemSize; - } - size.c[0] *= elemSize; - } - else if (dstImageBuffer) { - const size_t elemSize = - cmd.destination().asImage()->getImageFormat().getElementSize(); - dstOrigin.c[0] *= elemSize; - size.c[0] *= elemSize; - } - - result = blitMgr().copyBuffer( - *srcDevMem, *destDevMem, srcOrigin, - dstOrigin, size, cmd.isEntireMemory()); - break; - } - case CL_COMMAND_COPY_BUFFER_RECT: { - result = blitMgr().copyBufferRect( - *srcDevMem, *destDevMem, cmd.srcRect(), - cmd.dstRect(), size, cmd.isEntireMemory()); - break; - } - case CL_COMMAND_COPY_IMAGE: { - result = blitMgr().copyImage( - *srcDevMem, *destDevMem, cmd.srcOrigin(), - cmd.dstOrigin(), size, cmd.isEntireMemory()); - break; - } - case CL_COMMAND_COPY_IMAGE_TO_BUFFER: { - result = blitMgr().copyImageToBuffer( - *srcDevMem, *destDevMem, cmd.srcOrigin(), - cmd.dstOrigin(), size, cmd.isEntireMemory()); - break; - } - case CL_COMMAND_COPY_BUFFER_TO_IMAGE: { - result = blitMgr().copyBufferToImage( - *srcDevMem, *destDevMem, cmd.srcOrigin(), - cmd.dstOrigin(), size, cmd.isEntireMemory()); - break; - } - default: - ShouldNotReachHere(); - break; - } - - if (!result) { - LogError("submitCopyMemory failed!"); - cmd.setStatus(CL_OUT_OF_RESOURCES); - } - - profilingEnd(cmd); - - cmd.destination().signalWrite(&dev()); -} - -void VirtualGPU::submitMapMemory(amd::MapMemoryCommand &cmd) -{ - //! @todo add multi-devices synchronization when supported. - - profilingBegin(cmd); - - device::Memory *devMemory = cmd.memory().getDeviceMemory(dev(), false); - - cl_command_type type = cmd.type(); - bool imageBuffer = false; - - // Force buffer read for IMAGE1D_BUFFER - if ((type == CL_COMMAND_MAP_IMAGE) && - (cmd.memory().getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER)) { - type = CL_COMMAND_MAP_BUFFER; - imageBuffer = true; - } - - cl_map_flags mapFlag = cmd.mapFlags(); - - // Treat no map flag as read-write. - if (mapFlag == 0) { - mapFlag = CL_MAP_READ | CL_MAP_WRITE; - } - - // Save map write requirement. - if (mapFlag & (CL_MAP_WRITE | CL_MAP_WRITE_INVALIDATE_REGION)) { - devMemory->saveMapInfo(cmd.origin(), cmd.size(), - mapFlag, cmd.isEntireMemory()); - } - - // Sync to the map target. - if ((!devMemory->isHostMemDirectAccess()) && - (mapFlag & (CL_MAP_READ | CL_MAP_WRITE))) { - bool result = false; - - oclhsa::Memory *hsaMemory = static_cast(devMemory); - - amd::Memory* mapMemory = hsaMemory->mapMemory(); - void *hostPtr = mapMemory == NULL ? - hsaMemory->owner()->getHostMem() : - mapMemory->getHostMem(); - - if (type == CL_COMMAND_MAP_BUFFER) { - amd::Coord3D origin(cmd.origin()[0]); - amd::Coord3D size(cmd.size()[0]); - if (imageBuffer) { - size_t elemSize = - cmd.memory().asImage()->getImageFormat().getElementSize(); - origin.c[0] *= elemSize; - size.c[0] *= elemSize; - } - result = blitMgr().readBuffer( - *hsaMemory, - static_cast(hostPtr) + origin[0], - origin, - size, - cmd.isEntireMemory()); - } - else if (type == CL_COMMAND_MAP_IMAGE) { - amd::Image* image = cmd.memory().asImage(); - result = blitMgr().readImage( - *hsaMemory, hostPtr, amd::Coord3D(0), - image->getRegion(), image->getRowPitch(), - image->getSlicePitch(), true); - } - else { - ShouldNotReachHere(); - } - - if (!result) { - LogError("submitMapMemory failed!"); - cmd.setStatus(CL_OUT_OF_RESOURCES); - } - } - - profilingEnd(cmd); -} - -void VirtualGPU::submitUnmapMemory(amd::UnmapMemoryCommand &cmd) -{ - profilingBegin(cmd); - - device::Memory *devMemory = cmd.memory().getDeviceMemory(dev(), false); - - // Force buffer write for IMAGE1D_BUFFER - bool imageBuffer = - (cmd.memory().getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER); - - if (devMemory->isUnmapWrite()) { - // Commit the changes made by the user. - if (!devMemory->isHostMemDirectAccess()) { - bool result = false; - - if (cmd.memory().asImage() && !imageBuffer) { - amd::Image *image = cmd.memory().asImage(); - result = blitMgr().writeImage( - cmd.mapPtr(), *devMemory, - devMemory->writeMapInfo()->origin_, - devMemory->writeMapInfo()->region_, - image->getRowPitch(), image->getSlicePitch()); - } - else { - amd::Coord3D origin(devMemory->writeMapInfo()->origin_[0]); - amd::Coord3D size(devMemory->writeMapInfo()->region_[0]); - if (imageBuffer) { - size_t elemSize = - cmd.memory().asImage()->getImageFormat().getElementSize(); - origin.c[0] *= elemSize; - size.c[0] *= elemSize; - } - result = blitMgr().writeBuffer( - cmd.mapPtr(), *devMemory, - origin, - size); - } - - if (!result) { - LogError("submitMapMemory failed!"); - cmd.setStatus(CL_OUT_OF_RESOURCES); - } - } - - devMemory->clearUnmapFlags(); - - cmd.memory().signalWrite(&dev()); - } - - profilingEnd(cmd); -} - -void VirtualGPU::submitFillMemory(amd::FillMemoryCommand &cmd) -{ - device::Memory *devMemory = cmd.memory().getDeviceMemory(dev(), false); - - //! @todo add multi-devices synchronization when supported. - - cl_command_type type = cmd.type(); - bool result = false; - bool imageBuffer = false; - float fillValue[4]; - - // Force fill buffer for IMAGE1D_BUFFER - if ((type == CL_COMMAND_FILL_IMAGE) && - (cmd.memory().getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER)) { - type = CL_COMMAND_FILL_BUFFER; - imageBuffer = true; - } - - profilingBegin(cmd); - - // Find the the right fill operation - switch (type) { - case CL_COMMAND_FILL_BUFFER: { - const void* pattern = cmd.pattern(); - size_t patternSize = cmd.patternSize(); - amd::Coord3D origin(cmd.origin()[0]); - amd::Coord3D size(cmd.size()[0]); - // Reprogram fill parameters if it's an IMAGE1D_BUFFER object - if (imageBuffer) { - size_t elemSize = - cmd.memory().asImage()->getImageFormat().getElementSize(); - origin.c[0] *= elemSize; - size.c[0] *= elemSize; - memset(fillValue, 0, sizeof(fillValue)); - cmd.memory().asImage()->getImageFormat().formatColor(pattern, fillValue); - pattern = fillValue; - patternSize = elemSize; - } - result = blitMgr().fillBuffer( - *devMemory, pattern, patternSize, origin, size, - cmd.isEntireMemory()); - break; - } - case CL_COMMAND_FILL_IMAGE: { - result = blitMgr().fillImage( - *devMemory, cmd.pattern(), cmd.origin(), cmd.size(), - cmd.isEntireMemory()); - break; - } - default: - ShouldNotReachHere(); - break; - } - - if (!result) { - LogError("submitFillMemory failed!"); - cmd.setStatus(CL_OUT_OF_RESOURCES); - } - - cmd.memory().signalWrite(&dev()); - - profilingEnd(cmd); -} - -void VirtualGPU::submitMigrateMemObjects(amd::MigrateMemObjectsCommand &vcmd) -{ - // Wait on a kernel if one is outstanding - releaseGpuMemoryFence(); - - profilingBegin(vcmd); - - std::vector::const_iterator itr; - - for (itr = vcmd.memObjects().begin(); - itr != vcmd.memObjects().end(); - itr++) { - // Find device memory - device::Memory *m = (*itr)->getDeviceMemory(dev()); - oclhsa::Memory *memory = static_cast(m); - - if (vcmd.migrationFlags() & CL_MIGRATE_MEM_OBJECT_HOST) { - //! @todo revisit this when multi devices is supported. - } else if (vcmd.migrationFlags() & - CL_MIGRATE_MEM_OBJECT_CONTENT_UNDEFINED) { - //! @todo revisit this when multi devices is supported. - } else { - LogWarning("Unknown operation for memory migration!"); - } - } - - profilingEnd(vcmd); -} - -HsaStatus VirtualGPU::getDispatchConfig(uint32_t lds_size, - bool profile_enable, - HsaDispatchConfig* config, - const amd::NDRangeContainer& sizes, - const amd::Kernel& kernel) -{ - uint32_t idx; - uint32_t dimensions; - - //Used to detect whether runtime implemetation should - //set up the work group size - bool overrideLwgSize = true; - - device::Kernel *devKernel = const_cast - (kernel.getDeviceKernel(dev())); - - // Initialize the work grid parameter - for (idx = 0; idx < 3; idx++) { - config->local_work_size.dimension[idx] = 1; - config->global_work_size.dimension[idx] = 1; - config->global_work_offset.dimension[idx] = 0; - } - - // Retrieve user provided work grid values - dimensions = sizes.dimensions(); - amd::NDRange local(sizes.local()); - amd::NDRange global(sizes.global()); - amd::NDRange offset(sizes.offset()); - - // Update the work grid with user provided values - for (idx = 0; idx < dimensions; idx++) { - config->global_work_size.dimension[idx] = global[idx]; - - config->global_work_offset.dimension[idx] = offset[idx]; - - //if reqd_work_group_size is set use that - //otherwise use the ones passed into NDRange - //In both cases, no need to further override work group size - if (devKernel->workGroupInfo()->compileSize_[idx]) { - config->local_work_size.dimension[idx] = - devKernel->workGroupInfo()->compileSize_[idx]; - overrideLwgSize = false; - } - else if (local[idx]) { - config->local_work_size.dimension[idx] = local[idx]; - overrideLwgSize = false; - } - } - - //If true, set work group sizes - if (overrideLwgSize) { - if (dimensions == 1) { - config->local_work_size.dimension[0] = - dev().settings().maxWorkGroupSize_; - } - else if (dimensions == 2) { - config->local_work_size.dimension[0] = - dev().settings().maxWorkGroupSize2DX_; - config->local_work_size.dimension[1] = - dev().settings().maxWorkGroupSize2DY_; - } - else if (dimensions == 3) { - config->local_work_size.dimension[0] = - dev().settings().maxWorkGroupSize3DX_; - config->local_work_size.dimension[1] = - dev().settings().maxWorkGroupSize3DY_; - config->local_work_size.dimension[2] = - dev().settings().maxWorkGroupSize3DZ_; - } - else { - assert("Invalid Work Dimensions"); - } - } - // Update Local Data Store and Profiling parameters - config->lds_size = lds_size; - config->work_dimensions = dimensions; - config->profile = profile_enable; - return kHsaStatusSuccess; -} - -HsaStatus VirtualGPU::synchronizeInterQueueKernels(HsaQueue *dispatchQueue) { - - // Determine current kernel type based on queue used to submit - HsaQueueType currQueue = (dispatchQueue == gpu_queue_) ? - kHsaQueueTypeCompute : kHsaQueueTypeInterop; - - // An outstanding kernel exists, a new one can be submitted - // as long as it belongs to the same class of queue type - if (lastSubmitQueue_ == currQueue) { - return kHsaStatusSuccess; - } - - // If there is no outstanding kernel, a new one can be - // submitted unconditionally - if (lastSubmitQueue_ == 0xFFFF) { - lastSubmitQueue_ = currQueue; - return kHsaStatusSuccess; - } - - // Current kernel submit cannot occur until all outstanding - // kernels on the queue type have completed. - releaseGpuMemoryFence(); - lastSubmitQueue_ = currQueue; - return kHsaStatusSuccess; -} - -/*! \brief Writes to the buffer and incrememts the write pointer to the - * buffer. Also, ensures that the argument is written to an - * aligned memory as specified - * - * @param dst The write pointer to the buffer - * @param src The source pointer - * @param size The size in bytes to copy - * @param alignment The alignment to follow while writing to the buffer - */ -static void -addArg(unsigned char** dst, const void* src, - size_t size, uint32_t alignment) -{ - *dst = amd::alignUp(*dst, alignment); - memcpy(*dst, src, size); - *dst += size; -} - -static inline void -addArg(unsigned char** dst, const void* src, size_t size) -{ - assert(size < UINT32_MAX); - addArg(dst, src, size, size); -} - -static void -fillSampleDescriptor(HsaSamplerDescriptor& samplerDescriptor, - const amd::Sampler& sampler) -{ - samplerDescriptor.filterType = sampler.filterMode() == CL_FILTER_NEAREST ? - HSA_SAMP_FILTER_NEAREST : HSA_SAMP_FILTER_LINEAR; - samplerDescriptor.coordinateMode = sampler.normalizedCoords() ? - HSA_SAMP_COORDINATE_NORMALIZED : HSA_SAMP_COORDINATE_UNNORMALIZED; - HsaSamplerAddressMode mode = HSA_SAMP_ADDRESS_NONE; - switch (sampler.addressingMode()) { - case CL_ADDRESS_CLAMP_TO_EDGE: - mode = HSA_SAMP_ADDRESS_CLAMPEDGE; - break; - case CL_ADDRESS_REPEAT: - mode = HSA_SAMP_ADDRESS_WRAP; - break; - case CL_ADDRESS_CLAMP: - mode = HSA_SAMP_ADDRESS_CLAMPBORDER; - break; - case CL_ADDRESS_MIRRORED_REPEAT: - mode = HSA_SAMP_ADDRESS_MIRROR; - break; - case CL_ADDRESS_NONE: - mode = HSA_SAMP_ADDRESS_MIRRORONCE; - break; - default: - return; - } - samplerDescriptor.addressModeX = mode; - samplerDescriptor.addressModeY = mode; - samplerDescriptor.addressModeZ = mode; -} - -bool -VirtualGPU::submitKernelInternal( - const amd::NDRangeContainer& sizes, - const amd::Kernel& kernel, - const_address parameters, - void *eventHandle) -{ - device::Kernel *devKernel = const_cast - (kernel.getDeviceKernel(dev())); - Kernel &gpuKernel = static_cast(*devKernel); - HsaKernelCode *kernelCode = const_cast(gpuKernel.kernelCode()); - const size_t compilerLdsUsage = kernelCode->workgroup_group_segment_byte_size; - size_t ldsUsage = compilerLdsUsage; - bool useInteropQueue = false; - - // Allocate buffer to hold kernel arguments - address argBuffer = NULL; - HsaStatus status = servicesapi->HsaAllocateSystemMemory( - kernelCode->kernarg_segment_byte_size, 256, - kHsaSystemMemoryTypeUncached, reinterpret_cast(&argBuffer)); - if (status != kHsaStatusSuccess) { - LogError("Out of memory"); - return false; - } - kernelArgList_.push_back(argBuffer); - address argPtr = argBuffer; - - // The HLC generates 3 additional arguments for the global offsets - for (uint j = 0; j < Kernel::ExtraArguments; ++j) { - const size_t offset = j < sizes.dimensions() ? sizes.offset()[j] : 0; - addArg(&argPtr, &offset, sizeof(size_t)); - } - - const amd::KernelSignature& signature = kernel.signature(); - const amd::KernelParameters& kernelParams = kernel.parameters(); - - // Find all parameters for the current kernel - for (uint i = 0; i != signature.numParameters(); ++i) { - const HsailKernelArg* arg = gpuKernel.hsailArgAt(i); - const_address srcArgPtr = parameters + signature.at(i).offset_; - - if (arg->type_ == HSAIL_ARGTYPE_POINTER ) { - const size_t size = sizeof(size_t); - if (arg->addrQual_ == HSAIL_ADDRESS_LOCAL) { - ldsUsage = amd::alignUp(ldsUsage, arg->alignment_); //!< do we need this? - addArg(&argPtr, &ldsUsage, size); - ldsUsage += *reinterpret_cast(srcArgPtr); - continue; - } - assert((arg->addrQual_ == HSAIL_ADDRESS_GLOBAL) && - "Unsupported address qualifier"); - if (kernelParams.boundToSvmPointer(dev(), parameters, i)) { - addArg(&argPtr, srcArgPtr, size); - continue; - } - amd::Memory* mem = *reinterpret_cast(srcArgPtr); - if (mem == NULL) { - addArg(&argPtr, srcArgPtr, size); - continue; - } - - Memory *devMem = static_cast(mem->getDeviceMemory(dev())); - //! @todo add multi-devices synchronization when supported. - void* globalAddress = devMem->getDeviceMemory(); - addArg(&argPtr, &globalAddress, size); - - //! @todo Compiler has to return read/write attributes - const cl_mem_flags flags = mem->getMemFlags(); - if (!flags || (flags & (CL_MEM_READ_WRITE | CL_MEM_WRITE_ONLY))) { - mem->signalWrite(&dev()); - } - - useInteropQueue |= devMem->isHsaLocalMemory(); - } - else if (arg->type_ == HSAIL_ARGTYPE_VALUE) { - if (arg->dataType_ == HSAIL_DATATYPE_STRUCT) { - void *mem = NULL; - if (kHsaStatusSuccess != servicesapi->HsaAllocateSystemMemory( - arg->size_, 0, kHsaSystemMemoryTypeUncached, &mem)) { - LogError("Out of memory"); - return false; - } - memcpy(mem, srcArgPtr, arg->size_); - addArg(&argPtr, &mem, sizeof(void*)); - kernelArgList_.push_back(mem); - continue; - } - for (uint e = 0; e < arg->numElem_; ++e) { - addArg(&argPtr, srcArgPtr, arg->size_); - srcArgPtr += arg->size_; - } - } - else if (arg->type_ == HSAIL_ARGTYPE_IMAGE) { - amd::Memory* mem = *reinterpret_cast(srcArgPtr); - Image* image = static_cast(mem->getDeviceMemory(dev())); - if (image == NULL) { - LogError( "Kernel image argument is not an image object"); - return false; - } - - // Image arguments are of size 48 bytes and are aligned to 16 bytes - addArg(&argPtr, image->getHsaImageObjectAddress(), - HSA_IMAGE_OBJECT_SIZE, HSA_IMAGE_OBJECT_ALIGNMENT); - - //! @todo Compiler has to return read/write attributes - const cl_mem_flags flags = mem->getMemFlags(); - if (!flags || (flags & (CL_MEM_READ_WRITE | CL_MEM_WRITE_ONLY))) { - mem->signalWrite(&dev()); - } - - useInteropQueue |= image->isHsaLocalMemory(); - } - else { - assert((arg->type_ == HSAIL_ARGTYPE_SAMPLER) && - "Unsupported address type"); - amd::Sampler* sampler = *reinterpret_cast(srcArgPtr); - if (sampler == NULL) { - LogError("Kernel sampler argument is not an sampler object"); - return false; - } - - HsaSamplerDescriptor samplerDescriptor; - fillSampleDescriptor(samplerDescriptor, *sampler); - - argPtr = amd::alignUp(argPtr, HSA_SAMPLER_OBJECT_ALIGNMENT); - status = hsacoreapi->HsaCreateDeviceSampler(dev().getBackendDevice(), - &samplerDescriptor, argPtr); - if (status != kHsaStatusSuccess) { - LogError("Error creating device sampler object!"); - return false; - } - argPtr += HSA_SAMPLER_OBJECT_SIZE; - } - } - - // Check there is no arguments' buffer overflow - assert(argPtr <= argBuffer + kernelCode->kernarg_segment_byte_size); - - // Check for group memory overflow - //! @todo Check should be in HSA - here we should have at most an assert - if (ldsUsage > gpu_device_->group_memory_size) { - LogError("No local memory available\n"); - return false; - } - - HsaQueue *queue = useInteropQueue ? interopQueue_ : gpu_queue_; - - // Set the acl_binary and ocl event for possible debugger use - if (eventHandle != NULL) { - const HsaDevice *device = queue->device; - servicesapi->HsaDebuggerCorrelationHandler(device, eventHandle); - assert(gpuKernel.brig()->loadmap_section != NULL); - void * acl_binary = - reinterpret_cast(gpuKernel.brig()->loadmap_section); - servicesapi->HsaSetAclBinary(device, - const_cast(gpuKernel.program()->binaryElf())); - } - - // Obtain handle to an instance of Dispatch configuration object - HsaDispatchConfig config; - bool profilingEnable = timestamp_ != NULL; - status = getDispatchConfig(ldsUsage - compilerLdsUsage, profilingEnable, - &config, sizes, kernel); - if (status != kHsaStatusSuccess) { - LogError("Call to HsaPopulateDispatchConfig failed.\n"); - return false; - } - - // Determine if enqueue must wait on last kernel submit - status = synchronizeInterQueueKernels(queue); - if (status != kHsaStatusSuccess) { - LogError("synchronizeInterQueueKernels failed"); - return false; - } - - // Create a signal object to monitor kernel completion when needed - HsaSignal signal = profilingEnable ? timestamp_->createSignal() : 0; - status = servicesapi->HsaDispatchKernel(queue, signal, kernelCode, &config, - (uint64_t*)argBuffer, 1); - if (status != kHsaStatusSuccess) { - LogError("Call to HsaDispatchKernel failed.\n"); - return false; - } - - // Mark the flag indicating if a dispatch is outstanding - hasPendingDispatch_ = true; - return true; -} -/** - * @brief Api to dispatch a kernel for execution. The implementation - * parses the input object, an instance of virtual command to obtain - * the parameters of global size, work group size, offsets of work - * items, enable/disable profiling, etc. - * - * It also parses the kernel arguments buffer to inject into Hsa Runtime - * the list of kernel parameters. - */ -void VirtualGPU::submitKernel(amd::NDRangeKernelCommand &vcmd) { - profilingBegin(vcmd); - - // Submit kernel to HW - if (!submitKernelInternal( - vcmd.sizes(), vcmd.kernel(), vcmd.parameters(), - static_cast(as_cl(&vcmd.event())))) { - vcmd.setStatus(CL_INVALID_OPERATION); - } - - profilingEnd(vcmd); -} - -void VirtualGPU::submitNativeFn(amd::NativeFnCommand &cmd) { - // std::cout<<__FUNCTION__<<" not implemented"<<"*********"<::const_iterator it = vcmd.getMemList().begin(); - amd::InteropObject *interop; - std::vector d3d10Resources; - std::vector d3d11Resources; - amd::D3D10Object *d3d10Obj; - amd::D3D11Object *d3d11Obj; - - for (std::vector::const_iterator it = - vcmd.getMemList().begin(); - it != vcmd.getMemList().end(); it++) { - // amd::Memory object should never be NULL - assert(*it && "Memory object for interop is NULL"); - - device::Memory *m = (*it)->getDeviceMemory(dev()); - oclhsa::Memory *memory = static_cast(m); - - interop = (*it)->getInteropObj(); - // [TODO]: Check if this is need in case of HSA. - - if (interop) { - d3d10Obj = interop->asD3D10Object(); - if (d3d10Obj != NULL) { - if (d3d10Obj->getD3D10ResOrig() != NULL) { - // Resource is a shared copy of original resource - // Need to copy data from original resource - d3d10Obj->copyOrigToShared(); - } - assert(d3d10Obj->getD3D10Resource() != NULL); - d3d10Resources.push_back(d3d10Obj->getD3D10Resource()); - } - - d3d11Obj = interop->asD3D11Object(); - if (d3d11Obj != NULL) { - if (d3d11Obj->getD3D11ResOrig() != NULL) { - // Resource is a shared copy of original resource - // Need to copy data from original resource - d3d11Obj->copyOrigToShared(); - } - assert(d3d11Obj->getD3D11Resource() != NULL); - d3d11Resources.push_back(d3d11Obj->getD3D11Resource()); - } - } - - } //end of for loop - - if (!d3d10Resources.empty()) { - HsaStatus status = hsacoreapi->HsaAcquireD3D10Resources(gpu_device_, - &d3d10Resources[0], - d3d10Resources.size()); - if (status != kHsaStatusSuccess) { - LogError("HsaAcquireD3D10Resources - failed"); - vcmd.setStatus(CL_INVALID_OPERATION); - return; - } - } - - if (!d3d11Resources.empty()) { - HsaStatus status = hsacoreapi->HsaAcquireD3D11Resources(gpu_device_, - &d3d11Resources[0], - d3d11Resources.size()); - if (status != kHsaStatusSuccess) { - LogError("HsaAcquireD3D11Resources - failed"); - vcmd.setStatus(CL_INVALID_OPERATION); - return; - } - } -#endif - - profilingEnd(vcmd); -} - -void VirtualGPU::submitReleaseExtObjects(amd::ReleaseExtObjectsCommand &vcmd) { - - // Wait on a kernel if one is outstanding - releaseGpuMemoryFence(); - - profilingBegin(vcmd); - std::vector::const_iterator it = vcmd.getMemList().begin(); - - amd::InteropObject *interop; - -#ifdef _WIN32 - std::vector d3d10Resources; - std::vector d3d11Resources; - - amd::D3D10Object *d3d10Obj; - amd::D3D11Object *d3d11Obj; - - for (std::vector::const_iterator it = - vcmd.getMemList().begin(); - it != vcmd.getMemList().end(); it++) { - // amd::Memory object should never be NULL - assert(*it && "Memory object for interop is NULL"); - - device::Memory *m = (*it)->getDeviceMemory(dev()); - oclhsa::Memory *memory = static_cast(m); - interop = (*it)->getInteropObj(); - - if (interop) { - d3d10Obj = interop->asD3D10Object(); - if (d3d10Obj != NULL) { - if (d3d10Obj->getD3D10ResOrig() != NULL) { - // Resource is a shared copy of original resource - // Need to copy data from original resource - d3d10Obj->copySharedToOrig(); - } - assert(d3d10Obj->getD3D10Resource() != NULL); - d3d10Resources.push_back(d3d10Obj->getD3D10Resource()); - } - - d3d11Obj = interop->asD3D11Object(); - if (d3d11Obj != NULL) { - if (d3d11Obj->getD3D11ResOrig() != NULL) { - // Resource is a shared copy of original resource - // Need to copy data from original resource - d3d11Obj->copySharedToOrig(); - } - assert(d3d11Obj->getD3D11Resource() != NULL); - d3d11Resources.push_back(d3d11Obj->getD3D11Resource()); - } - } - } - - if (!d3d10Resources.empty()) { - HsaStatus status = hsacoreapi->HsaReleaseD3D10Resources(gpu_device_, - &d3d10Resources[0], - d3d10Resources.size()); - if (status != kHsaStatusSuccess) { - LogError("HsaReleaseD3D10Resources - failed"); - vcmd.setStatus(CL_INVALID_OPERATION); - return; - } - } - - if (!d3d11Resources.empty()) { - HsaStatus status = hsacoreapi->HsaReleaseD3D11Resources(gpu_device_, - &d3d11Resources[0], - d3d11Resources.size()); - if (status != kHsaStatusSuccess) { - LogError("HsaReleaseD3D11Resources - failed"); - vcmd.setStatus(CL_INVALID_OPERATION); - return; - } - } -#endif // _WIN32 - - profilingEnd(vcmd); -} - -void -VirtualGPU::submitSvmFreeMemory(amd::SvmFreeMemoryCommand& cmd) -{ - // in-order semantics: previous commands need to be done before we start - releaseGpuMemoryFence(); - - profilingBegin(cmd); - const std::vector& svmPointers = cmd.svmPointers(); - if (cmd.pfnFreeFunc() == NULL) { - // pointers allocated using clSVMAlloc - for (cl_uint i = 0; i < svmPointers.size(); i++) { - amd::SvmBuffer::free(cmd.context(), svmPointers[i]); - } - } - else { - cmd.pfnFreeFunc()(as_cl(cmd.queue()->asCommandQueue()), svmPointers.size(), - (void**) (&(svmPointers[0])), cmd.userData()); - } - profilingEnd(cmd); -} - -void -VirtualGPU::submitSvmCopyMemory(amd::SvmCopyMemoryCommand& cmd) -{ - releaseGpuMemoryFence(); - profilingBegin(cmd); - SvmBuffer::memFill(cmd.dst(), cmd.src(), cmd.srcSize(), 1); - profilingEnd(cmd); -} - -void -VirtualGPU::submitSvmFillMemory(amd::SvmFillMemoryCommand& cmd) -{ - releaseGpuMemoryFence(); - profilingBegin(cmd); - SvmBuffer::memFill(cmd.dst(), cmd.pattern(), cmd.patternSize(), cmd.times()); - profilingEnd(cmd); -} - -void -VirtualGPU::submitSvmMapMemory(amd::SvmMapMemoryCommand& cmd) -{ - // no fence is needed since this is a no-op: the command will be completed - // only after all the previous commands are complete - profilingBegin(cmd); - profilingEnd(cmd); -} - -void -VirtualGPU::submitSvmUnmapMemory(amd::SvmUnmapMemoryCommand& cmd) -{ - // no fence is needed since this is a no-op: the command will be completed - // only after all the previous commands are complete - profilingBegin(cmd); - profilingEnd(cmd); -} - -void VirtualGPU::submitPerfCounter(amd::PerfCounterCommand &vcmd) { - - // Wait on a kernel if one is outstanding - releaseGpuMemoryFence(); - - HsaPmu hsaPmu = NULL; - HsaStatus status; - const amd::PerfCounterCommand::PerfCounterList counters = vcmd.getCounters(); - for (uint i = 0; i < vcmd.getNumCounters(); ++i) { - amd::PerfCounter* amdCounter = - static_cast(counters[i]); - const PerfCounter* counter = - reinterpret_cast(amdCounter->getDeviceCounter()); - - // Make sure we have a valid gpu performance counter - if (NULL == counter) { - if (hsaPmu == NULL) { - status = servicesapi->HsaCreatePmu(gpu_device_, &hsaPmu); - if (status != kHsaStatusSuccess) { - LogError("HsaCreatePmu - failed"); - vcmd.setStatus(CL_INVALID_OPERATION); - return; - } - } - - amd::PerfCounter::Properties prop = amdCounter->properties(); - PerfCounter* hsaCounter = new PerfCounter( - gpu_device_, - *this, - prop[CL_PERFCOUNTER_GPU_BLOCK_INDEX], - prop[CL_PERFCOUNTER_GPU_COUNTER_INDEX], - prop[CL_PERFCOUNTER_GPU_EVENT_INDEX]); - if (NULL == hsaCounter) { - LogError("We failed to allocate memory for the GPU perfcounter"); - vcmd.setStatus(CL_INVALID_OPERATION); - return; - } - else if (hsaCounter->create(hsaPmu)) { - amdCounter->setDeviceCounter(hsaCounter); - } - else { - LogPrintfError("We failed to allocate a perfcounter in Hsa.\ - Block: %d, counter: #d, event: %d", - hsaCounter->info()->blockIndex_, - hsaCounter->info()->counterIndex_, - hsaCounter->info()->eventIndex_); - delete hsaCounter; - vcmd.setStatus(CL_INVALID_OPERATION); - return; - } - counter = NULL; - } - } - - if (vcmd.getState() == amd::PerfCounterCommand::Begin) { - hsaPmu = NULL; - for (uint i = 0; i < vcmd.getNumCounters(); ++i) { - amd::PerfCounter* amdCounter = - static_cast(counters[i]); - const PerfCounter* counter = - static_cast(amdCounter->getDeviceCounter()); - - if (hsaPmu != counter->getCounterPmu()) { - hsaPmu = counter->getCounterPmu(); - status = servicesapi->HsaPmuBegin(hsaPmu, gpu_queue_, true); - if (status != kHsaStatusSuccess) { - LogError("HsaPmuBegin failed"); - vcmd.setStatus(CL_INVALID_OPERATION); - return; - } - } - } - } - else if (vcmd.getState() == amd::PerfCounterCommand::End) { - hsaPmu = NULL; - for (uint i = 0; i < vcmd.getNumCounters(); ++i) { - amd::PerfCounter* amdCounter = - static_cast(counters[i]); - const PerfCounter* counter = - static_cast(amdCounter->getDeviceCounter()); - - if (hsaPmu != counter->getCounterPmu()) { - hsaPmu = counter->getCounterPmu(); - status = servicesapi->HsaPmuEnd(hsaPmu, gpu_queue_); - if (status != kHsaStatusSuccess) { - LogError("HsaPmuEnd failed"); - vcmd.setStatus(CL_INVALID_OPERATION); - return; - } - - status = servicesapi->HsaPmuWaitForCompletion(hsaPmu, HSA_TIMEOUT_INFINITE); - if (status != kHsaStatusSuccess) { - LogError("HsaPmuWaitForCompletion failed"); - vcmd.setStatus(CL_INVALID_OPERATION); - return; - } - } - } - } - else { - LogError("Unsupported performance counter state"); - vcmd.setStatus(CL_INVALID_OPERATION); - return; - } -} - -void VirtualGPU::flush(amd::Command *list, bool wait) { - - /** - * VT TODO temporarily setting the status complete at flush - * This is not the correct way of handling completion, the - * correct way is to either register a callback that sets - * command status or tie-in event from higher levels to HSA - * Event. There are no known thread safety issues if an HSA - * event is exposed to OCL level and mapped to its event - * - * list->setStatus(CL_COMPLETE); - */ - amd::Command *current = list; - - // Query the status of openCL kernel task i.e. is still - // running or has completed. - releaseGpuMemoryFence(); - - // If profiling is enabled collect the results - if (profilingCollectResults(list)) { - return; - } - - // The openCL task has completed successfully - while (current != NULL) { - - // @note: Currently Commands coming into Hsa Runtime - // already have their status set as CL_SUBMITTED - // SUBMITTED -> RUNNING -> COMPLETE - if (current->status() == CL_SUBMITTED) { - current->setStatus(CL_RUNNING); - current->setStatus(CL_COMPLETE); - } - else if (current->status() == CL_RUNNING) { - current->setStatus(CL_COMPLETE); - } - - // Get the next command in the list for updates and free current. - amd::Command *next = current->getNext(); - current->release(); - current = next; - } - - // Release the memory blocks allocated for the various - // struct arguments of one or more kernel submissions - std::for_each(kernelArgList_.begin(), - kernelArgList_.end(), - std::ptr_fun(servicesapi->HsaFreeSystemMemory)); - kernelArgList_.clear(); - - // Reset the queue parameter - lastSubmitQueue_ = static_cast(0xFFFF); -} -} // End of oclhsa namespace diff --git a/rocclr/runtime/device/hsa/hsavirtual.hpp b/rocclr/runtime/device/hsa/hsavirtual.hpp deleted file mode 100644 index 8ab98c05c3..0000000000 --- a/rocclr/runtime/device/hsa/hsavirtual.hpp +++ /dev/null @@ -1,181 +0,0 @@ -// -// Copyright (c) 2008 Advanced Micro Devices, Inc. All rights reserved. -// - -#ifndef HSAVIRTUAL_HPP_ -#define HSAVIRTUAL_HPP_ -#include "hsadevice.hpp" -#include "services.h" -#include "utils/util.hpp" - -namespace oclhsa { -class Device; - -// Timestamp for keeping track of some profiling information for various events -// including EnqueueNDRangeKernel and clEnqueueCopyBuffer. -class Timestamp { -private: - HsaSignal signal_; - uint64_t start_; - uint64_t end_; - -public: - // get-ers - uint64_t getStart() const { return start_; } - uint64_t getEnd() const { return end_; } - HsaSignal getSignal() const { return signal_; } - - // Default constructor - Timestamp() - : signal_(0), - start_(0), - end_(0) {} - - // Deconstructor, which will delete the signal if we created one - ~Timestamp(); - - // Creates a signal for the timestamp, saves it, and returns it - HsaSignal createSignal(); - - // Start a timestamp (get timestamp from OS) - void start(); - - // End a timestamp (get timestamp from OS) - void end(); -}; - -class VirtualGPU : public device::VirtualDevice { -public: - VirtualGPU(Device &device); - ~VirtualGPU(); - - bool create(HsaQueueType queueType); - bool terminate(); - - void profilingBegin(amd::Command &command, bool drmProfiling = false); - const Device& dev() const { return oclhsa_device_; } - //! End the command profiling - void profilingEnd(amd::Command &command); - - //! Collect the profiling results - bool profilingCollectResults( - amd::Command* list //!< List of all commands in the batch. - ); - void submitReadMemory(amd::ReadMemoryCommand& cmd); - void submitWriteMemory(amd::WriteMemoryCommand& cmd); - void submitCopyMemory(amd::CopyMemoryCommand& cmd); - void submitMapMemory(amd::MapMemoryCommand& cmd); - void submitUnmapMemory(amd::UnmapMemoryCommand& cmd); - void submitKernel(amd::NDRangeKernelCommand& cmd); - bool submitKernelInternal( - const amd::NDRangeContainer& sizes, //!< Workload sizes - const amd::Kernel& kernel, //!< Kernel for execution - const_address parameters, //!< Parameters for the kernel - void *event_handle //!< Handle to OCL event for debugging - ); - void submitNativeFn(amd::NativeFnCommand& cmd); - void submitMarker(amd::Marker& cmd); - void submitAcquireExtObjects(amd::AcquireExtObjectsCommand& cmd); - void submitReleaseExtObjects(amd::ReleaseExtObjectsCommand& cmd); - void submitPerfCounter(amd::PerfCounterCommand& cmd); - void flush(amd::Command* list = NULL, bool wait = false); - void submitFillMemory(amd::FillMemoryCommand& cmd); - void submitMigrateMemObjects(amd::MigrateMemObjectsCommand& cmd); - -// { oclhsa OpenCL integration -// Added these stub (no-ops) implementation of pure virtual methods, -// when integrating HSA and OpenCL branches. -// TODO: After inegration, whoever is working on VirtualGPU should write -// actual implemention. - virtual void submitSignal(amd::SignalCommand &cmd) {} - virtual void submitMakeBuffersResident(amd::MakeBuffersResidentCommand &cmd) {} - virtual void submitSvmFreeMemory(amd::SvmFreeMemoryCommand& cmd); - virtual void submitSvmCopyMemory(amd::SvmCopyMemoryCommand& cmd); - virtual void submitSvmFillMemory(amd::SvmFillMemoryCommand& cmd); - virtual void submitSvmMapMemory(amd::SvmMapMemoryCommand& cmd); - virtual void submitSvmUnmapMemory(amd::SvmUnmapMemoryCommand& cmd); - void submitThreadTraceMemObjects(amd::ThreadTraceMemObjectsCommand &cmd) {} - void submitThreadTrace(amd::ThreadTraceCommand &vcmd) {} - - /** - * @brief Waits on an outstanding kernel without regard to how - * it was dispatched - with or without a signal - * - * @return bool true if Wait returned successfully, false - * otherwise - */ - bool releaseGpuMemoryFence(); -// } oclhsa OpenCL integration -private: - /** - * @brief Retrieves the various configuration parameters that could - * be used to execute a kernel - Enable Profiling, Sizes of Global, - * Local work spaces, offsets for global Id, etc. - * - * @note: The implementation currently does not verify if the input - * parameters for global, local and offset arrays are valid. For - * example, it assumes that the values that are passed in conform to - * openCL properties such as: CL_DEVICE_MAX_WORK_ITEM_SIZES, - * CL_DEVICE_MAX_WORK_GROUP_SIZE, etc - * - * @param lds_size The amount of LDS memory used in the kernel. - * - * @param profile_enable Flag to enable kernel profiling. - * - * @param config Output parameter updated with various execution - * policy paramters. - * - * @param sizes The work item and work group size. - * - * @return HsaStatus ::kHsaStatusSuccess or ::kHsaStatusError - */ - HsaStatus getDispatchConfig( - uint32_t lds_size, - bool profile_enable, - HsaDispatchConfig* config, - const amd::NDRangeContainer& sizes, - const amd::Kernel& kernel); - - /** - * @brief Synchronize kernel submits across different queue types - * i.e. a submit to compute kernel should determine that there is no - * outstanding kernel to another queue type, e.g. interop queue. - * The same applies for submits to interop queues or queues of - * another type. - * - * @param dispatch_queue Queue object into which the current kernel - * would be submitted. - * - * @return HsaStatus ::kHsaStatusSuccess or ::kHsaStatusError - */ - HsaStatus synchronizeInterQueueKernels(HsaQueue *dispatchQueue); - - /** - * @brief Maintains the list of memory blocks allocated - * for one or more kernel submissions - */ - std::vector kernelArgList_; - - /** - * @brief Indicates if a kernel dispatch is outstanding. This flag is - * used to synchronized on kernel outputs. - */ - bool hasPendingDispatch_; - - /** - * @brief Maintains the queue type of the last kernel submit. - * Submission of kernels across queue types must be coordinated - * i.e. all outstanding kernels on one queue type must be finished - * before kernels can be submitted onto a different queue type. - */ - HsaQueueType lastSubmitQueue_; - - Timestamp* timestamp_; - HsaDevice* gpu_device_; //!< Physical device - HsaQueue* gpu_queue_; //!< Queue associated with a gpu - HsaQueue* interopQueue_; //!< Interop queue associated with a gpu - uint32_t dispatch_id_; //!< This variable must be updated atomically. - Device& oclhsa_device_; //!< oclhsa device object -}; -} -#endif diff --git a/rocclr/runtime/device/hsa/oclhsa.def b/rocclr/runtime/device/hsa/oclhsa.def deleted file mode 100644 index ad704b16e8..0000000000 --- a/rocclr/runtime/device/hsa/oclhsa.def +++ /dev/null @@ -1,3 +0,0 @@ -LIBRARY OCLHSA -EXPORTS - diff --git a/rocclr/runtime/device/hsa/oclhsa_common.hpp b/rocclr/runtime/device/hsa/oclhsa_common.hpp deleted file mode 100644 index 741cebce5b..0000000000 --- a/rocclr/runtime/device/hsa/oclhsa_common.hpp +++ /dev/null @@ -1,26 +0,0 @@ -// -// Copyright (c) 2013 Advanced Micro Devices, Inc. All rights reserved. -// - -#ifndef _OPENCL_RUNTIME_DEVICE_HSA_OCLHSA_COMMON_HPP_ -#define _OPENCL_RUNTIME_DEVICE_HSA_OCLHSA_COMMON_HPP_ - -#include "hsacore_symbol_loader.hpp" -#include "services_symbol_loader.hpp" - -#include "hsacoreagent.h" -#include "hsaagent.h" - -#ifdef __cplusplus -extern "C" { -#endif - -extern const HsaCoreApiTable *hsacoreapi; -extern const HsaServicesApiTable *servicesapi; - - -#ifdef __cplusplus -} -#endif - -#endif // header guard diff --git a/rocclr/runtime/device/hsa/services_symbol_loader.cpp b/rocclr/runtime/device/hsa/services_symbol_loader.cpp deleted file mode 100644 index 308a9a3b79..0000000000 --- a/rocclr/runtime/device/hsa/services_symbol_loader.cpp +++ /dev/null @@ -1,52 +0,0 @@ -// -// Copyright (c) 2013 Advanced Micro Devices, Inc. All rights reserved. -// - -// Implementation of the the loading of dll and loading of all the exported -// function symbols. - -#include "device/hsa/services_symbol_loader.hpp" - -#include "runtime/thread/thread.hpp" -#include "runtime/utils/debug.hpp" -#include "runtime/os/os.hpp" - -#include - -#include - -ServicesApiSymbols* ServicesApiSymbols::instance_ = NULL; -// services_dll_handle_ is defined in ServicesApiSymbols class. -// This macro must be used only in member functions of ServicesApiSymbols -// class. -#define LOADSYMBOL(api) \ - api = (pfn_ ## api) amd::Os::getSymbol(services_dll_handle_, # api); \ - if (api == NULL) { \ - amd::log_printf(amd::LOG_ERROR, __FILE__, __LINE__, \ - "amd::Os::getSymbol() for exported func " # api " failed."); \ - amd::Os::unloadLibrary(services_dll_handle_); \ - abort(); \ - } - -ServicesApiSymbols::ServicesApiSymbols() - : services_dll_name_(SERVICES_DLL_NAME) { - services_dll_handle_ = amd::Os::loadLibrary(services_dll_name_.c_str()); - if (services_dll_handle_ == NULL) { -// Do not print, otherwise tests fail when HSA core and services DLLs are -// not installed, in which case only ORCA stack is initialized and it is -// not an error -// amd::log_printf(amd::LOG_INFO, __FILE__, __LINE__, -//"Cannot load hsa servicese dll. HSA DLLs may not be installed on the machine." -//" OpenCL requirement, returning without error."); - return; - } - - LOADSYMBOL(HsaGetServicesApiTable) -} - -ServicesApiSymbols::~ServicesApiSymbols() { - if (services_dll_handle_) { - amd::Os::unloadLibrary(services_dll_handle_); - services_dll_handle_ = NULL; - } -} diff --git a/rocclr/runtime/device/hsa/services_symbol_loader.hpp b/rocclr/runtime/device/hsa/services_symbol_loader.hpp deleted file mode 100644 index 9125d67215..0000000000 --- a/rocclr/runtime/device/hsa/services_symbol_loader.hpp +++ /dev/null @@ -1,78 +0,0 @@ -// -// Copyright (c) 2013 Advanced Micro Devices, Inc. All rights reserved. -// - -#ifndef _OPENCL_RUNTIME_DEVICE_HSA_SERVICES_SYMBOL_LOADER_HPP_ -#define _OPENCL_RUNTIME_DEVICE_HSA_SERVICES_SYMBOL_LOADER_HPP_ - -// File: services_symbol_loader.hpp -// The main purpose of this file (class ServicesApiSymbols), is to load the HSA -// API function symbol HsaGetServicesApiTable() from hsaservices DLL/so module. -// This function outputs HsaServicesApiTable which has pointers to the rest of the -// hsaservices API functions, which should be used to invoke the API functions. - -#include "services.h" -#include "hsainterop.h" -#include "hsaagent.h" - -#include - -// In case of change in the name of hsaservices dll name, change the -// #define SERVICES_DLL_NAME value. this is the only place the DLL name should -// be changed or referred to. -#define SERVICES_DLL_NAME "hsaservices" LP64_ONLY("64") - -// Convention: The typedefed function name must be prefixed with pfn_ indicating -// it as pointer-to-function. -typedef HsaStatus (*pfn_HsaGetServicesApiTable)(const HsaServicesApiTable **api_table); - -// Singleton ServicesApiSymbols class contains the module handle and loaded -// symbols of one accessor API accessor function. -// To call hsaservices API funciton, instance of this class must be used. -// Example: -// // In initialization code -// const HsaServicesApiTable *servicesapi = NULL; -// ServicesApiSymbols::Instance().HsaGetServicesApiTable(&servicesapi); -// ... -// ... -// // Calling the services api. -// servicesapi->HsaGetDevices(...); -// servicesapi->HsaRegisterMemory(...); -class ServicesApiSymbols { - public: - // Only the access function symbol is loaded, which in turn has pointers to - // rest of the hsaservices api. - pfn_HsaGetServicesApiTable HsaGetServicesApiTable; - static ServicesApiSymbols& Instance() { - if (instance_ == NULL) { - instance_ = new ServicesApiSymbols(); - } - - return *instance_; - } - static void teardown(){ - if (instance_ != NULL){ - delete instance_; - } - } - static bool IsDllLoaded(){ - return Instance().services_dll_handle_ ? true : false; - }; - - - private: - - static ServicesApiSymbols* instance_; - // Force singleton pattern. - explicit ServicesApiSymbols(); - ~ServicesApiSymbols(); - ServicesApiSymbols(const ServicesApiSymbols &) {} - const ServicesApiSymbols &operator=(const ServicesApiSymbols &) { - return *this; - } - - // Data. - void *services_dll_handle_; - const std::string services_dll_name_; -}; -#endif // _OPENCL_RUNTIME_DEVICE_HSA_SERVICES_SYMBOL_LOADER_HPP_ diff --git a/rocclr/runtime/device/hsa/system_memory.h b/rocclr/runtime/device/hsa/system_memory.h deleted file mode 100644 index 55602044db..0000000000 --- a/rocclr/runtime/device/hsa/system_memory.h +++ /dev/null @@ -1,97 +0,0 @@ -// -// Copyright (c) 2013 Advanced Micro Devices, Inc. All rights reserved. -// - -/** @file */ - -#ifndef _OPENCL_RUNTIME_DEVICE_HSA_SYSTEM_MEMORY_H_ -#define _OPENCL_RUNTIME_DEVICE_HSA_SYSTEM_MEMORY_H_ - -#include "newcore.h" - -#ifdef __cplusplus -extern "C" { -#endif // __cplusplus - -/** - ******************************************************************************* - * @brief System memory types. - * @details The memory option enumerations are used for specifying the various - * configurable global system memory allocation options. - ******************************************************************************* - */ -typedef enum { - /** - * Memory option used for requesting cacheable system memory. - */ - kHsaAmdSystemMemoryTypeDefault = 0, - - /** - * Memory option used for requesting system memory with caching disabled. - */ - kHsaAmdSystemMemoryTypeUncached = 1, - - /** - * Memory option used for requesting write-combined system memory. - */ - kHsaAmdSystemMemoryTypeWriteCombined = 2, - - /** - * Shortcut to get the number of supported memory type. - */ - kHsaAmdSystemMemoryTypeCount = 3 -} HsaAmdSystemMemoryType; - -/** - **************************************************************************** - * @brief Allocate system memory accessible by all AMD devices in the platform. - * @details The HsaAmdAllocateSystemMemory() interface is used for allocating - * global system memory accessible (read and write) by the host and all AMD - * devices in the platform. - * - * @param size The allocation size in bytes. - * @param alignment The alignment size in bytes for the address of resulting - * allocation. If the value is zero, no particular alignment will be applied. - * If the value is not zero, it needs to be a power of two and minimum of - * sizeof(void*). - * @param type Type of system memory. - * @param address A pointer to the location of where to return the pointer to - * the base of the allocated region of memory. - * - * @return HsaStatus - * @retval kHsaStatusSuccess The requested amount of memory was successfully - * allocated. - * @retval kHsaStatusOutOfMemory The implementation was unable to allocate the - * requested amount of device memory due to memory constraints. - * @retval kHsaStatusInvalidArgument An address of NULL was specified, the size - * is 0 or the alignment is invalid. - * - * @see HsaAmdFreeSystemMemory, HsaAmdSystemMemoryType - **************************************************************************/ -COREAPI HsaStatus HsaAmdAllocateSystemMemory(size_t size, - size_t alignment, - HsaAmdSystemMemoryType type, - void **address); - -/** - **************************************************************************** - * @brief Deallocate system memory. - * @details The HsaAmdFreeSystemMemory() interface is used for - * deallocating global system memory that was previously allocated with - * HsaAmdAllocateSystemMemory(). - * - * @param address A pointer to the address to be deallocated. - * - * @return HsaStatus - * @retval kHsaStatusSuccess The requested memory was successfully deallocated. - * @retval kHsaStatusInvalidArguement An address of NULL was specified. - * - * @see HsaAmdAllocateSystemMemory - *************************************************************************** - */ -COREAPI HsaStatus HsaAmdFreeSystemMemory(void *address); - -#ifdef __cplusplus -} -#endif // __cplusplus -#endif // header guard