diff --git a/projects/clr/rocclr/runtime/device/rocm/mesa_glinterop.h b/projects/clr/rocclr/runtime/device/rocm/mesa_glinterop.h deleted file mode 100644 index 0b9cb4e9f7..0000000000 --- a/projects/clr/rocclr/runtime/device/rocm/mesa_glinterop.h +++ /dev/null @@ -1,271 +0,0 @@ -/* - * Mesa 3-D graphics library - * - * Copyright 2016 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS - * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR - * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, - * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -/* Mesa OpenGL inter-driver interoperability interface designed for but not - * limited to OpenCL. - * - * This is a driver-agnostic, backward-compatible interface. The structures - * are only allowed to grow. They can never shrink and their members can - * never be removed, renamed, or redefined. - * - * The interface doesn't return a lot of static texture parameters like - * width, height, etc. It mainly returns mutable buffer and texture view - * parameters that can't be part of the texture allocation (because they are - * mutable). If drivers want to return more data or want to return static - * allocation parameters, they can do it in one of these two ways: - * - attaching the data to the DMABUF handle in a driver-specific way - * - passing the data via "out_driver_data" in the "in" structure. - * - * Mesa is expected to do a lot of error checking on behalf of OpenCL, such - * as checking the target, miplevel, and texture completeness. - * - * OpenCL, on the other hand, needs to check if the display+context combo - * is compatible with the OpenCL driver by querying the device information. - * It also needs to check if the texture internal format and channel ordering - * (returned in a driver-specific way) is supported by OpenCL, among other - * things. - */ - -#ifndef MESA_GLINTEROP_H -#define MESA_GLINTEROP_H - -#include - -#if !defined(MESA_GLINTEROP_NO_GLX) -#include -#include -#else -#include -#include -#endif - -#ifdef __cplusplus -extern "C" { -#endif - -#define MESA_GLINTEROP_VERSION 1 - -/** Returned error codes. */ -enum { - MESA_GLINTEROP_SUCCESS = 0, - MESA_GLINTEROP_OUT_OF_RESOURCES, - MESA_GLINTEROP_OUT_OF_HOST_MEMORY, - MESA_GLINTEROP_INVALID_OPERATION, - MESA_GLINTEROP_INVALID_VALUE, - MESA_GLINTEROP_INVALID_DISPLAY, - MESA_GLINTEROP_INVALID_CONTEXT, - MESA_GLINTEROP_INVALID_TARGET, - MESA_GLINTEROP_INVALID_OBJECT, - MESA_GLINTEROP_INVALID_MIP_LEVEL, - MESA_GLINTEROP_UNSUPPORTED -}; - -/** Access flags. */ -enum { - MESA_GLINTEROP_ACCESS_READ_WRITE = 0, - MESA_GLINTEROP_ACCESS_READ_ONLY, - MESA_GLINTEROP_ACCESS_WRITE_ONLY -}; - - -/** - * Device information returned by Mesa. - */ -typedef struct _mesa_glinterop_device_info { - uint32_t size; /* size of this structure */ - - /* PCI location */ - uint32_t pci_segment_group; - uint32_t pci_bus; - uint32_t pci_device; - uint32_t pci_function; - - /* Device identification */ - uint32_t vendor_id; - uint32_t device_id; -} mesa_glinterop_device_info; - - -/** - * Input parameters to Mesa interop export functions. - */ -typedef struct _mesa_glinterop_export_in { - uint32_t size; /* size of this structure */ - - /* One of the following: - * - GL_TEXTURE_BUFFER - * - GL_TEXTURE_1D - * - GL_TEXTURE_2D - * - GL_TEXTURE_3D - * - GL_TEXTURE_RECTANGLE - * - GL_TEXTURE_1D_ARRAY - * - GL_TEXTURE_2D_ARRAY - * - GL_TEXTURE_CUBE_MAP_ARRAY - * - GL_TEXTURE_CUBE_MAP - * - GL_TEXTURE_CUBE_MAP_POSITIVE_X - * - GL_TEXTURE_CUBE_MAP_NEGATIVE_X - * - GL_TEXTURE_CUBE_MAP_POSITIVE_Y - * - GL_TEXTURE_CUBE_MAP_NEGATIVE_Y - * - GL_TEXTURE_CUBE_MAP_POSITIVE_Z - * - GL_TEXTURE_CUBE_MAP_NEGATIVE_Z - * - GL_TEXTURE_2D_MULTISAMPLE - * - GL_TEXTURE_2D_MULTISAMPLE_ARRAY - * - GL_TEXTURE_EXTERNAL_OES - * - GL_RENDERBUFFER - * - GL_ARRAY_BUFFER - */ - GLenum target; - - /* If target is GL_ARRAY_BUFFER, it's a buffer object. - * If target is GL_RENDERBUFFER, it's a renderbuffer object. - * If target is GL_TEXTURE_*, it's a texture object. - */ - GLuint obj; - - /* Mipmap level. Ignored for non-texture objects. */ - GLuint miplevel; - - /* One of MESA_GLINTEROP_ACCESS_* flags. This describes how the exported - * object is going to be used. - */ - uint32_t access; - - /* Size of memory pointed to by out_driver_data. */ - uint32_t out_driver_data_size; - - /* If the caller wants to query driver-specific data about the OpenGL - * object, this should point to the memory where that data will be stored. - */ - void *out_driver_data; -} mesa_glinterop_export_in; - - -/** - * Outputs of Mesa interop export functions. - */ -typedef struct _mesa_glinterop_export_out { - uint32_t size; /* size of this structure */ - - /* The DMABUF handle. It must be closed by the caller using the POSIX - * close() function when it's not needed anymore. Mesa is not responsible - * for closing the handle. - * - * Not closing the handle by the caller will lead to a resource leak, - * prevents releasing the GPU buffer, and may prevent creating new DMABUF - * handles until the process termination. - */ - int dmabuf_fd; - - /* The mutable OpenGL internal format specified by glTextureView or - * glTexBuffer. If the object is not one of those, the original internal - * format specified by glTexStorage, glTexImage, or glRenderbufferStorage - * will be returned. - */ - GLenum internalformat; - - /* Parameters specified by glTexBufferRange for GL_TEXTURE_BUFFER. */ - GLintptr buf_offset; - GLsizeiptr buf_size; - - /* Parameters specified by glTextureView. If the object is not a texture - * view, default parameters covering the whole texture will be returned. - */ - GLuint view_minlevel; - GLuint view_numlevels; - GLuint view_minlayer; - GLuint view_numlayers; -} mesa_glinterop_export_out; - -#if !defined(MESA_GLINTEROP_NO_GLX) -/** - * Query device information. - * - * \param dpy GLX display - * \param context GLX context - * \param out where to return the information - * - * \return MESA_GLINTEROP_SUCCESS or MESA_GLINTEROP_* != 0 on error - */ -GLAPI int GLAPIENTRY -MesaGLInteropGLXQueryDeviceInfo(Display *dpy, GLXContext context, - mesa_glinterop_device_info *out); -#endif - -/** - * Same as MesaGLInteropGLXQueryDeviceInfo except that it accepts EGLDisplay - * and EGLContext. - */ -GLAPI int GLAPIENTRY -MesaGLInteropEGLQueryDeviceInfo(EGLDisplay dpy, EGLContext context, - mesa_glinterop_device_info *out); - - -#if !defined(MESA_GLINTEROP_NO_GLX) -/** - * Create and return a DMABUF handle corresponding to the given OpenGL - * object, and return other parameters about the OpenGL object. - * - * \param dpy GLX display - * \param context GLX context - * \param in input parameters - * \param out return values - * - * \return MESA_GLINTEROP_SUCCESS or MESA_GLINTEROP_* != 0 on error - */ -GLAPI int GLAPIENTRY -MesaGLInteropGLXExportObject(Display *dpy, GLXContext context, - mesa_glinterop_export_in *in, - mesa_glinterop_export_out *out); -#endif - -/** - * Same as MesaGLInteropGLXExportObject except that it accepts - * EGLDisplay and EGLContext. - */ -GLAPI int GLAPIENTRY -MesaGLInteropEGLExportObject(EGLDisplay dpy, EGLContext context, - mesa_glinterop_export_in *in, - mesa_glinterop_export_out *out); - - -#if !defined(MESA_GLINTEROP_NO_GLX) -typedef int (APIENTRYP PFNMESAGLINTEROPGLXQUERYDEVICEINFOPROC)(Display *dpy, GLXContext context, - mesa_glinterop_device_info *out); -#endif -typedef int (APIENTRYP PFNMESAGLINTEROPEGLQUERYDEVICEINFOPROC)(EGLDisplay dpy, EGLContext context, - mesa_glinterop_device_info *out); -#if !defined(MESA_GLINTEROP_NO_GLX) -typedef int (APIENTRYP PFNMESAGLINTEROPGLXEXPORTOBJECTPROC)(Display *dpy, GLXContext context, - mesa_glinterop_export_in *in, - mesa_glinterop_export_out *out); -#endif -typedef int (APIENTRYP PFNMESAGLINTEROPEGLEXPORTOBJECTPROC)(EGLDisplay dpy, EGLContext context, - mesa_glinterop_export_in *in, - mesa_glinterop_export_out *out); - -#ifdef __cplusplus -} -#endif - -#endif /* MESA_GLINTEROP_H */ diff --git a/projects/clr/rocclr/runtime/device/rocm/rocappprofile.cpp b/projects/clr/rocclr/runtime/device/rocm/rocappprofile.cpp deleted file mode 100644 index 0e0ac15113..0000000000 --- a/projects/clr/rocclr/runtime/device/rocm/rocappprofile.cpp +++ /dev/null @@ -1,61 +0,0 @@ -// -// Copyright (c) 2009 Advanced Micro Devices, Inc. All rights reserved. -// - - -#ifndef WITHOUT_HSA_BACKEND - -#include "top.hpp" -#include "device/device.hpp" -#include "device/appprofile.hpp" -#include "device/rocm/rocappprofile.hpp" - -#include - -amd::AppProfile* rocCreateAppProfile() -{ - amd::AppProfile* appProfile = new roc::AppProfile; - - if ((appProfile == NULL) || !appProfile->init()) { - return NULL; - } - - return appProfile; -} - -namespace roc { - -bool AppProfile::ParseApplicationProfile() -{ - std::string appName("Explorer"); - - std::transform(appName.begin(), appName.end(), appName.begin(), ::tolower); - std::transform(appFileName_.begin(), appFileName_.end(), appFileName_.begin(), ::tolower); - - if (appFileName_.compare(appName) == 0 ) { - hsaDeviceHint_ = CL_HSA_DISABLED_AMD; - gpuvmHighAddr_ = false; - noHsaInit_ = true; - profileOverridesAllSettings_ = true; - } - - // Setting both bits is invalid, make it niether. - if (hsaDeviceHint_ & CL_HSA_ENABLED_AMD - && hsaDeviceHint_ & CL_HSA_DISABLED_AMD) { - hsaDeviceHint_ = 0; - } - - if (noHsaInit_) { - // If no HSA initialization, then force hint flag to non-HSA device. - // Even if this is not forced, the device selection logic will endure it. - // After all hint flags are treated as hint only - depending on - // availibility. - hsaDeviceHint_ = CL_HSA_DISABLED_AMD; - } - - return true; -} - -} - -#endif diff --git a/projects/clr/rocclr/runtime/device/rocm/rocappprofile.hpp b/projects/clr/rocclr/runtime/device/rocm/rocappprofile.hpp deleted file mode 100644 index 00221dda2c..0000000000 --- a/projects/clr/rocclr/runtime/device/rocm/rocappprofile.hpp +++ /dev/null @@ -1,23 +0,0 @@ -// -// Copyright (c) 2009 Advanced Micro Devices, Inc. All rights reserved. -// -#pragma once - -#ifndef WITHOUT_HSA_BACKEND - -namespace roc { - -class AppProfile : public amd::AppProfile -{ -public: - AppProfile(): amd::AppProfile() {} - -protected: - //! parse application profile based on application file name - virtual bool ParseApplicationProfile(); -}; - -} - -#endif - diff --git a/projects/clr/rocclr/runtime/device/rocm/rocbinary.hpp b/projects/clr/rocclr/runtime/device/rocm/rocbinary.hpp deleted file mode 100644 index cdb910f622..0000000000 --- a/projects/clr/rocclr/runtime/device/rocm/rocbinary.hpp +++ /dev/null @@ -1,51 +0,0 @@ -// -// Copyright (c) 2009 Advanced Micro Devices, Inc. All rights reserved. -// -#pragma once - -#include "top.hpp" -#include "rocdevice.hpp" - -#ifndef WITHOUT_HSA_BACKEND - -namespace roc { - -typedef std::map NameKernelMap; - -class ClBinary : public device::ClBinary -{ -public: - ClBinary(const Device& dev, BinaryImageFormat bifVer = BIF_VERSION3) - : device::ClBinary(dev, bifVer) - {} - - //! Destructor - ~ClBinary() {} - - -protected: - bool setElfTarget() { - uint32_t target = static_cast(21);//dev().calTarget()); - assert (((0xFFFF8000 & target) == 0) && "ASIC target ID >= 2^15"); - uint16_t elf_target = (uint16_t)(0x7FFF & target); - return elfOut()->setTarget(elf_target, amd::OclElf::CAL_PLATFORM); - return true; - } - -private: - //! Disable default copy constructor - ClBinary(const ClBinary&); - - //! Disable default operator= - ClBinary& operator=(const ClBinary&); - - //! Returns the HSA device for this object - const Device& dev() const { return static_cast(dev_); } - -}; - -} // namespace roc - -#endif // WITHOUT_HSA_BACKEND - - diff --git a/projects/clr/rocclr/runtime/device/rocm/rocblit.cpp b/projects/clr/rocclr/runtime/device/rocm/rocblit.cpp deleted file mode 100644 index c4824d8bf7..0000000000 --- a/projects/clr/rocclr/runtime/device/rocm/rocblit.cpp +++ /dev/null @@ -1,1532 +0,0 @@ -// -// Copyright (c) 2013 Advanced Micro Devices, Inc. All rights reserved. -// - -#include "platform/commandqueue.hpp" -#include "device/rocm/rocdevice.hpp" -#include "device/rocm/rocblit.hpp" -#include "device/rocm/rocmemory.hpp" -#include "device/rocm/rocvirtual.hpp" -#include "utils/debug.hpp" - -namespace roc { - - -void -FindPinSize( - size_t& pinSize, const amd::Coord3D& size, - size_t& rowPitch, size_t& slicePitch, const Image& image) -{ - size_t elementSize = image.owner()->asImage()->getImageFormat().getElementSize(); - pinSize = size[0] * elementSize; - if ((rowPitch == 0) || (rowPitch == pinSize)) { - rowPitch = 0; - } - else { - pinSize = rowPitch; - } - - // Calculate the pin size, which should be equal to the copy size - for (uint i = 1; i < 3; ++i) { - pinSize *= size[i]; - if (i == 1) { - if ((slicePitch == 0) || (slicePitch == pinSize)) { - slicePitch = 0; - } - else { - if (image.getHsaImageDescriptor().geometry != HSA_EXT_IMAGE_GEOMETRY_1DA) { - pinSize = slicePitch; - } - else { - pinSize = slicePitch * size[i]; - } - } - } - } -} - -HsaBlitManager::HsaBlitManager(device::VirtualDevice& vDev, Setup setup) - : HostBlitManager(vDev, setup), - roc_device_(reinterpret_cast(dev_)) { - completion_signal_.handle = 0; -} - -bool HsaBlitManager::hsaCopy(const void *hostSrc, void *hostDst, - uint32_t size, bool hostToDev) const { - - // No allocation is necessary for Full Profile - hsa_status_t status; - if (roc_device_.agent_profile() == HSA_PROFILE_FULL) { - status = hsa_memory_copy(hostDst, hostSrc, size); - if (status != HSA_STATUS_SUCCESS) { - LogPrintfError("Hsa copy of data failed with code %d", status); - } - return (status == HSA_STATUS_SUCCESS); - } - - // Allocate requested size of memory - size_t align = 0x04; - bool atomics = false; - void *hsaBuffer = NULL; - hsaBuffer = roc_device_.hostAlloc(size, align, false); - if (hsaBuffer == NULL) { - LogError("Hsa buffer allocation failed with code"); - return false; - } - - const hsa_signal_value_t kInitVal = 1; - hsa_signal_store_relaxed(completion_signal_, kInitVal); - - // Copy data from Host to Device - if (hostToDev) { - memcpy(hsaBuffer, hostSrc, size); - status = hsa_amd_memory_async_copy( - hostDst, roc_device_.getBackendDevice(), hsaBuffer, - roc_device_.getCpuAgent(), size, 0, NULL, completion_signal_); - if (status == HSA_STATUS_SUCCESS) { - hsa_signal_value_t val = - hsa_signal_wait_acquire(completion_signal_, HSA_SIGNAL_CONDITION_EQ, 0, - uint64_t(-1), HSA_WAIT_STATE_ACTIVE); - - if (val != (kInitVal - 1)) { - LogError("Async copy failed"); - status = HSA_STATUS_ERROR; - } - } - else { - LogPrintfError("Hsa copy from host to device failed with code %d", status); - } - - roc_device_.hostFree(hsaBuffer, size); - return (status == HSA_STATUS_SUCCESS); - } - - // Copy data from Device to Host - status = hsa_amd_memory_async_copy(hsaBuffer, roc_device_.getCpuAgent(), - hostSrc, roc_device_.getBackendDevice(), - size, 0, NULL, completion_signal_); - if (status == HSA_STATUS_SUCCESS) { - hsa_signal_value_t val = hsa_signal_wait_acquire( - completion_signal_, HSA_SIGNAL_CONDITION_EQ, 0, uint64_t(-1), - HSA_WAIT_STATE_ACTIVE); - - if (val != (kInitVal - 1)) { - LogError("Async copy failed"); - status = HSA_STATUS_ERROR; - } - - if (status == HSA_STATUS_SUCCESS) { - memcpy(hostDst, hsaBuffer, size); - } - } else { - LogPrintfError("Hsa copy from device to host failed with code %d", status); - } - - roc_device_.hostFree(hsaBuffer, size); - return (status == HSA_STATUS_SUCCESS); -} - -bool HsaBlitManager::readBuffer(device::Memory& srcMemory, void* dstHost, - const amd::Coord3D& origin, - const amd::Coord3D& size, bool entire) const { - hsa_memory_register(dstHost, size[0]); - void* src = static_cast(srcMemory).getDeviceMemory(); - - // Copy data from device to host - const void *srcDev = reinterpret_cast(src) + origin[0]; - bool retval = hsaCopy(srcDev, dstHost, size[0], false); - - hsa_memory_deregister(dstHost, size[0]); - return retval; -} - -bool HsaBlitManager::readBufferRect(device::Memory& srcMemory, void* dst, - const amd::BufferRect& bufRect, - const amd::BufferRect& hostRect, - const amd::Coord3D& size, - bool entire) const { - void* src = static_cast(srcMemory).getDeviceMemory(); - - size_t srcOffset; - size_t dstOffset; - - for (size_t z = 0; z < size[2]; ++z) { - for (size_t y = 0; y < size[1]; ++y) { - srcOffset = bufRect.offset(0, y, z); - dstOffset = hostRect.offset(0, y, z); - - // Copy data from device to host - line by line - void *dstHost = reinterpret_cast
(dst) + dstOffset; - const void *srcDev = reinterpret_cast(src) + srcOffset; - bool retval = hsaCopy(srcDev, dstHost, size[0], false); - if (!retval) { - return retval; - } - } - } - - return true; -} - -static bool hsaCopyImageToBuffer(hsa_agent_t agent, - hsa_ext_image_t srcImage, - void* dstBuffer, const amd::Coord3D& srcOrigin, - const amd::Coord3D& dstOrigin, - const amd::Coord3D& size, bool entire, - size_t rowPitch, size_t slicePitch) { - hsa_ext_image_region_t image_region; - image_region.offset.x = srcOrigin[0]; - image_region.offset.y = srcOrigin[1]; - image_region.offset.z = srcOrigin[2]; - image_region.range.x = size[0]; - image_region.range.y = size[1]; - image_region.range.z = size[2]; - - char *dstHost = ((char*)dstBuffer) + dstOrigin[0]; - - hsa_status_t status = hsa_ext_image_export(agent, srcImage, dstHost, rowPitch, - slicePitch, &image_region); - return (status == HSA_STATUS_SUCCESS); -} - -bool HsaBlitManager::readImage(device::Memory& srcMemory, void* dstHost, - const amd::Coord3D& origin, - const amd::Coord3D& size, size_t rowPitch, - size_t slicePitch, bool entire) const { - roc::Image* srcImage = (roc::Image*)&srcMemory; - - void* svmDstHost = NULL; - size_t pinSize = 0; - FindPinSize(pinSize, size, rowPitch, slicePitch, *srcImage); - - hsa_agent_t agent = gpu().gpu_device(); - - hsa_status_t status = hsa_amd_memory_lock(dstHost, pinSize, - &agent, 1, &svmDstHost); - - if (status != HSA_STATUS_SUCCESS) { - return false; - } - - bool retval = hsaCopyImageToBuffer(agent, srcImage->getHsaImageObject(), - svmDstHost, origin, amd::Coord3D(0), size, entire, - rowPitch, slicePitch); - hsa_amd_memory_unlock(dstHost); - return retval; -} - -bool HsaBlitManager::writeBuffer(const void* srcHost, device::Memory& dstMemory, - const amd::Coord3D& origin, - const amd::Coord3D& size, bool entire) const { - hsa_memory_register(const_cast(srcHost), size[0]); - void* dst = static_cast(dstMemory).getDeviceMemory(); - - // Copy data from host to device - void *dstDev = reinterpret_cast
(dst) + origin[0]; - bool retval = hsaCopy(srcHost, dstDev, size[0], true); - - hsa_memory_deregister(const_cast(srcHost), size[0]); - return retval; -} - -bool HsaBlitManager::writeBufferRect(const void* src, - device::Memory& dstMemory, - const amd::BufferRect& hostRect, - const amd::BufferRect& bufRect, - const amd::Coord3D& size, - bool entire) const { - void* dst = static_cast(dstMemory).getDeviceMemory(); - - size_t srcOffset; - size_t dstOffset; - - for (size_t z = 0; z < size[2]; ++z) { - for (size_t y = 0; y < size[1]; ++y) { - srcOffset = hostRect.offset(0, y, z); - dstOffset = bufRect.offset(0, y, z); - - // Copy data from host to device - line by line - void *dstDev = reinterpret_cast
(dst) + dstOffset; - const void *srcHost = reinterpret_cast(src) + srcOffset; - bool retval = hsaCopy(srcHost, dstDev, size[0], true); - if (!retval) { - return retval; - } - } - } - - return true; -} - -bool hsaCopyBufferToImage(hsa_agent_t agent, const void* srcBuffer, - hsa_ext_image_t dstImage, - const amd::Coord3D& srcOrigin, - const amd::Coord3D& dstOrigin, - const amd::Coord3D& size, bool entire, - size_t rowPitch, size_t slicePitch) { - char* srcHost = ((char*)srcBuffer) + srcOrigin[0]; - - hsa_ext_image_region_t image_region; - image_region.offset.x = dstOrigin[0]; - image_region.offset.y = dstOrigin[1]; - image_region.offset.z = dstOrigin[2]; - image_region.range.x = size[0]; - image_region.range.y = size[1]; - image_region.range.z = size[2]; - - hsa_status_t status = hsa_ext_image_import( - agent, srcHost, rowPitch, slicePitch, dstImage, &image_region); - return (status == HSA_STATUS_SUCCESS); -} - -bool HsaBlitManager::writeImage(const void* srcHost, device::Memory& dstMemory, - const amd::Coord3D& origin, - const amd::Coord3D& size, size_t rowPitch, - size_t slicePitch, bool entire) const { - roc::Image* image = (roc::Image*)&dstMemory; - - void* svmSrcHost = NULL; - size_t pinSize = 0; - FindPinSize(pinSize, size, rowPitch, slicePitch, *image); - - hsa_agent_t agent = gpu().gpu_device(); - - hsa_status_t status = hsa_amd_memory_lock(const_cast(srcHost), pinSize, - &agent, 1, &svmSrcHost); - - if (status != HSA_STATUS_SUCCESS) { - return false; - } - - bool retval = hsaCopyBufferToImage(agent, svmSrcHost, - image->getHsaImageObject(), amd::Coord3D(0), - origin, size, entire, rowPitch, slicePitch); - - hsa_amd_memory_unlock(const_cast(srcHost)); - - return retval; -} - -bool HsaBlitManager::copyBuffer(device::Memory& srcMemory, - device::Memory& dstMemory, - const amd::Coord3D& srcOrigin, - const amd::Coord3D& dstOrigin, - const amd::Coord3D& size, bool entire) const { - void* src = static_cast(srcMemory).getDeviceMemory(); - void* dst = static_cast(dstMemory).getDeviceMemory(); - - if (srcMemory.isHostMemDirectAccess() && dstMemory.isHostMemDirectAccess()) { - if (srcMemory.owner()->getMemFlags() & CL_MEM_USE_HOST_PTR) { - src = srcMemory.owner()->getHostMem(); - } - - if (dstMemory.owner()->getMemFlags() & CL_MEM_USE_HOST_PTR) { - dst = dstMemory.owner()->getHostMem(); - } - } - - const hsa_agent_t src_agent = (srcMemory.isHostMemDirectAccess()) - ? roc_device_.getCpuAgent() - : roc_device_.getBackendDevice(); - - const hsa_agent_t dst_agent = (dstMemory.isHostMemDirectAccess()) - ? roc_device_.getCpuAgent() - : roc_device_.getBackendDevice(); - - // Straight forward buffer copy - const hsa_signal_value_t kInitVal = 1; - hsa_signal_store_relaxed(completion_signal_, kInitVal); - hsa_status_t status = hsa_amd_memory_async_copy( - (reinterpret_cast
(dst) + dstOrigin[0]), dst_agent, - (reinterpret_cast(src) + srcOrigin[0]), src_agent, size[0], - 0, NULL, completion_signal_); - if (status != HSA_STATUS_SUCCESS) { - LogPrintfError("DMA buffer failed with code %d", status); - return false; - } - - hsa_signal_value_t val = - hsa_signal_wait_acquire(completion_signal_, HSA_SIGNAL_CONDITION_EQ, 0, - uint64_t(-1), HSA_WAIT_STATE_ACTIVE); - - if (val != (kInitVal - 1)) { - LogError("Async copy failed"); - return false; - } - - return true; -} - -bool HsaBlitManager::copyBufferRect(device::Memory& srcMemory, - device::Memory& dstMemory, - const amd::BufferRect& srcRect, - const amd::BufferRect& dstRect, - const amd::Coord3D& size, - bool entire) const { - void* src = static_cast(srcMemory).getDeviceMemory(); - void* dst = static_cast(dstMemory).getDeviceMemory(); - - const hsa_signal_value_t kInitVal = size[2] * size[1]; - hsa_signal_store_relaxed(completion_signal_, kInitVal); - - for (size_t z = 0; z < size[2]; ++z) { - for (size_t y = 0; y < size[1]; ++y) { - size_t srcOffset = srcRect.offset(0, y, z); - size_t dstOffset = dstRect.offset(0, y, z); - - // Copy memory line by line - hsa_status_t status = hsa_amd_memory_async_copy( - (reinterpret_cast
(dst) + dstOffset), - roc_device_.getBackendDevice(), - (reinterpret_cast(src) + srcOffset), - roc_device_.getBackendDevice(), size[0], 0, NULL, - completion_signal_); - if (status != HSA_STATUS_SUCCESS) { - LogPrintfError("DMA buffer failed with code %d", status); - return false; - } - } - } - - hsa_signal_value_t val = - hsa_signal_wait_acquire(completion_signal_, HSA_SIGNAL_CONDITION_EQ, - 0, uint64_t(-1), HSA_WAIT_STATE_ACTIVE); - - if (val != 0) { - LogError("Async copy failed"); - return false; - } - - return true; -} - -bool HsaBlitManager::copyImageToBuffer(device::Memory& srcMemory, - device::Memory& dstMemory, - const amd::Coord3D& srcOrigin, - const amd::Coord3D& dstOrigin, - const amd::Coord3D& size, bool entire, - size_t rowPitch, - size_t slicePitch) const { - roc::Image& srcImage = (roc::Image&)srcMemory; - roc::Buffer& dstBuffer = (roc::Buffer&)dstMemory; - - return hsaCopyImageToBuffer(gpu().gpu_device(), srcImage.getHsaImageObject(), - dstBuffer.getDeviceMemory(), srcOrigin, dstOrigin, - size, entire, rowPitch, slicePitch); -} - -bool HsaBlitManager::copyBufferToImage(device::Memory& srcMemory, - device::Memory& dstMemory, - const amd::Coord3D& srcOrigin, - const amd::Coord3D& dstOrigin, - const amd::Coord3D& size, bool entire, - size_t rowPitch, - size_t slicePitch) const { - roc::Buffer& srcBuffer = (roc::Buffer&)srcMemory; - roc::Image& dstImage = (roc::Image&)dstMemory; - - return hsaCopyBufferToImage(gpu().gpu_device(), srcBuffer.getDeviceMemory(), - dstImage.getHsaImageObject(), srcOrigin, - dstOrigin, size, entire, rowPitch, slicePitch); -} - -bool HsaBlitManager::copyImage(device::Memory& srcMemory, - device::Memory& dstMemory, - const amd::Coord3D& srcOrigin, - const amd::Coord3D& dstOrigin, - const amd::Coord3D& size, bool entire) const { - if (srcMemory.isHostMemDirectAccess() && - dstMemory.isHostMemDirectAccess()) { - return device::HostBlitManager::copyImage( - srcMemory, dstMemory, srcOrigin, dstOrigin, size, entire); - } - - roc::Image *srcImage = (roc::Image *)&srcMemory; - roc::Image *dstImage = (roc::Image *)&dstMemory; - - hsa_dim3_t src_offset = { 0 }; - src_offset.x = srcOrigin[0]; - src_offset.y = srcOrigin[1]; - src_offset.z = srcOrigin[2]; - - hsa_dim3_t dst_offset = { 0 }; - dst_offset.x = dstOrigin[0]; - dst_offset.y = dstOrigin[1]; - dst_offset.z = dstOrigin[2]; - - hsa_dim3_t copy_size = { 0 }; - copy_size.x = size[0]; - copy_size.y = size[1]; - copy_size.z = size[2]; - - hsa_status_t status = hsa_ext_image_copy( - gpu().gpu_device(), srcImage->getHsaImageObject(), &src_offset, - dstImage->getHsaImageObject(), &dst_offset, ©_size); - return (status == HSA_STATUS_SUCCESS); -} - -bool HsaBlitManager::fillBuffer(device::Memory& memory, const void* pattern, - size_t patternSize, const amd::Coord3D& origin, - const amd::Coord3D& size, bool entire) const { - void* fillMem = static_cast(memory).getDeviceMemory(); - - size_t offset = origin[0]; - size_t fillSize = size[0]; - - if ((fillSize % patternSize) != 0) { - LogError("Misaligned buffer size and pattern size!"); - } - - // Fill the buffer memory with a pattern - for (size_t i = 0; i < (fillSize / patternSize); i++) { - void *dstDev = reinterpret_cast
(fillMem) + offset; - bool retval = hsaCopy(pattern, dstDev, patternSize, true); - if (!retval) { - LogError("DMA buffer failed with code"); - return retval; - } - - offset += patternSize; - } - - return true; -} - -bool HsaBlitManager::fillImage(device::Memory& memory, const void* pattern, - const amd::Coord3D& origin, - const amd::Coord3D& size, bool entire) const { - if (memory.isHostMemDirectAccess()) { - return device::HostBlitManager::fillImage(memory, pattern, origin, size, entire); - } - - roc::Image *image = (roc::Image*)&memory; - hsa_ext_image_region_t image_region; - image_region.offset.x = origin[0]; - image_region.offset.y = origin[1]; - image_region.offset.z = origin[2]; - image_region.range.x = size[0]; - image_region.range.y = size[1]; - image_region.range.z = size[2]; - - hsa_status_t status = hsa_ext_image_clear( - gpu().gpu_device(), image->getHsaImageObject(), - pattern, &image_region); - return (status == HSA_STATUS_SUCCESS); -} - -static void -CalcRowSlicePitches( - cl_ulong* pitch, const cl_int* copySize, - size_t rowPitch, size_t slicePitch, const Memory& mem) -{ - const roc::Image &hsaImage = static_cast< const roc::Image &>(mem); - bool img1Darray = - (mem.owner()->getType() == CL_MEM_OBJECT_IMAGE1D_ARRAY) ? true : false; - size_t memFmtSize = mem.owner()->asImage()->getImageFormat().getElementSize(); - - if (rowPitch == 0) { - pitch[0] = copySize[0]; - } - else { - pitch[0] = rowPitch / memFmtSize; - } - if (slicePitch == 0) { - pitch[1] = pitch[0] * (img1Darray ? 1 : copySize[1]); - } - else { - pitch[1] = slicePitch / memFmtSize; - } - assert((pitch[0] <= pitch[1]) && "rowPitch must be <= slicePitch"); - - if (img1Darray) { - // For 1D array rowRitch = slicePitch - pitch[0] = pitch[1]; - } -} - -KernelBlitManager::KernelBlitManager(device::VirtualDevice& vDev, Setup setup) - : HsaBlitManager(vDev, setup), - context_(NULL), - program_(NULL) -{ - for (uint i = 0; i < BlitTotal; ++i) { - kernels_[i] = NULL; - } -} - -KernelBlitManager::~KernelBlitManager() -{ - for (uint i = 0; i < BlitTotal; ++i) { - if (NULL != kernels_[i]) { - kernels_[i]->release(); - } - } - - if (NULL != program_) { - program_->release(); - } - - if (NULL != context_) { - // Release a dummy context - context_->release(); - } -} - -bool -KernelBlitManager::readBuffer( - device::Memory& srcMemory, - void* dstHost, - const amd::Coord3D& origin, - const amd::Coord3D& size, - bool entire) const -{ - //if (setup_.disableReadBuffer_ || srcMemory.isHostMemDirectAccess()) { - // return device::HostBlitManager::readBuffer(srcMemory, dstHost, origin, - // size, entire); - //} - // Exercise HSA path for now. - return HsaBlitManager::readBuffer(srcMemory, dstHost, origin, - size, entire); - - amd::Buffer *dstMemory = new (*context_) amd::Buffer( - *context_, CL_MEM_USE_HOST_PTR, size[0]); - - if (!dstMemory->create(const_cast(dstHost))) { - LogError("[OCL] Fail to create mem object for destination"); - return false; - } - - device::Memory *devDstMemory = dstMemory->getDeviceMemory(dev_); - if (devDstMemory== NULL) { - LogError("[OCL] Fail to create device mem object for destination"); - return false; - } - - bool result = copyBuffer( - srcMemory, *devDstMemory, origin, amd::Coord3D(0), size, entire); - - // Wait for the transfer to finish so that we could safely release the - // destination memory object. - // TODO: we could remove this if issue on implicit memory registration is - // fixed by KFD, so that we could pass the pattern as SVM. - gpu().releaseGpuMemoryFence(); - - dstMemory->release(); - - return result; -} - -bool -KernelBlitManager::readBufferRect( - device::Memory& srcMemory, - void* dstHost, - const amd::BufferRect& bufRect, - const amd::BufferRect& hostRect, - const amd::Coord3D& size, - bool entire) const -{ - // if (setup_.disableReadBufferRect_ || srcMemory.isHostMemDirectAccess()) { - //return device::HostBlitManager::readBufferRect( - // srcMemory, dstHost, bufRect, hostRect, size, entire); - // } - - // Exercise HSA path for now. - return HsaBlitManager::readBufferRect( - srcMemory, dstHost, bufRect, hostRect, size, entire); - - size_t dstSize = hostRect.start_ + hostRect.end_; - amd::Buffer *dstMemory = - new (*context_) amd::Buffer(*context_, CL_MEM_USE_HOST_PTR, dstSize); - - if (!dstMemory->create(const_cast(dstHost))) { - LogError("[OCL] Fail to create mem object for destination"); - return false; - } - - device::Memory *devDstMemory = dstMemory->getDeviceMemory(dev_); - if (devDstMemory== NULL) { - LogError("[OCL] Fail to create device mem object for destination"); - return false; - } - - bool result = copyBufferRect( - srcMemory, *devDstMemory, bufRect, hostRect, size, entire); - - // Wait for the transfer to finish so that we could safely release the - // destination memory object. - // TODO: we could remove this if issue on implicit memory registration is - // fixed by KFD, so that we could pass the pattern as SVM. - gpu().releaseGpuMemoryFence(); - - dstMemory->release(); - - return result; -} - -void -FindLinearSize( - size_t& linearSize, const amd::Coord3D& size, - size_t& rowPitch, size_t& slicePitch, const device::Memory& mem) -{ - const roc::Image &image = static_cast(mem); - size_t elementSize = mem.owner()->asImage()->getImageFormat().getElementSize(); - - linearSize = size[0] * elementSize; - if ((rowPitch == 0) || (rowPitch == linearSize)) { - rowPitch = 0; - } - else { - linearSize = rowPitch; - } - - // Calculate the pin size, which should be equal to the copy size - for (uint i = 1; i < mem.owner()->asImage()->getDims(); ++i) { - linearSize *= size[i]; - if (i == 1) { - if ((slicePitch == 0) || (slicePitch == linearSize)) { - slicePitch = 0; - } - else { - if (mem.owner()->getType() != CL_MEM_OBJECT_IMAGE1D_ARRAY) { - linearSize = slicePitch; - } - else { - linearSize = slicePitch * size[i]; - } - } - } - } -} - -// The following data structures will be used for the view creations. -// Some formats has to be converted before a kernel blit operation -struct FormatConvertion { - cl_uint clOldType_; - cl_uint clNewType_; -}; - -// The list of rejected data formats and corresponding conversion -static const FormatConvertion RejectedData[] = -{ - { CL_UNORM_INT8, CL_UNSIGNED_INT8 }, - { CL_UNORM_INT16, CL_UNSIGNED_INT16 }, - { CL_SNORM_INT8, CL_UNSIGNED_INT8 }, - { CL_SNORM_INT16, CL_UNSIGNED_INT16 }, - { CL_HALF_FLOAT, CL_UNSIGNED_INT16 }, - { CL_FLOAT, CL_UNSIGNED_INT32 }, - { CL_SIGNED_INT8, CL_UNSIGNED_INT8 }, - { CL_SIGNED_INT16, CL_UNSIGNED_INT16 }, - { CL_UNORM_INT_101010, CL_UNSIGNED_INT8 }, - { CL_SIGNED_INT32, CL_UNSIGNED_INT32 } -}; - -// The list of rejected channel's order and corresponding conversion -static const FormatConvertion RejectedOrder[] = -{ - { CL_A, CL_R }, - { CL_RA, CL_RG }, - { CL_LUMINANCE, CL_R }, - { CL_INTENSITY, CL_R }, - { CL_RGB, CL_RGBA }, - { CL_BGRA, CL_RGBA }, - { CL_ARGB, CL_RGBA }, - { CL_sRGB, CL_RGBA }, - { CL_sRGBx, CL_RGBA }, - { CL_sRGBA, CL_RGBA }, - { CL_sBGRA, CL_RGBA }, - { CL_DEPTH, CL_R} -}; - -const uint RejectedFormatDataTotal = - sizeof(RejectedData) / sizeof(FormatConvertion); -const uint RejectedFormatChannelTotal = - sizeof(RejectedOrder) / sizeof(FormatConvertion); - -amd::Image::Format -KernelBlitManager::filterFormat(amd::Image::Format oldFormat) const -{ - cl_image_format newFormat; - newFormat.image_channel_data_type = oldFormat.image_channel_data_type; - newFormat.image_channel_order = oldFormat.image_channel_order; - - // Find unsupported formats - for (uint i = 0; i < RejectedFormatDataTotal; ++i) { - if (RejectedData[i].clOldType_ == oldFormat.image_channel_data_type) { - newFormat.image_channel_data_type = RejectedData[i].clNewType_; - break; - } - } - - // Find unsupported channel's order - for (uint i = 0; i < RejectedFormatChannelTotal; ++i) { - if (RejectedOrder[i].clOldType_ == oldFormat.image_channel_order) { - newFormat.image_channel_order = RejectedOrder[i].clNewType_; - break; - } - } - - return amd::Image::Format(newFormat); -} - -device::Memory * -KernelBlitManager::createImageView( - device::Memory &parent, - amd::Image::Format newFormat) const -{ - amd::Image *image = - parent.owner()->asImage()->createView(parent.owner()->getContext(), newFormat, &gpu()); - - if (image == NULL) { - LogError("[OCL] Fail to allocate view of image object"); - return NULL; - } - - Image* devImage = new roc::Image(static_cast(dev_), *image); - if (devImage == NULL) { - LogError("[OCL] Fail to allocate device mem object for the view"); - image->release(); - return NULL; - } - - if (!devImage->createView(static_cast(parent))) { - LogError("[OCL] Fail to create device mem object for the view"); - delete devImage; - image->release(); - return NULL; - } - - image->replaceDeviceMemory(&dev_, devImage); - - return devImage; -} - -bool -KernelBlitManager::readImage( - device::Memory& srcMemory, - void* dstHost, - const amd::Coord3D& origin, - const amd::Coord3D& size, - size_t rowPitch, - size_t slicePitch, - bool entire) const -{ - return HsaBlitManager::readImage( - srcMemory, dstHost, origin, size, rowPitch, slicePitch, entire); -} - -bool -KernelBlitManager::writeBuffer( - const void* srcHost, - device::Memory& dstMemory, - const amd::Coord3D& origin, - const amd::Coord3D& size, - bool entire) const -{ - // if (setup_.disableWriteBuffer_ || dstMemory.isHostMemDirectAccess()) { - //return device::HostBlitManager::writeBuffer(srcHost, dstMemory, origin, size, - // entire); - // } - - // Exercise HSA path for now. - return HsaBlitManager::writeBuffer(srcHost, dstMemory, origin, size, - entire); - - amd::Buffer *srcMemory = - new (*context_) amd::Buffer(*context_, CL_MEM_USE_HOST_PTR, size[0]); - - if (!srcMemory->create(const_cast(srcHost))) { - LogError("[OCL] Fail to create mem object for destination"); - return false; - } - - device::Memory *devSrcMemory = srcMemory->getDeviceMemory(dev_); - if (devSrcMemory== NULL) { - LogError("[OCL] Fail to create device mem object for destination"); - return false; - } - - bool result = - copyBuffer(*devSrcMemory, dstMemory, amd::Coord3D(0), origin, size, entire); - - // Wait for the transfer to finish so that we could safely release the - // source memory object. - // TODO: we could remove this if issue on implicit memory registration is - // fixed by KFD, so that we could pass the pattern as SVM. - gpu().releaseGpuMemoryFence(); - - srcMemory->release(); - - return result; -} - -bool -KernelBlitManager::writeBufferRect( - const void* srcHost, - device::Memory& dstMemory, - const amd::BufferRect& hostRect, - const amd::BufferRect& bufRect, - const amd::Coord3D& size, - bool entire) const -{ - // if (setup_.disableWriteBufferRect_ || dstMemory.isHostMemDirectAccess()) { - //return device::HostBlitManager::writeBufferRect( - // srcHost, dstMemory, hostRect, bufRect, size, entire); - // } - - // Exercise HSA path for now. - return HsaBlitManager::writeBufferRect( - srcHost, dstMemory, hostRect, bufRect, size, entire); - - size_t srcSize = hostRect.start_ + hostRect.end_; - amd::Buffer *srcMemory = - new (*context_) amd::Buffer(*context_, CL_MEM_USE_HOST_PTR, srcSize); - - if (!srcMemory->create(const_cast(srcHost))) { - LogError("[OCL] Fail to create mem object for destination"); - return false; - } - - device::Memory *devSrcMemory = srcMemory->getDeviceMemory(dev_); - if (devSrcMemory== NULL) { - LogError("[OCL] Fail to create device mem object for destination"); - return false; - } - - bool result = copyBufferRect( - *devSrcMemory, dstMemory, hostRect, bufRect, size, entire); - - // Wait for the transfer to finish so that we could safely release the - // destination memory object. - // TODO: we could remove this if issue on implicit memory registration is - // fixed by KFD, so that we could pass the pattern as SVM. - gpu().releaseGpuMemoryFence(); - - srcMemory->release(); - - return result; -} - -bool -KernelBlitManager::writeImage( - const void* srcHost, - device::Memory& dstMemory, - const amd::Coord3D& origin, - const amd::Coord3D& size, - size_t rowPitch, - size_t slicePitch, - bool entire) const -{ - return HsaBlitManager::writeImage( - srcHost, dstMemory, origin, size, rowPitch, slicePitch, entire); -} - -bool -KernelBlitManager::copyBuffer( - device::Memory& srcMemory, - device::Memory& dstMemory, - const amd::Coord3D& srcOrigin, - const amd::Coord3D& dstOrigin, - const amd::Coord3D& sizeIn, - bool entire) const -{ - // if (setup_.disableCopyBuffer_ || - // (srcMemory.isHostMemDirectAccess() && - // dstMemory.isHostMemDirectAccess())) { - //return HsaBlitManager::copyBuffer( - // srcMemory, dstMemory, srcOrigin, dstOrigin, sizeIn, entire); - // } - - // Exercise HSA path for now. - return HsaBlitManager::copyBuffer( - srcMemory, dstMemory, srcOrigin, dstOrigin, sizeIn, entire); - - uint blitType = BlitCopyBuffer; - size_t dim = 1; - size_t globalWorkOffset[3] = { 0, 0, 0 }; - size_t globalWorkSize = 0; - size_t localWorkSize = 0; - - const static uint CopyBuffAlignment[3] = { 16, 4, 1 }; - amd::Coord3D size(sizeIn[0], sizeIn[1], sizeIn[2]); - - bool aligned; - uint i; - for (i = 0; i < 3; ++i) { - // Check source alignments - aligned = ((srcOrigin[0] % CopyBuffAlignment[i]) == 0); - // Check destination alignments - aligned &= ((dstOrigin[0] % CopyBuffAlignment[i]) == 0); - // Check copy size alignment in the first dimension - aligned &= ((sizeIn[0] % CopyBuffAlignment[i]) == 0); - - if (aligned) { - if (CopyBuffAlignment[i] != 1) { - blitType = BlitCopyBufferAligned; - } - break; - } - } - - cl_uint remain; - if (blitType == BlitCopyBufferAligned) { - size.c[0] /= CopyBuffAlignment[i]; - } - else { - remain = size[0] % 4; - size.c[0] /= 4; - size.c[0] += 1; - } - - // Program the dispatch dimensions - localWorkSize = 256; - globalWorkSize = amd::alignUp(size[0] , 256); - - // Program kernels arguments for the blit operation - cl_mem clmem = ((cl_mem) as_cl(srcMemory.owner())); - kernels_[blitType]->parameters().set(0, sizeof(cl_mem), &clmem); - clmem = ((cl_mem) as_cl(dstMemory.owner())); - kernels_[blitType]->parameters().set(1, sizeof(cl_mem), &clmem); - // Program source origin - cl_ulong srcOffset = srcOrigin[0] / CopyBuffAlignment[i]; - kernels_[blitType]->parameters().set(2, sizeof(srcOffset), &srcOffset); - - // Program destination origin - cl_ulong dstOffset = dstOrigin[0] / CopyBuffAlignment[i]; - kernels_[blitType]->parameters().set(3, sizeof(dstOffset), &dstOffset); - - cl_ulong copySize = size[0]; - kernels_[blitType]->parameters().set(4, sizeof(copySize), ©Size); - - if (blitType == BlitCopyBufferAligned) { - cl_int alignment = CopyBuffAlignment[i]; - kernels_[blitType]->parameters().set(5, sizeof(alignment), &alignment); - } - else { - kernels_[blitType]->parameters().set(5, sizeof(remain), &remain); - } - - // Create ND range object for the kernel's execution - amd::NDRangeContainer ndrange( - 1, globalWorkOffset, &globalWorkSize, &localWorkSize); - - // Execute the blit - address parameters = kernels_[blitType]->parameters().capture(dev_); - bool result = gpu().submitKernelInternal( - ndrange, *kernels_[blitType], parameters, NULL); - kernels_[blitType]->parameters().release(const_cast
(parameters), dev_); - return result; -} - -bool -KernelBlitManager::copyBufferRect( - device::Memory& srcMemory, - device::Memory& dstMemory, - const amd::BufferRect& srcRectIn, - const amd::BufferRect& dstRectIn, - const amd::Coord3D& sizeIn, - bool entire) const -{ - // if (setup_.disableCopyBuffer_ || - // (srcMemory.isHostMemDirectAccess() && dstMemory.isHostMemDirectAccess())) { - //return HsaBlitManager::copyBufferRect( - // srcMemory, dstMemory, srcRectIn, dstRectIn, sizeIn, entire); - // } - - // Exercise HSA path for now. - return HsaBlitManager::copyBufferRect( - srcMemory, dstMemory, srcRectIn, dstRectIn, sizeIn, entire); - - uint blitType = BlitCopyBufferRect; - size_t dim = 3; - size_t globalWorkOffset[3] = { 0, 0, 0 }; - size_t globalWorkSize[3]; - size_t localWorkSize[3]; - - const static uint CopyRectAlignment[3] = { 16, 4, 1 }; - - bool aligned; - uint i; - for (i = 0; i < sizeof(CopyRectAlignment) / sizeof(uint); i++) { - // Check source alignments - aligned = ((srcRectIn.rowPitch_ % CopyRectAlignment[i]) == 0); - aligned &= ((srcRectIn.slicePitch_ % CopyRectAlignment[i]) == 0); - aligned &= ((srcRectIn.start_ % CopyRectAlignment[i]) == 0); - - // Check destination alignments - aligned &= ((dstRectIn.rowPitch_ % CopyRectAlignment[i]) == 0); - aligned &= ((dstRectIn.slicePitch_ % CopyRectAlignment[i]) == 0); - aligned &= ((dstRectIn.start_ % CopyRectAlignment[i]) == 0); - - // Check copy size alignment in the first dimension - aligned &= ((sizeIn[0] % CopyRectAlignment[i]) == 0); - - if (aligned) { - if (CopyRectAlignment[i] != 1) { - blitType = BlitCopyBufferRectAligned; - } - break; - } - } - - amd::BufferRect srcRect; - amd::BufferRect dstRect; - amd::Coord3D size(sizeIn[0], sizeIn[1], sizeIn[2]); - - srcRect.rowPitch_ = srcRectIn.rowPitch_ / CopyRectAlignment[i]; - srcRect.slicePitch_ = srcRectIn.slicePitch_ / CopyRectAlignment[i]; - srcRect.start_ = srcRectIn.start_ / CopyRectAlignment[i]; - srcRect.end_ = srcRectIn.end_ / CopyRectAlignment[i]; - - dstRect.rowPitch_ = dstRectIn.rowPitch_ / CopyRectAlignment[i]; - dstRect.slicePitch_ = dstRectIn.slicePitch_ / CopyRectAlignment[i]; - dstRect.start_ = dstRectIn.start_ / CopyRectAlignment[i]; - dstRect.end_ = dstRectIn.end_ / CopyRectAlignment[i]; - - size.c[0] /= CopyRectAlignment[i]; - - // Program the kernel's workload depending on the transfer dimensions - if ((size[1] == 1) && (size[2] == 1)) { - globalWorkSize[0] = amd::alignUp(size[0], 256); - globalWorkSize[1] = 1; - globalWorkSize[2] = 1; - localWorkSize[0] = 256; - localWorkSize[1] = 1; - localWorkSize[2] = 1; - } - else if (size[2] == 1) { - globalWorkSize[0] = amd::alignUp(size[0], 16); - globalWorkSize[1] = amd::alignUp(size[1], 16); - globalWorkSize[2] = 1; - localWorkSize[0] = localWorkSize[1] = 16; - localWorkSize[2] = 1; - } - else { - globalWorkSize[0] = amd::alignUp(size[0], 8); - globalWorkSize[1] = amd::alignUp(size[1], 8); - globalWorkSize[2] = amd::alignUp(size[2], 4); - localWorkSize[0] = localWorkSize[1] = 8; - localWorkSize[2] = 4; - } - - - // Program kernels arguments for the blit operation - cl_mem clmem = ((cl_mem) as_cl(srcMemory.owner())); - kernels_[blitType]->parameters().set(0, sizeof(cl_mem), &clmem); - clmem = ((cl_mem) as_cl(dstMemory.owner())); - kernels_[blitType]->parameters().set(1, sizeof(cl_mem), &clmem); - cl_ulong src[4] = {srcRect.rowPitch_, - srcRect.slicePitch_, - srcRect.start_, 0 }; - kernels_[blitType]->parameters().set(2, sizeof(src), src); - cl_ulong dst[4] = {dstRect.rowPitch_, - dstRect.slicePitch_, - dstRect.start_, 0 }; - kernels_[blitType]->parameters().set(3, sizeof(dst), dst); - cl_ulong copySize[4] = {size[0], - size[1], - size[2], - CopyRectAlignment[i] }; - kernels_[blitType]->parameters().set(4, sizeof(copySize), copySize); - - // Create ND range object for the kernel's execution - amd::NDRangeContainer ndrange(dim, - globalWorkOffset, globalWorkSize, localWorkSize); - - // Execute the blit - address parameters = kernels_[blitType]->parameters().capture(dev_); - bool result = gpu().submitKernelInternal( - ndrange, *kernels_[blitType], parameters, NULL); - kernels_[blitType]->parameters().release(const_cast
(parameters), dev_); - return result; -} - -bool -KernelBlitManager::copyImageToBuffer( - device::Memory& srcMemory, - device::Memory& dstMemory, - const amd::Coord3D& srcOrigin, - const amd::Coord3D& dstOrigin, - const amd::Coord3D& size, - bool entire, - size_t rowPitch, - size_t slicePitch) const -{ - if (dstMemory.isHostMemDirectAccess()) { - return HsaBlitManager::copyImageToBuffer(srcMemory, dstMemory, srcOrigin, - dstOrigin, size, entire, rowPitch, - slicePitch); - } - - amd::Image::Format oldFormat = srcMemory.owner()->asImage()->getImageFormat(); - amd::Image::Format newFormat = filterFormat(oldFormat); - bool useView = false; - - device::Memory* srcView = &srcMemory; - if (oldFormat != newFormat) { - srcView = createImageView(srcMemory, newFormat); - useView = true; - } - - roc::Image& srcImage = static_cast(*srcView); - - amd::Image* image = srcImage.owner()->asImage(); - uint blitType = 0; - blitType = BlitCopyImageToBuffer; - size_t dim = 0; - size_t globalWorkOffset[3] = {0, 0, 0}; - size_t globalWorkSize[3]; - size_t localWorkSize[3]; - - // Program the kernels workload depending on the blit dimensions - const size_t imageDims = srcImage.owner()->asImage()->getDims(); - dim = 3; - // Find the current blit type - if (imageDims == 1) { - globalWorkSize[0] = amd::alignUp(size[0], 256); - globalWorkSize[1] = amd::alignUp(size[1], 1); - globalWorkSize[2] = amd::alignUp(size[2], 1); - localWorkSize[0] = 256; - localWorkSize[1] = localWorkSize[2] = 1; - } else if (imageDims == 2) { - globalWorkSize[0] = amd::alignUp(size[0], 16); - globalWorkSize[1] = amd::alignUp(size[1], 16); - globalWorkSize[2] = amd::alignUp(size[2], 1); - localWorkSize[0] = localWorkSize[1] = 16; - localWorkSize[2] = 1; - } else { - globalWorkSize[0] = amd::alignUp(size[0], 8); - globalWorkSize[1] = amd::alignUp(size[1], 8); - globalWorkSize[2] = amd::alignUp(size[2], 4); - localWorkSize[0] = localWorkSize[1] = 8; - localWorkSize[2] = 4; - } - - // Program kernels arguments for the blit operation - cl_mem clmem = ((cl_mem)as_cl(srcImage.owner())); - kernels_[blitType]->parameters().set(0, sizeof(cl_mem), &clmem); - clmem = ((cl_mem)as_cl(dstMemory.owner())); - kernels_[blitType]->parameters().set(1, sizeof(cl_mem), &clmem); - - // Update extra paramters for USHORT and UBYTE pointers. - // Only then compiler can optimize the kernel to use - // UAV Raw for other writes - kernels_[blitType]->parameters().set(2, sizeof(cl_mem), &clmem); - kernels_[blitType]->parameters().set(3, sizeof(cl_mem), &clmem); - - cl_int srcOrg[4] = {(cl_int)srcOrigin[0], (cl_int)srcOrigin[1], - (cl_int)srcOrigin[2], 0}; - cl_int copySize[4] = {(cl_int)size[0], (cl_int)size[1], (cl_int)size[2], 0}; - - kernels_[blitType]->parameters().set(4, sizeof(srcOrg), srcOrg); - - const size_t elementSize = - srcImage.owner()->asImage()->getImageFormat().getElementSize(); - const size_t numChannels = - srcImage.owner()->asImage()->getImageFormat().getNumChannels(); - - // 1 element granularity for writes by default - cl_int granularity = 1; - if (elementSize == 2) { - granularity = 2; - } else if (elementSize >= 4) { - granularity = 4; - } - CondLog(((dstOrigin[0] % granularity) != 0), "Unaligned offset in blit!"); - cl_ulong dstOrg[4] = {dstOrigin[0] / granularity, dstOrigin[1], dstOrigin[2], - 0}; - kernels_[blitType]->parameters().set(5, sizeof(dstOrg), dstOrg); - kernels_[blitType]->parameters().set(6, sizeof(copySize), copySize); - - // Program memory format - uint multiplier = elementSize / sizeof(uint32_t); - multiplier = (multiplier == 0) ? 1 : multiplier; - cl_uint format[4] = {(cl_uint)numChannels, - (cl_uint)(elementSize / numChannels), multiplier, 0}; - kernels_[blitType]->parameters().set(7, sizeof(format), format); - - // Program row and slice pitches - cl_ulong pitch[4] = {0}; - CalcRowSlicePitches(pitch, copySize, rowPitch, slicePitch, srcImage); - kernels_[blitType]->parameters().set(8, sizeof(pitch), pitch); - - // Create ND range object for the kernel's execution - amd::NDRangeContainer ndrange(dim, globalWorkOffset, globalWorkSize, - localWorkSize); - - // Execute the blit - address parameters = kernels_[blitType]->parameters().capture(dev_); - bool result = gpu().submitKernelInternal(ndrange, *kernels_[blitType], - parameters, NULL); - kernels_[blitType]->parameters().release(const_cast
(parameters), - dev_); - - if (useView) { - srcView->owner()->release(); - } - - return result; -} - -bool KernelBlitManager::copyBufferToImage(device::Memory& srcMemory, - device::Memory& dstMemory, - const amd::Coord3D& srcOrigin, - const amd::Coord3D& dstOrigin, - const amd::Coord3D& size, bool entire, - size_t rowPitch, - size_t slicePitch) const { - if (srcMemory.isHostMemDirectAccess()) { - return HsaBlitManager::copyBufferToImage(srcMemory, dstMemory, srcOrigin, - dstOrigin, size, entire, rowPitch, - slicePitch); - } - - amd::Image::Format oldFormat = dstMemory.owner()->asImage()->getImageFormat(); - amd::Image::Format newFormat = filterFormat(oldFormat); - bool useView = false; - - device::Memory* dstView = &dstMemory; - if (oldFormat != newFormat) { - dstView = createImageView(dstMemory, newFormat); - useView = true; - } - - roc::Image& dstImage = static_cast(*dstView); - - // Use a common blit type with three dimensions by default - uint blitType = BlitCopyBufferToImage; - size_t dim = 0; - size_t globalWorkOffset[3] = {0, 0, 0}; - size_t globalWorkSize[3]; - size_t localWorkSize[3]; - - // Program the kernels workload depending on the blit dimensions - const size_t imageDims = dstImage.owner()->asImage()->getDims(); - dim = 3; - if (imageDims == 1) { - globalWorkSize[0] = amd::alignUp(size[0], 256); - globalWorkSize[1] = amd::alignUp(size[1], 1); - globalWorkSize[2] = amd::alignUp(size[2], 1); - localWorkSize[0] = 256; - localWorkSize[1] = localWorkSize[2] = 1; - } else if (imageDims == 2) { - globalWorkSize[0] = amd::alignUp(size[0], 16); - globalWorkSize[1] = amd::alignUp(size[1], 16); - globalWorkSize[2] = amd::alignUp(size[2], 1); - localWorkSize[0] = localWorkSize[1] = 16; - localWorkSize[2] = 1; - } else { - globalWorkSize[0] = amd::alignUp(size[0], 8); - globalWorkSize[1] = amd::alignUp(size[1], 8); - globalWorkSize[2] = amd::alignUp(size[2], 4); - localWorkSize[0] = localWorkSize[1] = 8; - localWorkSize[2] = 4; - } - - // Program kernels arguments for the blit operation - cl_mem clmem = ((cl_mem)as_cl(srcMemory.owner())); - kernels_[blitType]->parameters().set(0, sizeof(cl_mem), &clmem); - clmem = ((cl_mem)as_cl(dstImage.owner())); - kernels_[blitType]->parameters().set(1, sizeof(cl_mem), &clmem); - - const size_t elementSize = - dstImage.owner()->asImage()->getImageFormat().getElementSize(); - const size_t numChannels = - dstImage.owner()->asImage()->getImageFormat().getNumChannels(); - - // 1 element granularity for writes by default - cl_int granularity = 1; - if (elementSize == 2) { - granularity = 2; - } else if (elementSize >= 4) { - granularity = 4; - } - CondLog(((srcOrigin[0] % granularity) != 0), "Unaligned offset in blit!"); - cl_ulong srcOrg[4] = {srcOrigin[0] / granularity, srcOrigin[1], srcOrigin[2], - 0}; - kernels_[blitType]->parameters().set(2, sizeof(srcOrg), srcOrg); - - cl_int dstOrg[4] = {(cl_int)dstOrigin[0], (cl_int)dstOrigin[1], - (cl_int)dstOrigin[2], 0}; - cl_int copySize[4] = {(cl_int)size[0], (cl_int)size[1], (cl_int)size[2], 0}; - - kernels_[blitType]->parameters().set(3, sizeof(dstOrg), dstOrg); - kernels_[blitType]->parameters().set(4, sizeof(copySize), copySize); - - // Program memory format - uint multiplier = elementSize / sizeof(uint32_t); - multiplier = (multiplier == 0) ? 1 : multiplier; - cl_uint format[4] = {(cl_uint)numChannels, - (cl_uint)(elementSize / numChannels), multiplier, 0}; - kernels_[blitType]->parameters().set(5, sizeof(format), format); - - // Program row and slice pitches - cl_ulong pitch[4] = {0}; - CalcRowSlicePitches(pitch, copySize, rowPitch, slicePitch, dstImage); - kernels_[blitType]->parameters().set(6, sizeof(pitch), pitch); - - // Create ND range object for the kernel's execution - amd::NDRangeContainer ndrange(dim, globalWorkOffset, globalWorkSize, - localWorkSize); - - // Execute the blit - address parameters = kernels_[blitType]->parameters().capture(dev_); - bool result = gpu().submitKernelInternal(ndrange, *kernels_[blitType], - parameters, NULL); - kernels_[blitType]->parameters().release(const_cast
(parameters), - dev_); - - if (useView) { - dstView->owner()->release(); - } - - return result; -} - -bool -KernelBlitManager::copyImage( - device::Memory& srcMemory, - device::Memory& dstMemory, - const amd::Coord3D& srcOrigin, - const amd::Coord3D& dstOrigin, - const amd::Coord3D& size, - bool entire) const -{ - return HsaBlitManager::copyImage( - srcMemory, dstMemory, srcOrigin, dstOrigin, size, entire); -} - -bool -KernelBlitManager::fillBuffer( - device::Memory& memory, - const void* pattern, - size_t patternSize, - const amd::Coord3D& origin, - const amd::Coord3D& size, - bool entire - ) const -{ - if (setup_.disableFillBuffer_ || memory.isHostMemDirectAccess()) { - return HostBlitManager::fillBuffer(memory, pattern, patternSize, origin, - size, entire); - } - - uint fillType = FillBuffer; - size_t globalWorkOffset[3] = { 0, 0, 0 }; - cl_ulong fillSize = size[0] / patternSize; - size_t globalWorkSize = amd::alignUp(fillSize, 256); - size_t localWorkSize = 256; - bool dwordAligned = - ((patternSize % sizeof(uint32_t)) == 0) ? true : false; - - // Program kernels arguments for the fill operation - if (dwordAligned) { - kernels_[fillType]->parameters().set(0, sizeof(cl_mem), NULL); - cl_mem clmem = ((cl_mem) as_cl(memory.owner())); - kernels_[fillType]->parameters().set(1, sizeof(cl_mem), &clmem); - } - else { - cl_mem clmem = ((cl_mem) as_cl(memory.owner())); - kernels_[fillType]->parameters().set(0, sizeof(cl_mem), &clmem); - kernels_[fillType]->parameters().set(1, sizeof(cl_mem), NULL); - } - - amd::Buffer *fillMemory = - new (*context_) amd::Buffer(*context_, CL_MEM_USE_HOST_PTR, patternSize); - - if (!fillMemory->create(const_cast(pattern))) { - LogError("[OCL] Fail to create mem object for destination"); - return false; - } - - if (fillMemory->getDeviceMemory(dev_) == NULL) { - LogError("[OCL] Fail to create device mem object for destination"); - return false; - } - - cl_mem clmem = ((cl_mem) as_cl(fillMemory)); - kernels_[fillType]->parameters().set(2, sizeof(cl_mem), &clmem); - cl_ulong offset = origin[0]; - if (dwordAligned) { - patternSize /= sizeof(uint32_t); - offset /= sizeof(uint32_t); - } - kernels_[fillType]->parameters().set(3, sizeof(cl_uint), &patternSize); - kernels_[fillType]->parameters().set(4, sizeof(offset), &offset); - kernels_[fillType]->parameters().set(5, sizeof(fillSize), &fillSize); - - // Create ND range object for the kernel's execution - amd::NDRangeContainer ndrange(1, - globalWorkOffset, &globalWorkSize, &localWorkSize); - - // Execute the blit - address parameters = kernels_[fillType]->parameters().capture(dev_); - bool result = gpu().submitKernelInternal( - ndrange, *kernels_[fillType], parameters, NULL); - kernels_[fillType]->parameters().release(const_cast
(parameters), dev_); - - // Wait for the transfer to finish so that we could safely release the - // fill memory object. - // TODO: we could remove this if issue on implicit memory registration is - // fixed by KFD, so that we could pass the pattern as SVM. - gpu().releaseGpuMemoryFence(); - - fillMemory->release(); - - return result; -} - -bool -KernelBlitManager::fillImage( - device::Memory& memory, - const void* pattern, - const amd::Coord3D& origin, - const amd::Coord3D& size, - bool entire - ) const -{ - return HsaBlitManager::fillImage(memory, pattern, origin, size, entire); -} - -bool -KernelBlitManager::create(amd::Device& device) -{ - if (!HsaBlitManager::create(device)) { - return false; - } - if (!createProgram(static_cast(device))) { - return false; - } - - return true; -} - -bool -KernelBlitManager::createProgram(Device& device) -{ - // Save context and program for this device - context_ = device.blitProgram()->context_; - context_->retain(); - program_ = device.blitProgram()->program_; - program_->retain(); - - bool result = false; - do { - // Create kernel objects for all blits - for (uint i = 0; i < BlitTotal; ++i) { - const amd::Symbol* symbol = program_->findSymbol(BlitName[i]); - if (symbol == NULL) { - break; - } - kernels_[i] = new amd::Kernel(*program_, *symbol, BlitName[i]); - if (kernels_[i] == NULL) { - break; - } - } - - result = true; - } while(!result); - - return result; -} - -} // namespace roc diff --git a/projects/clr/rocclr/runtime/device/rocm/rocblit.hpp b/projects/clr/rocclr/runtime/device/rocm/rocblit.hpp deleted file mode 100644 index 16d1ef2363..0000000000 --- a/projects/clr/rocclr/runtime/device/rocm/rocblit.hpp +++ /dev/null @@ -1,412 +0,0 @@ -// -// Copyright (c) 2013 Advanced Micro Devices, Inc. All rights reserved. -// -#pragma once - -#include "top.hpp" -#include "platform/command.hpp" -#include "platform/commandqueue.hpp" -#include "device/device.hpp" -#include "device/blit.hpp" - -/*! \addtogroup HSA Blit Implementation - * @{ - */ - -//! HSA Blit Manager Implementation -namespace roc { - -class Device; -class Kernel; -class Memory; -class VirtualGPU; - -//! DMA Blit Manager -class HsaBlitManager : public device::HostBlitManager -{ -public: - //! Constructor - HsaBlitManager( - device::VirtualDevice& vdev, //!< Virtual GPU to be used for blits - Setup setup = Setup() //!< Specifies HW accelerated blits - ); - - //! Destructor - virtual ~HsaBlitManager() { - if (completion_signal_.handle != 0) { - hsa_signal_destroy(completion_signal_); - } - } - - //! Creates HostBlitManager object - virtual bool create(amd::Device& device) { - if (HSA_STATUS_SUCCESS != hsa_signal_create(0, 0, NULL, &completion_signal_)) { - return false; - } - return true; - } - - //! Copies a buffer object to system memory - virtual bool readBuffer( - device::Memory& srcMemory, //!< Source memory object - void* dstHost, //!< Destination host memory - const amd::Coord3D& origin, //!< Source origin - const amd::Coord3D& size, //!< Size of the copy region - bool entire = false //!< Entire buffer will be updated - ) const; - - //! Copies a buffer object to system memory - virtual bool readBufferRect( - device::Memory& srcMemory, //!< Source memory object - void* dstHost, //!< Destinaiton host memory - const amd::BufferRect& bufRect, //!< Source rectangle - const amd::BufferRect& hostRect, //!< Destination rectangle - const amd::Coord3D& size, //!< Size of the copy region - bool entire = false //!< Entire buffer will be updated - ) const; - - //! Copies an image object to system memory - virtual bool readImage( - device::Memory& srcMemory, //!< Source memory object - void* dstHost, //!< Destination host memory - const amd::Coord3D& origin, //!< Source origin - const amd::Coord3D& size, //!< Size of the copy region - size_t rowPitch, //!< Row pitch for host memory - size_t slicePitch, //!< Slice pitch for host memory - bool entire = false //!< Entire buffer will be updated - ) const; - - //! Copies system memory to a buffer object - virtual bool writeBuffer( - const void* srcHost, //!< Source host memory - device::Memory& dstMemory, //!< Destination memory object - const amd::Coord3D& origin, //!< Destination origin - const amd::Coord3D& size, //!< Size of the copy region - bool entire = false //!< Entire buffer will be updated - ) const; - - //! Copies system memory to a buffer object - virtual bool writeBufferRect( - const void* srcHost, //!< Source host memory - device::Memory& dstMemory, //!< Destination memory object - const amd::BufferRect& hostRect, //!< Destination rectangle - const amd::BufferRect& bufRect, //!< Source rectangle - const amd::Coord3D& size, //!< Size of the copy region - bool entire = false //!< Entire buffer will be updated - ) const; - - //! Copies system memory to an image object - virtual bool writeImage( - const void* srcHost, //!< Source host memory - device::Memory& dstMemory, //!< Destination memory object - const amd::Coord3D& origin, //!< Destination origin - const amd::Coord3D& size, //!< Size of the copy region - size_t rowPitch, //!< Row pitch for host memory - size_t slicePitch, //!< Slice pitch for host memory - bool entire = false //!< Entire buffer will be updated - ) const; - - //! Copies a buffer object to another buffer object - virtual bool copyBuffer( - device::Memory& srcMemory, //!< Source memory object - device::Memory& dstMemory, //!< Destination memory object - const amd::Coord3D& srcOrigin, //!< Source origin - const amd::Coord3D& dstOrigin, //!< Destination origin - const amd::Coord3D& size, //!< Size of the copy region - bool entire = false //!< Entire buffer will be updated - ) const; - - //! Copies a buffer object to another buffer object - virtual bool copyBufferRect( - device::Memory& srcMemory, //!< Source memory object - device::Memory& dstMemory, //!< Destination memory object - const amd::BufferRect& srcRect, //!< Source rectangle - const amd::BufferRect& dstRect, //!< Destination rectangle - const amd::Coord3D& size, //!< Size of the copy region - bool entire = false //!< Entire buffer will be updated - ) const; - - //! Copies an image object to a buffer object - virtual bool copyImageToBuffer( - device::Memory& srcMemory, //!< Source memory object - device::Memory& dstMemory, //!< Destination memory object - const amd::Coord3D& srcOrigin, //!< Source origin - const amd::Coord3D& dstOrigin, //!< Destination origin - const amd::Coord3D& size, //!< Size of the copy region - bool entire = false, //!< Entire buffer will be updated - size_t rowPitch = 0, //!< Pitch for buffer - size_t slicePitch = 0 //!< Slice for buffer - ) const; - - //! Copies a buffer object to an image object - virtual bool copyBufferToImage( - device::Memory& srcMemory, //!< Source memory object - device::Memory& dstMemory, //!< Destination memory object - const amd::Coord3D& srcOrigin, //!< Source origin - const amd::Coord3D& dstOrigin, //!< Destination origin - const amd::Coord3D& size, //!< Size of the copy region - bool entire = false, //!< Entire buffer will be updated - size_t rowPitch = 0, //!< Pitch for buffer - size_t slicePitch = 0 //!< Slice for buffer - ) const; - - //! Copies an image object to another image object - virtual bool copyImage( - device::Memory& srcMemory, //!< Source memory object - device::Memory& dstMemory, //!< Destination memory object - const amd::Coord3D& srcOrigin, //!< Source origin - const amd::Coord3D& dstOrigin, //!< Destination origin - const amd::Coord3D& size, //!< Size of the copy region - bool entire = false //!< Entire buffer will be updated - ) const; - - //! Fills a buffer memory with a pattern data - virtual bool fillBuffer( - device::Memory& memory, //!< Memory object to fill with pattern - const void* pattern, //!< Pattern data - size_t patternSize, //!< Pattern size - const amd::Coord3D& origin, //!< Destination origin - const amd::Coord3D& size, //!< Size of the copy region - bool entire = false //!< Entire buffer will be updated - ) const; - - //! Fills an image memory with a pattern data - virtual bool fillImage( - device::Memory& dstMemory, //!< Memory object to fill with pattern - const void* pattern, //!< Pattern data - const amd::Coord3D& origin, //!< Destination origin - const amd::Coord3D& size, //!< Size of the copy region - bool entire = false //!< Entire buffer will be updated - ) const; - -protected: - //! Returns the virtual GPU object - VirtualGPU& gpu() const { return static_cast(vDev_); } - -private: - //! Handle of Hsa Device object - const roc::Device& roc_device_; - - hsa_signal_t completion_signal_; - - //! Assits in transferring data from Host to Local or vice versa - //! taking into account the Hsail profile supported by Hsa Agent - bool hsaCopy( - const void *hostSrc, //!< Contains source data to be copied - void *hostDst, //!< Destination buffer address for copying - uint32_t size, //!< Size of data to copy in bytes - bool hostToDev //!< True if data is copied from Host To Device - ) const; - - //! Disable copy constructor - HsaBlitManager(const HsaBlitManager&); - - //! Disable operator= - HsaBlitManager& operator=(const HsaBlitManager&); -}; - -//! Kernel Blit Manager -//class KernelBlitManager : public HsaBlitManager -class KernelBlitManager : public HsaBlitManager -{ -private: - VirtualGPU& gpu() const { return static_cast(vDev_); } -public: - enum { - BlitCopyImage = 0, - BlitCopyImage1DA, - BlitCopyImageToBuffer, - BlitCopyBufferToImage, - BlitCopyBufferRect, - BlitCopyBufferRectAligned, - BlitCopyBuffer, - BlitCopyBufferAligned, - FillBuffer, - FillImage, - BlitTotal - }; - - //! Constructor - KernelBlitManager( - device::VirtualDevice& vdev, //!< Virtual GPU to be used for blits - Setup setup = Setup() //!< Specifies HW accelerated blits - ); - - //! Destructor - virtual ~KernelBlitManager(); - - //! Creates HostBlitManager object - virtual bool create(amd::Device& device); - - //! Copies a buffer object to system memory - virtual bool readBuffer( - device::Memory& srcMemory, //!< Source memory object - void* dstHost, //!< Destination host memory - const amd::Coord3D& origin, //!< Source origin - const amd::Coord3D& size, //!< Size of the copy region - bool entire = false //!< Entire buffer will be updated - ) const; - - //! Copies a buffer object to system memory - virtual bool readBufferRect( - device::Memory& srcMemory, //!< Source memory object - void* dstHost, //!< Destinaiton host memory - const amd::BufferRect& bufRect, //!< Source rectangle - const amd::BufferRect& hostRect, //!< Destination rectangle - const amd::Coord3D& size, //!< Size of the copy region - bool entire = false //!< Entire buffer will be updated - ) const; - - //! Copies an image object to system memory - virtual bool readImage( - device::Memory& srcMemory, //!< Source memory object - void* dstHost, //!< Destination host memory - const amd::Coord3D& origin, //!< Source origin - const amd::Coord3D& size, //!< Size of the copy region - size_t rowPitch, //!< Row pitch for host memory - size_t slicePitch, //!< Slice pitch for host memory - bool entire = false //!< Entire buffer will be updated - ) const; - - //! Copies system memory to a buffer object - virtual bool writeBuffer( - const void* srcHost, //!< Source host memory - device::Memory& dstMemory, //!< Destination memory object - const amd::Coord3D& origin, //!< Destination origin - const amd::Coord3D& size, //!< Size of the copy region - bool entire = false //!< Entire buffer will be updated - ) const; - - //! Copies system memory to a buffer object - virtual bool writeBufferRect( - const void* srcHost, //!< Source host memory - device::Memory& dstMemory, //!< Destination memory object - const amd::BufferRect& hostRect, //!< Destination rectangle - const amd::BufferRect& bufRect, //!< Source rectangle - const amd::Coord3D& size, //!< Size of the copy region - bool entire = false //!< Entire buffer will be updated - ) const; - - //! Copies system memory to an image object - virtual bool writeImage( - const void* srcHost, //!< Source host memory - device::Memory& dstMemory, //!< Destination memory object - const amd::Coord3D& origin, //!< Destination origin - const amd::Coord3D& size, //!< Size of the copy region - size_t rowPitch, //!< Row pitch for host memory - size_t slicePitch, //!< Slice pitch for host memory - bool entire = false //!< Entire buffer will be updated - ) const; - - //! Copies a buffer object to another buffer object - virtual bool copyBuffer( - device::Memory& srcMemory, //!< Source memory object - device::Memory& dstMemory, //!< Destination memory object - const amd::Coord3D& srcOrigin, //!< Source origin - const amd::Coord3D& dstOrigin, //!< Destination origin - const amd::Coord3D& size, //!< Size of the copy region - bool entire = false //!< Entire buffer will be updated - ) const; - - //! Copies a buffer object to another buffer object - virtual bool copyBufferRect( - device::Memory& srcMemory, //!< Source memory object - device::Memory& dstMemory, //!< Destination memory object - const amd::BufferRect& srcRect, //!< Source rectangle - const amd::BufferRect& dstRect, //!< Destination rectangle - const amd::Coord3D& size, //!< Size of the copy region - bool entire = false //!< Entire buffer will be updated - ) const; - - //! Copies an image object to a buffer object - virtual bool copyImageToBuffer( - device::Memory& srcMemory, //!< Source memory object - device::Memory& dstMemory, //!< Destination memory object - const amd::Coord3D& srcOrigin, //!< Source origin - const amd::Coord3D& dstOrigin, //!< Destination origin - const amd::Coord3D& size, //!< Size of the copy region - bool entire = false, //!< Entire buffer will be updated - size_t rowPitch = 0, //!< Pitch for buffer - size_t slicePitch = 0 //!< Slice for buffer - ) const; - - //! Copies a buffer object to an image object - virtual bool copyBufferToImage( - device::Memory& srcMemory, //!< Source memory object - device::Memory& dstMemory, //!< Destination memory object - const amd::Coord3D& srcOrigin, //!< Source origin - const amd::Coord3D& dstOrigin, //!< Destination origin - const amd::Coord3D& size, //!< Size of the copy region - bool entire = false, //!< Entire buffer will be updated - size_t rowPitch = 0, //!< Pitch for buffer - size_t slicePitch = 0 //!< Slice for buffer - ) const; - - //! Copies an image object to another image object - virtual bool copyImage( - device::Memory& srcMemory, //!< Source memory object - device::Memory& dstMemory, //!< Destination memory object - const amd::Coord3D& srcOrigin, //!< Source origin - const amd::Coord3D& dstOrigin, //!< Destination origin - const amd::Coord3D& size, //!< Size of the copy region - bool entire = false //!< Entire buffer will be updated - ) const; - - //! Fills a buffer memory with a pattern data - virtual bool fillBuffer( - device::Memory& memory, //!< Memory object to fill with pattern - const void* pattern, //!< Pattern data - size_t patternSize, //!< Pattern size - const amd::Coord3D& origin, //!< Destination origin - const amd::Coord3D& size, //!< Size of the copy region - bool entire = false //!< Entire buffer will be updated - ) const; - - //! Fills an image memory with a pattern data - virtual bool fillImage( - device::Memory& dstMemory, //!< Memory object to fill with pattern - const void* pattern, //!< Pattern data - const amd::Coord3D& origin, //!< Destination origin - const amd::Coord3D& size, //!< Size of the copy region - bool entire = false //!< Entire buffer will be updated - ) const; - -private: - //! Disable copy constructor - KernelBlitManager(const KernelBlitManager&); - - //! Disable operator= - KernelBlitManager& operator=(const KernelBlitManager&); - - //! Creates a program for all blit operations - bool createProgram( - Device& device //!< Device object - ); - - amd::Image::Format filterFormat(amd::Image::Format oldFormat) const; - - device::Memory *createImageView( - device::Memory &parent, - amd::Image::Format newFormat) const; - - amd::Context *context_; //!< A dummy context - amd::Program *program_; //!< GPU program obejct - amd::Kernel *kernels_[BlitTotal]; //!< GPU kernels for blit -}; - -static const char* BlitName[KernelBlitManager::BlitTotal] = { - "copyImage", - "copyImage1DA", - "copyImageToBuffer", - "copyBufferToImage", - "copyBufferRect", - "copyBufferRectAligned", - "copyBuffer", - "copyBufferAligned", - "fillBuffer", - "fillImage" - }; - -/*@}*/ -} // namespace roc - diff --git a/projects/clr/rocclr/runtime/device/rocm/roccompiler.cpp b/projects/clr/rocclr/runtime/device/rocm/roccompiler.cpp deleted file mode 100644 index 6e89f63b08..0000000000 --- a/projects/clr/rocclr/runtime/device/rocm/roccompiler.cpp +++ /dev/null @@ -1,160 +0,0 @@ -// -// Copyright (c) 2008 Advanced Micro Devices, Inc. All rights reserved. -// -#ifndef WITHOUT_HSA_BACKEND - -#include -#include -#include -#include - -#include "os/os.hpp" -#include "rocdevice.hpp" -#include "rocprogram.hpp" -#include "roccompilerlib.hpp" -#include "utils/options.hpp" -#include - -//CLC_IN_PROCESS_CHANGE -extern int openclFrontEnd(const char* cmdline, std::string*, std::string* typeInfo = NULL); - -namespace roc { - -/* Temporary log function for the compiler library */ -static void logFunction(const char* msg, size_t size) -{ - std::cout<< "Compiler Log: " << msg << std::endl; -} - -static int programsCount = 0; - -bool -HSAILProgram::compileImpl(const std::string& sourceCode, - const std::vector& headers, - const char** headerIncludeNames, - amd::option::Options* options) -{ - acl_error errorCode; - aclTargetInfo target; - - //Defaulting to bonaire - //Todo (sramalin) : Query the device for asic type- - //Defaulting to Bonair for now. - target = g_complibApi._aclGetTargetInfo(LP64_SWITCH("hsail","hsail64"), "Bonaire", - &errorCode); - - //end if asic info is ready - // We dump the source code for each program (param: headers) - // into their filenames (headerIncludeNames) into the TEMP - // folder specific to the OS and add the include path while - // compiling - - //Find the temp folder for the OS - std::string tempFolder = amd::Os::getEnvironment("TEMP"); - if (tempFolder.empty()) { - tempFolder = amd::Os::getEnvironment("TMP"); - if (tempFolder.empty()) { - tempFolder = WINDOWS_SWITCH(".","/tmp");; - } - } - //Iterate through each source code and dump it into tmp - std::fstream f; - std::vector headerFileNames(headers.size()); - std::vector newDirs; - for (size_t i = 0; i < headers.size(); ++i) { - std::string headerPath = tempFolder; - std::string headerIncludeName(headerIncludeNames[i]); - // replace / in path with current os's file separator - if ( amd::Os::fileSeparator() != '/') { - for (std::string::iterator it = headerIncludeName.begin(), - end = headerIncludeName.end(); - it != end; - ++it) { - if (*it == '/') *it = amd::Os::fileSeparator(); - } - } - size_t pos = headerIncludeName.rfind(amd::Os::fileSeparator()); - if (pos != std::string::npos) { - headerPath += amd::Os::fileSeparator(); - headerPath += headerIncludeName.substr(0, pos); - headerIncludeName = headerIncludeName.substr(pos+1); - } - if (!amd::Os::pathExists(headerPath)) { - bool ret = amd::Os::createPath(headerPath); - assert(ret && "failed creating path!"); - newDirs.push_back(headerPath); - } - std::string headerFullName - = headerPath + amd::Os::fileSeparator() + headerIncludeName; - headerFileNames[i] = headerFullName; - f.open(headerFullName.c_str(), std::fstream::out); - //Should we allow asserts - assert(!f.fail() && "failed creating header file!"); - f.write(headers[i]->c_str(), headers[i]->length()); - f.close(); - } - - //Create Binary - binaryElf_ = g_complibApi._aclBinaryInit(sizeof(aclBinary), - &target, - &binOpts_, - &errorCode); - - if( errorCode!=ACL_SUCCESS ) { - buildLog_ += "Error while compiling opencl source:\ - aclBinary init failure \n"; - LogWarning("aclBinaryInit failed"); - return false; - } - - //Insert opencl into binary - errorCode = g_complibApi._aclInsertSection(device().compiler(), - binaryElf_, - sourceCode.c_str(), - strlen(sourceCode.c_str()), - aclSOURCE); - - if ( errorCode != ACL_SUCCESS ) { - buildLog_ += "Error while converting to BRIG: \ - Inserting openCl Source \n"; - } - - //Set the options for the compiler - //Set the include path for the temp folder that contains the includes - if(!headers.empty()) { - this->compileOptions_.append(" -I"); - this->compileOptions_.append(tempFolder); - } - - //Add only for CL2.0 and later - if (options->oVariables->CLStd[2] >= '2') { - std::stringstream opts; - opts << " -D" << "CL_DEVICE_MAX_GLOBAL_VARIABLE_SIZE=" - << device().info().maxGlobalVariableSize_; - compileOptions_.append(opts.str()); - } - - //Compile source to IR - this->compileOptions_.append(hsailOptions()); - - errorCode = g_complibApi._aclCompile(device().compiler(), - binaryElf_, - //"-Wf,--support_all_extensions", - this->compileOptions_.c_str(), - ACL_TYPE_OPENCL, - ACL_TYPE_LLVMIR_BINARY, - logFunction); - buildLog_ += g_complibApi._aclGetCompilerLog(device().compiler()); - if( errorCode!=ACL_SUCCESS ) { - LogWarning("aclCompile failed"); - buildLog_ += "Error while compiling \ - opencl source: Compiling CL to IR"; - return false; - } - // Save the binary in the interface class - saveBinaryAndSetType(TYPE_COMPILED); - return true; - -} -} -#endif // WITHOUT_GPU_BACKEND diff --git a/projects/clr/rocclr/runtime/device/rocm/roccompilerlib.cpp b/projects/clr/rocclr/runtime/device/rocm/roccompilerlib.cpp deleted file mode 100644 index e933d5c393..0000000000 --- a/projects/clr/rocclr/runtime/device/rocm/roccompilerlib.cpp +++ /dev/null @@ -1,59 +0,0 @@ -#include "roccompilerlib.hpp" -#include "utils/flags.hpp" - -#include "acl.h" - -namespace roc { - -void* g_complibModule = NULL; -struct CompLibApi g_complibApi; - -// -// g_complibModule is defined in LoadCompLib(). This macro must be used only in LoadCompLib() function. -// -#define LOADSYMBOL(api) \ - g_complibApi._##api = (pfn_##api) amd::Os::getSymbol(g_complibModule, #api); \ - if( g_complibApi._##api == NULL ) { \ - LogError ("amd::Os::getSymbol() for exported func " #api " failed."); \ - amd::Os::unloadLibrary(g_complibModule); \ - return false; \ - } - -bool LoadCompLib(bool offline) -{ - g_complibModule = amd::Os::loadLibrary("amdhsacl" LP64_SWITCH(LINUX_SWITCH("32",""), "64")); - if( g_complibModule == NULL ) { - if (!offline) { - LogError( "amd::Os::loadLibrary() for loading of amdhsacl.dll failed."); - } - return false; - } - - LOADSYMBOL(aclCompilerInit) - LOADSYMBOL(aclGetTargetInfo) - LOADSYMBOL(aclBinaryInit) - LOADSYMBOL(aclInsertSection) - LOADSYMBOL(aclCompile) - LOADSYMBOL(aclCompilerFini) - LOADSYMBOL(aclBinaryFini) - LOADSYMBOL(aclWriteToMem) - LOADSYMBOL(aclQueryInfo) - LOADSYMBOL(aclExtractSymbol) - LOADSYMBOL(aclGetCompilerLog) - LOADSYMBOL(aclCreateFromBinary) - LOADSYMBOL(aclReadFromMem) - LOADSYMBOL(aclBinaryVersion) - LOADSYMBOL(aclLink) - - return true; -} - -void UnloadCompLib() -{ - if( g_complibModule ) - { - amd::Os::unloadLibrary(g_complibModule); - } -} - -} // namespace roc diff --git a/projects/clr/rocclr/runtime/device/rocm/roccompilerlib.hpp b/projects/clr/rocclr/runtime/device/rocm/roccompilerlib.hpp deleted file mode 100644 index ca31bd9216..0000000000 --- a/projects/clr/rocclr/runtime/device/rocm/roccompilerlib.hpp +++ /dev/null @@ -1,77 +0,0 @@ -#pragma once - -// -// This file hsa the code for explicity loading amdoclcl.dll. -// Exported functions from amdoclcl.dll can be added for usage as need-basis. -// With explicit/dynamic loading roc will not have any linkage to amdoclcl.lib. -// - -#include "thread/thread.hpp" -#include "acl.h" -#include "utils/debug.hpp" - -using namespace amd; - -namespace roc { - -// -// To use any new exported function from amdhsacl.dll please add/make that function specific changes -// in typedef below, struct CompLibApi and in hsacompilerLib.cpp::LoadCompLib() function. -// - -// -// Convention: The typedefed function name must be prefixed with pfn_ -// -typedef aclCompiler* (ACL_API_ENTRY *pfn_aclCompilerInit) (aclCompilerOptions *opts, acl_error *error_code); -typedef aclTargetInfo (ACL_API_ENTRY *pfn_aclGetTargetInfo) (const char*, const char*, acl_error*); -typedef aclBinary* (ACL_API_ENTRY *pfn_aclBinaryInit) (size_t, const aclTargetInfo*, const aclBinaryOptions*, acl_error*); -typedef acl_error (ACL_API_ENTRY *pfn_aclInsertSection) (aclCompiler *cl, aclBinary *binary, const void *data, size_t data_size, aclSections id); -typedef acl_error (ACL_API_ENTRY *pfn_aclCompile) (aclCompiler *cl, aclBinary *bin, const char *options, aclType from, aclType to, aclLogFunction compile_callback); -typedef acl_error (ACL_API_ENTRY *pfn_aclCompilerFini) (aclCompiler *cl); -typedef acl_error (ACL_API_ENTRY *pfn_aclBinaryFini) (aclBinary *bin); -typedef acl_error (ACL_API_ENTRY *pfn_aclWriteToMem) (aclBinary *bin,void **mem, size_t *size); -typedef acl_error (ACL_API_ENTRY *pfn_aclQueryInfo) (aclCompiler *cl, const aclBinary *binary, aclQueryType query, const char *kernel, void *data_ptr, size_t *ptr_size); -typedef const void* (ACL_API_ENTRY *pfn_aclExtractSymbol) (aclCompiler *cl,const aclBinary *binary,size_t *size,aclSections id,const char *symbol,acl_error *error_code); -typedef aclBinary* (ACL_API_ENTRY *pfn_aclReadFromMem) (void *mem,size_t size, acl_error *error_code); -typedef char* (ACL_API_ENTRY *pfn_aclGetCompilerLog) (aclCompiler* cl); -typedef aclBinary* (ACL_API_ENTRY *pfn_aclCreateFromBinary) (const aclBinary *binary,aclBIFVersion version); -typedef aclBIFVersion (ACL_API_ENTRY *pfn_aclBinaryVersion) (const aclBinary *binary); -typedef acl_error (ACL_API_ENTRY *pfn_aclLink) (aclCompiler* cl, aclBinary *src_bin, unsigned int num_libs, aclBinary **libs, aclType link_mode,const char* options, aclLogFunction link_callback); -// -// Convention: prefix struct member variable with with underscore '_' -// would be nice if there was no underscore prfix, but on Linux the token -// pasting in the macro is srtict and his is the workaround. -// -struct CompLibApi -{ - pfn_aclCompilerInit _aclCompilerInit; - pfn_aclGetTargetInfo _aclGetTargetInfo; - pfn_aclBinaryInit _aclBinaryInit; - pfn_aclInsertSection _aclInsertSection; - pfn_aclCompile _aclCompile; - pfn_aclCompilerFini _aclCompilerFini; - pfn_aclBinaryFini _aclBinaryFini; - pfn_aclWriteToMem _aclWriteToMem; - pfn_aclQueryInfo _aclQueryInfo; - pfn_aclExtractSymbol _aclExtractSymbol; - pfn_aclReadFromMem _aclReadFromMem; - pfn_aclGetCompilerLog _aclGetCompilerLog; - pfn_aclCreateFromBinary _aclCreateFromBinary; - pfn_aclBinaryVersion _aclBinaryVersion; - pfn_aclLink _aclLink; -}; - - -// -// Use g_ prefix for all global variables. -// -extern void* g_complibModule; -extern CompLibApi g_complibApi; - -// Note: initializes global variable g_complibApi. -// Not sure what error values we have, for now returning false on failure. -bool LoadCompLib(bool isOfflineDevice=false); -void UnloadCompLib(); - -} // namespace roc - diff --git a/projects/clr/rocclr/runtime/device/rocm/rocdefs.hpp b/projects/clr/rocclr/runtime/device/rocm/rocdefs.hpp deleted file mode 100644 index d94a10a1d7..0000000000 --- a/projects/clr/rocclr/runtime/device/rocm/rocdefs.hpp +++ /dev/null @@ -1,49 +0,0 @@ -#pragma once - -#ifndef WITHOUT_HSA_BACKEND - -namespace roc { - -typedef uint HsaDeviceId; - -struct AMDDeviceInfo { - HsaDeviceId hsaDeviceId_; //!< Machine id - const char* targetName_; //!< Target name for compilation - const char* machineTarget_; //!< Machine target - uint simdPerCU_; //!< Number of SIMDs per CU - uint simdWidth_; //!< Number of workitems processed per SIMD - uint simdInstructionWidth_; //!< Number of instructions processed per SIMD - uint memChannelBankWidth_; //!< Memory channel bank width - uint localMemSizePerCU_; //!< Local memory size per CU - uint localMemBanks_; //!< Number of banks of local memory - uint pciDeviceId; //!< PCIe device id -}; - -//The device ID must match with the device's index into DeviceInfo -const HsaDeviceId HSA_SPECTRE_ID = 0; -const HsaDeviceId HSA_SPOOKY_ID = 1; -const HsaDeviceId HSA_TONGA_ID = 2; -const HsaDeviceId HSA_CARRIZO_ID = 3; -const HsaDeviceId HSA_ICELAND_ID = 4; -const HsaDeviceId HSA_FIJI_ID = 5; -const HsaDeviceId HSA_HAWAII_ID = 6; -const HsaDeviceId HSA_ELLESMERE_ID = 7; -const HsaDeviceId HSA_BAFFIN_ID = 8; -const HsaDeviceId HSA_INVALID_DEVICE_ID = -1; - -static const AMDDeviceInfo DeviceInfo[] = { - // targetName machineTarget - /* TARGET_KAVERI_SPECTRE */ {HSA_SPECTRE_ID, "Spectre", "Spectre", 4, 16, 1, 256, 64 * Ki, 32, 0 }, - /* TARGET_KAVERI_SPOOKY */ {HSA_SPOOKY_ID, "Spooky", "Spooky", 4, 16, 1, 256, 64 * Ki, 32, 0 }, - /* TARGET_TONGA */ {HSA_TONGA_ID, "Tonga", "Tonga", 4, 16, 1, 256, 64 * Ki, 32, 0}, - /* TARGET_CARRIZO */ {HSA_CARRIZO_ID, "Carrizo", "Carrizo", 4, 16, 1, 256, 64 * Ki, 32, 0}, - /* TARGET_ICELAND */ {HSA_ICELAND_ID, "Topaz", "Topaz", 4, 16, 1, 256, 64 * Ki, 32, 0}, - /* TARGET_FIJI */ {HSA_FIJI_ID, "Fiji", "Fiji", 4, 16, 1, 256, 64 * Ki, 32, 0 }, - /* TARGET HAWAII */ {HSA_HAWAII_ID, "Hawaii", "Hawaii", 4, 16, 1, 256, 64 * Ki, 32, 0 }, - /* TARGET ELLESMERE */ {HSA_ELLESMERE_ID, "Ellesmere", "Ellesmere", 4, 16, 1, 256, 64 * Ki, 32, 0 }, - /* TARGET BAFFIN */ {HSA_BAFFIN_ID, "Baffin", "Baffin", 4, 16, 1, 256, 64 * Ki, 32, 0 } -}; - -} -#endif - diff --git a/projects/clr/rocclr/runtime/device/rocm/rocdevice.cpp b/projects/clr/rocclr/runtime/device/rocm/rocdevice.cpp deleted file mode 100644 index 71a0f7d1c5..0000000000 --- a/projects/clr/rocclr/runtime/device/rocm/rocdevice.cpp +++ /dev/null @@ -1,1296 +0,0 @@ -// -// Copyright (c) 2008 Advanced Micro Devices, Inc. All rights reserved. -// - -#ifndef WITHOUT_HSA_BACKEND - -#include "platform/program.hpp" -#include "platform/kernel.hpp" -#include "os/os.hpp" -#include "utils/debug.hpp" -#include "utils/flags.hpp" -#include "utils/versions.hpp" -#include "thread/monitor.hpp" -#include "CL/cl_ext.h" - -#include "amdocl/cl_common.hpp" -#include "device/rocm/rocdevice.hpp" -#include "device/rocm/rocblit.hpp" -#include "device/rocm/rocvirtual.hpp" -#include "device/rocm/rocprogram.hpp" -#include "device/rocm/roccompilerlib.hpp" -#include "device/rocm/rocmemory.hpp" -#include "device/rocm/rocglinterop.hpp" -#include "kv_id.h" -#include "vi_id.h" -#include "cz_id.h" -#include "ci_id.h" -#include -#include -#include -#include -#include -#include -#endif // WITHOUT_HSA_BACKEND - -#define OPENCL_VERSION_STR XSTR(OPENCL_MAJOR) "." XSTR(OPENCL_MINOR) - -#ifndef WITHOUT_HSA_BACKEND -namespace device { -extern const char* BlitSourceCode; -} - -namespace roc { -aclCompiler* NullDevice::compilerHandle_; -bool roc::Device::isHsaInitialized_ = false; -hsa_agent_t roc::Device::cpu_agent_ = { 0 }; -std::vector roc::Device::gpu_agents_; -const bool roc::Device::offlineDevice_ = false; -const bool roc::NullDevice::offlineDevice_= true; - - -static HsaDeviceId getHsaDeviceId(hsa_agent_t device, uint32_t& pci_id) { - /* - * Use the device id to determine the ASIC family - */ - // TODO: translate from hsa_agent to internal AMD device id. - if (HSA_STATUS_SUCCESS != - hsa_agent_get_info( - device, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_CHIP_ID, - &pci_id)) { - return HSA_INVALID_DEVICE_ID; - } - - switch (pci_id) { - case DEVICE_ID_SPECTRE_MOBILE: - case DEVICE_ID_SPECTRE_DESKTOP: - case DEVICE_ID_SPECTRE_LITE_MOBILE_1309: - case DEVICE_ID_SPECTRE_LITE_MOBILE_130A: - case DEVICE_ID_SPECTRE_SL_MOBILE_130B: - case DEVICE_ID_SPECTRE_MOBILE_130C: - case DEVICE_ID_SPECTRE_LITE_MOBILE_130D: - case DEVICE_ID_SPECTRE_SL_MOBILE_130E: - case DEVICE_ID_SPECTRE_DESKTOP_130F: - case DEVICE_ID_SPECTRE_WORKSTATION_1310: - case DEVICE_ID_SPECTRE_WORKSTATION_1311: - case DEVICE_ID_SPECTRE_LITE_DESKTOP_1313: - case DEVICE_ID_SPECTRE_SL_DESKTOP_1315: - case DEVICE_ID_SPECTRE_SL_MOBILE_1318: - case DEVICE_ID_SPECTRE_SL_EMBEDDED_131B: - case DEVICE_ID_SPECTRE_EMBEDDED_131C: - case DEVICE_ID_SPECTRE_LITE_EMBEDDED_131D: - return HSA_SPECTRE_ID; - case DEVICE_ID_SPOOKY_MOBILE: - case DEVICE_ID_SPOOKY_DESKTOP: - case DEVICE_ID_SPOOKY_DESKTOP_1312: - case DEVICE_ID_SPOOKY_DESKTOP_1316: - case DEVICE_ID_SPOOKY_MOBILE_1317: - return HSA_SPOOKY_ID; - case DEVICE_ID_VI_TONGA_P_6920: - case DEVICE_ID_VI_TONGA_P_6921: - case DEVICE_ID_VI_TONGA_P_6928: - case DEVICE_ID_VI_TONGA_P_692B: - case DEVICE_ID_VI_TONGA_P_692F: - case DEVICE_ID_VI_TONGA_P_6938: - case DEVICE_ID_VI_TONGA_P_6939: - return HSA_TONGA_ID; - case DEVICE_ID_VI_FIJI_P_7300: - return HSA_FIJI_ID; - case DEVICE_ID_CZ_9870: - case DEVICE_ID_CZ_9874: - case DEVICE_ID_CZ_9875: - case DEVICE_ID_CZ_9876: - case DEVICE_ID_CZ_9877: - return HSA_CARRIZO_ID; - case DEVICE_ID_VI_ICELAND_M_6900: - case DEVICE_ID_VI_ICELAND_M_6901: - case DEVICE_ID_VI_ICELAND_M_6902: - case DEVICE_ID_VI_ICELAND_M_6903: - case DEVICE_ID_VI_ICELAND_M_6907: - return HSA_ICELAND_ID; - case DEVICE_ID_CI_HAWAII_P_67A0: - case DEVICE_ID_CI_HAWAII_P_67A1: - case DEVICE_ID_CI_HAWAII_P_67A2: - case DEVICE_ID_CI_HAWAII_P_67A8: - case DEVICE_ID_CI_HAWAII_P_67A9: - case DEVICE_ID_CI_HAWAII_P_67AA: - case DEVICE_ID_CI_HAWAII_P_67B0: - case DEVICE_ID_CI_HAWAII_P_67B1: - case DEVICE_ID_CI_HAWAII_P_67B8: - case DEVICE_ID_CI_HAWAII_P_67B9: - case DEVICE_ID_CI_HAWAII_P_67BE: - return HSA_HAWAII_ID; - case DEVICE_ID_VI_ELLESMERE_P_67C0: - case DEVICE_ID_VI_ELLESMERE_P_67C1: - case DEVICE_ID_VI_ELLESMERE_P_67C2: - case DEVICE_ID_VI_ELLESMERE_P_67C4: - case DEVICE_ID_VI_ELLESMERE_P_67C7: - case DEVICE_ID_VI_ELLESMERE_P_67DF: - case DEVICE_ID_VI_ELLESMERE_P_67D0: - case DEVICE_ID_VI_ELLESMERE_P_67C8: - case DEVICE_ID_VI_ELLESMERE_P_67C9: - case DEVICE_ID_VI_ELLESMERE_P_67CA: - case DEVICE_ID_VI_ELLESMERE_P_67CC: - case DEVICE_ID_VI_ELLESMERE_P_67CF: - return HSA_ELLESMERE_ID; - case DEVICE_ID_VI_BAFFIN_M_67E0: - case DEVICE_ID_VI_BAFFIN_M_67E3: - case DEVICE_ID_VI_BAFFIN_M_67E8: - case DEVICE_ID_VI_BAFFIN_M_67EB: - case DEVICE_ID_VI_BAFFIN_M_67EF: - case DEVICE_ID_VI_BAFFIN_M_67FF: - case DEVICE_ID_VI_BAFFIN_M_67E1: - case DEVICE_ID_VI_BAFFIN_M_67E7: - case DEVICE_ID_VI_BAFFIN_M_67E9: - return HSA_BAFFIN_ID; - default: - return HSA_INVALID_DEVICE_ID; - } -} - -bool NullDevice::create(const AMDDeviceInfo& deviceInfo) { - online_ = false; - deviceInfo_ = deviceInfo; - // Mark the device as GPU type - info_.type_ = CL_DEVICE_TYPE_GPU | CL_HSA_ENABLED_AMD; - info_.vendorId_ = 0x1002; - - settings_ = new Settings(); - roc::Settings* hsaSettings = static_cast(settings_); - if ((hsaSettings == NULL) || - // @Todo sramalin Use double precision from constsant - !hsaSettings->create((true) & 0x1)) { - LogError("Error creating settings for NULL HSA device"); - return false; - } - // Report the device name - ::strcpy(info_.name_, "AMD HSA Device"); - info_.extensions_ = getExtensionString(); - info_.maxWorkGroupSize_ = hsaSettings->maxWorkGroupSize_; - ::strcpy(info_.vendor_, "Advanced Micro Devices, Inc."); - info_.oclcVersion_ = "OpenCL C " OPENCL_VERSION_STR " "; - strcpy(info_.driverVersion_, "1.0 Provisional (hsa)"); - info_.version_ = "OpenCL " OPENCL_VERSION_STR " "; - return true; -} - -Device::Device(hsa_agent_t bkendDevice) - : mapCacheOps_(nullptr) - , mapCache_(nullptr) - , _bkendDevice(bkendDevice) - , gpuvm_segment_max_alloc_(0) - , alloc_granularity_(0) - , context_(nullptr) - , xferQueue_(nullptr) -{ - group_segment_.handle = 0; - system_segment_.handle = 0; - system_coarse_segment_.handle = 0; - gpuvm_segment_.handle = 0; -} - -Device::~Device() -{ - // Release cached map targets - for (uint i = 0; mapCache_ != NULL && i < mapCache_->size(); ++i) { - if ((*mapCache_)[i] != NULL) { - (*mapCache_)[i]->release(); - } - } - delete mapCache_; - delete mapCacheOps_; - - // Destroy transfer queue - if (xferQueue_ && xferQueue_->terminate()) { - delete xferQueue_; - xferQueue_ = NULL; - } - - if (blitProgram_) { - delete blitProgram_; - blitProgram_ = NULL; - } - - if (context_ != NULL) { - context_->release(); - } - - if (info_.extensions_) { - delete[]info_.extensions_; - info_.extensions_ = NULL; - } - - if (settings_) { - delete settings_; - settings_ = NULL; - } -} -bool NullDevice::initCompiler(bool isOffline) { - // Initializes g_complibModule and g_complibApi if they were not initialized - if( g_complibModule == NULL ){ - if (!LoadCompLib(isOffline)) { - if (!isOffline) { - LogError("Error - could not find the compiler library"); - } - return false; - } - } - //Initialize the compiler handle if has already not been initialized - //This is destroyed in Device::teardown - acl_error error; - if (!compilerHandle_) { - compilerHandle_ = g_complibApi._aclCompilerInit(NULL, &error); - if (error != ACL_SUCCESS) { - LogError("Error initializing the compiler handle"); - return false; - } - } - return true; -} - -bool NullDevice::destroyCompiler() { - if (compilerHandle_ != NULL) { - acl_error error = g_complibApi._aclCompilerFini(compilerHandle_); - if (error != ACL_SUCCESS) { - LogError("Error closing the compiler"); - return false; - } - } - if( g_complibModule != NULL ){ - UnloadCompLib(); - } - return true; -} - -void NullDevice::tearDown() { - destroyCompiler(); -} -bool NullDevice::init() { - //Initialize the compiler - if (!initCompiler(offlineDevice_)){ - return false; - } - //If there is an HSA enabled device online then skip any offline device - std::vector devices; - devices = getDevices(CL_DEVICE_TYPE_GPU | CL_HSA_ENABLED_AMD, false); - - //Load the offline devices - //Iterate through the set of available offline devices - for (uint id = 0; id < sizeof(DeviceInfo)/sizeof(AMDDeviceInfo); id++) { - bool isOnline = false; - //Check if the particular device is online - for (unsigned int i=0; i< devices.size(); i++) { - if (static_cast(devices[i])->deviceInfo_.hsaDeviceId_ == - DeviceInfo[id].hsaDeviceId_){ - isOnline = true; - } - } - if (isOnline) { - continue; - } - NullDevice* nullDevice = new NullDevice(); - if (!nullDevice->create(DeviceInfo[id])) { - LogError("Error creating new instance of Device."); - delete nullDevice; - return false; - } - nullDevice->registerDevice(); - } - return true; -} -NullDevice::~NullDevice() { - if (info_.extensions_) { - delete[]info_.extensions_; - info_.extensions_ = NULL; - } - - if (settings_) { - delete settings_; - settings_ = NULL; - } -} - -hsa_status_t Device::iterateAgentCallback(hsa_agent_t agent, void *data) { - hsa_device_type_t dev_type = HSA_DEVICE_TYPE_CPU; - - hsa_status_t stat = - hsa_agent_get_info( - agent, HSA_AGENT_INFO_DEVICE, &dev_type); - - if (stat != HSA_STATUS_SUCCESS) { - return stat; - } - - if (dev_type == HSA_DEVICE_TYPE_CPU) { - Device::cpu_agent_ = agent; - return HSA_STATUS_SUCCESS; - } - - gpu_agents_.push_back(agent); - - assert(dev_type == HSA_DEVICE_TYPE_GPU); - Device *roc_device = new Device(agent); - if (!roc_device) { - LogError("Error creating new instance of Device on then heap."); - return HSA_STATUS_ERROR_OUT_OF_RESOURCES; - } - - if (!roc_device->mapHSADeviceToOpenCLDevice(agent)) { - LogError("Failed mapping of HsaDevice to Device."); - return HSA_STATUS_ERROR_OUT_OF_RESOURCES; - } - - if (!roc_device->create()) { - LogError("Error creating new instance of Device."); - return HSA_STATUS_ERROR_OUT_OF_RESOURCES; - } - - uint32_t pci_id; - HsaDeviceId deviceId = getHsaDeviceId(agent, pci_id); - if (deviceId == HSA_INVALID_DEVICE_ID) { - LogError(" Invalid HSA device"); - return HSA_STATUS_ERROR_INVALID_AGENT; - } - //Find device id in the table - unsigned sizeOfTable = sizeof(DeviceInfo) / sizeof(AMDDeviceInfo); - uint id; - for (id = 0; id < sizeOfTable; id++) { - if (DeviceInfo[id].hsaDeviceId_ == deviceId){ - break; - } - } - //If the AmdDeviceInfo for the HsaDevice Id could not be found return false - if (id == sizeOfTable) { - return HSA_STATUS_ERROR_OUT_OF_RESOURCES; - } - roc_device->deviceInfo_ = DeviceInfo[id]; - roc_device->deviceInfo_.pciDeviceId = pci_id; - - roc_device->registerDevice(); // no return code for this function - - return HSA_STATUS_SUCCESS; -} - -bool Device::init() { - LogInfo("Initializing HSA stack."); - - //Initialize the compiler - if (!initCompiler(offlineDevice_)){ - return false; - } - - if (HSA_STATUS_SUCCESS != hsa_init()) { - LogError("hsa_init failed."); - return false; - } - - if (HSA_STATUS_SUCCESS != - hsa_iterate_agents(iterateAgentCallback, NULL)) { - return false; - } - - return true; -} - -void -Device::tearDown() -{ - NullDevice::tearDown(); - hsa_shut_down(); -} - -bool -Device::create() -{ - if (!amd::Device::create()) { - return false; - } - - amd::Context::Info info = {0}; - std::vector devices; - devices.push_back(this); - - // Create a dummy context - context_ = new amd::Context(devices, info); - if (context_ == NULL) { - return false; - } - - blitProgram_ = new BlitProgram(context_); - // Create blit programs - if (blitProgram_ == NULL || !blitProgram_->create(this)) { - delete blitProgram_; - blitProgram_ = NULL; - LogError("Couldn't create blit kernels!"); - return false; - } - - mapCacheOps_ = new amd::Monitor("Map Cache Lock", true); - if (NULL == mapCacheOps_) { - return false; - } - - mapCache_ = new std::vector(); - if (mapCache_ == NULL) { - return false; - } - // Use just 1 entry by default for the map cache - mapCache_->push_back(NULL); - - xferQueue(); - - return true; -} - -device::Program* -NullDevice::createProgram(amd::option::Options* options) { - return new roc::HSAILProgram(*this); -} - -device::Program* -Device::createProgram(amd::option::Options* options) { - return new roc::HSAILProgram(*this); -} - -bool -Device::mapHSADeviceToOpenCLDevice(hsa_agent_t dev) -{ - // Create HSA settings - settings_ = new Settings(); - roc::Settings* hsaSettings = static_cast(settings_); - if ((hsaSettings == NULL) || - !hsaSettings->create((true) & 0x1)) { - return false; - } - - if (populateOCLDeviceConstants() == false) { - return false; - } - - // Setup System Memory to be Non-Coherent per user - // request via environment variable. By default the - // System Memory is setup to be Coherent - if (hsaSettings->enableNCMode_) { - hsa_status_t err = - hsa_amd_coherency_set_type(dev, HSA_AMD_COHERENCY_TYPE_NONCOHERENT); - if (err != HSA_STATUS_SUCCESS) { - LogError("Unable to set NC memory policy!"); - return false; - } - } - - return true; -} - -hsa_status_t Device::iterateGpuMemoryPoolCallback(hsa_amd_memory_pool_t pool, - void* data) { - if (data == NULL) { - return HSA_STATUS_ERROR_INVALID_ARGUMENT; - } - - hsa_region_segment_t segment_type = (hsa_region_segment_t)0; - hsa_status_t stat = - hsa_amd_memory_pool_get_info( - pool, HSA_AMD_MEMORY_POOL_INFO_SEGMENT, &segment_type); - if (stat != HSA_STATUS_SUCCESS) { - return stat; - } - - // TODO: system and device local segment - Device *dev = reinterpret_cast(data); - switch (segment_type) { - case HSA_REGION_SEGMENT_GLOBAL: { - if (dev->settings().enableLocalMemory_) { - dev->gpuvm_segment_ = pool; - } - break; - } - case HSA_REGION_SEGMENT_GROUP: - dev->group_segment_ = pool; - break; - default: - break; - } - - return HSA_STATUS_SUCCESS; -} - -hsa_status_t Device::iterateCpuMemoryPoolCallback(hsa_amd_memory_pool_t pool, - void* data) { - if (data == NULL) { - return HSA_STATUS_ERROR_INVALID_ARGUMENT; - } - - hsa_region_segment_t segment_type = (hsa_region_segment_t)0; - hsa_status_t stat = hsa_amd_memory_pool_get_info( - pool, HSA_AMD_MEMORY_POOL_INFO_SEGMENT, &segment_type); - if (stat != HSA_STATUS_SUCCESS) { - return stat; - } - - Device* dev = reinterpret_cast(data); - switch (segment_type) { - case HSA_REGION_SEGMENT_GLOBAL: { - uint32_t global_flag = 0; - hsa_status_t stat = hsa_amd_memory_pool_get_info( - pool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &global_flag); - if (stat != HSA_STATUS_SUCCESS) { - return stat; - } - - if ((global_flag & HSA_REGION_GLOBAL_FLAG_FINE_GRAINED) != 0) { - dev->system_segment_ = pool; - } else { - dev->system_coarse_segment_ = pool; - } - break; - } - default: - break; - } - - return HSA_STATUS_SUCCESS; -} - -bool -Device::populateOCLDeviceConstants() -{ - info_.available_ = true; - - roc::Settings* hsa_settings = static_cast(settings_); - - strcpy(info_.name_, "AMD HSA Device"); - - char device_name[64] = { 0 }; - if (HSA_STATUS_SUCCESS != - hsa_agent_get_info( - _bkendDevice, HSA_AGENT_INFO_NAME, device_name)) { - return false; - } - - strcpy(info_.boardName_, device_name); - - if (HSA_STATUS_SUCCESS != hsa_agent_get_info(_bkendDevice, - HSA_AGENT_INFO_PROFILE, - &agent_profile_)) { - return false; - } - - if (HSA_STATUS_SUCCESS != - hsa_agent_get_info( - _bkendDevice, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, - &info_.maxComputeUnits_)) { - return false; - } - assert(info_.maxComputeUnits_ > 0); - - if (HSA_STATUS_SUCCESS != - hsa_agent_get_info( - _bkendDevice, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_CACHELINE_SIZE, - &info_.globalMemCacheLineSize_)) { - return false; - } - assert(info_.globalMemCacheLineSize_ > 0); - - uint32_t cachesize[4] = { 0 }; - if (HSA_STATUS_SUCCESS != - hsa_agent_get_info( - _bkendDevice, HSA_AGENT_INFO_CACHE_SIZE, cachesize)) { - return false; - } - assert(cachesize[0] > 0); - info_.globalMemCacheSize_ = cachesize[0]; - - info_.globalMemCacheType_ = CL_READ_WRITE_CACHE; - - info_.type_ = CL_DEVICE_TYPE_GPU | CL_HSA_ENABLED_AMD; - - uint32_t hsa_bdf_id = 0; - if (HSA_STATUS_SUCCESS != - hsa_agent_get_info( - _bkendDevice, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_BDFID, &hsa_bdf_id)) { - return false; - } - - info_.deviceTopology_.pcie.type = CL_DEVICE_TOPOLOGY_TYPE_PCIE_AMD; - info_.deviceTopology_.pcie.bus = (hsa_bdf_id & (0xFF << 8)) >> 8; - info_.deviceTopology_.pcie.device = (hsa_bdf_id & (0x1F << 3)) >> 3; - info_.deviceTopology_.pcie.function = (hsa_bdf_id & 0x07); - info_.extensions_ = getExtensionString(); - info_.nativeVectorWidthDouble_ = - info_.preferredVectorWidthDouble_ = (settings().doublePrecision_) ? 1 : 0; - - if (HSA_STATUS_SUCCESS != - hsa_agent_get_info( - _bkendDevice, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_MAX_CLOCK_FREQUENCY, - &info_.maxClockFrequency_)) { - return false; - } - assert(info_.maxClockFrequency_ > 0); - - if (HSA_STATUS_SUCCESS != - hsa_amd_agent_iterate_memory_pools( - cpu_agent_, Device::iterateCpuMemoryPoolCallback, this)) { - return false; - } - - assert(system_segment_.handle != 0); - - if (HSA_STATUS_SUCCESS != - hsa_amd_agent_iterate_memory_pools( - _bkendDevice, Device::iterateGpuMemoryPoolCallback, this)) { - return false; - } - - assert(group_segment_.handle != 0); - - size_t group_segment_size = 0; - if (HSA_STATUS_SUCCESS != - hsa_amd_memory_pool_get_info( - group_segment_, HSA_AMD_MEMORY_POOL_INFO_SIZE, &group_segment_size)) { - return false; - } - assert(group_segment_size > 0); - - info_.localMemSizePerCU_ = group_segment_size; - info_.localMemSize_ = group_segment_size; - - info_.maxWorkItemDimensions_ = 3; - - if (settings().enableLocalMemory_ && gpuvm_segment_.handle != 0) { - size_t global_segment_size = 0; - if (HSA_STATUS_SUCCESS != - hsa_amd_memory_pool_get_info(gpuvm_segment_, - HSA_AMD_MEMORY_POOL_INFO_SIZE, - &global_segment_size)) { - return false; - } - - assert(global_segment_size > 0); - info_.globalMemSize_ = static_cast(global_segment_size); - - gpuvm_segment_max_alloc_ = - cl_ulong(info_.globalMemSize_ * - std::min(GPU_SINGLE_ALLOC_PERCENT, 100u) / 100u); - assert(gpuvm_segment_max_alloc_ > 0); - - info_.maxMemAllocSize_ = - static_cast(gpuvm_segment_max_alloc_); - - if (HSA_STATUS_SUCCESS != - hsa_amd_memory_pool_get_info(gpuvm_segment_, - HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_GRANULE, - &alloc_granularity_)) { - return false; - } - - assert(alloc_granularity_ > 0); - } - else { - static const cl_ulong kDefaultGlobalMemSize = cl_ulong(1 * Gi); - info_.globalMemSize_ = kDefaultGlobalMemSize; - info_.maxMemAllocSize_ = info_.globalMemSize_ / 4; - - if (HSA_STATUS_SUCCESS != - hsa_amd_memory_pool_get_info(system_segment_, - HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_GRANULE, - &alloc_granularity_)) { - return false; - } - } - - // Make sure the max allocation size is not larger than the available - // memory size. - info_.maxMemAllocSize_ = - std::min(info_.maxMemAllocSize_, info_.globalMemSize_); - - /*make sure we don't run anything over 8 params for now*/ - info_.maxParameterSize_ = 1024; // [TODO]: CAL stack values: 1024* - // constant - - uint32_t max_work_group_size = settings().maxWorkGroupSize_; - /* - if (HSA_STATUS_SUCCESS != - hsa_agent_get_info( - _bkendDevice, HSA_AGENT_INFO_WORKGROUP_MAX_SIZE, &max_work_group_size)) { - return false; - } - */ - assert(max_work_group_size > 0); - info_.maxWorkGroupSize_ = max_work_group_size; - - uint16_t max_workgroup_size[3] = { 0, 0, 0 }; - if (HSA_STATUS_SUCCESS != - hsa_agent_get_info( - _bkendDevice, HSA_AGENT_INFO_WORKGROUP_MAX_DIM, &max_workgroup_size)) { - return false; - } - - assert(max_workgroup_size[0] != 0 && max_workgroup_size[1] != 0 && - max_workgroup_size[2] != 0); - - info_.maxWorkItemSizes_[0] = max_workgroup_size[0]; - info_.maxWorkItemSizes_[1] = max_workgroup_size[1]; - info_.maxWorkItemSizes_[2] = max_workgroup_size[2]; - - info_.nativeVectorWidthChar_ = info_.preferredVectorWidthChar_ = 4; - info_.nativeVectorWidthShort_ = info_.preferredVectorWidthShort_ = 2; - info_.nativeVectorWidthInt_ = info_.preferredVectorWidthInt_ = 1; - info_.nativeVectorWidthLong_ = info_.preferredVectorWidthLong_ = 1; - info_.nativeVectorWidthFloat_ = info_.preferredVectorWidthFloat_ = 1; - - info_.hostUnifiedMemory_ = CL_TRUE; - info_.memBaseAddrAlign_ = 8 * (flagIsDefault(MEMOBJ_BASE_ADDR_ALIGN) ? - sizeof(cl_long16) : MEMOBJ_BASE_ADDR_ALIGN); - info_.minDataTypeAlignSize_ = sizeof(cl_long16); - - info_.maxConstantArgs_ = 8; - info_.maxConstantBufferSize_ = 64 * 1024; - info_.localMemType_ = CL_LOCAL; - info_.errorCorrectionSupport_ = false; - info_.profilingTimerResolution_ = 1; - info_.littleEndian_ = true; - info_.compilerAvailable_ = true; - info_.executionCapabilities_ = CL_EXEC_KERNEL; - info_.queueProperties_ = CL_QUEUE_PROFILING_ENABLE; - info_.platform_ = AMD_PLATFORM; - info_.profile_ = "FULL_PROFILE"; - strcpy(info_.vendor_, "Advanced Micro Devices, Inc."); - - info_.addressBits_ = LP64_SWITCH(32, 64); - info_.maxSamplers_ = 16; - info_.bufferFromImageSupport_ = CL_FALSE; - info_.oclcVersion_ = "OpenCL C " OPENCL_VERSION_STR " "; - strcpy(info_.driverVersion_, "1.0 Provisional (hsa)"); - info_.version_ = "OpenCL " OPENCL_VERSION_STR " "; - - info_.builtInKernels_ = ""; - info_.linkerAvailable_ = true; - info_.preferredInteropUserSync_ = true; - info_.printfBufferSize_ = PrintfDbg::WorkitemDebugSize * info().maxWorkGroupSize_; - info_.vendorId_ = 0x1002; // AMD's PCIe vendor id - - info_.maxGlobalVariableSize_ = static_cast(info_.maxMemAllocSize_); - info_.globalVariablePreferredTotalSize_ = - static_cast(info_.globalMemSize_); - - // Populate the single config setting. - info_.singleFPConfig_ = CL_FP_ROUND_TO_NEAREST | CL_FP_ROUND_TO_ZERO | - CL_FP_ROUND_TO_INF | CL_FP_INF_NAN | CL_FP_FMA; - - if (hsa_settings->doublePrecision_) { - info_.doubleFPConfig_ = info_.singleFPConfig_ | CL_FP_DENORM; - info_.singleFPConfig_ |= CL_FP_CORRECTLY_ROUNDED_DIVIDE_SQRT; - } - info_.preferredPlatformAtomicAlignment_ = 0; - info_.preferredGlobalAtomicAlignment_ = 0; - info_.preferredLocalAtomicAlignment_ = 0; - - uint8_t hsa_extensions[128]; - if (HSA_STATUS_SUCCESS != hsa_agent_get_info(_bkendDevice, - HSA_AGENT_INFO_EXTENSIONS, - hsa_extensions)) { - return false; - } - - assert(HSA_EXTENSION_IMAGES < 8); - const bool image_is_supported = - ((hsa_extensions[0] & (1 << HSA_EXTENSION_IMAGES)) != 0); - if (image_is_supported) { - // Images - if (HSA_STATUS_SUCCESS != - hsa_agent_get_info(_bkendDevice, - static_cast( - HSA_EXT_AGENT_INFO_MAX_SAMPLER_HANDLERS), - &info_.maxSamplers_)) { - return false; - } - - if (HSA_STATUS_SUCCESS != - hsa_agent_get_info(_bkendDevice, - static_cast( - HSA_EXT_AGENT_INFO_MAX_IMAGE_RD_HANDLES), - &info_.maxReadImageArgs_)) { - return false; - } - - // TODO: no attribute for write image. - info_.maxWriteImageArgs_ = 8; - - if (HSA_STATUS_SUCCESS != - hsa_agent_get_info(_bkendDevice, - static_cast( - HSA_EXT_AGENT_INFO_MAX_IMAGE_RORW_HANDLES), - &info_.maxReadWriteImageArgs_)) { - return false; - } - - uint32_t image_max_dim[3]; - if (HSA_STATUS_SUCCESS != - hsa_agent_get_info(_bkendDevice, - static_cast( - HSA_EXT_AGENT_INFO_IMAGE_2D_MAX_ELEMENTS), - &image_max_dim)) { - return false; - } - - info_.image2DMaxWidth_ = image_max_dim[0]; - info_.image2DMaxHeight_ = image_max_dim[1]; - - if (HSA_STATUS_SUCCESS != - hsa_agent_get_info(_bkendDevice, - static_cast( - HSA_EXT_AGENT_INFO_IMAGE_3D_MAX_ELEMENTS), - &image_max_dim)) { - return false; - } - - info_.image3DMaxWidth_ = image_max_dim[0]; - info_.image3DMaxHeight_ = image_max_dim[1]; - info_.image3DMaxDepth_ = image_max_dim[2]; - - uint32_t max_array_size = 0; - if (HSA_STATUS_SUCCESS != - hsa_agent_get_info(_bkendDevice, - static_cast( - HSA_EXT_AGENT_INFO_IMAGE_ARRAY_MAX_LAYERS), - &max_array_size)) { - return false; - } - - info_.imageMaxArraySize_ = max_array_size; - - if (HSA_STATUS_SUCCESS != - hsa_agent_get_info(_bkendDevice, - static_cast( - HSA_EXT_AGENT_INFO_IMAGE_1DB_MAX_ELEMENTS), - &image_max_dim)) { - return false; - } - info_.imageMaxBufferSize_ = image_max_dim[0]; - - info_.imagePitchAlignment_ = 256; - - info_.imageBaseAddressAlignment_ = 256; - - info_.bufferFromImageSupport_ = CL_FALSE; - - info_.imageSupport_ = - (info_.maxReadWriteImageArgs_ > 0) ? CL_TRUE : CL_FALSE; - } - - // Enable SVM Capabilities of Hsa device. Ensure - // user has not setup memory to be non-coherent - info_.svmCapabilities_ = 0; - if (hsa_settings->enableNCMode_ == false) { - info_.svmCapabilities_ = CL_DEVICE_SVM_COARSE_GRAIN_BUFFER; - info_.svmCapabilities_ |= CL_DEVICE_SVM_FINE_GRAIN_BUFFER; - // Report fine-grain system only on full profile - if (agent_profile_ == HSA_PROFILE_FULL) { - info_.svmCapabilities_ |= CL_DEVICE_SVM_FINE_GRAIN_SYSTEM; - } - info_.svmCapabilities_ |= CL_DEVICE_SVM_ATOMICS; - } - - return true; -} - -device::VirtualDevice* -Device::createVirtualDevice(amd::CommandQueue* queue) -{ - bool profiling = (queue != NULL) && - queue->properties().test(CL_QUEUE_PROFILING_ENABLE); - - // Initialization of heap and other resources occur during the command - // queue creation time. - VirtualGPU *virtualDevice = new VirtualGPU(*this); - - if (!virtualDevice->create(profiling)) { - delete virtualDevice; - return NULL; - } - - if(profiling) { - hsa_amd_profiling_set_profiler_enabled(virtualDevice->gpu_queue(), 1); - } - - return virtualDevice; -} - -bool -Device::globalFreeMemory(size_t *freeMemory) const -{ - return false; -} - -bool -Device::bindExternalDevice( - uint flags, - void* const gfxDevice[], - void* gfxContext, - bool validateOnly) -{ -#if defined(_WIN32) - return false; -#else - if((flags&amd::Context::GLDeviceKhr)==0) - return false; - - MesaInterop::MESA_INTEROP_KIND kind=MesaInterop::MESA_INTEROP_NONE; - MesaInterop::DisplayHandle display; - MesaInterop::ContextHandle context; - - if((flags&amd::Context::EGLDeviceKhr)!=0) - { - kind=MesaInterop::MESA_INTEROP_EGL; - display.eglDisplay=reinterpret_cast(gfxDevice[amd::Context::GLDeviceKhrIdx]); - context.eglContext=reinterpret_cast(gfxContext); - } - else - { - kind=MesaInterop::MESA_INTEROP_GLX; - display.glxDisplay=reinterpret_cast(gfxDevice[amd::Context::GLDeviceKhrIdx]); - context.glxContext=reinterpret_cast(gfxContext); - } - - mesa_glinterop_device_info info; - info.size=sizeof(mesa_glinterop_device_info); - MesaInterop temp; - if(!temp.Bind(kind, display, context)) - { - assert(false && "Failed mesa interop bind."); - return false; - } - - if(!temp.GetInfo(info)) - { - assert(false && "Failed to get mesa interop device info."); - return false; - } - - bool match=true; - match &= info_.deviceTopology_.pcie.bus==info.pci_bus; - match &= info_.deviceTopology_.pcie.device==info.pci_device; - match &= info_.deviceTopology_.pcie.function==info.pci_function; - match &= info_.vendorId_==info.vendor_id; - match &= deviceInfo_.pciDeviceId==info.device_id; - - if(!validateOnly) - mesa_=temp; - - return match; -#endif -} - -bool -Device::unbindExternalDevice( - uint flags, - void* const gfxDevice[], - void* gfxContext, - bool validateOnly) -{ -#if defined(_WIN32) - return false; -#else - if ((flags&amd::Context::GLDeviceKhr)==0) - return false; - if(!validateOnly) - mesa_.Unbind(); - return true; -#endif -} - -amd::Memory* -Device::findMapTarget(size_t size) const -{ - // Must be serialised for access - amd::ScopedLock lk(*mapCacheOps_); - - amd::Memory* map = NULL; - size_t minSize = 0; - size_t maxSize = 0; - uint mapId = mapCache_->size(); - uint releaseId = mapCache_->size(); - - // Find if the list has a map target of appropriate size - for (uint i = 0; i < mapCache_->size(); i++) { - if ((*mapCache_)[i] != NULL) { - // Requested size is smaller than the entry size - if (size < (*mapCache_)[i]->getSize()) { - if ((minSize == 0) || - (minSize > (*mapCache_)[i]->getSize())) { - minSize = (*mapCache_)[i]->getSize(); - mapId = i; - } - } - // Requeted size matches the entry size - else if (size == (*mapCache_)[i]->getSize()) { - mapId = i; - break; - } - else { - // Find the biggest map target in the list - if (maxSize < (*mapCache_)[i]->getSize()) { - maxSize = (*mapCache_)[i]->getSize(); - releaseId = i; - } - } - } - } - - // Check if we found any map target - if (mapId < mapCache_->size()) { - map = (*mapCache_)[mapId]; - (*mapCache_)[mapId] = NULL; - } - // If cache is full, then release the biggest map target - else if (releaseId < mapCache_->size()) { - (*mapCache_)[releaseId]->release(); - (*mapCache_)[releaseId] = NULL; - } - - return map; -} - -bool -Device::addMapTarget(amd::Memory* memory) const -{ - // Must be serialised for access - amd::ScopedLock lk(*mapCacheOps_); - - //the svm memory shouldn't be cached - if (!memory->canBeCached()) { - return false; - } - // Find if the list has a map target of appropriate size - for (uint i = 0; i < mapCache_->size(); ++i) { - if ((*mapCache_)[i] == NULL) { - (*mapCache_)[i] = memory; - return true; - } - } - - // Add a new entry - mapCache_->push_back(memory); - - return true; -} - -device::Memory* -Device::createMemory(amd::Memory &owner) const -{ - roc::Memory* memory = NULL; - if (owner.asBuffer()) { - memory = new roc::Buffer(*this, owner); - } - else if (owner.asImage()) { - memory = new roc::Image(*this, owner); - } - else { - LogError("Unknown memory type"); - } - - if (memory == NULL) { - return NULL; - } - - bool result = memory->create(); - - if (!result) { - LogError("Failed creating memory"); - delete memory; - return NULL; - } - - if (!memory->isHostMemDirectAccess() && owner.asImage() && - owner.parent() == NULL && - (owner.getMemFlags() & (CL_MEM_COPY_HOST_PTR | CL_MEM_USE_HOST_PTR))) { - // To avoid recurssive call to Device::createMemory, we perform - // data transfer to the view of the image. - amd::Image* imageView = owner.asImage()->createView( - owner.getContext(), owner.asImage()->getImageFormat(), xferQueue()); - - if (imageView == NULL) { - LogError("[OCL] Fail to allocate view of image object"); - return NULL; - } - - Image* devImageView = - new roc::Image(static_cast(*this), *imageView); - if (devImageView == NULL) { - LogError("[OCL] Fail to allocate device mem object for the view"); - imageView->release(); - return NULL; - } - - if (devImageView != NULL && - !devImageView->createView(static_cast(*memory))) { - LogError("[OCL] Fail to create device mem object for the view"); - delete devImageView; - imageView->release(); - return NULL; - } - - imageView->replaceDeviceMemory(this, devImageView); - - result = xferMgr().writeImage(owner.getHostMem(), *devImageView, - amd::Coord3D(0), imageView->getRegion(), - imageView->getRowPitch(), - imageView->getSlicePitch(), true); - - imageView->release(); - } - - if (!result) { - delete memory; - return NULL; - } - - return memory; -} - -void* -Device::hostAlloc(size_t size, size_t alignment, bool atomics) const { - void* ptr = NULL; - const hsa_amd_memory_pool_t segment = - (!atomics) - ? (system_coarse_segment_.handle != 0) ? system_coarse_segment_ - : system_segment_ - : system_segment_; - assert(segment.handle != 0); - hsa_status_t stat = hsa_amd_memory_pool_allocate(segment, size, 0, &ptr); - if (stat != HSA_STATUS_SUCCESS) { - LogError("Fail allocation host memory"); - return NULL; - } - - stat = hsa_amd_agents_allow_access(gpu_agents_.size(), &gpu_agents_[0], - NULL, ptr); - if (stat != HSA_STATUS_SUCCESS) { - LogError("Fail hsa_amd_agents_allow_access"); - return NULL; - } - - return ptr; -} - -void -Device::hostFree(void* ptr, size_t size) const -{ - hsa_status_t stat = - hsa_amd_memory_pool_free(ptr); - if (stat != HSA_STATUS_SUCCESS) { - LogError("Fail freeing host memory"); - assert(stat == HSA_STATUS_SUCCESS); - } -} - -void * -Device::deviceLocalAlloc(size_t size) const -{ - if (gpuvm_segment_.handle == 0 || gpuvm_segment_max_alloc_ == 0) { - return NULL; - } - - void *ptr = NULL; - hsa_status_t stat = - hsa_amd_memory_pool_allocate(gpuvm_segment_, size, 0, &ptr); - if (stat != HSA_STATUS_SUCCESS) { - LogError("Fail allocation local memory"); - return NULL; - } - - stat = hsa_memory_assign_agent(ptr, _bkendDevice, HSA_ACCESS_PERMISSION_RW); - if (stat != HSA_STATUS_SUCCESS) { - LogError("Fail assigning local memory to agent"); - deviceLocalFree(ptr, size); - return NULL; - } - - return ptr; -} - -void -Device::deviceLocalFree(void *ptr, size_t size) const -{ - hsa_status_t stat = - hsa_amd_memory_pool_free(ptr); - if (stat != HSA_STATUS_SUCCESS) { - LogError("Fail freeing local memory"); - } -} - -void* -Device::svmAlloc(amd::Context& context, size_t size, size_t alignment, cl_svm_mem_flags flags, void* svmPtr) const -{ - amd::Memory* mem = NULL; - if (NULL == svmPtr) { - bool atomics = (flags & CL_MEM_SVM_ATOMICS) != 0; - void* ptr = hostAlloc(size, alignment, atomics); - - if (ptr != NULL) { - // Copy paste from ORCA code. - // create a hidden buffer, which will allocated on the device later - mem = new (context) - amd::Buffer(context, CL_MEM_USE_HOST_PTR, size, ptr); - if (mem == NULL) { - LogError("failed to create a svm mem object!"); - return NULL; - } - - if (!mem->create(ptr)) { - LogError("failed to create a svm hidden buffer!"); - mem->release(); - return NULL; - } - - // add the information to context so that we can use it later. - amd::SvmManager::AddSvmBuffer(ptr, mem); - - return ptr; - } - else { - return NULL; - } - } else { - // Copy paste from ORCA code. - // Find the existing amd::mem object - mem = amd::SvmManager::FindSvmBuffer(svmPtr); - - if (NULL == mem) { - return NULL; - } - - return svmPtr; - } -} - -void -Device::svmFree(void* ptr) const -{ - amd::Memory * svmMem = NULL; - svmMem = amd::SvmManager::FindSvmBuffer(ptr); - if (NULL != svmMem) { - svmMem->release(); - amd::SvmManager::RemoveSvmBuffer(ptr); - hostFree(ptr); - } -} - -VirtualGPU* -Device::xferQueue() const -{ - if (!xferQueue_) { - // Create virtual device for internal memory transfer - Device* thisDevice = const_cast(this); - thisDevice->xferQueue_ = reinterpret_cast( - thisDevice->createVirtualDevice()); - if (!xferQueue_) { - LogError("Couldn't create the device transfer manager!"); - } - } - return xferQueue_; -} - -} -#endif // WITHOUT_HSA_BACKEND diff --git a/projects/clr/rocclr/runtime/device/rocm/rocdevice.hpp b/projects/clr/rocclr/runtime/device/rocm/rocdevice.hpp deleted file mode 100644 index dcc2aa07ad..0000000000 --- a/projects/clr/rocclr/runtime/device/rocm/rocdevice.hpp +++ /dev/null @@ -1,376 +0,0 @@ -// -// Copyright (c) 2009 Advanced Micro Devices, Inc. All rights reserved. -// -#pragma once - -#ifndef WITHOUT_HSA_BACKEND - -#include "top.hpp" -#include "CL/cl.h" -#include "device/device.hpp" -#include "platform/command.hpp" -#include "platform/program.hpp" -#include "platform/perfctr.hpp" -#include "platform/memory.hpp" -#include "utils/concurrent.hpp" -#include "thread/thread.hpp" -#include "thread/monitor.hpp" -#include "utils/versions.hpp" -#include "aclTypes.h" - -#include "device/rocm/rocsettings.hpp" -#include "device/rocm/rocvirtual.hpp" -#include "device/rocm/rocdefs.hpp" -#include "device/rocm/rocprintf.hpp" -#include "device/rocm/rocglinterop.hpp" - -#include "hsa.h" -#include "hsa_ext_image.h" -#include "hsa_ext_finalize.h" -#include "hsa_ext_amd.h" - -#include -#include - -// extern hsa::Runtime* g_hsaruntime; - -/*! \addtogroup HSA - * @{ - */ - -//! HSA Device Implementation -namespace roc { - -/** - * @brief List of environment variables that could be used to - * configure the behavior of Hsa Runtime - */ -#define ENVVAR_HSA_POLL_KERNEL_COMPLETION "HSA_POLL_COMPLETION" - -//! Forward declarations -class Command; -class Device; -class GpuCommand; -class Heap; -class HeapBlock; -class Program; -class Kernel; -class Memory; -class Resource; -class VirtualDevice; -class PrintfDbg; - -//A NULL Device type used only for offline compilation -// Only functions that are used for compilation will be in this device -class NullDevice : public amd::Device { -public: - //! constructor - NullDevice(){}; - - //!create the device - bool create(const AMDDeviceInfo& deviceInfo); - - //! Initialise all the offline devices that can be used for compilation - static bool init(); - //! Teardown for offline devices - static void tearDown(); - - //! Destructor for the Null device - virtual ~NullDevice(); - - aclCompiler *compiler() const { return compilerHandle_; } - - //! Construct an HSAIL program object from the ELF assuming it is valid - virtual device::Program *createProgram(amd::option::Options* options = NULL); - const AMDDeviceInfo& deviceInfo() const { - return deviceInfo_; - } - //! Gets the backend device for the NULL device type - virtual hsa_agent_t getBackendDevice() const { - ShouldNotReachHere(); - const hsa_agent_t kInvalidAgent = { 0 }; - return kInvalidAgent; - } - - //List of dummy functions which are disabled for NullDevice - - //! Create sub-devices according to the given partition scheme. - virtual cl_int createSubDevices( - device::CreateSubDevicesInfo& create_info, - cl_uint num_entries, - cl_device_id* devices, - cl_uint* num_devices) { - ShouldNotReachHere(); - return CL_INVALID_VALUE; }; - - //! Create a new virtual device environment. - virtual device::VirtualDevice* createVirtualDevice( - amd::CommandQueue* queue = NULL) { - ShouldNotReachHere(); - return NULL; - } - - virtual bool registerSvmMemory(void* ptr, size_t size) const { - ShouldNotReachHere(); - return false; - } - - virtual void deregisterSvmMemory(void* ptr) const { - ShouldNotReachHere(); - } - - //! Just returns NULL for the dummy device - virtual device::Memory* createMemory(amd::Memory& owner) const { - ShouldNotReachHere(); - return NULL; } - - //! Sampler object allocation - virtual bool createSampler( - const amd::Sampler& owner, //!< abstraction layer sampler object - device::Sampler** sampler //!< device sampler object - ) const - { - ShouldNotReachHere(); - return true; - } - - //! Just returns NULL for the dummy device - virtual device::Memory* createView( - amd::Memory& owner, //!< Owner memory object - const device::Memory& parent //!< Parent device memory object for the view - ) const { - ShouldNotReachHere(); - return NULL; - } - - //! Just returns NULL for the dummy device - virtual void* svmAlloc( - amd::Context& context, //!< The context used to create a buffer - size_t size, //!< size of svm spaces - size_t alignment, //!< alignment requirement of svm spaces - cl_svm_mem_flags flags, //!< flags of creation svm spaces - void* svmPtr //!< existing svm pointer for mGPU case - ) const { - ShouldNotReachHere(); - return NULL; - } - - //! Just returns NULL for the dummy device - virtual void svmFree( - void* ptr //!< svm pointer needed to be freed - ) const { - ShouldNotReachHere(); - return; - } - - //! Reallocates the provided buffer object - virtual bool reallocMemory(amd::Memory& owner) const { - ShouldNotReachHere(); - return false; - } - - //! Acquire external graphics API object in the host thread - //! Needed for OpenGL objects on CPU device - - virtual bool bindExternalDevice( - uint flags, void* const pDevice[], void* pContext, bool validateOnly) { - ShouldNotReachHere(); - return false; - } - - virtual bool unbindExternalDevice( - uint flags, void* const pDevice[], void* pContext, bool validateOnly) { - ShouldNotReachHere(); - return false; - } - - //! Releases non-blocking map target memory - virtual void freeMapTarget(amd::Memory& mem, void* target) { ShouldNotReachHere();} - - //! Empty implementation on Null device - virtual bool globalFreeMemory(size_t* freeMemory) const { - ShouldNotReachHere(); - return false; - } - -protected: - //! Initialize compiler instance and handle - static bool initCompiler(bool isOffline); - //! destroy compiler instance and handle - static bool destroyCompiler(); - //! Handle to the the compiler - static aclCompiler* compilerHandle_; - //! Device Id for an HsaDevice - AMDDeviceInfo deviceInfo_; -private: - static const bool offlineDevice_; -}; - -//! A HSA device ordinal (physical HSA device) -class Device : public NullDevice { -public: - //! Initialise the whole HSA device subsystem (CAL init, device enumeration, etc). - static bool init(); - static void tearDown(); - - //! Lookup all AMD HSA devices and memory regions. - static hsa_status_t iterateAgentCallback(hsa_agent_t agent, void *data); - static hsa_status_t iterateGpuMemoryPoolCallback( - hsa_amd_memory_pool_t region, void* data); - static hsa_status_t iterateCpuMemoryPoolCallback( - hsa_amd_memory_pool_t region, void* data); - - static bool loadHsaModules(); - - bool create(); - - //! Construct a new physical HSA device - Device(hsa_agent_t bkendDevice); - virtual hsa_agent_t getBackendDevice() const { return _bkendDevice; } - - static const std::vector& getGpuAgents() { - return gpu_agents_; - } - - static hsa_agent_t getCpuAgent() - { - return cpu_agent_; - } - - //! Destructor for the physical HSA device - virtual ~Device(); - - bool mapHSADeviceToOpenCLDevice(hsa_agent_t hsadevice); - - // Temporary, delete it later when HSA Runtime and KFD is fully fucntional. - void fake_device(); - - /////////////////////////////////////////////////////////////////////////////// - // TODO: Below are all mocked up virtual functions from amd::Device, they may - // need real implementation. - /////////////////////////////////////////////////////////////////////////////// - -// #ifdef cl_ext_device_fission - //! Create sub-devices according to the given partition scheme. - virtual cl_int createSubDevices( - device::CreateSubDevicesInfo &create_inf, - cl_uint num_entries, - cl_device_id *devices, - cl_uint *num_devices) - { return CL_INVALID_VALUE; } -// #endif // cl_ext_device_fission - - // bool Device::create(CALuint ordinal); - - //! Instantiate a new virtual device - virtual device::VirtualDevice *createVirtualDevice( - amd::CommandQueue* queue = NULL); - - //! Construct an HSAIL program object from the ELF assuming it is valid - virtual device::Program *createProgram(amd::option::Options* options = NULL); - - virtual device::Memory *createMemory(amd::Memory &owner) const; - - //! Sampler object allocation - virtual bool createSampler( - const amd::Sampler& owner, //!< abstraction layer sampler object - device::Sampler** sampler //!< device sampler object - ) const - { - //! \todo HSA team has to implement sampler allocation - *sampler = NULL; - return true; - } - - - //! Just returns NULL for the dummy device - virtual device::Memory *createView( - amd::Memory &owner, //!< Owner memory object - const device::Memory &parent //!< Parent device memory object for the view - ) const { return NULL; } - - //! Reallocates the provided buffer object - virtual bool reallocMemory(amd::Memory &owner) const {return true; } - - //! Acquire external graphics API object in the host thread - //! Needed for OpenGL objects on CPU device - virtual bool bindExternalDevice( - uint flags, void * const pDevice[], void *pContext, bool validateOnly); - - /** - * @brief Removes the external device as an available device. - * - * @note: The current implementation is to avoid build break - * and does not represent actual / correct implementation. This - * needs to be done. - */ - bool unbindExternalDevice( - uint flags, //!< Enum val. for ext.API type: GL, D3D10, etc. - void * const gfxDevice[], //!< D3D device do D3D, HDC/Display handle of X Window for GL - void *gfxContext, //!< HGLRC/GLXContext handle - bool validateOnly //!< Only validate if the device can inter-operate with - //!< pDevice/pContext, do not bind. - ); - - //! Gets free memory on a GPU device - virtual bool globalFreeMemory(size_t *freeMemory) const; - - virtual void* hostAlloc(size_t size, size_t alignment, bool atomics = false) const; - - virtual void hostFree(void* ptr, size_t size = 0) const; - - void *deviceLocalAlloc(size_t size) const; - - void deviceLocalFree(void *ptr, size_t size) const; - - virtual void* svmAlloc(amd::Context& context, size_t size, size_t alignment, cl_svm_mem_flags flags = CL_MEM_READ_WRITE, void* svmPtr = NULL) const; - - virtual void svmFree(void* ptr) const; - - const Settings &settings() const { return reinterpret_cast(*settings_); } - - //! Returns transfer engine object - const device::BlitManager& xferMgr() const { return xferQueue()->blitMgr(); } - - const size_t alloc_granularity() const { return alloc_granularity_; } - - const hsa_profile_t agent_profile() const { return agent_profile_; } - - const MesaInterop& mesa() const { return mesa_; } - - //! Finds an appropriate map target - amd::Memory* findMapTarget(size_t size) const; - - //! Adds a map target to the cache - bool addMapTarget(amd::Memory* memory) const; - -private: - amd::Monitor* mapCacheOps_; //!< Lock to serialise cache for the map resources - std::vector* mapCache_; //!< Map cache info structure - - bool populateOCLDeviceConstants(); - static bool isHsaInitialized_; - static hsa_agent_t cpu_agent_; - static std::vector gpu_agents_; - MesaInterop mesa_; - hsa_agent_t _bkendDevice; - hsa_profile_t agent_profile_; - hsa_amd_memory_pool_t group_segment_; - hsa_amd_memory_pool_t system_segment_; - hsa_amd_memory_pool_t system_coarse_segment_; - hsa_amd_memory_pool_t gpuvm_segment_; - size_t gpuvm_segment_max_alloc_; - size_t alloc_granularity_; - static const bool offlineDevice_; - amd::Context *context_; //!< A dummy context for internal data transfer - VirtualGPU *xferQueue_; //!< Transfer queue, created on demand - - VirtualGPU* xferQueue() const; -}; // class roc::Device -} // namespace roc - -/** - * @} - */ -#endif /*WITHOUT_HSA_BACKEND*/ - diff --git a/projects/clr/rocclr/runtime/device/rocm/rocglinterop.cpp b/projects/clr/rocclr/runtime/device/rocm/rocglinterop.cpp deleted file mode 100644 index 0db28ab973..0000000000 --- a/projects/clr/rocclr/runtime/device/rocm/rocglinterop.cpp +++ /dev/null @@ -1,120 +0,0 @@ -// -// Copyright (c) 2016 Advanced Micro Devices, Inc. All rights reserved. -// - -#ifndef WITHOUT_HSA_BACKEND - -#include "os/os.hpp" -#include "utils/debug.hpp" -#include "utils/flags.hpp" -#include "device/rocm/rocglinterop.hpp" - -#if !defined(_WIN32) -#include -#endif - -namespace roc -{ - -#if !defined(_WIN32) -static PFNMESAGLINTEROPGLXQUERYDEVICEINFOPROC GlxInfo = nullptr; -static PFNMESAGLINTEROPGLXEXPORTOBJECTPROC GlxExport = nullptr; -static PFNMESAGLINTEROPEGLQUERYDEVICEINFOPROC EglInfo = nullptr; -static PFNMESAGLINTEROPEGLEXPORTOBJECTPROC EglExport = nullptr; -#endif - -std::atomic MesaInterop::refCount(0); - -bool MesaInterop::Supported() -{ -#ifdef _WIN32 - return false; -#else - return true; -#endif -} - -//Attempt to locate Mesa interop APIs. Return which of glx/egl are supported. -bool MesaInterop::Bind(MESA_INTEROP_KIND Kind, const DisplayHandle& Display, const ContextHandle& Context) -{ -#if defined(_WIN32) - return false; -#else - if(Kind==MESA_INTEROP_NONE) - return false; - - if(kind!=MESA_INTEROP_NONE) - { - LogError("Error - MesaInterop Bind while already bound."); - return false; - } - - void* glxinfo=dlsym(RTLD_DEFAULT, "MesaGLInteropGLXQueryDeviceInfo"); - void* eglinfo=dlsym(RTLD_DEFAULT, "MesaGLInteropEGLQueryDeviceInfo"); - - if(((glxinfo!=GlxInfo) || (eglinfo!=EglInfo)) && (refCount!=0)) - LogWarning("Warning - Mesa changed while holding interop contexts."); - - GlxInfo=(PFNMESAGLINTEROPGLXQUERYDEVICEINFOPROC)glxinfo; - EglInfo=(PFNMESAGLINTEROPEGLQUERYDEVICEINFOPROC)eglinfo; - - GlxExport=(PFNMESAGLINTEROPGLXEXPORTOBJECTPROC)dlsym(RTLD_DEFAULT, "MesaGLInteropGLXExportObject"); - EglExport=(PFNMESAGLINTEROPEGLEXPORTOBJECTPROC)dlsym(RTLD_DEFAULT, "MesaGLInteropEGLExportObject"); - - uint32_t ret=MESA_INTEROP_NONE; - if(GlxInfo && GlxExport) - ret|=MESA_INTEROP_GLX; - if(EglInfo && EglExport) - ret|=MESA_INTEROP_EGL; - - kind = MESA_INTEROP_KIND(ret & Kind); - display=Display; - context=Context; - - if(kind!=MESA_INTEROP_NONE) - { - refCount++; - return true; - } - return false; - -#endif -} - -bool MesaInterop::GetInfo(mesa_glinterop_device_info& info) const -{ -#ifdef _WIN32 - return false; -#else - switch(kind) - { - case MESA_INTEROP_GLX: - return GlxInfo(display.glxDisplay, context.glxContext, &info)==MESA_GLINTEROP_SUCCESS; - case MESA_INTEROP_EGL: - return EglInfo(display.eglDisplay, context.eglContext, &info)==MESA_GLINTEROP_SUCCESS; - default: - return false; - } -#endif -} - -bool MesaInterop::Export (mesa_glinterop_export_in& in, mesa_glinterop_export_out& out) const -{ -#ifdef _WIN32 - return false; -#else - switch(kind) - { - case MESA_INTEROP_GLX: - return GlxExport(display.glxDisplay, context.glxContext, &in, &out)==MESA_GLINTEROP_SUCCESS; - case MESA_INTEROP_EGL: - return EglExport(display.eglDisplay, context.eglContext, &in, &out)==MESA_GLINTEROP_SUCCESS; - default: - return false; - } -#endif -} - -} - -#endif // WITHOUT_HSA_BACKEND diff --git a/projects/clr/rocclr/runtime/device/rocm/rocglinterop.hpp b/projects/clr/rocclr/runtime/device/rocm/rocglinterop.hpp deleted file mode 100644 index 219baa8cdd..0000000000 --- a/projects/clr/rocclr/runtime/device/rocm/rocglinterop.hpp +++ /dev/null @@ -1,158 +0,0 @@ -// -// Copyright (c) 2016 Advanced Micro Devices, Inc. All rights reserved. -// -#pragma once - -#ifndef WITHOUT_HSA_BACKEND - -#ifdef _WIN32 -//GLX header cannot be included in Windows due to X11 header dependency -#define MESA_GLINTEROP_NO_GLX -#include "device/rocm/mesa_glinterop.h" -//Give GLX parameters void* size -typedef void Display; -typedef void* GLXContext; -#undef MESA_GLINTEROP_NO_GLX -#else -#include "device/rocm/mesa_glinterop.h" -#endif - -#include "device/rocm/rocregisters.hpp" -#include "hsa_ext_amd.h" - -#include - -namespace roc -{ - - //Specific typed container for version 1 - typedef struct metadata_amd_ci_vi_s { - uint32_t version; // Must be 1 - uint32_t vendorID; // AMD | CZ - SQ_IMG_RSRC_WORD0 word0; - SQ_IMG_RSRC_WORD1 word1; - SQ_IMG_RSRC_WORD2 word2; - SQ_IMG_RSRC_WORD3 word3; - SQ_IMG_RSRC_WORD4 word4; - SQ_IMG_RSRC_WORD5 word5; - SQ_IMG_RSRC_WORD6 word6; - SQ_IMG_RSRC_WORD7 word7; - uint32_t mip_offsets[0]; //Mip level offset bits [39:8] for each level (if any) - } metadata_amd_ci_vi_t; - - class image_metadata - { - private: - metadata_amd_ci_vi_t* data; - - image_metadata(const image_metadata&)=delete; - image_metadata& operator=(const image_metadata&)=delete; - - public: - image_metadata() : data(nullptr) {} - ~image_metadata() { data=nullptr; } - - bool create(hsa_amd_image_descriptor_t* image_desc) - { - if((image_desc->version!=1) || ((image_desc->deviceID>>16)!=0x1002)) return false; - data=reinterpret_cast(image_desc); - return true; - } - - bool setMipLevel(uint32_t level) - { - if(level>data->word3.bits.last_level) - return false; - data->word3.bits.base_level=level; - data->word3.bits.last_level=level; - return true; - } - - bool setLayer(uint32_t layer) - { - data->word3.bits.type=SQ_RSRC_IMG_2D_ARRAY; - data->word5.bits.last_array=layer; - data->word5.bits.base_array=layer; - return true; - } - - bool setFace(GLenum face) - { - int index=face-GL_TEXTURE_CUBE_MAP_POSITIVE_X; - if(index<0 || index>5) - return false; - if(data->word3.bits.type!=SQ_RSRC_IMG_CUBE) - return false; - return setLayer(index); - } - }; - - class MesaInterop - { - public: - - enum MESA_INTEROP_KIND { MESA_INTEROP_NONE=0, MESA_INTEROP_GLX=1, MESA_INTEROP_EGL=2 }; - - union DisplayHandle - { - Display* glxDisplay; - EGLDisplay eglDisplay; - }; - - union ContextHandle - { - GLXContext glxContext; - EGLContext eglContext; - }; - - //True if the configuration supports the indicated interop ability. - static bool Supported(); - - MesaInterop() { kind=MESA_INTEROP_NONE; } - MesaInterop(const MesaInterop& rhs) { *this=rhs; } - ~MesaInterop() { Unbind(); } - - const MesaInterop& operator=(const MesaInterop& rhs) - { - display=rhs.display; - context=rhs.context; - kind=rhs.kind; - if(kind!=MESA_INTEROP_NONE) - refCount++; - return *this; - } - - /* - Loads Mesa interop APIs and sets this interface object to use the indicated - subsystem (GLX/EGL). Returns true if the required subsystem is found. - */ - bool Bind(MESA_INTEROP_KIND Kind, const DisplayHandle& Display, const ContextHandle& Context); - - /* - Releases use of Mesa interop APIs. - Used to check for bad load/unload sequences. - */ - void Unbind() - { - if(kind==MESA_INTEROP_NONE) return; - assert(refCount>0 && "Invalid refCount in MesaInterop."); - refCount--; - kind=MESA_INTEROP_NONE; - } - - bool GetInfo(mesa_glinterop_device_info& info) const; - - bool Export (mesa_glinterop_export_in& in, mesa_glinterop_export_out& out) const; - - private: - static std::atomic refCount; - - DisplayHandle display; - ContextHandle context; - MESA_INTEROP_KIND kind; - }; - -} - -#endif /*WITHOUT_HSA_BACKEND*/ - diff --git a/projects/clr/rocclr/runtime/device/rocm/rockernel.cpp b/projects/clr/rocclr/runtime/device/rocm/rockernel.cpp deleted file mode 100644 index 1c08135897..0000000000 --- a/projects/clr/rocclr/runtime/device/rocm/rockernel.cpp +++ /dev/null @@ -1,680 +0,0 @@ -// -// Copyright (c) 2009 Advanced Micro Devices, Inc. All rights reserved. -// - -#include "rockernel.hpp" -#include "SCHSAInterface.h" -#include "amd_hsa_kernel_code.h" - -#include - -#ifndef WITHOUT_HSA_BACKEND - -namespace roc { - -inline static HSAIL_ARG_TYPE -GetHSAILArgType(const aclArgData* argInfo) -{ - switch (argInfo->type) { - case ARG_TYPE_POINTER: - return HSAIL_ARGTYPE_POINTER; - case ARG_TYPE_VALUE: - return HSAIL_ARGTYPE_VALUE; - case ARG_TYPE_IMAGE: - return HSAIL_ARGTYPE_IMAGE; - case ARG_TYPE_SAMPLER: - return HSAIL_ARGTYPE_SAMPLER; - case ARG_TYPE_ERROR: - default: - return HSAIL_ARGTYPE_ERROR; - } -} - -inline static size_t -GetHSAILArgAlignment(const aclArgData* argInfo) -{ - switch (argInfo->type) { - case ARG_TYPE_POINTER: - return argInfo->arg.pointer.align; - default: - return 1; - } -} - -inline static HSAIL_ACCESS_TYPE -GetHSAILArgAccessType(const aclArgData* argInfo) -{ - if (argInfo->type == ARG_TYPE_POINTER) { - switch (argInfo->arg.pointer.type) { - case ACCESS_TYPE_RO: - return HSAIL_ACCESS_TYPE_RO; - case ACCESS_TYPE_WO: - return HSAIL_ACCESS_TYPE_WO; - case ACCESS_TYPE_RW: - default: - return HSAIL_ACCESS_TYPE_RW; - } - } - return HSAIL_ACCESS_TYPE_NONE; -} - -inline static HSAIL_ADDRESS_QUALIFIER -GetHSAILAddrQual(const aclArgData* argInfo) -{ - if (argInfo->type == ARG_TYPE_POINTER) { - switch (argInfo->arg.pointer.memory) { - case PTR_MT_CONSTANT_EMU: - case PTR_MT_CONSTANT: - case PTR_MT_UAV: - case PTR_MT_GLOBAL: - return HSAIL_ADDRESS_GLOBAL; - case PTR_MT_LDS_EMU: - case PTR_MT_LDS: - return HSAIL_ADDRESS_LOCAL; - case PTR_MT_ERROR: - default: - LogError("Unsupported address type"); - return HSAIL_ADDRESS_ERROR; - } - } - else if ((argInfo->type == ARG_TYPE_IMAGE) || - (argInfo->type == ARG_TYPE_SAMPLER)) { - return HSAIL_ADDRESS_GLOBAL; - } - return HSAIL_ADDRESS_ERROR; -} - -/* f16 returns f32 - workaround due to comp lib */ -inline static HSAIL_DATA_TYPE -GetHSAILDataType(const aclArgData* argInfo) -{ - aclArgDataType dataType; - - if (argInfo->type == ARG_TYPE_POINTER) { - dataType = argInfo->arg.pointer.data; - } - else if (argInfo->type == ARG_TYPE_VALUE) { - dataType = argInfo->arg.value.data; - } - else { - return HSAIL_DATATYPE_ERROR; - } - switch (dataType) { - case DATATYPE_i1: - return HSAIL_DATATYPE_B1; - case DATATYPE_i8: - return HSAIL_DATATYPE_S8; - case DATATYPE_i16: - return HSAIL_DATATYPE_S16; - case DATATYPE_i32: - return HSAIL_DATATYPE_S32; - case DATATYPE_i64: - return HSAIL_DATATYPE_S64; - case DATATYPE_u8: - return HSAIL_DATATYPE_U8; - case DATATYPE_u16: - return HSAIL_DATATYPE_U16; - case DATATYPE_u32: - return HSAIL_DATATYPE_U32; - case DATATYPE_u64: - return HSAIL_DATATYPE_U64; - case DATATYPE_f16: - return HSAIL_DATATYPE_F32; - case DATATYPE_f32: - return HSAIL_DATATYPE_F32; - case DATATYPE_f64: - return HSAIL_DATATYPE_F64; - case DATATYPE_struct: - return HSAIL_DATATYPE_STRUCT; - case DATATYPE_opaque: - return HSAIL_DATATYPE_OPAQUE; - case DATATYPE_ERROR: - default: - return HSAIL_DATATYPE_ERROR; - } -} - -// returns size in number of bytes -inline static int -GetHSAILArgSize(const aclArgData *argInfo) -{ - switch (argInfo->type) { - case ARG_TYPE_VALUE: - switch (GetHSAILDataType(argInfo)) { - case HSAIL_DATATYPE_B1: - return 1; - case HSAIL_DATATYPE_B8: - case HSAIL_DATATYPE_S8: - case HSAIL_DATATYPE_U8: - return 1; - case HSAIL_DATATYPE_B16: - case HSAIL_DATATYPE_U16: - case HSAIL_DATATYPE_S16: - case HSAIL_DATATYPE_F16: - return 2; - case HSAIL_DATATYPE_B32: - case HSAIL_DATATYPE_U32: - case HSAIL_DATATYPE_S32: - case HSAIL_DATATYPE_F32: - return 4; - case HSAIL_DATATYPE_B64: - case HSAIL_DATATYPE_U64: - case HSAIL_DATATYPE_S64: - case HSAIL_DATATYPE_F64: - return 8; - case HSAIL_DATATYPE_STRUCT: - return argInfo->arg.value.numElements; - default: - return -1; - } - case ARG_TYPE_POINTER: - case ARG_TYPE_IMAGE: - case ARG_TYPE_SAMPLER: - return sizeof(void*); - default: - return -1; - } -} - -inline static clk_value_type_t -GetOclType(const aclArgData* argInfo) -{ - static const clk_value_type_t ClkValueMapType[6][6] = { - { T_CHAR, T_CHAR2, T_CHAR3, T_CHAR4, T_CHAR8, T_CHAR16 }, - { T_SHORT, T_SHORT2, T_SHORT3, T_SHORT4, T_SHORT8, T_SHORT16 }, - { T_INT, T_INT2, T_INT3, T_INT4, T_INT8, T_INT16 }, - { T_LONG, T_LONG2, T_LONG3, T_LONG4, T_LONG8, T_LONG16 }, - { T_FLOAT, T_FLOAT2, T_FLOAT3, T_FLOAT4, T_FLOAT8, T_FLOAT16 }, - { T_DOUBLE, T_DOUBLE2, T_DOUBLE3, T_DOUBLE4, T_DOUBLE8, T_DOUBLE16 }, - }; - - uint sizeType; - if ((argInfo->type == ARG_TYPE_POINTER) || (argInfo->type == ARG_TYPE_IMAGE)) { - return T_POINTER; - } - else if (argInfo->type == ARG_TYPE_VALUE) { - switch (argInfo->arg.value.data) { - case DATATYPE_i8: - case DATATYPE_u8: - sizeType = 0; - break; - case DATATYPE_i16: - case DATATYPE_u16: - sizeType = 1; - break; - case DATATYPE_i32: - case DATATYPE_u32: - sizeType = 2; - break; - case DATATYPE_i64: - case DATATYPE_u64: - sizeType = 3; - break; - case DATATYPE_f16: - case DATATYPE_f32: - sizeType = 4; - break; - case DATATYPE_f64: - sizeType = 5; - break; - default: - return T_VOID; - } - switch (argInfo->arg.value.numElements) { - case 1: return ClkValueMapType[sizeType][0]; - case 2: return ClkValueMapType[sizeType][1]; - case 3: return ClkValueMapType[sizeType][2]; - case 4: return ClkValueMapType[sizeType][3]; - case 8: return ClkValueMapType[sizeType][4]; - case 16: return ClkValueMapType[sizeType][5]; - default: return T_VOID; - } - } - else if (argInfo->type == ARG_TYPE_SAMPLER) { - return T_SAMPLER; - } - else { - return T_VOID; - } -} - -inline static cl_kernel_arg_address_qualifier -GetOclAddrQual(const aclArgData* argInfo) -{ - if (argInfo->type == ARG_TYPE_POINTER) { - switch (argInfo->arg.pointer.memory) { - case PTR_MT_UAV: - case PTR_MT_GLOBAL: - return CL_KERNEL_ARG_ADDRESS_GLOBAL; - case PTR_MT_CONSTANT: - case PTR_MT_UAV_CONSTANT: - case PTR_MT_CONSTANT_EMU: - return CL_KERNEL_ARG_ADDRESS_CONSTANT; - case PTR_MT_LDS_EMU: - case PTR_MT_LDS: - return CL_KERNEL_ARG_ADDRESS_LOCAL; - default: - return CL_KERNEL_ARG_ADDRESS_PRIVATE; - } - } - else if (argInfo->type == ARG_TYPE_IMAGE) { - return CL_KERNEL_ARG_ADDRESS_GLOBAL; - } - //default for all other cases - return CL_KERNEL_ARG_ADDRESS_PRIVATE; -} - -inline static cl_kernel_arg_access_qualifier -GetOclAccessQual(const aclArgData* argInfo) -{ - if (argInfo->type == ARG_TYPE_IMAGE) { - switch (argInfo->arg.image.type) { - case ACCESS_TYPE_RO: - return CL_KERNEL_ARG_ACCESS_READ_ONLY; - case ACCESS_TYPE_WO: - return CL_KERNEL_ARG_ACCESS_WRITE_ONLY; - case ACCESS_TYPE_RW: - return CL_KERNEL_ARG_ACCESS_READ_WRITE; - default: - return CL_KERNEL_ARG_ACCESS_NONE; - } - } - return CL_KERNEL_ARG_ACCESS_NONE; -} - -inline static cl_kernel_arg_type_qualifier -GetOclTypeQual(const aclArgData* argInfo) -{ - cl_kernel_arg_type_qualifier rv = CL_KERNEL_ARG_TYPE_NONE; - if (argInfo->type == ARG_TYPE_POINTER) { - if (argInfo->arg.pointer.isVolatile) { - rv |= CL_KERNEL_ARG_TYPE_VOLATILE; - } - if (argInfo->arg.pointer.isRestrict) { - rv |= CL_KERNEL_ARG_TYPE_RESTRICT; - } - if (argInfo->isConst) { - rv |= CL_KERNEL_ARG_TYPE_CONST; - } - switch (argInfo->arg.pointer.memory) { - case PTR_MT_CONSTANT: - case PTR_MT_UAV_CONSTANT: - case PTR_MT_CONSTANT_EMU: - rv |= CL_KERNEL_ARG_TYPE_CONST; - break; - default: - break; - } - } - return rv; -} - -static int -GetOclSize(const aclArgData* argInfo) -{ - switch (argInfo->type) { - case ARG_TYPE_POINTER: return sizeof(void *); - case ARG_TYPE_VALUE: - switch (argInfo->arg.value.data) { - case DATATYPE_i8: - case DATATYPE_u8: - case DATATYPE_struct: - return 1 * argInfo->arg.value.numElements; - case DATATYPE_u16: - case DATATYPE_i16: - case DATATYPE_f16: - return 2 * argInfo->arg.value.numElements; - case DATATYPE_u32: - case DATATYPE_i32: - case DATATYPE_f32: - return 4 * argInfo->arg.value.numElements; - case DATATYPE_i64: - case DATATYPE_u64: - case DATATYPE_f64: - return 8 * argInfo->arg.value.numElements; - case DATATYPE_ERROR: - default: return -1; - } - case ARG_TYPE_IMAGE: return sizeof(cl_mem); - case ARG_TYPE_SAMPLER: return sizeof(cl_sampler); - default: return -1; - } -} - -KernelArg::KernelArg(aclArgData *argInfo) { - argInfo_ = argInfo; - name_ = argInfo_->argStr; - typeName_ = argInfo->typeStr; -} - -int KernelArg::size() { - switch (argInfo_->type) { - case ARG_TYPE_POINTER: { - return sizeof(void *); - } - case ARG_TYPE_VALUE: { - switch (argInfo_->arg.value.data) { - case DATATYPE_ERROR: { - return -1; - } - case DATATYPE_i8: - case DATATYPE_u8: - case DATATYPE_struct: { - return 1 * argInfo_->arg.value.numElements; - } - case DATATYPE_u16: - case DATATYPE_i16: - case DATATYPE_f16: { - return 2 * argInfo_->arg.value.numElements; - } - case DATATYPE_u32: - case DATATYPE_i32: - case DATATYPE_f32: { - return 4 * argInfo_->arg.value.numElements; - } - case DATATYPE_i64: - case DATATYPE_u64: - case DATATYPE_f64: { - return 8 * argInfo_->arg.value.numElements; - } - default: - return -1; - } - } - case ARG_TYPE_IMAGE: { - return sizeof(cl_mem); - } - case ARG_TYPE_SAMPLER: { - return sizeof(cl_sampler); - } - default: - return -1; - } -} - -std::string& KernelArg::name() { - return name_; -} - -std::string& KernelArg::typeName() -{ - return typeName_; -} - -void -Kernel::initArgList(const aclArgData* aclArg) -{ - // Initialize the hsail argument list too - initHsailArgs(aclArg); - - // Iterate through the arguments and insert into parameterList - device::Kernel::parameters_t params; - amd::KernelParameterDescriptor desc; - size_t offset = 0; - - // Reserved arguments for HSAIL launch - aclArg += MaxExtraArgumentsNum; - for (uint i = 0; aclArg->struct_size != 0; i++, aclArg++) { - desc.name_ = hsailArgList_[i]->name_.c_str(); - desc.type_ = GetOclType(aclArg); - desc.addressQualifier_ = GetOclAddrQual(aclArg); - desc.accessQualifier_ = GetOclAccessQual(aclArg); - desc.typeQualifier_ = GetOclTypeQual(aclArg); - desc.typeName_ = hsailArgList_[i]->typeName_.c_str(); - - // Make a check if it is local or global - if (desc.addressQualifier_ == CL_KERNEL_ARG_ADDRESS_LOCAL) { - desc.size_ = 0; - } - else { - desc.size_ = GetOclSize(aclArg); - } - - // Make offset alignment to match CPU metadata, since - // in multidevice config abstraction layer has a single signature - // and CPU sends the paramaters as they are allocated in memory - size_t size = desc.size_; - if (size == 0) { - // Local memory for CPU - size = sizeof(cl_mem); - } - offset = amd::alignUp(offset, std::min(size, size_t(16))); - desc.offset_ = offset; - offset += amd::alignUp(size, sizeof(uint32_t)); - params.push_back(desc); - } - createSignature(params); -} - -void -Kernel::initHsailArgs(const aclArgData* aclArg) -{ - int offset = 0; - - // Reserved arguments for HSAIL launch - aclArg += MaxExtraArgumentsNum; - - // Iterate through the each kernel argument - for (; aclArg->struct_size != 0; aclArg++) { - HsailKernelArg* arg = new HsailKernelArg; - // Initialize HSAIL kernel argument - arg->name_ = aclArg->argStr; - arg->typeName_ = aclArg->typeStr; - arg->size_ = GetHSAILArgSize(aclArg); - arg->offset_ = offset; - arg->type_ = GetHSAILArgType(aclArg); - arg->addrQual_ = GetHSAILAddrQual(aclArg); - arg->dataType_ = GetHSAILDataType(aclArg); - // If vector of args we add additional arguments to flatten it out - arg->numElem_ = ((aclArg->type == ARG_TYPE_VALUE) && - (aclArg->arg.value.data != DATATYPE_struct)) ? - aclArg->arg.value.numElements : 1; - arg->alignment_ = GetHSAILArgAlignment(aclArg); - arg->access_ = GetHSAILArgAccessType(aclArg); - offset += GetHSAILArgSize(aclArg); - hsailArgList_.push_back(arg); - } -} - -Kernel::Kernel(std::string name, HSAILProgram* prog, - const uint64_t& kernelCodeHandle, - const uint32_t workgroupGroupSegmentByteSize, - const uint32_t workitemPrivateSegmentByteSize, - const uint32_t kernargSegmentByteSize, - const uint32_t kernargSegmentAlignment, - uint extraArgsNum) - : device::Kernel(name), - program_(prog), - kernelCodeHandle_(kernelCodeHandle), - workgroupGroupSegmentByteSize_(workgroupGroupSegmentByteSize), - workitemPrivateSegmentByteSize_(workitemPrivateSegmentByteSize), - kernargSegmentByteSize_(kernargSegmentByteSize), - kernargSegmentAlignment_(kernargSegmentAlignment), - extraArgumentsNum_(extraArgsNum) {} - -bool Kernel::init(){ - acl_error errorCode; - //compile kernel down to ISA - hsa_agent_t hsaDevice = program_->hsaDevice(); - // Pull out metadata from the ELF - size_t sizeOfArgList; - aclCompiler* compileHandle = program_->dev().compiler(); - std::string openClKernelName("&__OpenCL_" + name() + "_kernel"); - errorCode = g_complibApi._aclQueryInfo(compileHandle, - program_->binaryElf(), - RT_ARGUMENT_ARRAY, - openClKernelName.c_str(), - NULL, - &sizeOfArgList); - if (errorCode != ACL_SUCCESS) { - return false; - } - std::unique_ptr argList(new char[sizeOfArgList]); - errorCode = g_complibApi._aclQueryInfo(compileHandle, - program_->binaryElf(), - RT_ARGUMENT_ARRAY, - openClKernelName.c_str(), - argList.get(), - &sizeOfArgList); - if (errorCode != ACL_SUCCESS) { - return false; - } - //Set the argList - initArgList((const aclArgData *) argList.get()); - //Set the workgroup information for the kernel - memset(&workGroupInfo_, 0, sizeof(workGroupInfo_)); - workGroupInfo_.availableLDSSize_ = program_->dev().info().localMemSizePerCU_; - assert(workGroupInfo_.availableLDSSize_ > 0); - workGroupInfo_.availableSGPRs_ = 0; - workGroupInfo_.availableVGPRs_ = 0; - size_t sizeOfWorkGroupSize; - errorCode = g_complibApi._aclQueryInfo(compileHandle, - program_->binaryElf(), - RT_WORK_GROUP_SIZE, - openClKernelName.c_str(), - NULL, - &sizeOfWorkGroupSize); - if (errorCode != ACL_SUCCESS) { - return false; - } - errorCode = g_complibApi._aclQueryInfo(compileHandle, - program_->binaryElf(), - RT_WORK_GROUP_SIZE, - openClKernelName.c_str(), - workGroupInfo_.compileSize_, - &sizeOfWorkGroupSize); - if (errorCode != ACL_SUCCESS) { - return false; - } - - uint32_t wavefront_size = 0; - if (HSA_STATUS_SUCCESS != - hsa_agent_get_info( - program_->hsaDevice(), HSA_AGENT_INFO_WAVEFRONT_SIZE, - &wavefront_size)) { - return false; - } - assert(wavefront_size > 0); - - // Setting it the same as used LDS. - workGroupInfo_.localMemSize_ = workgroupGroupSegmentByteSize_; - workGroupInfo_.privateMemSize_ = workitemPrivateSegmentByteSize_; - workGroupInfo_.usedLDSSize_ = workgroupGroupSegmentByteSize_; - workGroupInfo_.preferredSizeMultiple_ = wavefront_size; - workGroupInfo_.usedSGPRs_ = 0; - workGroupInfo_.usedStackSize_ = 0; - workGroupInfo_.usedVGPRs_ = 0; - workGroupInfo_.wavefrontPerSIMD_ = - program_->dev().info().maxWorkItemSizes_[0] / wavefront_size; - workGroupInfo_.wavefrontSize_ = wavefront_size; - if (workGroupInfo_.compileSize_[0] != 0) { - workGroupInfo_.size_ = - workGroupInfo_.compileSize_[0] * - workGroupInfo_.compileSize_[1] * - workGroupInfo_.compileSize_[2]; - } - else { - workGroupInfo_.size_ = program_->dev().info().maxWorkGroupSize_; - } - - // Pull out printf metadata from the ELF - size_t sizeOfPrintfList; - errorCode = g_complibApi._aclQueryInfo(compileHandle, program_->binaryElf(), RT_GPU_PRINTF_ARRAY, - openClKernelName.c_str(), NULL, &sizeOfPrintfList); - if (errorCode != ACL_SUCCESS){ - return false; - } - - // Make sure kernel has any printf info - if (0 != sizeOfPrintfList) { - std::unique_ptr aclPrintfList(new char[sizeOfPrintfList]); - if (!aclPrintfList) { - return false; - } - errorCode = g_complibApi._aclQueryInfo( - compileHandle, program_->binaryElf(), RT_GPU_PRINTF_ARRAY, - openClKernelName.c_str(), aclPrintfList.get(), &sizeOfPrintfList); - if (errorCode != ACL_SUCCESS) { - return false; - } - - // Set the Printf List - initPrintf(reinterpret_cast(aclPrintfList.get())); - } - return true; -} - -void Kernel::initPrintf(const aclPrintfFmt* aclPrintf) { - PrintfInfo info; - uint index = 0; - for (; aclPrintf->struct_size != 0; aclPrintf++) { - index = aclPrintf->ID; - if (printf_.size() <= index) { - printf_.resize(index + 1); - } - std::string pfmt = aclPrintf->fmtStr; - size_t pos = 0; - for (size_t i = 0; i < pfmt.size(); ++i) { - char symbol = pfmt[pos++]; - if (symbol == '\\') { - // Rest of the C escape sequences (e.g. \') are handled correctly - // by the MDParser, we are not sure exactly how! - switch (pfmt[pos]) { - case 'a': - pos++; - symbol = '\a'; - break; - case 'b': - pos++; - symbol = '\b'; - break; - case 'f': - pos++; - symbol = '\f'; - break; - case 'n': - pos++; - symbol = '\n'; - break; - case 'r': - pos++; - symbol = '\r'; - break; - case 'v': - pos++; - symbol = '\v'; - break; - case '7': - if (pfmt[++pos] == '2') { - pos++; - i++; - symbol = '\72'; - } - break; - default: - break; - } - } - info.fmtString_.push_back(symbol); - } - info.fmtString_ += "\n"; - uint32_t* tmp_ptr = const_cast(aclPrintf->argSizes); - for (uint i = 0; i < aclPrintf->numSizes; i++, tmp_ptr++) { - info.arguments_.push_back(*tmp_ptr); - } - printf_[index] = info; - info.arguments_.clear(); - } -} - - -Kernel::~Kernel() { - while (!hsailArgList_.empty()) { - HsailKernelArg* kernelArgPointer = hsailArgList_.back(); - delete kernelArgPointer; - hsailArgList_.pop_back(); - } -} - -} // namespace roc -#endif // WITHOUT_HSA_BACKEND diff --git a/projects/clr/rocclr/runtime/device/rocm/rockernel.hpp b/projects/clr/rocclr/runtime/device/rocm/rockernel.hpp deleted file mode 100644 index 6a68e67aa5..0000000000 --- a/projects/clr/rocclr/runtime/device/rocm/rockernel.hpp +++ /dev/null @@ -1,195 +0,0 @@ -// -// Copyright (c) 2009 Advanced Micro Devices, Inc. All rights reserved. -// -#pragma once - -#include -#include "acl.h" -#include "rocprogram.hpp" -#include "top.hpp" -#include "rocprintf.hpp" - -#ifndef WITHOUT_HSA_BACKEND - -namespace roc { - -#define MAX_INFO_STRING_LEN 0x40 -enum HSAIL_ADDRESS_QUALIFIER{ -HSAIL_ADDRESS_ERROR=0, -HSAIL_ADDRESS_GLOBAL, -HSAIL_ADDRESS_LOCAL, -HSAIL_MAX_ADDRESS_QUALIFIERS -} ; - -enum HSAIL_ARG_TYPE{ -HSAIL_ARGTYPE_ERROR=0, -HSAIL_ARGTYPE_POINTER, -HSAIL_ARGTYPE_VALUE, -HSAIL_ARGTYPE_IMAGE, -HSAIL_ARGTYPE_SAMPLER, -HSAIL_ARGMAX_ARG_TYPES -}; - -enum HSAIL_DATA_TYPE{ -HSAIL_DATATYPE_ERROR=0, -HSAIL_DATATYPE_B1, -HSAIL_DATATYPE_B8, -HSAIL_DATATYPE_B16, -HSAIL_DATATYPE_B32, -HSAIL_DATATYPE_B64, -HSAIL_DATATYPE_S8, -HSAIL_DATATYPE_S16, -HSAIL_DATATYPE_S32, -HSAIL_DATATYPE_S64, -HSAIL_DATATYPE_U8, -HSAIL_DATATYPE_U16, -HSAIL_DATATYPE_U32, -HSAIL_DATATYPE_U64, -HSAIL_DATATYPE_F16, -HSAIL_DATATYPE_F32, -HSAIL_DATATYPE_F64, -HSAIL_DATATYPE_STRUCT, -HSAIL_DATATYPE_OPAQUE, -HSAIL_DATATYPE_MAX_TYPES -}; - -enum HSAIL_ACCESS_TYPE { - HSAIL_ACCESS_TYPE_NONE = 0, - HSAIL_ACCESS_TYPE_RO, - HSAIL_ACCESS_TYPE_WO, - HSAIL_ACCESS_TYPE_RW -}; - -struct HsailKernelArg -{ - std::string name_; //!< Argument's name - std::string typeName_; //!< Argument's type name - uint size_; //!< Size in bytes - uint offset_; //!< Argument's offset - uint alignment_; //!< Argument's alignment - HSAIL_ARG_TYPE type_; //!< Type of the argument - HSAIL_ADDRESS_QUALIFIER addrQual_; //!< Address qualifier of the argument - HSAIL_DATA_TYPE dataType_; //!< The type of data - uint numElem_; //!< Number of elements - HSAIL_ACCESS_TYPE access_; //!< Access type for the argument -}; - -class KernelArg -{ -public: - KernelArg(aclArgData* argInfo); - //! Return type of the argument - clk_value_type_t amdoclType(); - //! Global, local etc - returns amdocl types - clk_address_space_t amdoclAddrQual(); - //! Global,localetc - returns opencl type - cl_kernel_arg_address_qualifier oclAddrQual(); - //! read , write etc - returns amdocl type - clk_arg_qualifier_t amdoclAccessQual(); - //! read , write etc - returns opencl type type - cl_kernel_arg_access_qualifier oclAccessQual(); - //! const,volatile,restrict etc - returns opencl type type - cl_kernel_arg_type_qualifier oclTypeQual(); - - //! Name of the argument - std::string& name(); - //! Name of the argument - std::string& typeName(); - //! reflection - std::string reflection(){ return name(); }; - //! Returns the size of the argument - int size(); - //! returns the offset - int offset(); - - void setOffset(); - -private: - aclArgData* argInfo_; - int offset_; - std::string name_; - std::string typeName_; -}; - -class Kernel : public device::Kernel -{ -public: - Kernel(std::string name, - HSAILProgram* prog, - const uint64_t &kernelCodeHandle, - const uint32_t workgroupGroupSegmentByteSize, - const uint32_t workitemPrivateSegmentByteSize, - const uint32_t kernargSegmentByteSize, - const uint32_t kernargSegmentAlignment, - uint extraArgsNum); - - const uint64_t& KernelCodeHandle() { - return kernelCodeHandle_; - } - - const uint32_t WorkgroupGroupSegmentByteSize() const { - return workgroupGroupSegmentByteSize_; - } - - const uint32_t workitemPrivateSegmentByteSize() const { - return workitemPrivateSegmentByteSize_; - } - - const uint64_t KernargSegmentByteSize() const { - return kernargSegmentByteSize_; - } - - const uint8_t KernargSegmentAlignment() const { - return kernargSegmentAlignment_; - } - - ~Kernel(); - - //! Initializes the metadata required for this kernel - bool init(); - - const HSAILProgram* program() { - return static_cast(program_); - } - - //! Returns a pointer to the hsail argument at the specified index - HsailKernelArg* hsailArgAt(size_t index) const { - return hsailArgList_[index]; - } - - //! Max number of possible extra (hidden) kernel arguments - static const uint MaxExtraArgumentsNum = 6; - - uint extraArgumentsNum() const { return extraArgumentsNum_; } - - //! Return printf info array - const std::vector& printfInfo() const {return printf_;} - -private: - //! Populates hsailArgList_ - void initArgList(const aclArgData* aclArg); - - //! Initializes Hsail Argument metadata and info ; - void initHsailArgs(const aclArgData* aclArg); - - //! Initializes HSAIL Printf metadata and info - void initPrintf(const aclPrintfFmt* aclPrintf); - - HSAILProgram *program_; //!< The roc::HSAILProgram context - std::vector hsailArgList_; //!< Vector list of HSAIL Arguments - std::string compileOptions_; //!< compile used for finalizing this kernel - uint64_t kernelCodeHandle_; //!< Kernel code handle (aka amd_kernel_code_t) - const uint32_t workgroupGroupSegmentByteSize_; - const uint32_t workitemPrivateSegmentByteSize_; - const uint32_t kernargSegmentByteSize_; - const uint32_t kernargSegmentAlignment_; - size_t kernelDirectiveOffset_; - const uint extraArgumentsNum_; // Number of arguments in Kernenv - std::vector printf_; -}; - -} // namespace roc - -#endif // WITHOUT_HSA_BACKEND - - diff --git a/projects/clr/rocclr/runtime/device/rocm/rocmemory.cpp b/projects/clr/rocclr/runtime/device/rocm/rocmemory.cpp deleted file mode 100644 index e1eb87602e..0000000000 --- a/projects/clr/rocclr/runtime/device/rocm/rocmemory.cpp +++ /dev/null @@ -1,783 +0,0 @@ -// -// Copyright (c) 2008 Advanced Micro Devices, Inc. All rights reserved. -// - -#ifndef WITHOUT_HSA_BACKEND - -#if !defined(_WIN32) -#include -#endif - -#include "CL/cl_ext.h" - -#include "utils/util.hpp" -#include "device/device.hpp" -#include "device/rocm/rocmemory.hpp" -#include "device/rocm/rocdevice.hpp" -#include "device/rocm/rocblit.hpp" -#include "device/rocm/rocglinterop.hpp" -#include "thread/monitor.hpp" -#include "platform/memory.hpp" -#include "platform/sampler.hpp" -#include "api/opencl/amdocl/cl_gl_amd.hpp" - -namespace roc { - -/////////////////////////////////roc::Memory////////////////////////////// -Memory::Memory(const roc::Device &dev, amd::Memory &owner) - : device::Memory(owner), - dev_(dev), - deviceMemory_(NULL), - kind_(MEMORY_KIND_NORMAL) -{ -} - -Memory::~Memory() -{ - dev_.removeVACache(this); -} - -bool -Memory::allocateMapMemory(size_t allocationSize) -{ - assert(mapMemory_ == NULL); - - void *mapData = NULL; - - amd::Memory* mapMemory = dev_.findMapTarget(owner()->getSize()); - - if (mapMemory == nullptr) { - // Create buffer object to contain the map target. - mapMemory = - new(owner()->getContext()) amd::Buffer( - owner()->getContext(), CL_MEM_ALLOC_HOST_PTR, owner()->getSize()); - - if ((mapMemory == NULL) || (!mapMemory->create())) { - LogError("[OCL] Fail to allocate map target object"); - dev_.hostFree(mapData); - if (mapMemory) { - mapMemory->release(); - } - return false; - } - - roc::Memory* hsaMapMemory = reinterpret_cast( - mapMemory->getDeviceMemory(dev_)); - if (hsaMapMemory == nullptr) { - mapMemory->release(); - return false; - } - } - - mapMemory_ = mapMemory; - - return true; -} - -void* -Memory::allocMapTarget( - const amd::Coord3D &origin, - const amd::Coord3D ®ion, - uint mapFlags, - size_t *rowPitch, - size_t *slicePitch) -{ - // Map/Unmap must be serialized. - amd::ScopedLock lock(owner()->lockMemoryOps()); - - incIndMapCount(); - - // If the device backing storage is direct accessible, use it. - if (isHostMemDirectAccess()) { - if (owner()->getHostMem() != nullptr) { - return (static_cast(owner()->getHostMem()) + origin[0]); - } - - return (static_cast(deviceMemory_) + origin[0]); - } - - // Otherwise, check for host memory. - void *hostMem = owner()->getHostMem(); - if (hostMem != NULL) { - return (static_cast(hostMem) + origin[0]); - } - - // Allocate one if needed. - if (indirectMapCount_ == 1) { - if (!allocateMapMemory(owner()->getSize())) { - decIndMapCount(); - return NULL; - } - } - else { - // Did the map resource allocation fail? - if (mapMemory_ == NULL) { - LogError("Could not map target resource"); - return NULL; - } - } - - roc::Memory* hsaMapMemory = reinterpret_cast( - mapMemory_->getDeviceMemory(dev_)); - return reinterpret_cast
(hsaMapMemory->getDeviceMemory()) + origin[0]; -} - -void -Memory::decIndMapCount() -{ - // Map/Unmap must be serialized. - amd::ScopedLock lock(owner()->lockMemoryOps()); - - if (indirectMapCount_ == 0) { - LogError("decIndMapCount() called when indirectMapCount_ already zero"); - return; - } - - // Decrement the counter and release indirect map if it's the last op - if (--indirectMapCount_ == 0 && - mapMemory_ != NULL) { - if (!dev_.addMapTarget(mapMemory_)) { - // Release the buffer object containing the map data. - mapMemory_->release(); - } - mapMemory_ = nullptr; - } -} - -void * -Memory::cpuMap( - device::VirtualDevice& vDev, - uint flags, - uint startLayer, - uint numLayers, - size_t* rowPitch, - size_t* slicePitch) -{ - // Create the map target. - void * mapTarget = - allocMapTarget(amd::Coord3D(0), amd::Coord3D(0), 0, rowPitch, slicePitch); - - assert(mapTarget != NULL); - - if (!isHostMemDirectAccess()) { - if (!vDev.blitMgr().readBuffer( - *this, mapTarget, amd::Coord3D(0), amd::Coord3D(size()), true)) { - decIndMapCount(); - return NULL; - } - } - - return mapTarget; -} - -void -Memory::cpuUnmap(device::VirtualDevice& vDev) -{ - if (!isHostMemDirectAccess()) { - if (!vDev.blitMgr().writeBuffer( - mapMemory_->getHostMem(), *this, amd::Coord3D(0), - amd::Coord3D(size()), true)) { - LogError("[OCL] Fail sync the device memory on cpuUnmap"); - } - } - - decIndMapCount(); -} - -// Setup an interop buffer (dmabuf handle) as an OpenCL buffer -bool Memory::createInteropBuffer(GLenum targetType, int miplevel, size_t* metadata_size, const hsa_amd_image_descriptor_t** metadata) -{ -#if defined(_WIN32) - return false; -#else - assert(owner()->isInterop() && "Object is not an interop object."); - - mesa_glinterop_export_in in; - mesa_glinterop_export_out out; - - in.size=sizeof(mesa_glinterop_export_in); - out.size=sizeof(mesa_glinterop_export_out); - - if(owner()->getMemFlags() & CL_MEM_READ_ONLY) - in.access=MESA_GLINTEROP_ACCESS_READ_ONLY; - else if(owner()->getMemFlags() & CL_MEM_WRITE_ONLY) - in.access=MESA_GLINTEROP_ACCESS_WRITE_ONLY; - else - in.access=MESA_GLINTEROP_ACCESS_READ_WRITE; - - in.target = targetType; - in.obj=owner()->getInteropObj()->asGLObject()->getGLName(); - in.miplevel=miplevel; - in.out_driver_data_size=0; - in.out_driver_data=NULL; - - if(!dev_.mesa().Export(in, out)) - return false; - - size_t size; - hsa_agent_t agent=dev_.getBackendDevice(); - hsa_status_t status=hsa_amd_interop_map_buffer(1, &agent, out.dmabuf_fd, 0, &size, &deviceMemory_, metadata_size, (const void**)metadata); - close(out.dmabuf_fd); - - if(status!=HSA_STATUS_SUCCESS) - return false; - - kind_=MEMORY_KIND_INTEROP; - assert(deviceMemory_!=NULL && "Interop map failed to produce a pointer!"); - - return true; -#endif -} - -void Memory::destroyInteropBuffer() -{ - assert(kind_==MEMORY_KIND_INTEROP && "Memory must be interop type."); - hsa_amd_interop_unmap_buffer(deviceMemory_); - deviceMemory_=NULL; -} - -/////////////////////////////////roc::Buffer////////////////////////////// - -Buffer::Buffer(const roc::Device &dev, amd::Memory &owner) - : roc::Memory(dev, owner) -{} - -Buffer::~Buffer() -{ - destroy(); -} - -void -Buffer::destroy() -{ - if (owner()->parent() != NULL) { - return; - } - - if(kind_==MEMORY_KIND_INTEROP) - { - destroyInteropBuffer(); - return; - } - - const cl_mem_flags memFlags = owner()->getMemFlags(); - - if ((deviceMemory_ != nullptr) && - (deviceMemory_ != owner()->getHostMem())) { - // if they are identical, the host pointer will be - // deallocated later on => avoid double deallocation - if (isHostMemDirectAccess()) { - if (memFlags & CL_MEM_USE_HOST_PTR) { - if (dev_.agent_profile() != HSA_PROFILE_FULL) { - hsa_amd_memory_unlock(owner()->getHostMem()); - } - } - } - else { - dev_.deviceLocalFree(deviceMemory_, size()); - } - } - - if (memFlags & CL_MEM_USE_HOST_PTR) { - if (dev_.agent_profile() == HSA_PROFILE_FULL) { - hsa_memory_deregister(owner()->getHostMem(), size()); - } - } -} - -bool -Buffer::create() -{ - //Interop buffer - if(owner()->isInterop()) - return createInteropBuffer(GL_ARRAY_BUFFER, 0, NULL, NULL); - - if (owner()->parent()) { - // Sub-Buffer creation. - roc::Memory *parentBuffer = - static_cast(owner()->parent()->getDeviceMemory(dev_)); - - if (parentBuffer == NULL) { - LogError("[OCL] Fail to allocate parent buffer"); - return false; - } - - const size_t offset = owner()->getOrigin(); - deviceMemory_ = - static_cast(parentBuffer->getDeviceMemory()) + offset; - - flags_ |= SubMemoryObject; - flags_ |= - parentBuffer->isHostMemDirectAccess() ? HostMemoryDirectAccess : 0; - return true; - } - - // Allocate backing storage in device local memory unless UHP or AHP are set - const cl_mem_flags memFlags = owner()->getMemFlags(); - if (!(memFlags & (CL_MEM_USE_HOST_PTR | - CL_MEM_ALLOC_HOST_PTR | CL_MEM_USE_PERSISTENT_MEM_AMD))) { - deviceMemory_ = dev_.deviceLocalAlloc(size()); - - if (deviceMemory_ == NULL) { - // TODO: device memory is not enabled yet. - // Fallback to system memory if exist. - - flags_ |= HostMemoryDirectAccess; - if (dev_.agent_profile() == HSA_PROFILE_FULL && - owner()->getHostMem() != NULL) { - deviceMemory_ = owner()->getHostMem(); - assert( - amd::isMultipleOf( - deviceMemory_, - static_cast(dev_.info().memBaseAddrAlign_))); - return true; - } - - deviceMemory_ = dev_.hostAlloc(size(), 1, false); - } - - assert( - amd::isMultipleOf( - deviceMemory_, - static_cast(dev_.info().memBaseAddrAlign_))); - - if (deviceMemory_ && (memFlags & CL_MEM_COPY_HOST_PTR)) { - // To avoid recurssive call to Device::createMemory, we perform - // data transfer to the view of the buffer. - amd::Buffer *bufferView = new (owner()->getContext()) amd::Buffer( - *owner(), 0, owner()->getOrigin(), owner()->getSize()); - bufferView->create(); - - roc::Buffer *devBufferView = - new roc::Buffer(dev_, *bufferView); - devBufferView->deviceMemory_ = deviceMemory_; - - bufferView->replaceDeviceMemory(&dev_, devBufferView); - - bool ret = dev_.xferMgr().writeBuffer( - owner()->getHostMem(), *devBufferView, amd::Coord3D(0), - amd::Coord3D(size()), true); - - if (!ret) { - dev_.deviceLocalFree(deviceMemory_, size()); - deviceMemory_ = NULL; - } - - bufferView->release(); - return ret; - } - - return deviceMemory_ != NULL; - } - else if (memFlags & CL_MEM_USE_PERSISTENT_MEM_AMD) { - deviceMemory_ = dev_.hostAlloc(size(), 1, false); - if (deviceMemory_ != nullptr) { - if (owner()->getHostMem() != nullptr) { - memcpy(deviceMemory_, owner()->getHostMem(), size()); - } - flags_ |= HostMemoryDirectAccess; - } - return deviceMemory_ != nullptr; - } - - assert(owner()->getHostMem() != NULL); - - flags_ |= HostMemoryDirectAccess; - - if (dev_.agent_profile() == HSA_PROFILE_FULL) { - deviceMemory_ = owner()->getHostMem(); - - if (memFlags & CL_MEM_USE_HOST_PTR) { - hsa_memory_register(deviceMemory_, size()); - } - - return deviceMemory_ != NULL; - } - - if (owner()->getSvmPtr() != owner()->getHostMem()) { - if (memFlags & CL_MEM_USE_HOST_PTR) { - hsa_agent_t agent = dev_.getBackendDevice(); - hsa_status_t status = hsa_amd_memory_lock( - owner()->getHostMem(), owner()->getSize(), &agent, 1, &deviceMemory_); - if (status != HSA_STATUS_SUCCESS) { - deviceMemory_ = nullptr; - } - } - else { - deviceMemory_ = owner()->getHostMem(); - } - } - else { - deviceMemory_ = owner()->getHostMem(); - } - - return deviceMemory_ != NULL; -} - -/////////////////////////////////roc::Image////////////////////////////// -typedef struct ChannelOrderMap { - uint32_t cl_channel_order; - hsa_ext_image_channel_order_t hsa_channel_order; -} ChannelOrderMap; - -typedef struct ChannelTypeMap { - uint32_t cl_channel_type; - hsa_ext_image_channel_type_t hsa_channel_type; -} ChannelTypeMap; - -static const ChannelOrderMap kChannelOrderMapping[] = { - { CL_R, HSA_EXT_IMAGE_CHANNEL_ORDER_R }, - { CL_A, HSA_EXT_IMAGE_CHANNEL_ORDER_A }, - { CL_RG, HSA_EXT_IMAGE_CHANNEL_ORDER_RG }, - { CL_RA, HSA_EXT_IMAGE_CHANNEL_ORDER_RA }, - { CL_RGB, HSA_EXT_IMAGE_CHANNEL_ORDER_RGB }, - { CL_RGBA, HSA_EXT_IMAGE_CHANNEL_ORDER_RGBA }, - { CL_BGRA, HSA_EXT_IMAGE_CHANNEL_ORDER_BGRA }, - { CL_ARGB, HSA_EXT_IMAGE_CHANNEL_ORDER_ARGB }, - { CL_INTENSITY, HSA_EXT_IMAGE_CHANNEL_ORDER_INTENSITY }, - { CL_LUMINANCE, HSA_EXT_IMAGE_CHANNEL_ORDER_LUMINANCE }, - { CL_Rx, HSA_EXT_IMAGE_CHANNEL_ORDER_RX }, - { CL_RGx, HSA_EXT_IMAGE_CHANNEL_ORDER_RGX }, - { CL_RGBx, HSA_EXT_IMAGE_CHANNEL_ORDER_RGBX }, - { CL_DEPTH, HSA_EXT_IMAGE_CHANNEL_ORDER_DEPTH }, - { CL_DEPTH_STENCIL, HSA_EXT_IMAGE_CHANNEL_ORDER_DEPTH_STENCIL }, - { CL_sRGB, HSA_EXT_IMAGE_CHANNEL_ORDER_SRGB }, - { CL_sRGBx, HSA_EXT_IMAGE_CHANNEL_ORDER_SRGBX }, - { CL_sRGBA, HSA_EXT_IMAGE_CHANNEL_ORDER_SRGBA }, - { CL_sBGRA, HSA_EXT_IMAGE_CHANNEL_ORDER_SBGRA }, - { CL_ABGR, HSA_EXT_IMAGE_CHANNEL_ORDER_ABGR }, -}; - -static const ChannelTypeMap kChannelTypeMapping[] = { - {CL_SNORM_INT8, HSA_EXT_IMAGE_CHANNEL_TYPE_SNORM_INT8}, - {CL_SNORM_INT16, HSA_EXT_IMAGE_CHANNEL_TYPE_SNORM_INT16}, - {CL_UNORM_INT8, HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_INT8}, - {CL_UNORM_INT16, HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_INT16}, - {CL_UNORM_SHORT_565, HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_SHORT_565}, - {CL_UNORM_SHORT_555, HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_SHORT_555}, - {CL_UNORM_INT_101010, HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_SHORT_101010}, - {CL_SIGNED_INT8, HSA_EXT_IMAGE_CHANNEL_TYPE_SIGNED_INT8}, - {CL_SIGNED_INT16, HSA_EXT_IMAGE_CHANNEL_TYPE_SIGNED_INT16}, - {CL_SIGNED_INT32, HSA_EXT_IMAGE_CHANNEL_TYPE_SIGNED_INT32}, - {CL_UNSIGNED_INT8, HSA_EXT_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8}, - {CL_UNSIGNED_INT16, HSA_EXT_IMAGE_CHANNEL_TYPE_UNSIGNED_INT16}, - {CL_UNSIGNED_INT32, HSA_EXT_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32}, - {CL_HALF_FLOAT, HSA_EXT_IMAGE_CHANNEL_TYPE_HALF_FLOAT}, - {CL_FLOAT, HSA_EXT_IMAGE_CHANNEL_TYPE_FLOAT}, - {CL_UNORM_INT24, HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_INT24}, -}; - - -static hsa_access_permission_t -GetHsaAccessPermission(const cl_mem_flags flags) { - if(flags & CL_MEM_READ_ONLY) - return HSA_ACCESS_PERMISSION_RO; - else if(flags & CL_MEM_WRITE_ONLY) - return HSA_ACCESS_PERMISSION_WO; - else - return HSA_ACCESS_PERMISSION_RW; -} - -Image::Image(const roc::Device& dev, amd::Memory& owner) : - roc::Memory(dev, owner) -{ - flags_ &= (~HostMemoryDirectAccess & ~HostMemoryRegistered); - populateImageDescriptor(); - hsaImageObject_.handle = 0; - hsaImageData_ = NULL; -} - -void -Image::populateImageDescriptor() -{ - amd::Image* image = owner()->asImage(); - - // build HSA runtime image descriptor - imageDescriptor_.width = image->getWidth(); - imageDescriptor_.height = image->getHeight(); - imageDescriptor_.depth = image->getDepth(); - imageDescriptor_.array_size = 0; - - switch (image->getType()) - { - case CL_MEM_OBJECT_IMAGE1D: - imageDescriptor_.geometry = HSA_EXT_IMAGE_GEOMETRY_1D; - imageDescriptor_.height = 1; - imageDescriptor_.depth = 1; - break; - case CL_MEM_OBJECT_IMAGE1D_BUFFER: - imageDescriptor_.geometry = HSA_EXT_IMAGE_GEOMETRY_1DB; - imageDescriptor_.height = 1; - imageDescriptor_.depth = 1; - break; - case CL_MEM_OBJECT_IMAGE1D_ARRAY: - //@todo - arraySize = height ?! - imageDescriptor_.geometry = HSA_EXT_IMAGE_GEOMETRY_1DA; - imageDescriptor_.height = 1; - imageDescriptor_.array_size = image->getHeight(); - break; - case CL_MEM_OBJECT_IMAGE2D: - imageDescriptor_.geometry = HSA_EXT_IMAGE_GEOMETRY_2D; - imageDescriptor_.depth = 1; - break; - case CL_MEM_OBJECT_IMAGE2D_ARRAY: - //@todo - arraySize = depth ?! - imageDescriptor_.geometry = HSA_EXT_IMAGE_GEOMETRY_2DA; - imageDescriptor_.depth = 1; - imageDescriptor_.array_size = image->getDepth(); - break; - case CL_MEM_OBJECT_IMAGE3D: - imageDescriptor_.geometry = HSA_EXT_IMAGE_GEOMETRY_3D; - break; - } - - const int kChannelOrderCount = - sizeof(kChannelOrderMapping) / sizeof(ChannelOrderMap); - for (int i = 0; i < kChannelOrderCount; i++) { - if (image->getImageFormat().image_channel_order == - kChannelOrderMapping[i].cl_channel_order) { - imageDescriptor_.format.channel_order = - kChannelOrderMapping[i].hsa_channel_order; - break; - } - } - - const int kChannelTypeCount = - sizeof(kChannelTypeMapping) / sizeof(ChannelTypeMap); - for (int i = 0; i < kChannelTypeCount; i++) { - if (image->getImageFormat().image_channel_data_type == - kChannelTypeMapping[i].cl_channel_type) { - imageDescriptor_.format.channel_type = - kChannelTypeMapping[i].hsa_channel_type; - break; - } - } - - permission_ = - GetHsaAccessPermission(owner()->getMemFlags()); -} - -bool -Image::createInteropImage() -{ - auto obj=owner()->getInteropObj()->asGLObject(); - assert(obj->getCLGLObjectType()!=CL_GL_OBJECT_BUFFER && "Non-image OpenGL object used with interop image API."); - - const hsa_amd_image_descriptor_t* meta; - size_t size=0; - - GLenum glTarget = obj->getGLTarget(); - if (glTarget == GL_TEXTURE_CUBE_MAP) { - glTarget = obj->getCubemapFace(); - } - if(!createInteropBuffer(glTarget, obj->getGLMipLevel(), &size, &meta)) - { - assert(false && "Failed to map image buffer."); - return false; - } - MAKE_SCOPE_GUARD(BufferGuard, [&](){ destroyInteropBuffer(); }); - - amdImageDesc_=(hsa_amd_image_descriptor_t*)malloc(size); - if(amdImageDesc_==NULL) - return false; - MAKE_SCOPE_GUARD(DescGuard, [&](){ free(amdImageDesc_); amdImageDesc_=NULL; }); - - memcpy(amdImageDesc_, meta, size); - - image_metadata desc; - if(!desc.create(amdImageDesc_)) - return false; - - if(!desc.setMipLevel(obj->getGLMipLevel())) - return false; - - if (obj->getGLTarget()==GL_TEXTURE_CUBE_MAP) - desc.setFace(obj->getCubemapFace()); - - hsaImageData_=deviceMemory_; - - hsa_status_t err=hsa_amd_image_create(dev_.getBackendDevice(), &imageDescriptor_, amdImageDesc_, hsaImageData_, permission_, &hsaImageObject_); - if(err!=HSA_STATUS_SUCCESS) - return false; - - BufferGuard.Dismiss(); - DescGuard.Dismiss(); - return true; -} - -bool -Image::create() -{ - if (owner()->parent()) { - // Image view creation - roc::Memory *parent = - static_cast(owner()->parent()->getDeviceMemory(dev_)); - - if (parent == NULL) { - LogError("[OCL] Fail to allocate parent image"); - return false; - } - - return createView(*parent); - } - - //Interop image - if(owner()->isInterop()) - return createInteropImage(); - - // Get memory size requirement for device specific image. - hsa_status_t status = hsa_ext_image_data_get_info( - dev_.getBackendDevice(), &imageDescriptor_, - permission_, &deviceImageInfo_); - - if (status != HSA_STATUS_SUCCESS) { - LogError("[OCL] Fail to allocate image memory"); - return false; - } - - // roc::Device::hostAlloc and deviceLocalAlloc implementation does not - // support alignment larger than HSA memory region allocation granularity. - // In this case, the user manages the alignment. - const size_t alloc_size = - (deviceImageInfo_.alignment <= dev_.alloc_granularity()) - ? deviceImageInfo_.size - : deviceImageInfo_.size + deviceImageInfo_.alignment; - - if (!(owner()->getMemFlags() & CL_MEM_ALLOC_HOST_PTR)) { - deviceMemory_ = dev_.deviceLocalAlloc(alloc_size); - } - - if (deviceMemory_ == NULL) { - deviceMemory_ = - dev_.hostAlloc(alloc_size, 1, false); - } - - hsaImageData_ = reinterpret_cast( - amd::alignUp(reinterpret_cast(deviceMemory_), - deviceImageInfo_.alignment)); - - assert(amd::isMultipleOf( - hsaImageData_, static_cast(deviceImageInfo_.alignment))); - - status = hsa_ext_image_create( - dev_.getBackendDevice(), &imageDescriptor_, hsaImageData_, - permission_, &hsaImageObject_); - - if (status != HSA_STATUS_SUCCESS) { - LogError("[OCL] Fail to allocate image memory"); - return false; - } - - return true; -} - -bool -Image::createView(Memory &parent) -{ - deviceMemory_ = parent.getDeviceMemory(); - - hsaImageData_ = (parent.owner()->asBuffer() != NULL) - ? deviceMemory_ - : static_cast(parent).hsaImageData_; - - kind_=parent.getKind(); - - hsa_status_t status; - if(kind_==MEMORY_KIND_INTEROP) - status = hsa_amd_image_create(dev_.getBackendDevice(), &imageDescriptor_, amdImageDesc_, hsaImageData_, permission_, &hsaImageObject_); - else - status= hsa_ext_image_create(dev_.getBackendDevice(), &imageDescriptor_, hsaImageData_, permission_, &hsaImageObject_); - - if (status != HSA_STATUS_SUCCESS) { - LogError("[OCL] Fail to allocate image memory"); - return false; - } - - return true; -} - -void* -Image::allocMapTarget( - const amd::Coord3D& origin, - const amd::Coord3D& region, - uint mapFlags, - size_t* rowPitch, - size_t* slicePitch) -{ - amd::ScopedLock lock(owner()->lockMemoryOps()); - - incIndMapCount(); - - void* pHostMem = owner()->getHostMem(); - - if (pHostMem == NULL) { - if (indirectMapCount_ == 1) { - if (!allocateMapMemory(owner()->getSize())) { - decIndMapCount(); - return NULL; - } - } - else { - // Did the map resource allocation fail? - if (mapMemory_ == NULL) { - LogError("Could not map target resource"); - return NULL; - } - } - - pHostMem = mapMemory_->getHostMem(); - } - - amd::Image* image = owner()->asImage(); - - size_t elementSize = image->getImageFormat().getElementSize(); - - size_t offset = origin[0] * elementSize; - - // Adjust offset with Y dimension - offset += image->getRowPitch() * origin[1]; - - // Adjust offset with Z dimension - offset += image->getSlicePitch() * origin[2]; - - *rowPitch = image->getRowPitch(); - if (slicePitch != NULL) { - *slicePitch = image->getSlicePitch(); - } - - return (static_cast(pHostMem)+offset); -} - -Image::~Image() -{ - destroy(); -} - -void -Image::destroy() -{ - if (owner()->parent() != NULL) { - return; - } - - if(kind_==MEMORY_KIND_INTEROP) - { - hsa_ext_image_destroy(dev_.getBackendDevice(), hsaImageObject_); - free(amdImageDesc_); - amdImageDesc_=NULL; - destroyInteropBuffer(); - return; - } - - if (deviceMemory_ != NULL) { - dev_.hostFree(deviceMemory_, deviceImageInfo_.size); - } - - if (hsaImageObject_.handle != 0) { - hsa_status_t status = - hsa_ext_image_destroy(dev_.getBackendDevice(), hsaImageObject_); - assert(status == HSA_STATUS_SUCCESS); - } -} -} -#endif // WITHOUT_HSA_BACKEND diff --git a/projects/clr/rocclr/runtime/device/rocm/rocmemory.hpp b/projects/clr/rocclr/runtime/device/rocm/rocmemory.hpp deleted file mode 100644 index a561ad3f86..0000000000 --- a/projects/clr/rocclr/runtime/device/rocm/rocmemory.hpp +++ /dev/null @@ -1,188 +0,0 @@ -// -// Copyright (c) 2016 Advanced Micro Devices, Inc. All rights reserved. -// -#pragma once - -#ifndef WITHOUT_HSA_BACKEND - -#include "top.hpp" -#include "platform/memory.hpp" -#include "utils/debug.hpp" -#include "device/rocm/rocdevice.hpp" -#include "device/rocm/rocglinterop.hpp" - -namespace roc { -class Memory : public device::Memory { - public: - enum MEMORY_KIND { MEMORY_KIND_NORMAL=0, MEMORY_KIND_LOCK, MEMORY_KIND_GART, MEMORY_KIND_INTEROP }; - - Memory(const roc::Device &dev, amd::Memory &owner); - - virtual ~Memory(); - - // Getter for deviceMemory_. - void *getDeviceMemory() const { return deviceMemory_; } - - // Gets a pointer to a region of host-visible memory for use as the target - // of an indirect map for a given memory object - virtual void *allocMapTarget(const amd::Coord3D &origin, - const amd::Coord3D ®ion, - uint mapFlags, - size_t *rowPitch, - size_t *slicePitch); - - // Create device memory according to OpenCL memory flag. - virtual bool create() = 0; - - // Pins system memory associated with this memory object. - virtual bool pinSystemMemory(void *hostPtr, // System memory address - size_t size // Size of allocated system memory - ) { - Unimplemented(); - return true; - } - - // Immediate blocking write from device cache to owners's backing store. - // Marks owner as "current" by resetting the last writer to NULL. - virtual void syncHostFromCache(SyncFlags syncFlags = SyncFlags()) - { - // Need to revisit this when multi-devices is supported. - } - - // Releases indirect map surface - void releaseIndirectMap() { decIndMapCount(); } - - //! Map the device memory to CPU visible - virtual void* cpuMap( - device::VirtualDevice& vDev, //!< Virtual device for map operaiton - uint flags = 0, //!< flags for the map operation - // Optimization for multilayer map/unmap - uint startLayer = 0, //!< Start layer for multilayer map - uint numLayers = 0, //!< End layer for multilayer map - size_t* rowPitch = NULL,//!< Row pitch for the device memory - size_t* slicePitch = NULL //!< Slice pitch for the device memory - ); - - //! Unmap the device memory - virtual void cpuUnmap( - device::VirtualDevice& vDev //!< Virtual device for unmap operaiton - ); - - //Mesa has already decomressed if needed and also does acquire at the start of every command batch. - virtual bool processGLResource(GLResourceOP operation) { return true; } - - // Accessors for indirect map memory object - amd::Memory *mapMemory() const { return mapMemory_; } - - MEMORY_KIND getKind() const { return kind_; } - - protected: - - bool allocateMapMemory(size_t allocationSize); - - // Decrement map count - virtual void decIndMapCount(); - - // Free / deregister device memory. - virtual void destroy() = 0; - - // Place interop object into HSA's flat address space - bool createInteropBuffer(GLenum targetType, int miplevel, size_t* metadata_size, const hsa_amd_image_descriptor_t** metadata); - - void destroyInteropBuffer(); - - // Pointer to the device associated with this memory object. - const roc::Device &dev_; - - // Pointer to the device memory. This could be in system or device local mem. - void* deviceMemory_; - - // Track if this memory is interop, lock, gart, or normal. - MEMORY_KIND kind_; - - private: - // Disable copy constructor - Memory(const Memory &); - - // Disable operator= - Memory &operator=(const Memory &); - -}; - -class Buffer : public roc::Memory { - public: - Buffer(const roc::Device &dev, amd::Memory &owner); - - virtual ~Buffer(); - - // Create device memory according to OpenCL memory flag. - virtual bool create(); - - // Recreate the device memory using new size and alignment. - bool recreate(size_t newSize, size_t newAlignment, bool forceSystem); - - private: - // Disable copy constructor - Buffer(const Buffer &); - - // Disable operator= - Buffer &operator=(const Buffer &); - - // Free device memory. - void destroy(); -}; - -class Image : public roc::Memory -{ -public: - Image(const roc::Device& dev, amd::Memory& owner); - - virtual ~Image(); - - //! Create device memory according to OpenCL memory flag. - virtual bool create(); - - //! Create an image view - bool createView(Memory &parent); - - //! Gets a pointer to a region of host-visible memory for use as the target - //! of an indirect map for a given memory object - virtual void* allocMapTarget( - const amd::Coord3D& origin, - const amd::Coord3D& region, - uint mapFlags, - size_t* rowPitch, - size_t* slicePitch); - - size_t getDeviceDataSize() { return deviceImageInfo_.size; } - size_t getDeviceDataAlignment() { return deviceImageInfo_.alignment; } - - hsa_ext_image_t getHsaImageObject() { return hsaImageObject_; } - const hsa_ext_image_descriptor_t& getHsaImageDescriptor() const { return imageDescriptor_; } -private: - //! Disable copy constructor - Image(const Buffer&); - - //! Disable operator= - Image& operator=(const Buffer&); - - // Setup an interop image - bool createInteropImage(); - - // Free / deregister device memory. - void destroy(); - - void populateImageDescriptor(); - - hsa_ext_image_descriptor_t imageDescriptor_; - hsa_access_permission_t permission_; - hsa_ext_image_data_info_t deviceImageInfo_; - hsa_ext_image_t hsaImageObject_; - hsa_amd_image_descriptor_t* amdImageDesc_; - - const void* hsaImageData_; -}; - -} -#endif - diff --git a/projects/clr/rocclr/runtime/device/rocm/rocprintf.cpp b/projects/clr/rocclr/runtime/device/rocm/rocprintf.cpp deleted file mode 100644 index 8efbe57ba8..0000000000 --- a/projects/clr/rocclr/runtime/device/rocm/rocprintf.cpp +++ /dev/null @@ -1,467 +0,0 @@ -// -// Copyright (c) 2010 Advanced Micro Devices, Inc. All rights reserved. -// - -#include "top.hpp" -#include "os/os.hpp" -#include "device/device.hpp" -#include "device/rocm/rocdefs.hpp" -#include "device/rocm/rocmemory.hpp" -#include "device/rocm/rockernel.hpp" -#include "device/rocm/rocprogram.hpp" -#include "device/rocm/rocdevice.hpp" -#include "device/rocm/rocprintf.hpp" -#include -#include -#include - -namespace roc { - -PrintfDbg::PrintfDbg(Device& device, FILE* file) - : dbgBuffer_(NULL), - dbgBuffer_size_(0), - dbgFile_(file), - gpuDevice_(device) {} - -PrintfDbg::~PrintfDbg() { dev().hostFree(dbgBuffer_, dbgBuffer_size_); } - -bool PrintfDbg::allocate(bool realloc) { - if (NULL == dbgBuffer_) { - dbgBuffer_size_ = dev().info().printfBufferSize_; - dbgBuffer_ = reinterpret_cast
( - dev().hostAlloc(dbgBuffer_size_, sizeof(void*))); - } else if (realloc) { - LogWarning("Debug buffer reallocation!"); - // Double the buffer size if it's not big enough - dev().hostFree(dbgBuffer_, dbgBuffer_size_); - dbgBuffer_size_ = dbgBuffer_size_ << 1; - dbgBuffer_ = reinterpret_cast
(dbgBuffer_size_, sizeof(void*)); - } - - return (NULL != dbgBuffer_) ? true : false; -} - -bool PrintfDbg::checkFloat(const std::string& fmt) const { - switch (fmt[fmt.size() - 1]) { - case 'e': - case 'E': - case 'f': - case 'g': - case 'G': - case 'a': - return true; - break; - default: - break; - } - return false; -} - -bool PrintfDbg::checkString(const std::string& fmt) const { - if (fmt[fmt.size() - 1] == 's') return true; - return false; -} - -int PrintfDbg::checkVectorSpecifier(const std::string& fmt, size_t startPos, - size_t& curPos) const { - int vectorSize = 0; - size_t pos = curPos; - size_t size = curPos - startPos; - - if (size >= 3) { - size = 0; - // no modifiers - if (fmt[curPos - 3] == 'v') { - size = 2; - } - // the modifiers are "h" or "l" - else if (fmt[curPos - 4] == 'v') { - size = 3; - } - // the modifier is "hh" - else if ((curPos >= 5) && (fmt[curPos - 5] == 'v')) { - size = 4; - } - if (size > 0) { - curPos = size; - pos -= curPos; - - // Get vector size - vectorSize = fmt[pos++] - '0'; - // PrintfDbg supports only 2, 3, 4, 8 and 16 wide vectors - switch (vectorSize) { - case 1: - if ((fmt[pos++] - '0') == 6) { - vectorSize = 16; - } else { - vectorSize = 0; - } - break; - case 2: - case 3: - case 4: - case 8: - break; - default: - vectorSize = 0; - break; - } - } - } - - return vectorSize; -} - -static const size_t ConstStr = 0xffffffff; -static const char Separator[] = ",\0"; - -size_t PrintfDbg::outputArgument(const std::string& fmt, bool printFloat, - size_t size, const uint32_t* argument) const { - // Serialize the output to the screen - // amd::ScopedLock k(dev().lockAsyncOps()); - - size_t copiedBytes = size; - // Print the string argument, using standard PrintfDbg() - if (checkString(fmt.c_str())) { - // copiedBytes should be as number of printed chars - copiedBytes = 0; - //(null) should be printed - if (*argument == 0) { - amd::Os::printf(fmt.data(), 0); - // copiedBytes = strlen("(null)") - copiedBytes = 6; - } else { - const unsigned char* argumentStr = - reinterpret_cast(argument); - amd::Os::printf(fmt.data(), argumentStr); - // copiedBytes = strlen(argumentStr) - while (argumentStr[copiedBytes++] != 0) - ; - } - } - - // Print the argument(except for string ), using standard PrintfDbg() - else { - bool hlModifier = (strstr(fmt.c_str(), "hl") != NULL); - std::string hlFmt; - if (hlModifier) { - hlFmt = fmt; - hlFmt.erase(hlFmt.find_first_of("hl"), 2); - } - switch (size) { - case 0: { - const char* str = reinterpret_cast(argument); - amd::Os::printf(fmt.data(), str); - // Find the string length - while (str[copiedBytes++] != 0) - ; - } break; - case 1: - amd::Os::printf(fmt.data(), - *(reinterpret_cast(argument))); - break; - case 2: - case 4: - if (printFloat) { - static const char* fSpecifiers = "eEfgGa"; - std::string fmtF = fmt; - size_t posS = fmtF.find_first_of("%"); - size_t posE = fmtF.find_first_of(fSpecifiers); - if (posS != std::string::npos && posE != std::string::npos) { - fmtF.replace(posS + 1, posE - posS, "s"); - } - float fArg = *(reinterpret_cast(argument)); - float fSign = copysign(1.0, fArg); - if (isinf(fArg) && !isnan(fArg)) { - if (fSign < 0) { - amd::Os::printf(fmtF.data(), "-infinity"); - } else { - amd::Os::printf(fmtF.data(), "infinity"); - } - } else if (isnan(fArg)) { - if (fSign < 0) { - amd::Os::printf(fmtF.data(), "-nan"); - } else { - amd::Os::printf(fmtF.data(), "nan"); - } - } else if (hlModifier) { - amd::Os::printf(hlFmt.data(), fArg); - } else { - amd::Os::printf(fmt.data(), fArg); - } - } else { - bool hhModifier = (strstr(fmt.c_str(), "hh") != NULL); - if (hhModifier) { - // current implementation of printf in gcc 4.5.2 runtime libraries, - // doesn`t recognize "hh" modifier ==> - // argument should be explicitly converted to unsigned char (uchar) - // before printing and - // fmt should be updated not to contain "hh" modifier - std::string hhFmt = fmt; - hhFmt.erase(hhFmt.find_first_of("h"), 2); - amd::Os::printf( - hhFmt.data(), - *(reinterpret_cast(argument))); - } else if (hlModifier) { - amd::Os::printf(hlFmt.data(), *argument); - } else { - amd::Os::printf(fmt.data(), *argument); - } - } - break; - case 8: - if (printFloat) { - if (hlModifier) { - amd::Os::printf(hlFmt.data(), - *(reinterpret_cast(argument))); - } else { - amd::Os::printf(fmt.data(), - *(reinterpret_cast(argument))); - } - } else { - std::string out = fmt; - // Use 'll' for 64 bit printf - out.insert((out.size() - 1), 1, 'l'); - amd::Os::printf(out.data(), - *(reinterpret_cast(argument))); - } - break; - case ConstStr: { - const char* str = reinterpret_cast(argument); - amd::Os::printf(fmt.data(), str); - } break; - default: - amd::Os::printf("Error: Unsupported data size for PrintfDbg. %d bytes", - static_cast(size)); - return 0; - } - } - fflush(stdout); - return copiedBytes; -} - -void PrintfDbg::outputDbgBuffer(const PrintfInfo& info, - const uint32_t* workitemData, size_t& i) const { - static const char* specifiers = "cdieEfgGaosuxXp"; - static const char* modifiers = "hl"; - static const char* special = "%n"; - static const std::string sepStr = "%s"; - const uint32_t* s = workitemData; - size_t pos = 0; - - // Find the format string - std::string str = info.fmtString_; - std::string fmt; - size_t posStart, posEnd; - - // Print all arguments - // Note: the following code walks through all arguments, provided by the - // kernel and - // finds the corresponding specifier in the format string. - // Then it splits the original string into substrings with a single specifier - // and - // uses standard PrintfDbg() to print each argument - for (uint j = 0; j < info.arguments_.size(); ++j) { - do { - posStart = str.find_first_of("%", pos); - if (posStart != std::string::npos) { - posStart++; - // Erase all spaces after % - while (str[posStart] == ' ') { - str.erase(posStart, 1); - } - size_t tmp = str.find_first_of(special, posStart); - size_t tmp2 = str.find_first_of(specifiers, posStart); - // Special cases. Special symbol is located before any specifier - if (tmp < tmp2) { - posEnd = posStart + 1; - fmt = str.substr(pos, posEnd - pos); - fmt.erase(posStart - pos - 1, 1); - pos = posStart = posEnd; - outputArgument(sepStr, false, ConstStr, - reinterpret_cast(fmt.data())); - continue; - } - break; - } else if (pos < str.length()) { - outputArgument( - sepStr, false, ConstStr, - reinterpret_cast((str.substr(pos)).data())); - } - } while (posStart != std::string::npos); - - if (posStart != std::string::npos) { - bool printFloat = false; - int vectorSize = 0; - size_t length; - size_t idPos = 0; - - // Search for PrintfDbg specifier in the format string. - // It will be a split point for the output - posEnd = str.find_first_of(specifiers, posStart); - if (posEnd == std::string::npos) { - pos = posStart = posEnd; - break; - } - posEnd++; - - size_t curPos = posEnd; - vectorSize = checkVectorSpecifier(str, posStart, curPos); - - // Get substring from the last position to the current specifier - fmt = str.substr(pos, posEnd - pos); - - // Readjust the string pointer if PrintfDbg outputs a vector - if (vectorSize != 0) { - size_t posVecSpec = fmt.length() - (curPos + 1); - size_t posVecMod = fmt.find_first_of(modifiers, posVecSpec + 1); - size_t posMod = str.find_first_of(modifiers, posStart); - if (posMod < posEnd) { - fmt = fmt.erase(posVecSpec, posVecMod - posVecSpec); - } else { - fmt = fmt.erase(posVecSpec, curPos); - } - idPos = posStart - pos - 1; - } - pos = posStart = posEnd; - - // Find out if the argument is a float - printFloat = checkFloat(fmt); - - // Is it a scalar value? - if (vectorSize == 0) { - length = outputArgument(fmt, printFloat, info.arguments_[j], &s[i]); - if (0 == length) { - return; - } - i += amd::alignUp(length, sizeof(uint32_t)) / sizeof(uint32_t); - } else { - // 3-component vector's size is defined as 4 * size of each scalar - // component - size_t elemSize = - info.arguments_[j] / (vectorSize == 3 ? 4 : vectorSize); - size_t k = i * sizeof(uint32_t); - std::string elementStr = fmt.substr(idPos, fmt.size()); - - // Print first element with full string - if (0 == outputArgument(fmt, printFloat, elemSize, &s[i])) { - return; - } - - // Print other elemnts with separator if available - for (int e = 1; e < vectorSize; ++e) { - const char* t = reinterpret_cast(s); - // Output the vector separator - outputArgument(sepStr, false, ConstStr, - reinterpret_cast(Separator)); - - // Output the next element - outputArgument( - elementStr, printFloat, elemSize, - reinterpret_cast(&t[k + e * elemSize])); - } - i += (amd::alignUp(info.arguments_[j], sizeof(uint32_t))) / - sizeof(uint32_t); - } - } else { - amd::Os::printf( - "Error: The arguments don't match the printf format string. " - "printf(%s)", - info.fmtString_.data()); - return; - } - } - - if (pos != std::string::npos) { - fmt = str.substr(pos, str.size() - pos); - outputArgument(sepStr, false, ConstStr, - reinterpret_cast(fmt.data())); - } -} - -bool PrintfDbg::init(bool printfEnabled) { - // Set up debug output buffer (if printf active) - if (printfEnabled) { - if (!allocate()) { - return false; - } - - // The first two DWORDs in the printf buffer are as follows: - // First DWORD = Offset to where next information is to - // be written, initialized to 0 - // Second DWORD = Number of bytes available for printf data - // = buffer size \96 2*sizeof(uint32_t) - const uint8_t initSize = 2 * sizeof(uint32_t); - uint8_t sysMem[initSize]; - memset(sysMem, 0, initSize); - uint32_t dbgBufferSize = dbgBuffer_size_ - initSize; - memcpy(&sysMem[4], &dbgBufferSize, sizeof(dbgBufferSize)); - - // Copy offset and number of bytes available for printf data - // into the corresponding location in the debug buffer - hsa_status_t err = - hsa_memory_copy(dbgBuffer_, sysMem, 2 * sizeof(uint32_t)); - if (err != HSA_STATUS_SUCCESS) { - LogError("\n Can't copy offset and bytes available data to dgbBuffer_!"); - return false; - } - } - return true; -} - -bool PrintfDbg::output(VirtualGPU& gpu, bool printfEnabled, - const std::vector& printfInfo) { - if (printfEnabled) { - uint32_t offsetSize = 0; - - // Wait until outstanding kernels finish - gpu.releaseGpuMemoryFence(); - - // Get memory pointer to the staged buffer - uint32_t* dbgBufferPtr = reinterpret_cast(dbgBuffer_); - if (NULL == dbgBufferPtr) { - return false; - } - - offsetSize = *dbgBufferPtr; - - if (offsetSize == 0) { - LogError("\n The printf buffer is empty!"); - return false; - } - - // Get a pointer to the buffer data - dbgBufferPtr = - reinterpret_cast(dbgBuffer_ + 2 * sizeof(uint32_t)); - if (NULL == dbgBufferPtr) { - return false; - } - - std::vector::const_iterator ita; - uint sb = 0; - uint sbt = 0; - size_t idx = 1; - - // parse the debug buffer - while (sbt < offsetSize) { - assert(((*dbgBufferPtr) < printfInfo.size()) && - "Cound't find the reported PrintfID!"); - const PrintfInfo& info = printfInfo[(*dbgBufferPtr)]; - sb += sizeof(uint32_t); - for (ita = info.arguments_.begin(); ita != info.arguments_.end(); ++ita) { - sb += *ita; - } - - // There's something in the debug buffer - outputDbgBuffer(info, dbgBufferPtr, idx); - - sbt += sb; - dbgBufferPtr += sb / sizeof(uint32_t); - sb = 0; - } - } - - return true; -} - -} // namespace gpu diff --git a/projects/clr/rocclr/runtime/device/rocm/rocprintf.hpp b/projects/clr/rocclr/runtime/device/rocm/rocprintf.hpp deleted file mode 100644 index d0c8e10ca8..0000000000 --- a/projects/clr/rocclr/runtime/device/rocm/rocprintf.hpp +++ /dev/null @@ -1,115 +0,0 @@ -// -// Copyright (c) 2010 Advanced Micro Devices, Inc. All rights reserved. -// -#pragma once - -/*! \addtogroup GPU GPU Device Implementation - * @{ - */ -#ifndef isinf -#ifdef _MSC_VER -#define isinf(X) (!_finite(X) && !_isnan(X)) -#endif //_MSC_VER -#endif // isinf - -#ifndef isnan -#ifdef _MSC_VER -#define isnan(X) (_isnan(X)) -#endif //_MSC_VER -#endif // isnan - -#ifndef copysign -#ifdef _MSC_VER -#define copysign(X, Y) (_copysign(X, Y)) -#endif //_MSC_VER -#endif // copysign - -//! GPU Device Implementation -namespace roc { - -//! Printf info structure -struct PrintfInfo { - std::string fmtString_; //!< formated string for printf - std::vector arguments_; //!< passed arguments to the printf() call -}; - -class Kernel; -class VirtualGPU; -class Device; - -class PrintfDbg : public amd::HeapObject { - public: - //! Debug buffer size per workitem - static const uint WorkitemDebugSize = 4096; - - //! constructor - PrintfDbg(Device& device, FILE* file = NULL); - - //! Destructor - ~PrintfDbg(); - - //! Initializes the debug buffer before kernel's execution - bool init(bool printfEnabled //!< checks for printf - ); - - //! Prints the kernel's debug informaiton from the buffer - bool output(VirtualGPU& gpu, - bool printfEnabled, //!< checks for printf - const std::vector& printfInfo //!< printf info - ); - - //! Returns debug buffer object - address dbgBuffer() const { return dbgBuffer_; } - - protected: - address dbgBuffer_; //!< Buffer to hold debug output - size_t dbgBuffer_size_; //!< Size of the debugger buffer - FILE* dbgFile_; //!< Debug file - Device& gpuDevice_; //!< GPU device object - - //! Gets GPU device object - Device& dev() const { return gpuDevice_; } - - //! Allocates the debug buffer - bool allocate( - bool realloc = false //!< If TRUE then reallocate the debug memory - ); - - //! Returns TRUE if a float value has to be printed - bool checkFloat(const std::string& fmt //!< Format string - ) const; - - //! Returns TRUE if a string value has to be printed - bool checkString(const std::string& fmt //!< Format string - ) const; - - //! Finds the specifier in the format string - int checkVectorSpecifier(const std::string& fmt, //!< Format string - size_t startPos, //!< Start position for processing - size_t& curPos //!< End position for processing - ) const; - - //! Outputs an argument - size_t outputArgument(const std::string& fmt, //!< Format strint - bool printFloat, //!< Argument is a float value - size_t size, //!< Argument's size - const uint32_t* argument //!< Argument's location - ) const; - - //! Displays the PrintfDbg - void outputDbgBuffer( - const PrintfInfo& info, //!< printf info - const uint32_t* workitemData, //!< The PrintfDbg dump buffer - size_t& i //!< index to the data in the buffer - ) const; - - private: - //! Disable copy constructor - PrintfDbg(const PrintfDbg&); - - //! Disable assignment - PrintfDbg& operator=(const PrintfDbg&); -}; - -/*@}*/} // namespace roc - diff --git a/projects/clr/rocclr/runtime/device/rocm/rocprogram.cpp b/projects/clr/rocclr/runtime/device/rocm/rocprogram.cpp deleted file mode 100644 index a442da513e..0000000000 --- a/projects/clr/rocclr/runtime/device/rocm/rocprogram.cpp +++ /dev/null @@ -1,845 +0,0 @@ -// -// Copyright (c) 2008 Advanced Micro Devices, Inc. All rights reserved. -// - - -#ifndef WITHOUT_HSA_BACKEND - -#include "rocprogram.hpp" - -#include "compiler/lib/loaders/elf/elf.hpp" -#include "compiler/lib/utils/options.hpp" -#include "rockernel.hpp" -#include "roccompilerlib.hpp" -#include "utils/bif_section_labels.hpp" - - -#include -#include -#include -#include -#include -#include -#include - - -#endif // WITHOUT_HSA_BACKEND - -namespace roc { -#ifndef WITHOUT_HSA_BACKEND - /* Temporary log function for the compiler library */ - static void logFunction(const char *msg, size_t size) { - std::cout << "Compiler Library log :" << msg << std::endl; - } - - HSAILProgram::~HSAILProgram() { - acl_error error; - // Free the elf binary - if (binaryElf_ != NULL) { - error = g_complibApi._aclBinaryFini(binaryElf_); - if (error != ACL_SUCCESS) { - LogWarning( "Error while destroying the acl binary \n" ); - } - } - // Destroy the executable. - if (hsaExecutable_.handle != 0) { - hsa_executable_destroy(hsaExecutable_); - } - // Destroy the code object. - if (hsaProgramCodeObject_.handle != 0) { - hsa_code_object_destroy(hsaProgramCodeObject_); - } - // Destroy the program handle. - if (hsaProgramHandle_.handle != 0) { - hsa_ext_program_destroy(hsaProgramHandle_); - } - destroyBrigModule(); - destroyBrigContainer(); - releaseClBinary(); - } - - HSAILProgram::HSAILProgram(roc::NullDevice& device): device::Program(device), - llvmBinary_(), - binaryElf_(NULL), - device_(device), - brigModule_(NULL), - hsaBrigContainer_(NULL) - { - memset(&binOpts_, 0, sizeof(binOpts_)); - binOpts_.struct_size = sizeof(binOpts_); - //binOpts_.elfclass = LP64_SWITCH( ELFCLASS32, ELFCLASS64 ); - //Setting as 32 bit because hsail64 returns an invalid aclTargetInfo - //when aclGetTargetInfo is called - EPR# 377910 - binOpts_.elfclass = ELFCLASS32; - binOpts_.bitness = ELFDATA2LSB; - binOpts_.alloc = &::malloc; - binOpts_.dealloc = &::free; - hsaProgramHandle_.handle = 0; - hsaProgramCodeObject_.handle = 0; - hsaExecutable_.handle = 0; - } - - bool HSAILProgram::initClBinary(char *binaryIn, size_t size) { // Save the - // original - // binary that - // isn't owned - // by ClBinary - clBinary()->saveOrigBinary(binaryIn, size); - - char *bin = binaryIn; - size_t sz = size; - - int encryptCode; - - char *decryptedBin; - size_t decryptedSize; - if (!clBinary()->decryptElf(binaryIn, size, - &decryptedBin, &decryptedSize, &encryptCode)) { - return false; - } - if (decryptedBin != NULL) { - // It is decrypted binary. - bin = decryptedBin; - sz = decryptedSize; - } - - // Both 32-bit and 64-bit are allowed! - if (!amd::isElfMagic(bin)) { - // Invalid binary. - if (decryptedBin != NULL) { - delete[]decryptedBin; - } - return false; - } - - clBinary()->setFlags(encryptCode); - - return clBinary()->setBinary(bin, sz, (decryptedBin != NULL)); - } - - - bool HSAILProgram::initBuild(amd::option::Options *options) { - compileOptions_ = options->origOptionStr; - - if (!device::Program::initBuild(options)) { - return false; - } - // Elf Binary setup - std::string outFileName; - - // true means hsail required - clBinary()->init(options, true); - if (options->isDumpFlagSet(amd::option::DUMP_BIF)) { - outFileName = options->getDumpFileName(".bin"); - } - - bool useELF64 = getCompilerOptions()->oVariables->EnableGpuElf64; - if (!clBinary()->setElfOut(useELF64 ? ELFCLASS64 : ELFCLASS32, - (outFileName.size() > - 0) ? outFileName.c_str() : NULL)) { - LogError("Setup elf out for gpu failed"); - return false; - } - return true; - } - - // ! post-compile setup for GPU - bool HSAILProgram::finiBuild(bool isBuildGood) { - clBinary()->resetElfOut(); - clBinary()->resetElfIn(); - - if (!isBuildGood) { - // Prevent the encrypted binary form leaking out - clBinary()->setBinary(NULL, 0); - - } - - return device::Program::finiBuild(isBuildGood); - } - - static char *readFile(std::string source_filename, size_t &size) { - FILE *fp = ::fopen(source_filename.c_str(), "rb"); - unsigned int length; - size_t offset = 0; - char *ptr; - - if (!fp) { - return NULL; - } - - // obtain file size. - ::fseek(fp, 0, SEEK_END); - length = ::ftell(fp); - ::rewind(fp); - - ptr = reinterpret_cast(malloc(offset + length + 1)); - if (length != fread(&ptr[offset], 1, length, fp)) { - free(ptr); - return NULL; - } - - ptr[offset + length] = '\0'; - size = offset + length; - ::fclose(fp); - return ptr; - } - - aclType HSAILProgram::getCompilationStagesFromBinary(std::vector& completeStages, bool& needOptionsCheck) - { - acl_error errorCode; - size_t secSize = 0; - completeStages.clear(); - aclType from = ACL_TYPE_DEFAULT; - needOptionsCheck = true; - size_t boolSize = sizeof(bool); - //! @todo Should we also check for ACL_TYPE_OPENCL & ACL_TYPE_LLVMIR_TEXT? - // Checking llvmir in .llvmir section - bool containsLlvmirText = true; - errorCode = g_complibApi._aclQueryInfo(device().compiler(), binaryElf_, RT_CONTAINS_LLVMIR, NULL, &containsLlvmirText, &boolSize); - if (errorCode != ACL_SUCCESS) { - containsLlvmirText = false; - } - // Checking compile & link options in .comment section - bool containsOpts = true; - errorCode = g_complibApi._aclQueryInfo(device().compiler(), binaryElf_, RT_CONTAINS_OPTIONS, NULL, &containsOpts, &boolSize); - if (errorCode != ACL_SUCCESS) { - containsOpts = false; - } - if (containsLlvmirText && containsOpts) { - completeStages.push_back(from); - from = ACL_TYPE_LLVMIR_BINARY; - } - // Checking HSAIL in .cg section - bool containsHsailText = true; - errorCode = g_complibApi._aclQueryInfo(device().compiler(), binaryElf_, RT_CONTAINS_HSAIL, NULL, &containsHsailText, &boolSize); - if (errorCode != ACL_SUCCESS) { - containsHsailText = false; - } - // Checking BRIG sections - bool containsBrig = true; - errorCode = g_complibApi._aclQueryInfo(device().compiler(), binaryElf_, RT_CONTAINS_BRIG, NULL, &containsBrig, &boolSize); - if (errorCode != ACL_SUCCESS) { - containsBrig = false; - } - if (containsBrig) { - completeStages.push_back(from); - from = ACL_TYPE_HSAIL_BINARY; - // Here we should check that CG stage was done. - // Right now there are 2 criterions to check it (besides BRIG itself): - // 1. matadata symbols symOpenclKernel for every kernel. - // 2. HSAIL text in aclCODEGEN section. - // Unfortunately there is no appropriate way in Compiler Lib to check 1. - // because kernel names are unknown here, therefore only 2. - if (containsHsailText) { - completeStages.push_back(from); - from = ACL_TYPE_CG; - } - } - else if (containsHsailText) { - completeStages.push_back(from); - from = ACL_TYPE_HSAIL_TEXT; - } - // Checking ISA in .text section - bool containsShaderIsa = true; - errorCode = g_complibApi._aclQueryInfo(device().compiler(), binaryElf_, RT_CONTAINS_ISA, NULL, &containsShaderIsa, &boolSize); - if (errorCode != ACL_SUCCESS) { - containsShaderIsa = false; - } - if (containsShaderIsa) { - completeStages.push_back(from); - from = ACL_TYPE_ISA; - } - std::string sCurOptions = compileOptions_ + linkOptions_; - amd::option::Options curOptions; - if (!amd::option::parseAllOptions(sCurOptions, curOptions)) { - buildLog_ += curOptions.optionsLog(); - LogError("Parsing compile options failed."); - return ACL_TYPE_DEFAULT; - } - switch (from) { - // compile from HSAIL text, no matter prev. stages and options - case ACL_TYPE_HSAIL_TEXT: - needOptionsCheck = false; - break; - case ACL_TYPE_HSAIL_BINARY: - case ACL_TYPE_CG: - // do not check options, if LLVMIR is absent or might be absent or options are absent - if (curOptions.oVariables->BinLLVMIR || !containsLlvmirText || !containsOpts) { - needOptionsCheck = false; - } - break; - case ACL_TYPE_ISA: - // do not check options, if LLVMIR is absent or might be absent or options are absent - if (curOptions.oVariables->BinLLVMIR || !containsLlvmirText || !containsOpts) { - needOptionsCheck = false; - } - if (containsBrig && containsHsailText && curOptions.oVariables->BinHSAIL) { - needOptionsCheck = false; - // recompile from prev. stage, if BRIG || HSAIL are absent - } else { - from = completeStages.back(); - completeStages.pop_back(); - needOptionsCheck = true; - } - break; - // recompilation might be needed - case ACL_TYPE_LLVMIR_BINARY: - case ACL_TYPE_DEFAULT: - default: - break; - } - return from; - } - - aclType HSAILProgram::getNextCompilationStageFromBinary(amd::option::Options* options) { - aclType continueCompileFrom = ACL_TYPE_DEFAULT; - binary_t binary = this->binary(); - // If the binary already exists - if ((binary.first != NULL) && (binary.second > 0)) { - void *mem = const_cast(binary.first); - acl_error errorCode; - binaryElf_ = g_complibApi._aclReadFromMem(mem, binary.second, &errorCode); - if (errorCode != ACL_SUCCESS) { - buildLog_ += "Error while BRIG Codegen phase: aclReadFromMem failure \n" ; - LogWarning("aclReadFromMem failed"); - return continueCompileFrom; - } - // Calculate the next stage to compile from, based on sections in binaryElf_; - // No any validity checks here - std::vector completeStages; - bool needOptionsCheck = true; - continueCompileFrom = getCompilationStagesFromBinary(completeStages, needOptionsCheck); - // Saving binary in the interface class, - // which also load compile & link options from binary - setBinary(static_cast(mem), binary.second); - if (!options || !needOptionsCheck) { - return continueCompileFrom; - } - bool recompile = false; - //! @todo Should we also check for ACL_TYPE_OPENCL & ACL_TYPE_LLVMIR_TEXT? - switch (continueCompileFrom) { - case ACL_TYPE_HSAIL_BINARY: - case ACL_TYPE_CG: - case ACL_TYPE_ISA: { - // Compare options loaded from binary with current ones, recompile if differ; - // If compile options are absent in binary, do not compare and recompile - if (compileOptions_.empty()) - break; - const oclBIFSymbolStruct* symbol = findBIF30SymStruct(symOpenclCompilerOptions); - assert(symbol && "symbol not found"); - std::string symName = std::string(symbol->str[bif::PRE]) + std::string(symbol->str[bif::POST]); - size_t symSize = 0; - const void *opts = g_complibApi._aclExtractSymbol(device().compiler(), - binaryElf_, &symSize, aclCOMMENT, symName.c_str(), &errorCode); - if (errorCode != ACL_SUCCESS) { - recompile = true; - break; - } - std::string sBinOptions = std::string((char*)opts, symSize); - std::string sCurOptions = compileOptions_ + linkOptions_; - amd::option::Options curOptions, binOptions; - if (!amd::option::parseAllOptions(sBinOptions, binOptions)) { - buildLog_ += binOptions.optionsLog(); - LogError("Parsing compile options from binary failed."); - return ACL_TYPE_DEFAULT; - } - if (!amd::option::parseAllOptions(sCurOptions, curOptions)) { - buildLog_ += curOptions.optionsLog(); - LogError("Parsing compile options failed."); - return ACL_TYPE_DEFAULT; - } - if (!curOptions.equals(binOptions)) { - recompile = true; - } - break; - } - default: - break; - } - if (recompile) { - while (!completeStages.empty()) { - continueCompileFrom = completeStages.back(); - if (continueCompileFrom == ACL_TYPE_LLVMIR_BINARY || - continueCompileFrom == ACL_TYPE_DEFAULT) { - break; - } - completeStages.pop_back(); - } - } - } - return continueCompileFrom; - } - - bool HSAILProgram::saveBinaryAndSetType(type_t type) { - //Write binary to memory - void *rawBinary = NULL; - size_t size; - if (g_complibApi._aclWriteToMem(binaryElf_, &rawBinary, &size) - != ACL_SUCCESS) { - buildLog_ += "Failed to write binary to memory \n"; - return false; - } - clBinary()->saveBIFBinary((char*)rawBinary, size); - //Set the type of binary - setType(type); - //Free memory containing rawBinary - binaryElf_->binOpts.dealloc(rawBinary); - return true; - } - - bool HSAILProgram::linkImpl(const std::vector &inputPrograms, - amd::option::Options *options, - bool createLibrary) { - std::vector::const_iterator it - = inputPrograms.begin(); - std::vector::const_iterator itEnd - = inputPrograms.end(); - acl_error errorCode; - - // For each program we need to extract the LLVMIR and create - // aclBinary for each - std::vector binaries_to_link; - - for (size_t i = 0; it != itEnd; ++it, ++i) { - HSAILProgram *program = (HSAILProgram *)*it; - // Check if the program was created with clCreateProgramWIthBinary - binary_t binary = program->binary(); - if ((binary.first != NULL) && (binary.second > 0)) { - // Binary already exists -- we can also check if there is no - // opencl source code - // Need to check if LLVMIR exists in the binary - // If LLVMIR does not exist then is it valid - // We need to pull out all the compiled kernels - // We cannot do this at present because we need at least - // Hsail text to pull the kernels oout - void *mem = const_cast(binary.first); - binaryElf_ = g_complibApi._aclReadFromMem(mem, - binary.second, - &errorCode); - - if (errorCode != ACL_SUCCESS) { - LogWarning("Error while linking : Could not read from raw binary"); - return false; - } - } - // At this stage each HSAILProgram contains a valid binary_elf - // Check if LLVMIR is in the binary - size_t boolSize = sizeof(bool); - bool containsLLLVMIR = false; - errorCode = g_complibApi._aclQueryInfo(device().compiler(), binaryElf_, - RT_CONTAINS_LLVMIR, NULL, &containsLLLVMIR, &boolSize); - if (errorCode != ACL_SUCCESS || !containsLLLVMIR) { - buildLog_ +="Error while linking : Invalid binary (Missing LLVMIR section)"; - return false; - } - // Create a new aclBinary for each LLVMIR and save it in a list - aclBIFVersion ver = g_complibApi._aclBinaryVersion(binaryElf_); - aclBinary *bin = g_complibApi._aclCreateFromBinary(binaryElf_, ver); - binaries_to_link.push_back(bin); - } - - // At this stage each HSAILProgram in the list has an aclBinary initialized - // and contains LLVMIR - // We can now go ahead and link them. - if (binaries_to_link.size() > 1) { - errorCode = g_complibApi._aclLink(device().compiler(), - binaries_to_link[0], - binaries_to_link.size() - 1, - &binaries_to_link[1], - ACL_TYPE_LLVMIR_BINARY, - "-create-library", - NULL); - } - else { - errorCode = g_complibApi._aclLink(device().compiler(), - binaries_to_link[0], - 0, - NULL, - ACL_TYPE_LLVMIR_BINARY, - "-create-library", - NULL); - } - if (errorCode != ACL_SUCCESS) { - buildLog_ += "Failed to link programs"; - return false; - } - // Store the newly linked aclBinary for this program. - binaryElf_ = binaries_to_link[0]; - // Free all the other aclBinaries - for (size_t i = 1; i < binaries_to_link.size(); i++) { - g_complibApi._aclBinaryFini(binaries_to_link[i]); - } - if (createLibrary) { - saveBinaryAndSetType(TYPE_LIBRARY); - return true; - } - - // Now call linkImpl with the new options - return linkImpl(options); - } - - bool HSAILProgram::initBrigModule() { - const char *symbol_name = "__BRIG__"; - BrigModuleHeader* brig; - acl_error error_code; - size_t size; - const void* symbol_data = g_complibApi._aclExtractSymbol( - device().compiler(), - binaryElf_, - &size, - aclBRIG, - symbol_name, - &error_code); - if (error_code != ACL_SUCCESS) { - std::string error = "Could not find Brig in BIF: "; - error += symbol_name; - LogError(error.c_str()); - buildLog_ += error; - return false; - } - brig = (BrigModuleHeader*)malloc(size); - memcpy(brig, symbol_data, size); - brigModule_ = brig; - return true; - } - void HSAILProgram::destroyBrigModule() { - if (brigModule_ != NULL) { - free(brigModule_); - } - } - bool HSAILProgram::initBrigContainer() { - assert(brigModule_ != NULL); - - //Create a BRIG container - hsaBrigContainer_ = new BrigContainer(brigModule_); - if (!hsaBrigContainer_) { - return false; - } - return true; - } - - void HSAILProgram::destroyBrigContainer() { - delete (hsaBrigContainer_); - } - - - void HSAILProgram::hsaError(const char *msg, hsa_status_t status) { - std::string fmsg; - fmsg += msg; - if (status != HSA_STATUS_SUCCESS) { - const char *hmsg = 0; - hsa_status_string(status, &hmsg); - if (hmsg) { - fmsg += ": "; - fmsg += hmsg; - } - } - LogError(fmsg.c_str()); - buildLog_ += fmsg; - } - - bool HSAILProgram::linkImpl(amd::option::Options *options) { - acl_error errorCode; - aclType continueCompileFrom = ACL_TYPE_LLVMIR_BINARY; - bool finalize = true; - // If !binaryElf_ then program must have been created using clCreateProgramWithBinary - if (!binaryElf_) { - continueCompileFrom = getNextCompilationStageFromBinary(options); - } - switch (continueCompileFrom) { - // Compilation from ACL_TYPE_LLVMIR_BINARY to ACL_TYPE_CG in cases: - // 1. if the program is not created with binary; - // 2. if the program is created with binary and contains only .llvmir & .comment - // 3. if the program is created with binary, contains .llvmir, .comment, brig sections, - // but the binary's compile & link options differ from current ones (recompilation); - case ACL_TYPE_LLVMIR_BINARY: - // Compilation from ACL_TYPE_HSAIL_BINARY to ACL_TYPE_CG in cases: - // 1. if the program is created with binary and contains only brig sections - case ACL_TYPE_HSAIL_BINARY: - // Compilation from ACL_TYPE_HSAIL_TEXT to ACL_TYPE_CG in cases: - // 1. if the program is created with binary and contains only hsail text - case ACL_TYPE_HSAIL_TEXT: { - std::string curOptions = options->origOptionStr + hsailOptions(); - errorCode = g_complibApi._aclCompile(device().compiler(), binaryElf_, - curOptions.c_str(), continueCompileFrom, ACL_TYPE_CG, logFunction); - buildLog_ += g_complibApi._aclGetCompilerLog(device().compiler()); - if (errorCode != ACL_SUCCESS) { - buildLog_ += "Error while BRIG Codegen phase: compilation error \n" ; - return false; - } - break; - } - case ACL_TYPE_CG: - break; - case ACL_TYPE_ISA: - finalize = false; - break; - default: - buildLog_ += "Error while BRIG Codegen phase: the binary is incomplete \n" ; - return false; - } - //Stop compilation if it is an offline device - HSA runtime does not - //support ISA compiled offline - if (!dev().isOnline()) { - return true; - } - - hsa_agent_t hsaDevice = dev().getBackendDevice(); - if (!initBrigModule()) { - hsaError("Failed to create Brig Module"); - return false; - } - - // Create a BrigContainer. - if (!initBrigContainer()) { - hsaError("Failed to create Brig Container"); - return false; - } - // Create a program. - hsa_status_t status = hsa_ext_program_create( - HSA_MACHINE_MODEL_LARGE, - HSA_PROFILE_FULL, - HSA_DEFAULT_FLOAT_ROUNDING_MODE_ZERO, - NULL, - &hsaProgramHandle_ - ); - if (status != HSA_STATUS_SUCCESS) { - hsaError("Failed to create hsail program", status); - return false; - } - - // Add module to a program. - hsa_ext_module_t programModule = - reinterpret_cast(brigModule_); - status = hsa_ext_program_add_module( - hsaProgramHandle_, programModule - ); - if (status != HSA_STATUS_SUCCESS) { - hsaError("Failed to add a module to the program", status); - return false; - } - - // Obtain agent's Isa. - hsa_isa_t hsaDeviceIsa; - status = hsa_agent_get_info( - hsaDevice, HSA_AGENT_INFO_ISA, &hsaDeviceIsa - ); - if (status != HSA_STATUS_SUCCESS) { - hsaError("Failed to create hsail program", status); - return false; - } - - // Finalize a program. - hsa_ext_control_directives_t hsaControlDirectives; - memset(&hsaControlDirectives, 0, sizeof(hsa_ext_control_directives_t)); - status = hsa_ext_program_finalize( - hsaProgramHandle_, - hsaDeviceIsa, - 0, - hsaControlDirectives, - NULL, - HSA_CODE_OBJECT_TYPE_PROGRAM, - &hsaProgramCodeObject_ - ); - if (status != HSA_STATUS_SUCCESS) { - hsaError("Failed to finalize hsail program", status); - return false; - } - - // HLC always generates full profile - hsa_profile_t profile = HSA_PROFILE_FULL; - - // Create an executable. - status = hsa_executable_create( - profile, - HSA_EXECUTABLE_STATE_UNFROZEN, - "", - &hsaExecutable_ - ); - if (status != HSA_STATUS_SUCCESS) { - hsaError("Failed to create executable", status); - return false; - } - - // Load the code object. - status = hsa_executable_load_code_object( - hsaExecutable_, hsaDevice, hsaProgramCodeObject_, NULL - ); - if (status != HSA_STATUS_SUCCESS) { - hsaError("Failed to load code object", status); - return false; - } - - // Freeze the executable. - status = hsa_executable_freeze(hsaExecutable_, NULL); - if (status != HSA_STATUS_SUCCESS) { - hsaError("Failed to freeze executable", status); - return false; - } - - Code first_d = hsaBrigContainer_->code().begin(); - Code last_d = hsaBrigContainer_->code().end(); - //Iterate through the symbols using brig assembler - for (;first_d != last_d;first_d = first_d.next()) { - if (DirectiveExecutable de = first_d) { - // Disable function compilation unconditionally. - // TODO: May remove this after the finalizer supports function compilation. - if (DirectiveFunction df = first_d) { - continue; - } - - std::string kernelName = (SRef)de.name(); - if (de.linkage() != BRIG_LINKAGE_PROGRAM) { - kernelName.insert(0, "am::"); - } - // Query symbol handle for this symbol. - hsa_executable_symbol_t kernelSymbol; - status = hsa_executable_get_symbol( - hsaExecutable_, NULL, kernelName.c_str(), hsaDevice, 0, &kernelSymbol - ); - if (status != HSA_STATUS_SUCCESS) { - hsaError("Failed to get executable symbol", status); - return false; - } - - // Query code handle for this symbol. - uint64_t kernelCodeHandle; - status = hsa_executable_symbol_get_info( - kernelSymbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &kernelCodeHandle - ); - if (status != HSA_STATUS_SUCCESS) { - hsaError("Failed to get executable symbol info", status); - return false; - } - - std::string openclKernelName = kernelName; - // Strip the opencl and kernel name - kernelName = kernelName.substr(strlen("&__OpenCL_"), kernelName.size()); - kernelName = kernelName.substr(0,kernelName.size() - strlen("_kernel")); - aclMetadata md; - md.numHiddenKernelArgs = 0; - size_t sizeOfnumHiddenKernelArgs = sizeof(md.numHiddenKernelArgs); - errorCode = g_complibApi._aclQueryInfo(device().compiler(), binaryElf_, RT_NUM_KERNEL_HIDDEN_ARGS, - openclKernelName.c_str(), &md.numHiddenKernelArgs, &sizeOfnumHiddenKernelArgs); - if (errorCode != ACL_SUCCESS) { - buildLog_ += "Error while Finalization phase: Kernel extra arguments count querying from the ELF failed\n"; - return false; - } - - uint32_t workgroupGroupSegmentByteSize; - status = hsa_executable_symbol_get_info( - kernelSymbol, - HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE, - &workgroupGroupSegmentByteSize); - if (status != HSA_STATUS_SUCCESS) { - hsaError("Failed to get group segment size info", status); - return false; - } - - uint32_t workitemPrivateSegmentByteSize; - status = hsa_executable_symbol_get_info( - kernelSymbol, - HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE, - &workitemPrivateSegmentByteSize); - if (status != HSA_STATUS_SUCCESS) { - hsaError("Failed to get private segment size info", status); - return false; - } - - uint32_t kernargSegmentByteSize; - status = hsa_executable_symbol_get_info( - kernelSymbol, - HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE, - &kernargSegmentByteSize); - if (status != HSA_STATUS_SUCCESS) { - hsaError("Failed to get kernarg segment size info", status); - return false; - } - - uint32_t kernargSegmentAlignment; - status = hsa_executable_symbol_get_info( - kernelSymbol, - HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_ALIGNMENT, - &kernargSegmentAlignment); - if (status != HSA_STATUS_SUCCESS) { - hsaError("Failed to get kernarg segment alignment info", status); - return false; - } - - Kernel *aKernel = new roc::Kernel( - kernelName, - this, - kernelCodeHandle, - workgroupGroupSegmentByteSize, - workitemPrivateSegmentByteSize, - kernargSegmentByteSize, - kernargSegmentAlignment, - md.numHiddenKernelArgs - ); - if (!aKernel->init()) { - return false; - } - aKernel->setUniformWorkGroupSize(options->oVariables->UniformWorkGroupSize); - kernels()[kernelName] = aKernel; - } - } - saveBinaryAndSetType(TYPE_EXECUTABLE); - buildLog_ += g_complibApi._aclGetCompilerLog(device().compiler()); - return true; - } - - bool HSAILProgram::createBinary(amd::option::Options *options) { - return false; - } - - bool HSAILProgram::initClBinary() { - if (clBinary_ == NULL) { - clBinary_ = new ClBinary(static_cast(device())); - if (clBinary_ == NULL) { - return false; - } - } - return true; - } - - void HSAILProgram::releaseClBinary() { - if (clBinary_ != NULL) { - delete clBinary_; - clBinary_ = NULL; - } - } - - std::string HSAILProgram::hsailOptions() { - std::string hsailOptions; - //Set options for the standard device specific options - //This is just for legacy compiler code - // All our devices support these options now - hsailOptions.append(" -DFP_FAST_FMAF=1"); - hsailOptions.append(" -DFP_FAST_FMA=1"); - //TODO: this is a quick fix to restore original f32 denorm flushing - //Make this target/option dependent - hsailOptions.append(" -cl-denorms-are-zero"); - //TODO(sramalin) : Query the device for opencl version - // and only set if -cl-std wasn't specified in - // original build options (app) - //hsailOptions.append(" -cl-std=CL1.2"); - //check if the host is 64 bit or 32 bit - LP64_ONLY(hsailOptions.append(" -m64")); - //Now append each extension supported by the device - // one by one - std::string token; - std::istringstream iss(""); - iss.str(device().info().extensions_); - while (getline(iss, token, ' ')) { - if (!token.empty()) { - hsailOptions.append(" -D"); - hsailOptions.append(token); - hsailOptions.append("=1"); - } - } - return hsailOptions; - } - -#endif // WITHOUT_HSA_BACKEND -} // namespace hsa - diff --git a/projects/clr/rocclr/runtime/device/rocm/rocprogram.hpp b/projects/clr/rocclr/runtime/device/rocm/rocprogram.hpp deleted file mode 100644 index 877c64b240..0000000000 --- a/projects/clr/rocclr/runtime/device/rocm/rocprogram.hpp +++ /dev/null @@ -1,156 +0,0 @@ -// -// Copyright (c) 2008 Advanced Micro Devices, Inc. All rights reserved. -// -#pragma once - -#ifndef WITHOUT_HSA_BACKEND - -#include "rocbinary.hpp" -#include "roccompilerlib.hpp" -#include "acl.h" -#include -#include -#include -#include -#include "rocdevice.hpp" -#include "HSAILItems.h" - -using namespace HSAIL_ASM; -//! \namespace roc HSA Device Implementation -namespace roc { - - //! \class empty program - class HSAILProgram : public device::Program - { - friend class ClBinary; - public: - //! Default constructor - HSAILProgram(roc::NullDevice& device); - //! Default destructor - ~HSAILProgram(); - - // Initialize Binary for GPU (used only for clCreateProgramWithBinary()). - virtual bool initClBinary(char *binaryIn, size_t size); - - //! Returns the aclBinary associated with the progrm - const aclBinary* binaryElf() const { - return static_cast(binaryElf_); } - - const std::string& HsailText() { - return hsailProgram_; - } - - const NullDevice& dev() const { return device_; } - //! Returns the hsaBinary associated with the progrm - hsa_agent_t hsaDevice() const { - return dev().getBackendDevice(); - } - - protected: - //! log and append to build log an error from runtime - void hsaError(const char *msg, hsa_status_t status = HSA_STATUS_SUCCESS); - - //! pre-compile setup for GPU - virtual bool initBuild(amd::option::Options* options); - - //! post-compile setup for GPU - virtual bool finiBuild(bool isBuildGood); - - /*! \brief Compiles GPU CL program to LLVM binary (compiler frontend) - * - * \return True if we successefully compiled a GPU program - */ - virtual bool compileImpl( - const std::string& sourceCode, //!< the program's source code - const std::vector& headers, - const char** headerIncludeNames, - amd::option::Options* options //!< compile options's object - ); - - /*! \brief Compiles LLVM binary to HSAIL code (compiler backend: link+opt+codegen) - * - * \return The build error code - */ - int compileBinaryToHSAIL( - amd::option::Options* options //!< options for compilation - ); - - - virtual bool linkImpl(amd::option::Options* options); - - //! Link the device programs. - virtual bool linkImpl (const std::vector& inputPrograms, - amd::option::Options* options, - bool createLibrary); - - virtual bool createBinary(amd::option::Options* options); - - //! Initialize Binary - virtual bool initClBinary(); - - //! Release the Binary - virtual void releaseClBinary(); - - virtual const aclTargetInfo & info(const char * str = ""){ - return info_; - } - - virtual bool isElf(const char* bin) const { - return amd::isElfMagic(bin); - //return false; - } - - //! Returns the binary - // This should ensure that the binary is updated with all the kernels - // ClBinary& clBinary() { return binary_; } - ClBinary* clBinary() { - return static_cast(device::Program::clBinary()); - } - const ClBinary* clBinary() const { - return static_cast(device::Program::clBinary()); - } - private: - /* \brief Returns the next stage to compile from, based on sections in binary, - * also returns completeStages in a vector, which contains at least ACL_TYPE_DEFAULT, - * sets needOptionsCheck to true if options check is needed to decide whether or not to recompile - */ - aclType getCompilationStagesFromBinary(std::vector& completeStages, bool& needOptionsCheck); - - /* \brief Returns the next stage to compile from, based on sections and options in binary - */ - aclType getNextCompilationStageFromBinary(amd::option::Options* options); - bool saveBinaryAndSetType(type_t type); - bool initBrigContainer(); - void destroyBrigContainer(); - //Initializes BRIG module - bool initBrigModule(); - void destroyBrigModule(); - //! Disable default copy constructor - HSAILProgram(const HSAILProgram&); - - //! Disable operator= - HSAILProgram& operator=(const HSAILProgram&); - - //! Returns all the options to be appended while passing to the - //compiler library - std::string hsailOptions(); - - std::string openCLSource_; //!< Original OpenCL source - std::string hsailProgram_; //!< HSAIL program after compilation. - std::string llvmBinary_; //!< LLVM IR binary code - //!< aclBinary and aclCompiler - for the compiler libray - aclBinary* binaryElf_; //! -#include -#include - -/** -* HSA image object size in bytes (see HSAIL spec) -*/ -#define HSA_IMAGE_OBJECT_SIZE 48 - -/** -* HSA image object alignment in bytes (see HSAIL spec) -*/ -#define HSA_IMAGE_OBJECT_ALIGNMENT 16 - -/** -* HSA sampler object size in bytes (see HSAIL spec) -*/ -#define HSA_SAMPLER_OBJECT_SIZE 32 - -/** -* HSA sampler object alignment in bytes (see HSAIL spec) -*/ -#define HSA_SAMPLER_OBJECT_ALIGNMENT 16 - -namespace roc { -// (HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) invalidates I, K and L1 -// (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE) invalidates L1, L2 and flushes L2 - -static const uint16_t kDispatchPacketHeaderNoSync = - (HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) | - (HSA_FENCE_SCOPE_NONE << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) | - (HSA_FENCE_SCOPE_NONE << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE); - -static const uint16_t kDispatchPacketHeader = - (HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) | - (1 << HSA_PACKET_HEADER_BARRIER) | - (HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) | - (HSA_FENCE_SCOPE_NONE << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE); - -static const uint16_t kBarrierPacketHeader = - (HSA_PACKET_TYPE_BARRIER_AND << HSA_PACKET_HEADER_TYPE) | - (1 << HSA_PACKET_HEADER_BARRIER) | - (HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) | - (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE); - -static const uint16_t kBarrierPacketAcquireHeader = - (HSA_PACKET_TYPE_BARRIER_AND << HSA_PACKET_HEADER_TYPE) | - (1 << HSA_PACKET_HEADER_BARRIER) | - (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) | - (HSA_FENCE_SCOPE_NONE << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE); - -static const uint16_t kBarrierPacketReleaseHeader = - (HSA_PACKET_TYPE_BARRIER_AND << HSA_PACKET_HEADER_TYPE) | - (1 << HSA_PACKET_HEADER_BARRIER) | - (HSA_FENCE_SCOPE_NONE << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) | - (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE); - -static const hsa_barrier_and_packet_t kBarrierAcquirePacket = - {kBarrierPacketAcquireHeader,0,0,0,0,0,0,0,0,0}; - -static const hsa_barrier_and_packet_t kBarrierReleasePacket = - {kBarrierPacketReleaseHeader,0,0,0,0,0,0,0,0,0}; - -double Timestamp::ticksToTime_=0; - -/** -* Set the ocl correlation handle (essentially the cl_event handle) -* to correlate the cl kernel launch and HSA kernel dispatch -*/ -typedef hsa_status_t - (*hsa_ext_tools_set_correlation_handle)(const hsa_agent_t agent, - void *correlation_handle); -static void SetOclCorrelationHandle(void *tools_lib, const hsa_agent_t agent, void *handle) { - hsa_ext_tools_set_correlation_handle func = - (hsa_ext_tools_set_correlation_handle)Os::getSymbol(tools_lib, "hsa_ext_tools_set_correlation_handler"); - if (func) { - func(agent, handle); - } - - return; -} - -bool -VirtualGPU::MemoryDependency::create(size_t numMemObj) -{ - if (numMemObj > 0) { - // Allocate the array of memory objects for dependency tracking - memObjectsInQueue_ = new MemoryState[numMemObj]; - if (NULL == memObjectsInQueue_) { - return false; - } - memset(memObjectsInQueue_, 0, sizeof(MemoryState) * numMemObj); - maxMemObjectsInQueue_ = numMemObj; - } - - return true; -} - -void -VirtualGPU::MemoryDependency::validate( - VirtualGPU& gpu, - const Memory* memory, - bool readOnly) -{ - bool flushL1Cache = false; - - if (maxMemObjectsInQueue_ == 0) { - // Sync AQL packets - gpu.setAqlHeader(kDispatchPacketHeader); - return; - } - - uint64_t curStart = reinterpret_cast(memory->getDeviceMemory()); - uint64_t curEnd = curStart + memory->size(); - - // Loop through all memory objects in the queue and find dependency - // @note don't include objects from the current kernel - for (size_t j = 0; j < endMemObjectsInQueue_; ++j) { - // Check if the queue already contains this mem object and - // GPU operations aren't readonly - uint64_t busyStart = memObjectsInQueue_[j].start_; - uint64_t busyEnd = memObjectsInQueue_[j].end_; - - // Check if the start inside the busy region - if ((((curStart >= busyStart) && (curStart < busyEnd)) || - // Check if the end inside the busy region - ((curEnd > busyStart) && (curEnd <= busyEnd)) || - // Check if the start/end cover the busy region - ((curStart <= busyStart) && (curEnd >= busyEnd))) && - // If the buys region was written or the current one is for write - (!memObjectsInQueue_[j].readOnly_ || !readOnly)) { - flushL1Cache = true; - break; - } - } - - // Did we reach the limit? - if (maxMemObjectsInQueue_ <= (numMemObjectsInQueue_ + 1)) { - flushL1Cache = true; - } - - if (flushL1Cache) { - // Sync AQL packets - gpu.setAqlHeader(kDispatchPacketHeader); - - // Clear memory dependency state - const static bool All = true; - clear(!All); - } - - // Insert current memory object into the queue always, - // since runtime calls flush before kernel execution and it has to keep - // current kernel in tracking - memObjectsInQueue_ - [numMemObjectsInQueue_].start_ = curStart; - memObjectsInQueue_ - [numMemObjectsInQueue_].end_ = curEnd; - memObjectsInQueue_ - [numMemObjectsInQueue_].readOnly_ = readOnly; - numMemObjectsInQueue_++; -} - -void -VirtualGPU::MemoryDependency::clear(bool all) -{ - if (numMemObjectsInQueue_ > 0) { - size_t i, j; - if (all) { - endMemObjectsInQueue_ = numMemObjectsInQueue_; - } - - // Preserve all objects from the current kernel - for (i = 0, j = endMemObjectsInQueue_; j < numMemObjectsInQueue_; i++, j++) { - memObjectsInQueue_[i].start_ = memObjectsInQueue_[j].start_; - memObjectsInQueue_[i].end_ = memObjectsInQueue_[j].end_; - memObjectsInQueue_[i].readOnly_ = memObjectsInQueue_[j].readOnly_; - } - // Clear all objects except current kernel - memset(&memObjectsInQueue_[i], 0, sizeof(amd::Memory*) * numMemObjectsInQueue_); - numMemObjectsInQueue_ -= endMemObjectsInQueue_; - endMemObjectsInQueue_ = 0; - } -} - -bool -VirtualGPU::processMemObjects( - const amd::Kernel& kernel, - const_address params) -{ - static const bool NoAlias = true; - const Kernel& hsaKernel = static_cast - (*(kernel.getDeviceKernel(dev(), NoAlias))); - const amd::KernelSignature& signature = kernel.signature(); - const amd::KernelParameters& kernelParams = kernel.parameters(); - - // AQL packets - setAqlHeader(kDispatchPacketHeaderNoSync); - - // Mark the tracker with a new kernel, - // so we can avoid checks of the aliased objects - memoryDependency().newKernel(); - - bool deviceSupportFGS = 0 != dev().isFineGrainedSystem(true); - bool supportFineGrainedSystem = deviceSupportFGS; - FGSStatus status = kernelParams.getSvmSystemPointersSupport(); - switch (status) { - case FGS_YES: - if (!deviceSupportFGS) { - return false; - } - supportFineGrainedSystem = true; - break; - case FGS_NO: - supportFineGrainedSystem = false; - break; - case FGS_DEFAULT: - default: - break; - } - - size_t count = kernelParams.getNumberOfSvmPtr(); - size_t execInfoOffset = kernelParams.getExecInfoOffset(); - bool sync = true; - - amd::Memory* memory = NULL; - //get svm non arugment information - void* const* svmPtrArray = - reinterpret_cast(params + execInfoOffset); - for (size_t i = 0; i < count; i++) { - memory = amd::SvmManager::FindSvmBuffer(svmPtrArray[i]); - if (NULL == memory) { - if (!supportFineGrainedSystem) { - return false; - } - else if (sync) { - // Sync AQL packets - setAqlHeader(kDispatchPacketHeader); - // Clear memory dependency state - const static bool All = true; - memoryDependency().clear(!All); - continue; - } - } - else { - Memory* gpuMemory = static_cast(memory->getDeviceMemory(dev())); - if (NULL != gpuMemory) { - const static bool IsReadOnly = false; - // Validate SVM passed in the non argument list - memoryDependency().validate(*this, gpuMemory, IsReadOnly); - } - else { - return false; - } - } - } - - // Check all parameters for the current kernel - for (size_t i = 0; i < signature.numParameters(); ++i) { - const amd::KernelParameterDescriptor& desc = signature.at(i); - const HsailKernelArg* arg = hsaKernel.hsailArgAt(i); - Memory* memory = NULL; - bool readOnly = false; - amd::Memory* svmMem = NULL; - - // Find if current argument is a buffer - if ((desc.type_ == T_POINTER) && (arg->addrQual_ != HSAIL_ADDRESS_LOCAL)) { - if (kernelParams.boundToSvmPointer(dev(), params, i)) { - svmMem = amd::SvmManager::FindSvmBuffer( - *reinterpret_cast(params + desc.offset_)); - if (!svmMem) { - // Sync AQL packets - setAqlHeader(kDispatchPacketHeader); - // Clear memory dependency state - const static bool All = true; - memoryDependency().clear(!All); - continue; - } - } - - if (*reinterpret_cast - (params + desc.offset_) != NULL) { - if (NULL == svmMem) { - memory = static_cast((*reinterpret_cast - (params + desc.offset_))->getDeviceMemory(dev())); - } - else { - memory = static_cast(svmMem->getDeviceMemory(dev())); - } - } - - if (memory != NULL) { - // Check image - readOnly = (desc.accessQualifier_ == - CL_KERNEL_ARG_ACCESS_READ_ONLY) ? true : false; - // Check buffer - readOnly |= (arg->access_ == HSAIL_ACCESS_TYPE_RO) ? true : false; - // Validate memory for a dependency in the queue - memoryDependency().validate(*this, memory, readOnly); - } - } - } - - return true; -} - -template -bool -VirtualGPU::dispatchGenericAqlPacket( - AqlPacket* packet, - bool blocking) -{ - const uint32_t queueSize = gpu_queue_->size; - const uint32_t queueMask = queueSize - 1; - - //Check for queue full and wait if needed. - uint64_t index = hsa_queue_load_write_index_relaxed(gpu_queue_); - uint64_t read = hsa_queue_load_read_index_relaxed(gpu_queue_); - hsa_signal_t signal; - - // TODO: placeholder to setup the kernel to populate start and end timestamp. - if (timestamp_ != nullptr) { - // Find signal slot - ProfilingSignal* profilingSignal = &signal_pool_[index & queueMask]; - // Make sure we save the old results in the TS structure - if (profilingSignal->ts_ != nullptr) { - profilingSignal->ts_->checkGpuTime(); - } - // Update the new TS with the signal info - timestamp_->setProfilingSignal(profilingSignal); - packet->completion_signal = profilingSignal->signal_; - profilingSignal->ts_ = timestamp_; - timestamp_->setAgent(gpu_device_); - } - - if ((index - read) == queueMask) { - if (packet->completion_signal.handle == 0) { - packet->completion_signal = barrier_signal_; - } - signal = packet->completion_signal; - // Initialize signal for a wait - hsa_signal_store_relaxed(signal, InitSignalValue); - blocking = true; - } - - //Insert packet - ((AqlPacket*)( - gpu_queue_->base_address))[index & queueMask] = *packet; - hsa_queue_store_write_index_release(gpu_queue_, index + 1); - hsa_signal_store_relaxed(gpu_queue_->doorbell_signal, index); - - //Wait on signal ? - if (blocking) { - if (hsa_signal_wait_acquire( - signal, HSA_SIGNAL_CONDITION_LT, 1, uint64_t(-1), - HSA_WAIT_STATE_BLOCKED) != 0) { - LogPrintfError("Failed signal [0x%lx] wait", signal.handle); - return false; - } - - // Release the pool, since runtime just drained the entire queue - resetKernArgPool(); - } - - return true; -} - -bool -VirtualGPU::dispatchAqlPacket( - hsa_kernel_dispatch_packet_t* packet, - bool blocking) -{ - return dispatchGenericAqlPacket(packet, blocking); -} - -bool -VirtualGPU::dispatchAqlPacket( - hsa_barrier_and_packet_t* packet, - bool blocking) -{ - return dispatchGenericAqlPacket(packet, blocking); -} - -void -VirtualGPU::dispatchBarrierPacket(const hsa_barrier_and_packet_t* packet) -{ - assert(packet->completion_signal.handle != 0); - const uint32_t queueSize = gpu_queue_->size; - const uint32_t queueMask = queueSize - 1; - - uint64_t index = hsa_queue_load_write_index_relaxed(gpu_queue_); - ((hsa_barrier_and_packet_t*)( - gpu_queue_->base_address))[index&queueMask] = *packet; - - hsa_queue_store_write_index_relaxed(gpu_queue_, index + 1); - - hsa_signal_store_relaxed(gpu_queue_->doorbell_signal, index); -} - -/** - * @brief Waits on an outstanding kernel without regard to how - * it was dispatched - with or without a signal - * - * @return bool true if Wait returned successfully, false - * otherwise - */ -bool VirtualGPU::releaseGpuMemoryFence() { - // Return if there is no pending dispatch - if (!hasPendingDispatch_) { - return false; - } - - // Initialize signal for the barrier packet. - hsa_signal_store_relaxed(barrier_signal_, InitSignalValue); - - // Dispatch barrier packet into the queue and wait till it finishes. - dispatchBarrierPacket(&barrier_packet_); - if (hsa_signal_wait_acquire( - barrier_signal_, HSA_SIGNAL_CONDITION_EQ, 0, uint64_t(-1), - HSA_WAIT_STATE_BLOCKED) != 0) { - LogError("Barrier packet submission failed"); - return false; - } - - hasPendingDispatch_ = false; - - // Release all memory dependencies - memoryDependency().clear(); - - // Release the pool, since runtime just completed a barrier - resetKernArgPool(); - - return true; -} - -VirtualGPU::VirtualGPU(Device &device) - : device::VirtualDevice(device) - , roc_device_(device) -{ - gpu_device_ = device.getBackendDevice(); - // Initialize the last signal and dispatch flags - timestamp_ = NULL; - hasPendingDispatch_ = false; - tools_lib_ = NULL; - - kernarg_pool_base_ = NULL; - kernarg_pool_size_ = 0; - kernarg_pool_cur_offset_ = 0; - aqlHeader_ = kDispatchPacketHeaderNoSync; - barrier_signal_.handle = 0; -} - -VirtualGPU::~VirtualGPU() -{ - if (timestamp_ != NULL) { - delete timestamp_; - timestamp_ = NULL; - LogError("There was a timestamp that was not used; deleting."); - } - if (printfdbg_ != NULL){ - delete printfdbg_; - printfdbg_ = NULL; - } - - tools_lib_ = NULL; -} - -bool -VirtualGPU::create(bool profilingEna) -{ - // Set the event handle to the tools lib if the env var - // Load the library using its advertised "soname" - std::string lib_name = Os::getEnvironment("HSA_TOOLS_LIB"); - if (lib_name != "") { -#if defined(_WIN32) || defined(__CYGWIN__) - const char *tools_lib_name = "hsa-runtime-tools" LP64_SWITCH("", "64") ".dll"; -#else - const char *tools_lib_name = "libhsa-runtime-tools" LP64_SWITCH("", "64") ".so.1"; -#endif - tools_lib_ = Os::loadLibrary(tools_lib_name); - } - - uint32_t queue_max_packets = 0; - if (HSA_STATUS_SUCCESS != - hsa_agent_get_info( - gpu_device_, HSA_AGENT_INFO_QUEUE_MAX_SIZE, &queue_max_packets)) { - return false; - } - - //Pick a reasonable queue size - uint32_t queue_size = 1024; - queue_size = (queue_max_packets < queue_size) ? queue_max_packets : queue_size; - while (hsa_queue_create(gpu_device_, - queue_size, HSA_QUEUE_TYPE_MULTI, NULL, NULL, UINT_MAX, UINT_MAX, - &gpu_queue_) != HSA_STATUS_SUCCESS) { - queue_size >>= 1; - if (queue_size < 64) { - return false; - } - } - - if (!initPool(dev().settings().kernargPoolSize_, (profilingEna) ? queue_size : 0)) { - LogError("Couldn't allocate arguments/signals for the queue"); - return false; - } - - device::BlitManager::Setup blitSetup; - blitMgr_ = new KernelBlitManager(*this, blitSetup); - if ((NULL == blitMgr_) || !blitMgr_->create(roc_device_)) { - LogError("Could not create BlitManager!"); - return false; - } - - // Create signal for the barrier packet. - hsa_signal_t signal = { 0 }; - if (HSA_STATUS_SUCCESS != - hsa_signal_create(InitSignalValue, 0, NULL, &signal)) { - return false; - } - barrier_signal_ = signal; - - // Initialize barrier packet. - memset(&barrier_packet_, 0, sizeof(barrier_packet_)); - barrier_packet_.header = kBarrierPacketHeader; - barrier_packet_.completion_signal = barrier_signal_; - - // Create a object of PrintfDbg - printfdbg_ = new PrintfDbg(roc_device_); - if (NULL == printfdbg_) { - LogError("\nCould not create printfDbg Object!"); - return false; - } - - // Initialize timestamp conversion factor - if (Timestamp::getGpuTicksToTime() == 0) { - uint64_t frequency; - hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY, &frequency); - Timestamp::setGpuTicksToTime(1e9/double(frequency)); - } - - if (!memoryDependency().create(GPU_NUM_MEM_DEPENDENCY)) { - LogError("Could not create the array of memory objects!"); - return false; - } - - return true; -} - -bool -VirtualGPU::terminate() -{ - delete blitMgr_; - - // Release the resources of signal - releaseGpuMemoryFence(); - hsa_status_t err = hsa_queue_destroy(gpu_queue_); - if (err != HSA_STATUS_SUCCESS) { - return false; - } - - if (barrier_signal_.handle != 0) { - hsa_signal_destroy(barrier_signal_); - } - - if (tools_lib_) { - Os::unloadLibrary(tools_lib_); - tools_lib_ = NULL; - } - - destroyPool(); - - return true; -} - -bool -VirtualGPU::initPool(size_t kernarg_pool_size, uint signal_pool_count) -{ - kernarg_pool_size_ = kernarg_pool_size; - kernarg_pool_base_ = reinterpret_cast( - roc_device_.hostAlloc(kernarg_pool_size_, 1, true)); - if (kernarg_pool_base_ == nullptr) { - return false; - } - - if (signal_pool_count != 0) { - signal_pool_.resize(signal_pool_count); - for (uint i = 0; i < signal_pool_count; ++i) { - ProfilingSignal profilingSignal; - if (HSA_STATUS_SUCCESS != hsa_signal_create( - 0, 0, nullptr, &profilingSignal.signal_)) { - return false; - } - signal_pool_[i] = profilingSignal; - } - } - - return true; -} - -void -VirtualGPU::destroyPool() { - if (kernarg_pool_base_ != nullptr) { - roc_device_.hostFree(kernarg_pool_base_, kernarg_pool_size_); - } - - if (signal_pool_.size() > 0) { - for (uint i = 0; i < signal_pool_.size(); ++i) { - hsa_signal_destroy(signal_pool_[i].signal_); - } - } -} - -void* -VirtualGPU::allocKernArg(size_t size, size_t alignment) -{ - char* result = nullptr; - do { - result = alignUp(kernarg_pool_base_ + kernarg_pool_cur_offset_, alignment); - const size_t pool_new_usage = (result + size) - kernarg_pool_base_; - if (pool_new_usage <= kernarg_pool_size_) { - kernarg_pool_cur_offset_ = pool_new_usage; - return result; - } - else { - //! We run out of the arguments space! - //! That means the app didn't call clFlush/clFinish for very long time. - //! We can issue a barrier to avoid expensive extra memory allocations. - - // Initialize signal for the barrier packet. - hsa_signal_store_relaxed(barrier_signal_, InitSignalValue); - - // Dispatch barrier packet into the queue and wait till it finishes. - dispatchBarrierPacket(&barrier_packet_); - if (hsa_signal_wait_acquire( - barrier_signal_, HSA_SIGNAL_CONDITION_EQ, 0, uint64_t(-1), - HSA_WAIT_STATE_BLOCKED) != 0) { - LogError("Kernel arguments reset failed"); - } - - resetKernArgPool(); - } - } while (true); - - return result; -} - -/* profilingBegin, when profiling is enabled, creates a timestamp to save in -* virtualgpu's timestamp_, and calls start() to get the current host -* timestamp. -*/ -void VirtualGPU::profilingBegin(amd::Command &command, bool drmProfiling) -{ - if (command.profilingInfo().enabled_) { - if (timestamp_ != NULL) { - LogWarning("Trying to create a second timestamp in VirtualGPU. \ - This could have unintended consequences."); - return; - } - timestamp_ = new Timestamp; - timestamp_->start(); - } -} - -/* profilingEnd, when profiling is enabled, checks to see if a signal was -* created for whatever command we are running and calls end() to get the -* current host timestamp if no signal is available. It then saves the pointer -* timestamp_ to the command's data. -*/ -void VirtualGPU::profilingEnd(amd::Command &command) -{ - if (command.profilingInfo().enabled_) { - if (timestamp_->getProfilingSignal() == nullptr) { - timestamp_->end(); - } - command.setData(reinterpret_cast(timestamp_)); - timestamp_ = NULL; - } -} - -struct DestroySampler : public std::binary_function { - bool operator() (hsa_ext_sampler_t &sampler, - hsa_agent_t agent) const { - hsa_status_t status = hsa_ext_sampler_destroy(agent, sampler); - return status == HSA_STATUS_SUCCESS; - } -}; - -void VirtualGPU::updateCommandsState(amd::Command *list) -{ - Timestamp *ts = NULL; - - amd::Command* current = list; - amd::Command* next = NULL; - - if (current == NULL) { - return; - } - - uint64_t endTimeStamp = 0; - uint64_t startTimeStamp = endTimeStamp; - - if (current->profilingInfo().enabled_) { - // TODO: use GPU timestamp when available. - endTimeStamp = amd::Os::timeNanos(); - startTimeStamp = endTimeStamp; - - // This block gets the first valid timestamp from the first command - // that has one. This timestamp is used below to mark any command that - // came before it to start and end with this first valid start time. - current = list; - while (current != NULL) { - if (current->data() != NULL) { - ts = reinterpret_cast(current->data()); - startTimeStamp = ts->getStart(); - endTimeStamp = ts->getStart(); - break; - } - current = current->getNext(); - } - } - - // Iterate through the list of commands, and set timestamps as appropriate - // Note, if a command does not have a timestamp, it does one of two things: - // - if the command (without a timestamp), A, precedes another command, C, - // that _does_ contain a valid timestamp, command A will set RUNNING and - // COMPLETE with the RUNNING (start) timestamp from command C. This would - // also be true for command B, which is between A and C. These timestamps - // are actually retrieved in the block above (startTimeStamp, endTimeStamp). - // - if the command (without a timestamp), C, follows another command, A, - // that has a valid timestamp, command C will be set RUNNING and COMPLETE - // with the COMPLETE (end) timestamp of the previous command, A. This is - // also true for any command B, which falls between A and C. - current = list; - while (current != NULL) { - if (current->profilingInfo().enabled_) { - if (current->data() != NULL) { - // Since this is a valid command to get a timestamp, we use the - // timestamp provided by the runtime (saved in the data()) - ts = reinterpret_cast(current->data()); - startTimeStamp = ts->getStart(); - endTimeStamp = ts->getEnd(); - delete ts; - current->setData(NULL); - } - else { - // If we don't have a command that contains a valid timestamp, - // we simply use the end timestamp of the previous command. - // Note, if this is a command before the first valid timestamp, - // this will be equal to the start timestamp of the first valid - // timestamp at this point. - startTimeStamp = endTimeStamp; - } - } - - if (current->status() == CL_SUBMITTED) { - current->setStatus(CL_RUNNING, startTimeStamp); - current->setStatus(CL_COMPLETE, endTimeStamp); - } - else if (current->status() != CL_COMPLETE) { - LogPrintfError("Unexpected command status - %d.", current->status()); - } - - next = current->getNext(); - current->release(); - current = next; - } - - // Release the sampler handles allocated for the various - // on one or more kernel submissions - std::for_each(samplerList_.begin(), - samplerList_.end(), - std::bind2nd(DestroySampler(), gpu_device_)); - samplerList_.clear(); - - return; -} - -void VirtualGPU::submitReadMemory(amd::ReadMemoryCommand &cmd) -{ - // Wait on a kernel if one is outstanding - releaseGpuMemoryFence(); - - profilingBegin(cmd); - - size_t offset = 0; - // Find if virtual address is a CL allocation - device::Memory* hostMemory = dev().findMemoryFromVA(cmd.destination(), &offset); - - device::Memory *devMem = cmd.source().getDeviceMemory(dev()); - void *dst = cmd.destination(); - amd::Coord3D size = cmd.size(); - - //! @todo: add multi-devices synchronization when supported. - - cl_command_type type = cmd.type(); - bool result = false; - bool imageBuffer = false; - - // Force buffer read for IMAGE1D_BUFFER - if ((type == CL_COMMAND_READ_IMAGE) && - (cmd.source().getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER)) { - type = CL_COMMAND_READ_BUFFER; - imageBuffer = true; - } - - switch (type) { - case CL_COMMAND_READ_BUFFER: { - amd::Coord3D origin(cmd.origin()[0]); - if (imageBuffer) { - size_t elemSize = - cmd.source().asImage()->getImageFormat().getElementSize(); - origin.c[0] *= elemSize; - size.c[0] *= elemSize; - } - if (hostMemory != nullptr) { - // Accelerated transfer without pinning - amd::Coord3D dstOrigin(offset); - result = blitMgr().copyBuffer(*devMem, *hostMemory, - origin, dstOrigin, size, cmd.isEntireMemory()); - } - else { - result = blitMgr().readBuffer( - *devMem, dst, origin, size, - cmd.isEntireMemory()); - } - break; - } - case CL_COMMAND_READ_BUFFER_RECT: { - result = blitMgr().readBufferRect( - *devMem, dst, cmd.bufRect(), cmd.hostRect(), size, - cmd.isEntireMemory()); - break; - } - case CL_COMMAND_READ_IMAGE: { - result = blitMgr().readImage( - *devMem, dst, cmd.origin(), size, cmd.rowPitch(), - cmd.slicePitch(), cmd.isEntireMemory()); - break; - } - default: - ShouldNotReachHere(); - break; - } - - if (!result) { - LogError("submitReadMemory failed!"); - cmd.setStatus(CL_OUT_OF_RESOURCES); - } - - profilingEnd(cmd); -} - -void VirtualGPU::submitWriteMemory(amd::WriteMemoryCommand &cmd) -{ - // Wait on a kernel if one is outstanding - releaseGpuMemoryFence(); - - profilingBegin(cmd); - - size_t offset = 0; - // Find if virtual address is a CL allocation - device::Memory* hostMemory = dev().findMemoryFromVA(cmd.source(), &offset); - - device::Memory *devMem = cmd.destination().getDeviceMemory(dev()); - const char *src = static_cast(cmd.source()); - amd::Coord3D size = cmd.size(); - - //! @todo add multi-devices synchronization when supported. - - cl_command_type type = cmd.type(); - bool result = false; - bool imageBuffer = false; - - // Force buffer write for IMAGE1D_BUFFER - if ((type == CL_COMMAND_WRITE_IMAGE) && - (cmd.destination().getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER)) { - type = CL_COMMAND_WRITE_BUFFER; - imageBuffer = true; - } - - switch (type) { - case CL_COMMAND_WRITE_BUFFER: { - amd::Coord3D origin(cmd.origin()[0]); - if (imageBuffer) { - size_t elemSize = - cmd.destination().asImage()->getImageFormat().getElementSize(); - origin.c[0] *= elemSize; - size.c[0] *= elemSize; - } - if (hostMemory != nullptr) { - // Accelerated transfer without pinning - amd::Coord3D srcOrigin(offset); - result = blitMgr().copyBuffer(*hostMemory, *devMem, - srcOrigin, origin, size, cmd.isEntireMemory()); - } - else { - result = blitMgr().writeBuffer( - src, *devMem , origin, size, - cmd.isEntireMemory()); - } - break; - } - case CL_COMMAND_WRITE_BUFFER_RECT: { - result = blitMgr().writeBufferRect( - src, *devMem, cmd.hostRect(), cmd.bufRect(), size, - cmd.isEntireMemory()); - break; - } - case CL_COMMAND_WRITE_IMAGE: { - result = blitMgr().writeImage( - src, *devMem, cmd.origin(), size, cmd.rowPitch(), - cmd.slicePitch(), cmd.isEntireMemory()); - break; - } - default: - ShouldNotReachHere(); - break; - } - - if (!result) { - LogError("submitWriteMemory failed!"); - cmd.setStatus(CL_OUT_OF_RESOURCES); - } - else { - cmd.destination().signalWrite(&dev()); - } - - profilingEnd(cmd); -} - -void VirtualGPU::submitSvmFreeMemory(amd::SvmFreeMemoryCommand& cmd) -{ - // in-order semantics: previous commands need to be done before we start - releaseGpuMemoryFence(); - - profilingBegin(cmd); - const std::vector& svmPointers = cmd.svmPointers(); - if (cmd.pfnFreeFunc() == NULL) { - // pointers allocated using clSVMAlloc - for (cl_uint i = 0; i < svmPointers.size(); i++) { - amd::SvmBuffer::free(cmd.context(), svmPointers[i]); - } - } - else { - cmd.pfnFreeFunc()(as_cl(cmd.queue()->asCommandQueue()), svmPointers.size(), - (void**) (&(svmPointers[0])), cmd.userData()); - } - profilingEnd(cmd); -} - -void VirtualGPU::submitSvmCopyMemory(amd::SvmCopyMemoryCommand& cmd) -{ - // in-order semantics: previous commands need to be done before we start - releaseGpuMemoryFence(); - profilingBegin(cmd); - amd::SvmBuffer::memFill(cmd.dst(), cmd.src(), cmd.srcSize(), 1); - profilingEnd(cmd); -} - -void VirtualGPU::submitSvmFillMemory(amd::SvmFillMemoryCommand& cmd) -{ - // in-order semantics: previous commands need to be done before we start - releaseGpuMemoryFence(); - profilingBegin(cmd); - SvmBuffer::memFill(cmd.dst(), cmd.pattern(), cmd.patternSize(), cmd.times()); - profilingEnd(cmd); -} - -void VirtualGPU::submitCopyMemory(amd::CopyMemoryCommand &cmd) -{ - // Wait on a kernel if one is outstanding - releaseGpuMemoryFence(); - - profilingBegin(cmd); - - device::Memory *srcDevMem = cmd.source().getDeviceMemory(dev()); - device::Memory *destDevMem = cmd.destination().getDeviceMemory(dev()); - amd::Coord3D size = cmd.size(); - - //! @todo add multi-devices synchronization when supported. - - cl_command_type type = cmd.type(); - bool result = false; - bool srcImageBuffer = false; - bool dstImageBuffer = false; - - // Force buffer copy for IMAGE1D_BUFFER - if (cmd.source().getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER) { - srcImageBuffer = true; - type = CL_COMMAND_COPY_BUFFER; - } - if (cmd.destination().getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER) { - dstImageBuffer = true; - type = CL_COMMAND_COPY_BUFFER; - } - - switch (cmd.type()) { - case CL_COMMAND_COPY_BUFFER: { - amd::Coord3D srcOrigin(cmd.srcOrigin()[0]); - amd::Coord3D dstOrigin(cmd.dstOrigin()[0]); - - if (srcImageBuffer) { - const size_t elemSize = - cmd.source().asImage()->getImageFormat().getElementSize(); - srcOrigin.c[0] *= elemSize; - if (dstImageBuffer) { - dstOrigin.c[0] *= elemSize; - } - size.c[0] *= elemSize; - } - else if (dstImageBuffer) { - const size_t elemSize = - cmd.destination().asImage()->getImageFormat().getElementSize(); - dstOrigin.c[0] *= elemSize; - size.c[0] *= elemSize; - } - - result = blitMgr().copyBuffer( - *srcDevMem, *destDevMem, srcOrigin, - dstOrigin, size, cmd.isEntireMemory()); - break; - } - case CL_COMMAND_COPY_BUFFER_RECT: { - result = blitMgr().copyBufferRect( - *srcDevMem, *destDevMem, cmd.srcRect(), - cmd.dstRect(), size, cmd.isEntireMemory()); - break; - } - case CL_COMMAND_COPY_IMAGE: { - result = blitMgr().copyImage( - *srcDevMem, *destDevMem, cmd.srcOrigin(), - cmd.dstOrigin(), size, cmd.isEntireMemory()); - break; - } - case CL_COMMAND_COPY_IMAGE_TO_BUFFER: { - result = blitMgr().copyImageToBuffer( - *srcDevMem, *destDevMem, cmd.srcOrigin(), - cmd.dstOrigin(), size, cmd.isEntireMemory()); - break; - } - case CL_COMMAND_COPY_BUFFER_TO_IMAGE: { - result = blitMgr().copyBufferToImage( - *srcDevMem, *destDevMem, cmd.srcOrigin(), - cmd.dstOrigin(), size, cmd.isEntireMemory()); - break; - } - default: - ShouldNotReachHere(); - break; - } - - if (!result) { - LogError("submitCopyMemory failed!"); - cmd.setStatus(CL_OUT_OF_RESOURCES); - } - - cmd.destination().signalWrite(&dev()); - - profilingEnd(cmd); -} - -void VirtualGPU::submitSvmMapMemory(amd::SvmMapMemoryCommand& cmd) -{ - // No fence is needed since this is a no-op: the - // command will be completed only after all the - // previous commands are complete - profilingBegin(cmd); - profilingEnd(cmd); -} - -void VirtualGPU::submitSvmUnmapMemory(amd::SvmUnmapMemoryCommand& cmd) -{ - // No fence is needed since this is a no-op: the - // command will be completed only after all the - // previous commands are complete - profilingBegin(cmd); - profilingEnd(cmd); -} - -void VirtualGPU::submitMapMemory(amd::MapMemoryCommand &cmd) -{ - // Wait on a kernel if one is outstanding - releaseGpuMemoryFence(); - - profilingBegin(cmd); - - //! @todo add multi-devices synchronization when supported. - - roc::Memory *devMemory = reinterpret_cast( - cmd.memory().getDeviceMemory(dev(), false)); - - cl_command_type type = cmd.type(); - bool imageBuffer = false; - - // Save map requirement. - cl_map_flags mapFlag = cmd.mapFlags(); - - // Treat no map flag as read-write. - if (mapFlag == 0) { - mapFlag = CL_MAP_READ | CL_MAP_WRITE; - } - - devMemory->saveMapInfo(cmd.mapPtr(), cmd.origin(), cmd.size(), - mapFlag, cmd.isEntireMemory()); - - // Sync to the map target. - if (devMemory->isHostMemDirectAccess()) { - // Add memory to VA cache, so rutnime can detect direct access to VA - dev().addVACache(devMemory); - } - if ((!devMemory->isHostMemDirectAccess()) && - (mapFlag & (CL_MAP_READ | CL_MAP_WRITE))) { - bool result = false; - roc::Memory *hsaMemory = static_cast(devMemory); - - amd::Memory* mapMemory = hsaMemory->mapMemory(); - void *hostPtr = mapMemory == NULL ? - hsaMemory->owner()->getHostMem() : - mapMemory->getHostMem(); - - if (type == CL_COMMAND_MAP_BUFFER) { - amd::Coord3D origin(cmd.origin()[0]); - amd::Coord3D size(cmd.size()[0]); - amd::Coord3D dstOrigin(cmd.origin()[0], 0, 0); - if (imageBuffer) { - size_t elemSize = - cmd.memory().asImage()->getImageFormat().getElementSize(); - origin.c[0] *= elemSize; - size.c[0] *= elemSize; - } - - if (mapMemory != nullptr) { - roc::Memory *hsaMapMemory = static_cast( - mapMemory->getDeviceMemory(dev(), false)); - result = blitMgr().copyBuffer(*hsaMemory, - *hsaMapMemory, origin, dstOrigin, - size, cmd.isEntireMemory()); - } - else { - result = blitMgr().readBuffer( - *hsaMemory, static_cast(hostPtr)+origin[0], - origin, size, cmd.isEntireMemory()); - } - - } - else if (type == CL_COMMAND_MAP_IMAGE) { - amd::Image* image = cmd.memory().asImage(); - result = blitMgr().readImage( - *hsaMemory, hostPtr, amd::Coord3D(0), - image->getRegion(), image->getRowPitch(), - image->getSlicePitch(), true); - } - else { - ShouldNotReachHere(); - } - - if (!result) { - LogError("submitMapMemory failed!"); - cmd.setStatus(CL_OUT_OF_RESOURCES); - } - } - - profilingEnd(cmd); -} - -void VirtualGPU::submitUnmapMemory(amd::UnmapMemoryCommand &cmd) -{ - roc::Memory* devMemory = static_cast( - cmd.memory().getDeviceMemory(dev(), false)); - - const device::Memory::WriteMapInfo* mapInfo = - devMemory->writeMapInfo(cmd.mapPtr()); - if (nullptr == mapInfo) { - LogError("Unmap without map call"); - return; - } - // Wait on a kernel if one is outstanding - releaseGpuMemoryFence(); - profilingBegin(cmd); - - // Force buffer write for IMAGE1D_BUFFER - bool imageBuffer = (cmd.memory().getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER); - - if (devMemory->isHostMemDirectAccess()) { - // Remove memory from VA cache - dev().removeVACache(devMemory); - } - if (mapInfo->isUnmapWrite()) { - // Commit the changes made by the user. - if (!devMemory->isHostMemDirectAccess()) { - bool result = false; - - if (cmd.memory().asImage() && !imageBuffer) { - amd::Image *image = cmd.memory().asImage(); - result = blitMgr().writeImage( - cmd.mapPtr(), *devMemory, - mapInfo->origin_, - mapInfo->region_, - image->getRowPitch(), image->getSlicePitch()); - } - else { - amd::Coord3D origin(mapInfo->origin_[0]); - amd::Coord3D size(mapInfo->region_[0]); - if (imageBuffer) { - size_t elemSize = - cmd.memory().asImage()->getImageFormat().getElementSize(); - origin.c[0] *= elemSize; - size.c[0] *= elemSize; - } - if (devMemory->mapMemory() != nullptr) { - roc::Memory *mapMemory = static_cast( - devMemory->mapMemory()->getDeviceMemory(dev(), false)); - - result = blitMgr().copyBuffer( - *mapMemory, *devMemory, - mapInfo->origin_, - mapInfo->origin_, - mapInfo->region_, - mapInfo->isEntire()); - } - else { - result = blitMgr().writeBuffer( - cmd.mapPtr(), *devMemory, origin, size); - } - } - if (!result) { - LogError("submitMapMemory failed!"); - cmd.setStatus(CL_OUT_OF_RESOURCES); - } - } - - cmd.memory().signalWrite(&dev()); - } - - devMemory->clearUnmapInfo(cmd.mapPtr()); - - profilingEnd(cmd); -} - -void VirtualGPU::submitFillMemory(amd::FillMemoryCommand &cmd) -{ - // Wait on a kernel if one is outstanding - releaseGpuMemoryFence(); - - profilingBegin(cmd); - - device::Memory *devMemory = cmd.memory().getDeviceMemory(dev(), false); - - //! @todo add multi-devices synchronization when supported. - - cl_command_type type = cmd.type(); - bool result = false; - bool imageBuffer = false; - float fillValue[4]; - - // Force fill buffer for IMAGE1D_BUFFER - if ((type == CL_COMMAND_FILL_IMAGE) && - (cmd.memory().getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER)) { - type = CL_COMMAND_FILL_BUFFER; - imageBuffer = true; - } - - // Find the the right fill operation - switch (type) { - case CL_COMMAND_FILL_BUFFER: { - const void* pattern = cmd.pattern(); - size_t patternSize = cmd.patternSize(); - amd::Coord3D origin(cmd.origin()[0]); - amd::Coord3D size(cmd.size()[0]); - // Reprogram fill parameters if it's an IMAGE1D_BUFFER object - if (imageBuffer) { - size_t elemSize = - cmd.memory().asImage()->getImageFormat().getElementSize(); - origin.c[0] *= elemSize; - size.c[0] *= elemSize; - memset(fillValue, 0, sizeof(fillValue)); - cmd.memory().asImage()->getImageFormat().formatColor(pattern, - fillValue); - pattern = fillValue; - patternSize = elemSize; - } - result = blitMgr().fillBuffer( - *devMemory, pattern, patternSize, origin, size, - cmd.isEntireMemory()); - break; - } - case CL_COMMAND_FILL_IMAGE: { - result = blitMgr().fillImage( - *devMemory, cmd.pattern(), cmd.origin(), cmd.size(), - cmd.isEntireMemory()); - break; - } - default: - ShouldNotReachHere(); - break; - } - - if (!result) { - LogError("submitFillMemory failed!"); - cmd.setStatus(CL_OUT_OF_RESOURCES); - } - - cmd.memory().signalWrite(&dev()); - - profilingEnd(cmd); -} - -void VirtualGPU::submitMigrateMemObjects(amd::MigrateMemObjectsCommand &vcmd) -{ - // Wait on a kernel if one is outstanding - releaseGpuMemoryFence(); - - profilingBegin(vcmd); - - std::vector::const_iterator itr; - - for (itr = vcmd.memObjects().begin(); - itr != vcmd.memObjects().end(); - itr++) { - // Find device memory - device::Memory *m = (*itr)->getDeviceMemory(dev()); - roc::Memory *memory = static_cast(m); - - if (vcmd.migrationFlags() & CL_MIGRATE_MEM_OBJECT_HOST) { - //! @todo revisit this when multi devices is supported. - } else if (vcmd.migrationFlags() & - CL_MIGRATE_MEM_OBJECT_CONTENT_UNDEFINED) { - //! @todo revisit this when multi devices is supported. - } else { - LogWarning("Unknown operation for memory migration!"); - } - } - - profilingEnd(vcmd); -} - -/*! \brief Writes to the buffer and incrememts the write pointer to the - * buffer. Also, ensures that the argument is written to an - * aligned memory as specified - * - * @param dst The write pointer to the buffer - * @param src The source pointer - * @param size The size in bytes to copy - * @param alignment The alignment to follow while writing to the buffer - */ -static void -addArg(unsigned char** dst, const void* src, - size_t size, uint32_t alignment) -{ - *dst = amd::alignUp(*dst, alignment); - memcpy(*dst, src, size); - *dst += size; -} - -static inline void -addArg(unsigned char** dst, const void* src, size_t size) -{ - assert(size < UINT32_MAX); - addArg(dst, src, size, size); -} - - //Over rides the workgroup size fields in the packet with runtime/compiler set sizes - void setRuntimeCompilerLocalSize(hsa_kernel_dispatch_packet_t& dispatchPacket, - amd::NDRangeContainer sizes, - const size_t* compile_size, - const roc::Device &dev){ - //Todo (sramalin) need to check if compile_size is set to 0 if dimension is not valid - // else this error check is incorrect - if (compile_size[0] || compile_size[1] || compile_size[2]) { - dispatchPacket.workgroup_size_x = sizes.dimensions()>0 ? compile_size[0] : 1; - dispatchPacket.workgroup_size_y = sizes.dimensions()>1 ? compile_size[1] : 1; - dispatchPacket.workgroup_size_z = sizes.dimensions()>2 ? compile_size[2] : 1; - } - else { - //Runtime must set the group size - dispatchPacket.workgroup_size_x = 1; - dispatchPacket.workgroup_size_y = 1; - dispatchPacket.workgroup_size_z = 1; - - if (sizes.dimensions() == 1) { - dispatchPacket.workgroup_size_x = dev.settings().maxWorkGroupSize_; - } - else if (sizes.dimensions() == 2) { - dispatchPacket.workgroup_size_x = dev.settings().maxWorkGroupSize2DX_; - dispatchPacket.workgroup_size_y = dev.settings().maxWorkGroupSize2DY_; - } - else if (sizes.dimensions() == 3) { - dispatchPacket.workgroup_size_x = dev.settings().maxWorkGroupSize3DX_; - dispatchPacket.workgroup_size_y = dev.settings().maxWorkGroupSize3DY_; - dispatchPacket.workgroup_size_z = dev.settings().maxWorkGroupSize3DZ_; - } - } -} - - static void -fillSampleDescriptor( - hsa_ext_sampler_descriptor_t& samplerDescriptor, - const amd::Sampler& sampler) - { - samplerDescriptor.filter_mode = sampler.filterMode() == CL_FILTER_NEAREST ? - HSA_EXT_SAMPLER_FILTER_MODE_NEAREST : HSA_EXT_SAMPLER_FILTER_MODE_LINEAR; - samplerDescriptor.coordinate_mode = sampler.normalizedCoords() ? - HSA_EXT_SAMPLER_COORDINATE_MODE_NORMALIZED : - HSA_EXT_SAMPLER_COORDINATE_MODE_UNNORMALIZED; - switch (sampler.addressingMode()) { - case CL_ADDRESS_CLAMP_TO_EDGE: - samplerDescriptor.address_mode = - HSA_EXT_SAMPLER_ADDRESSING_MODE_CLAMP_TO_EDGE; - break; - case CL_ADDRESS_REPEAT: - samplerDescriptor.address_mode = - HSA_EXT_SAMPLER_ADDRESSING_MODE_REPEAT; - break; - case CL_ADDRESS_CLAMP: - samplerDescriptor.address_mode = - HSA_EXT_SAMPLER_ADDRESSING_MODE_CLAMP_TO_BORDER; - break; - case CL_ADDRESS_MIRRORED_REPEAT: - samplerDescriptor.address_mode = - HSA_EXT_SAMPLER_ADDRESSING_MODE_MIRRORED_REPEAT; - break; - case CL_ADDRESS_NONE: - samplerDescriptor.address_mode = - HSA_EXT_SAMPLER_ADDRESSING_MODE_UNDEFINED; - break; - default: - return; - } - } - -bool -VirtualGPU::submitKernelInternal( - const amd::NDRangeContainer& sizes, - const amd::Kernel& kernel, - const_address parameters, - void *eventHandle) -{ - if (tools_lib_) { - SetOclCorrelationHandle(tools_lib_, this->gpu_device_, eventHandle); - } - - device::Kernel *devKernel = const_cast - (kernel.getDeviceKernel(dev())); - Kernel &gpuKernel = static_cast(*devKernel); - const size_t compilerLdsUsage = gpuKernel.WorkgroupGroupSegmentByteSize(); - size_t ldsUsage = compilerLdsUsage; - - // Check memory dependency and SVM objects - if (!processMemObjects(kernel, parameters)) { - LogError("Wrong memory objects!"); - return false; - } - - // Init PrintfDbg object if printf is enabled. - bool printfEnabled = (gpuKernel.printfInfo().size() > 0) ? true : false; - if (!printfDbg()->init(printfEnabled)){ - LogError("\nPrintfDbg object initialization failed!"); - return false; - } - - // Allocate buffer to hold kernel arguments - address argBuffer = - (address)allocKernArg(gpuKernel.KernargSegmentByteSize(), - gpuKernel.KernargSegmentAlignment()); - - if (argBuffer == NULL) { - LogError("Out of memory"); - return false; - } - - address argPtr = argBuffer; - - // The HLC generates Kernenv arguments, first 3 are global offsets. - const uint extraAargs = ((roc::Kernel*)devKernel)->extraArgumentsNum(); - for (uint j = 0; j < extraAargs; ++j) { - // The 4th parameter is the pointer to print buffer - if (3 == j) { - address bufferPtr = printfDbg()->dbgBuffer(); - addArg(&argPtr, &bufferPtr, sizeof(void*)); - }else { - const size_t offset = j < sizes.dimensions() ? sizes.offset()[j] : 0; - addArg(&argPtr, &offset, sizeof(void*)); //Should be uint32_t for small model and uint64_t for large! - } - } - - const amd::KernelSignature& signature = kernel.signature(); - const amd::KernelParameters& kernelParams = kernel.parameters(); - - // Find all parameters for the current kernel - for (uint i = 0; i != signature.numParameters(); ++i) { - const HsailKernelArg* arg = gpuKernel.hsailArgAt(i); - const_address srcArgPtr = parameters + signature.at(i).offset_; - - if (arg->type_ == HSAIL_ARGTYPE_POINTER ) { - const size_t size = sizeof(void*); - if (arg->addrQual_ == HSAIL_ADDRESS_LOCAL) { - ldsUsage = amd::alignUp(ldsUsage, arg->alignment_); //!< do we need this? - addArg(&argPtr, &ldsUsage, size); - ldsUsage += *reinterpret_cast(srcArgPtr); - continue; - } - assert((arg->addrQual_ == HSAIL_ADDRESS_GLOBAL) && - "Unsupported address qualifier"); - if (kernelParams.boundToSvmPointer(dev(), parameters, i)) { - addArg(&argPtr, srcArgPtr, size); - continue; - } - amd::Memory* mem = *reinterpret_cast(srcArgPtr); - if (mem == NULL) { - addArg(&argPtr, srcArgPtr, size); - continue; - } - - Memory *devMem = static_cast(mem->getDeviceMemory(dev())); - //! @todo add multi-devices synchronization when supported. - void* globalAddress = devMem->getDeviceMemory(); - addArg(&argPtr, &globalAddress, size); - - //! @todo Compiler has to return read/write attributes - const cl_mem_flags flags = mem->getMemFlags(); - if (!flags || (flags & (CL_MEM_READ_WRITE | CL_MEM_WRITE_ONLY))) { - mem->signalWrite(&dev()); - } - } - else if (arg->type_ == HSAIL_ARGTYPE_VALUE) { - if (arg->dataType_ == HSAIL_DATATYPE_STRUCT) { - void *mem = allocKernArg(arg->size_, arg->alignment_); - if (mem == NULL) { - LogError("Out of memory"); - return false; - } - memcpy(mem, srcArgPtr, arg->size_); - addArg(&argPtr, &mem, sizeof(void*)); - continue; - } - for (uint e = 0; e < arg->numElem_; ++e) { - addArg(&argPtr, srcArgPtr, arg->size_); - srcArgPtr += arg->size_; - } - } - else if (arg->type_ == HSAIL_ARGTYPE_IMAGE) { - amd::Memory* mem = *reinterpret_cast(srcArgPtr); - Image* image = static_cast(mem->getDeviceMemory(dev())); - if (image == NULL) { - LogError("Kernel image argument is not an image object"); - return false; - } - - if (dev().settings().enableImageHandle_) { - const uint64_t image_srd = image->getHsaImageObject().handle; - assert(amd::isMultipleOf(image_srd, sizeof(image_srd))); - addArg(&argPtr, &image_srd, sizeof(image_srd)); - } - else { - // Image arguments are of size 48 bytes and are aligned to 16 bytes - addArg(&argPtr, (void *)image->getHsaImageObject().handle, - HSA_IMAGE_OBJECT_SIZE, HSA_IMAGE_OBJECT_ALIGNMENT); - } - - //! @todo Compiler has to return read/write attributes - const cl_mem_flags flags = mem->getMemFlags(); - if (!flags || (flags & (CL_MEM_READ_WRITE | CL_MEM_WRITE_ONLY))) { - mem->signalWrite(&dev()); - } - } - else if (arg->type_ == HSAIL_ARGTYPE_SAMPLER) { - amd::Sampler* sampler = *reinterpret_cast(srcArgPtr); - if (sampler == NULL) { - LogError("Kernel sampler argument is not an sampler object"); - return false; - } - - hsa_ext_sampler_descriptor_t samplerDescriptor; - fillSampleDescriptor(samplerDescriptor, *sampler); - - hsa_ext_sampler_t hsa_sampler; - hsa_status_t status = hsa_ext_sampler_create(dev().getBackendDevice(), - &samplerDescriptor, &hsa_sampler); - if (status != HSA_STATUS_SUCCESS) { - LogError("Error creating device sampler object!"); - return false; - } - - if (dev().settings().enableImageHandle_) { - uint64_t sampler_srd = hsa_sampler.handle; - addArg(&argPtr, &sampler_srd, sizeof(sampler_srd)); - samplerList_.push_back(hsa_sampler); - // TODO: destroy sampler. - } - else { - argPtr = amd::alignUp(argPtr, HSA_SAMPLER_OBJECT_ALIGNMENT); - - memcpy(argPtr, (void*)hsa_sampler.handle, HSA_SAMPLER_OBJECT_SIZE); - argPtr += HSA_SAMPLER_OBJECT_SIZE; - hsa_ext_sampler_destroy(dev().getBackendDevice(), hsa_sampler); - } - } - } - - // Check there is no arguments' buffer overflow - assert(argPtr <= argBuffer + gpuKernel.KernargSegmentByteSize()); - - // Check for group memory overflow - //! @todo Check should be in HSA - here we should have at most an assert - assert(roc_device_.info().localMemSizePerCU_ > 0); - if (ldsUsage > roc_device_.info().localMemSizePerCU_) { - LogError("No local memory available\n"); - return false; - } - - //Initialize the dispatch Packet - hsa_kernel_dispatch_packet_t dispatchPacket; - memset(&dispatchPacket, 0, sizeof(dispatchPacket)); - - dispatchPacket.kernel_object = gpuKernel.KernelCodeHandle(); - - dispatchPacket.header = aqlHeader_; - dispatchPacket.setup |= sizes.dimensions() << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS; - dispatchPacket.grid_size_x = sizes.dimensions()>0 ? sizes.global()[0] : 1; - dispatchPacket.grid_size_y = sizes.dimensions()>1 ? sizes.global()[1] : 1; - dispatchPacket.grid_size_z = sizes.dimensions()>2 ? sizes.global()[2] : 1; - - const size_t* compile_size = devKernel->workGroupInfo()->compileSize_; - if (sizes.local().product() != 0) { - dispatchPacket.workgroup_size_x = sizes.dimensions()>0 ? sizes.local()[0] : 1; - dispatchPacket.workgroup_size_y = sizes.dimensions()>1 ? sizes.local()[1] : 1; - dispatchPacket.workgroup_size_z = sizes.dimensions()>2 ? sizes.local()[2] : 1; - } else { - setRuntimeCompilerLocalSize(dispatchPacket, sizes, compile_size, dev()); - } - dispatchPacket.kernarg_address = argBuffer; - dispatchPacket.group_segment_size = ldsUsage; - dispatchPacket.private_segment_size = devKernel->workGroupInfo()->privateMemSize_; - - //Dispatch the packet - if (!dispatchAqlPacket(&dispatchPacket, false)){ - return false; - } - - // Mark the flag indicating if a dispatch is outstanding. - // We are not waiting after every dispatch. - hasPendingDispatch_ = true; - - // Output printf buffer - if(!printfDbg()->output(*this, printfEnabled, gpuKernel.printfInfo())){ - LogError("\nCould not print data from the printf buffer!"); - return false; - } - return true; -} -/** - * @brief Api to dispatch a kernel for execution. The implementation - * parses the input object, an instance of virtual command to obtain - * the parameters of global size, work group size, offsets of work - * items, enable/disable profiling, etc. - * - * It also parses the kernel arguments buffer to inject into Hsa Runtime - * the list of kernel parameters. - */ -void VirtualGPU::submitKernel(amd::NDRangeKernelCommand &vcmd) { - profilingBegin(vcmd); - - // Submit kernel to HW - if (!submitKernelInternal( - vcmd.sizes(), vcmd.kernel(), vcmd.parameters(), - static_cast(as_cl(&vcmd.event())))) { - LogError("AQL dispatch failed!"); - vcmd.setStatus(CL_INVALID_OPERATION); - } - - profilingEnd(vcmd); -} - -void VirtualGPU::submitNativeFn(amd::NativeFnCommand &cmd) { - // std::cout<<__FUNCTION__<<" not implemented"<<"*********"<signal_, &time); - start_ = time.start * ticksToTime_; - end_ = time.end * ticksToTime_; - profilingSignal_->ts_ = nullptr; - profilingSignal_ = nullptr; - } - } - - // Start a timestamp (get timestamp from OS) - void start() { - start_ = amd::Os::timeNanos(); - } - - // End a timestamp (get timestamp from OS) - void end() { - end_ = amd::Os::timeNanos(); - } - - static void setGpuTicksToTime(double ticksToTime) { ticksToTime_=ticksToTime; } - static double getGpuTicksToTime() { return ticksToTime_; } -}; - -class VirtualGPU : public device::VirtualDevice { -public: - //! Initial signal value - static const hsa_signal_value_t InitSignalValue = 1; - - class MemoryDependency : public amd::EmbeddedObject - { - public: - //! Default constructor - MemoryDependency() - : memObjectsInQueue_(NULL) - , numMemObjectsInQueue_(0) - , maxMemObjectsInQueue_(0) {} - - ~MemoryDependency() { delete [] memObjectsInQueue_; } - - //! Creates memory dependecy structure - bool create(size_t numMemObj); - - //! Notify the tracker about new kernel - void newKernel() { endMemObjectsInQueue_ = numMemObjectsInQueue_; } - - //! Validates memory object on dependency - void validate(VirtualGPU& gpu, const Memory* memory, bool readOnly); - - //! Clear memory dependency - void clear(bool all = true); - - private: - struct MemoryState { - uint64_t start_; //! Busy memory start address - uint64_t end_; //! Busy memory end address - bool readOnly_; //! Current GPU state in the queue - }; - - MemoryState* memObjectsInQueue_; //!< Memory object state in the queue - size_t endMemObjectsInQueue_; //!< End of mem objects in the queue - size_t numMemObjectsInQueue_; //!< Number of mem objects in the queue - size_t maxMemObjectsInQueue_; //!< Maximum number of mem objects in the queue - }; - - VirtualGPU(Device &device); - ~VirtualGPU(); - - bool create(bool profilingEna); - bool terminate(); - const Device& dev() const { return roc_device_; } - - void profilingBegin(amd::Command &command, bool drmProfiling = false); - void profilingEnd(amd::Command &command); - - void updateCommandsState(amd::Command* list); - - void submitReadMemory(amd::ReadMemoryCommand& cmd); - void submitWriteMemory(amd::WriteMemoryCommand& cmd); - void submitCopyMemory(amd::CopyMemoryCommand& cmd); - void submitMapMemory(amd::MapMemoryCommand& cmd); - void submitUnmapMemory(amd::UnmapMemoryCommand& cmd); - void submitKernel(amd::NDRangeKernelCommand& cmd); - bool submitKernelInternal( - const amd::NDRangeContainer& sizes, //!< Workload sizes - const amd::Kernel& kernel, //!< Kernel for execution - const_address parameters, //!< Parameters for the kernel - void *event_handle //!< Handle to OCL event for debugging - ); - void submitNativeFn(amd::NativeFnCommand& cmd); - void submitMarker(amd::Marker& cmd); - - void submitAcquireExtObjects(amd::AcquireExtObjectsCommand& cmd); - void submitReleaseExtObjects(amd::ReleaseExtObjectsCommand& cmd); - void submitPerfCounter(amd::PerfCounterCommand& cmd){}; - - void flush(amd::Command* list = NULL, bool wait = false); - void submitFillMemory(amd::FillMemoryCommand& cmd); - void submitMigrateMemObjects(amd::MigrateMemObjectsCommand& cmd); - -// { roc OpenCL integration -// Added these stub (no-ops) implementation of pure virtual methods, -// when integrating HSA and OpenCL branches. -// TODO: After inegration, whoever is working on VirtualGPU should write -// actual implemention. - virtual void submitSignal(amd::SignalCommand &cmd) {} - virtual void submitMakeBuffersResident(amd::MakeBuffersResidentCommand &cmd) {} - - virtual void submitSvmFreeMemory(amd::SvmFreeMemoryCommand& cmd); - virtual void submitSvmCopyMemory(amd::SvmCopyMemoryCommand& cmd); - virtual void submitSvmFillMemory(amd::SvmFillMemoryCommand& cmd); - virtual void submitSvmMapMemory(amd::SvmMapMemoryCommand& cmd); - virtual void submitSvmUnmapMemory(amd::SvmUnmapMemoryCommand& cmd); - - void submitThreadTraceMemObjects(amd::ThreadTraceMemObjectsCommand &cmd) {} - void submitThreadTrace(amd::ThreadTraceCommand &vcmd) {} - - /** - * @brief Waits on an outstanding kernel without regard to how - * it was dispatched - with or without a signal - * - * @return bool true if Wait returned successfully, false - * otherwise - */ - bool releaseGpuMemoryFence(); - - hsa_agent_t gpu_device() { return gpu_device_; } - hsa_queue_t* gpu_queue() { return gpu_queue_; } - - // Return pointer to PrintfDbg - PrintfDbg* printfDbg() const {return printfdbg_;} - - //! Returns memory dependency class - MemoryDependency& memoryDependency() { return memoryDependency_; } - - //! Detects memory dependency for HSAIL kernels and uses appropriate AQL header - bool processMemObjects( - const amd::Kernel& kernel, //!< AMD kernel object for execution - const_address params //!< Pointer to the param's store - ); - -// } roc OpenCL integration -private: - bool dispatchAqlPacket( - hsa_kernel_dispatch_packet_t* packet, bool blocking = true); - bool dispatchAqlPacket( - hsa_barrier_and_packet_t* packet, bool blocking = true); - template bool dispatchGenericAqlPacket( - AqlPacket* packet, bool blocking); - void dispatchBarrierPacket(const hsa_barrier_and_packet_t* packet); - void initializeDispatchPacket(hsa_kernel_dispatch_packet_t* packet, - amd::NDRangeContainer& sizes); - - bool initPool(size_t kernarg_pool_size, uint signal_pool_count); - void destroyPool(); - - void* allocKernArg(size_t size, size_t alignment); - void resetKernArgPool() { kernarg_pool_cur_offset_ = 0; } - - //! Updates AQL header for the upcomming dispatch - void setAqlHeader(uint16_t header) { aqlHeader_ = header; } - - /** - * @brief Maintains the list of sampler allocated for one or more kernel - * submissions. - */ - std::vector samplerList_; - - /** - * @brief Indicates if a kernel dispatch is outstanding. This flag is - * used to synchronized on kernel outputs. - */ - bool hasPendingDispatch_; - Timestamp* timestamp_; - hsa_agent_t gpu_device_; //!< Physical device - hsa_queue_t* gpu_queue_; //!< Queue associated with a gpu - hsa_barrier_and_packet_t barrier_packet_; - hsa_signal_t barrier_signal_; - uint32_t dispatch_id_; //!< This variable must be updated atomically. - Device& roc_device_; //!< roc device object - void * tools_lib_; - PrintfDbg* printfdbg_; - MemoryDependency memoryDependency_; //!< Memory dependency class - uint16_t aqlHeader_; //!< AQL header for dispatch - - char* kernarg_pool_base_; - size_t kernarg_pool_size_; - uint kernarg_pool_cur_offset_; - - std::vector signal_pool_; //!< Pool of signals for profiling - - friend class Timestamp; -}; -} -