From 8f923254c8d328aef39e06ee6c38ec2cccb8b1dc Mon Sep 17 00:00:00 2001
From: foreman
Date: Thu, 21 Jul 2016 12:34:56 -0400
Subject: [PATCH] P4 to Git Change 1294021 by lmoriche@lmoriche_opencl_dev on
2016/07/21 12:28:39
SWDEV-94640 - Back out CL#1293210:
[OCL-LC-ROCm] OpenCL Runtime Library Implements OpenCL runtime API. Add HSA virtual device to ORCA.
- Rename hsa_foundation to ROCm.
Affected files ...
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/Makefile#2 delete
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/build/Makefile#2 delete
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/build/Makefile.oclrocm#2 delete
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/build/wNow64a/Makefile#2 delete
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/mesa_glinterop.h#2 delete
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocappprofile.cpp#2 delete
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocappprofile.hpp#2 delete
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocbinary.hpp#2 delete
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocblit.cpp#2 delete
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocblit.hpp#2 delete
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/roccompiler.cpp#2 delete
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/roccompilerlib.cpp#2 delete
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/roccompilerlib.hpp#2 delete
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocdefs.hpp#3 delete
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocdevice.cpp#3 delete
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocdevice.hpp#2 delete
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocglinterop.cpp#2 delete
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocglinterop.hpp#2 delete
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rockernel.cpp#2 delete
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rockernel.hpp#2 delete
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocmemory.cpp#2 delete
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocmemory.hpp#2 delete
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocprintf.cpp#2 delete
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocprintf.hpp#2 delete
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocprogram.cpp#2 delete
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocprogram.hpp#2 delete
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocregisters.hpp#2 delete
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocsettings.cpp#2 delete
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocsettings.hpp#2 delete
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocvirtual.cpp#2 delete
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocvirtual.hpp#2 delete
[ROCm/clr commit: 355257ed719fe6a3152a3688cbc4c4abca0d9605]
---
.../runtime/device/rocm/mesa_glinterop.h | 271 ---
.../runtime/device/rocm/rocappprofile.cpp | 61 -
.../runtime/device/rocm/rocappprofile.hpp | 23 -
.../rocclr/runtime/device/rocm/rocbinary.hpp | 51 -
.../rocclr/runtime/device/rocm/rocblit.cpp | 1532 ---------------
.../rocclr/runtime/device/rocm/rocblit.hpp | 412 ----
.../runtime/device/rocm/roccompiler.cpp | 160 --
.../runtime/device/rocm/roccompilerlib.cpp | 59 -
.../runtime/device/rocm/roccompilerlib.hpp | 77 -
.../rocclr/runtime/device/rocm/rocdefs.hpp | 49 -
.../rocclr/runtime/device/rocm/rocdevice.cpp | 1296 -------------
.../rocclr/runtime/device/rocm/rocdevice.hpp | 376 ----
.../runtime/device/rocm/rocglinterop.cpp | 120 --
.../runtime/device/rocm/rocglinterop.hpp | 158 --
.../rocclr/runtime/device/rocm/rockernel.cpp | 680 -------
.../rocclr/runtime/device/rocm/rockernel.hpp | 195 --
.../rocclr/runtime/device/rocm/rocmemory.cpp | 783 --------
.../rocclr/runtime/device/rocm/rocmemory.hpp | 188 --
.../rocclr/runtime/device/rocm/rocprintf.cpp | 467 -----
.../rocclr/runtime/device/rocm/rocprintf.hpp | 115 --
.../rocclr/runtime/device/rocm/rocprogram.cpp | 845 --------
.../rocclr/runtime/device/rocm/rocprogram.hpp | 156 --
.../runtime/device/rocm/rocregisters.hpp | 198 --
.../runtime/device/rocm/rocsettings.cpp | 100 -
.../runtime/device/rocm/rocsettings.hpp | 69 -
.../rocclr/runtime/device/rocm/rocvirtual.cpp | 1721 -----------------
.../rocclr/runtime/device/rocm/rocvirtual.hpp | 251 ---
27 files changed, 10413 deletions(-)
delete mode 100644 projects/clr/rocclr/runtime/device/rocm/mesa_glinterop.h
delete mode 100644 projects/clr/rocclr/runtime/device/rocm/rocappprofile.cpp
delete mode 100644 projects/clr/rocclr/runtime/device/rocm/rocappprofile.hpp
delete mode 100644 projects/clr/rocclr/runtime/device/rocm/rocbinary.hpp
delete mode 100644 projects/clr/rocclr/runtime/device/rocm/rocblit.cpp
delete mode 100644 projects/clr/rocclr/runtime/device/rocm/rocblit.hpp
delete mode 100644 projects/clr/rocclr/runtime/device/rocm/roccompiler.cpp
delete mode 100644 projects/clr/rocclr/runtime/device/rocm/roccompilerlib.cpp
delete mode 100644 projects/clr/rocclr/runtime/device/rocm/roccompilerlib.hpp
delete mode 100644 projects/clr/rocclr/runtime/device/rocm/rocdefs.hpp
delete mode 100644 projects/clr/rocclr/runtime/device/rocm/rocdevice.cpp
delete mode 100644 projects/clr/rocclr/runtime/device/rocm/rocdevice.hpp
delete mode 100644 projects/clr/rocclr/runtime/device/rocm/rocglinterop.cpp
delete mode 100644 projects/clr/rocclr/runtime/device/rocm/rocglinterop.hpp
delete mode 100644 projects/clr/rocclr/runtime/device/rocm/rockernel.cpp
delete mode 100644 projects/clr/rocclr/runtime/device/rocm/rockernel.hpp
delete mode 100644 projects/clr/rocclr/runtime/device/rocm/rocmemory.cpp
delete mode 100644 projects/clr/rocclr/runtime/device/rocm/rocmemory.hpp
delete mode 100644 projects/clr/rocclr/runtime/device/rocm/rocprintf.cpp
delete mode 100644 projects/clr/rocclr/runtime/device/rocm/rocprintf.hpp
delete mode 100644 projects/clr/rocclr/runtime/device/rocm/rocprogram.cpp
delete mode 100644 projects/clr/rocclr/runtime/device/rocm/rocprogram.hpp
delete mode 100644 projects/clr/rocclr/runtime/device/rocm/rocregisters.hpp
delete mode 100644 projects/clr/rocclr/runtime/device/rocm/rocsettings.cpp
delete mode 100644 projects/clr/rocclr/runtime/device/rocm/rocsettings.hpp
delete mode 100644 projects/clr/rocclr/runtime/device/rocm/rocvirtual.cpp
delete mode 100644 projects/clr/rocclr/runtime/device/rocm/rocvirtual.hpp
diff --git a/projects/clr/rocclr/runtime/device/rocm/mesa_glinterop.h b/projects/clr/rocclr/runtime/device/rocm/mesa_glinterop.h
deleted file mode 100644
index 0b9cb4e9f7..0000000000
--- a/projects/clr/rocclr/runtime/device/rocm/mesa_glinterop.h
+++ /dev/null
@@ -1,271 +0,0 @@
-/*
- * Mesa 3-D graphics library
- *
- * Copyright 2016 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
- * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
- * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
- * OTHER DEALINGS IN THE SOFTWARE.
- */
-
-/* Mesa OpenGL inter-driver interoperability interface designed for but not
- * limited to OpenCL.
- *
- * This is a driver-agnostic, backward-compatible interface. The structures
- * are only allowed to grow. They can never shrink and their members can
- * never be removed, renamed, or redefined.
- *
- * The interface doesn't return a lot of static texture parameters like
- * width, height, etc. It mainly returns mutable buffer and texture view
- * parameters that can't be part of the texture allocation (because they are
- * mutable). If drivers want to return more data or want to return static
- * allocation parameters, they can do it in one of these two ways:
- * - attaching the data to the DMABUF handle in a driver-specific way
- * - passing the data via "out_driver_data" in the "in" structure.
- *
- * Mesa is expected to do a lot of error checking on behalf of OpenCL, such
- * as checking the target, miplevel, and texture completeness.
- *
- * OpenCL, on the other hand, needs to check if the display+context combo
- * is compatible with the OpenCL driver by querying the device information.
- * It also needs to check if the texture internal format and channel ordering
- * (returned in a driver-specific way) is supported by OpenCL, among other
- * things.
- */
-
-#ifndef MESA_GLINTEROP_H
-#define MESA_GLINTEROP_H
-
-#include
-
-#if !defined(MESA_GLINTEROP_NO_GLX)
-#include
-#include
-#else
-#include
-#include
-#endif
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#define MESA_GLINTEROP_VERSION 1
-
-/** Returned error codes. */
-enum {
- MESA_GLINTEROP_SUCCESS = 0,
- MESA_GLINTEROP_OUT_OF_RESOURCES,
- MESA_GLINTEROP_OUT_OF_HOST_MEMORY,
- MESA_GLINTEROP_INVALID_OPERATION,
- MESA_GLINTEROP_INVALID_VALUE,
- MESA_GLINTEROP_INVALID_DISPLAY,
- MESA_GLINTEROP_INVALID_CONTEXT,
- MESA_GLINTEROP_INVALID_TARGET,
- MESA_GLINTEROP_INVALID_OBJECT,
- MESA_GLINTEROP_INVALID_MIP_LEVEL,
- MESA_GLINTEROP_UNSUPPORTED
-};
-
-/** Access flags. */
-enum {
- MESA_GLINTEROP_ACCESS_READ_WRITE = 0,
- MESA_GLINTEROP_ACCESS_READ_ONLY,
- MESA_GLINTEROP_ACCESS_WRITE_ONLY
-};
-
-
-/**
- * Device information returned by Mesa.
- */
-typedef struct _mesa_glinterop_device_info {
- uint32_t size; /* size of this structure */
-
- /* PCI location */
- uint32_t pci_segment_group;
- uint32_t pci_bus;
- uint32_t pci_device;
- uint32_t pci_function;
-
- /* Device identification */
- uint32_t vendor_id;
- uint32_t device_id;
-} mesa_glinterop_device_info;
-
-
-/**
- * Input parameters to Mesa interop export functions.
- */
-typedef struct _mesa_glinterop_export_in {
- uint32_t size; /* size of this structure */
-
- /* One of the following:
- * - GL_TEXTURE_BUFFER
- * - GL_TEXTURE_1D
- * - GL_TEXTURE_2D
- * - GL_TEXTURE_3D
- * - GL_TEXTURE_RECTANGLE
- * - GL_TEXTURE_1D_ARRAY
- * - GL_TEXTURE_2D_ARRAY
- * - GL_TEXTURE_CUBE_MAP_ARRAY
- * - GL_TEXTURE_CUBE_MAP
- * - GL_TEXTURE_CUBE_MAP_POSITIVE_X
- * - GL_TEXTURE_CUBE_MAP_NEGATIVE_X
- * - GL_TEXTURE_CUBE_MAP_POSITIVE_Y
- * - GL_TEXTURE_CUBE_MAP_NEGATIVE_Y
- * - GL_TEXTURE_CUBE_MAP_POSITIVE_Z
- * - GL_TEXTURE_CUBE_MAP_NEGATIVE_Z
- * - GL_TEXTURE_2D_MULTISAMPLE
- * - GL_TEXTURE_2D_MULTISAMPLE_ARRAY
- * - GL_TEXTURE_EXTERNAL_OES
- * - GL_RENDERBUFFER
- * - GL_ARRAY_BUFFER
- */
- GLenum target;
-
- /* If target is GL_ARRAY_BUFFER, it's a buffer object.
- * If target is GL_RENDERBUFFER, it's a renderbuffer object.
- * If target is GL_TEXTURE_*, it's a texture object.
- */
- GLuint obj;
-
- /* Mipmap level. Ignored for non-texture objects. */
- GLuint miplevel;
-
- /* One of MESA_GLINTEROP_ACCESS_* flags. This describes how the exported
- * object is going to be used.
- */
- uint32_t access;
-
- /* Size of memory pointed to by out_driver_data. */
- uint32_t out_driver_data_size;
-
- /* If the caller wants to query driver-specific data about the OpenGL
- * object, this should point to the memory where that data will be stored.
- */
- void *out_driver_data;
-} mesa_glinterop_export_in;
-
-
-/**
- * Outputs of Mesa interop export functions.
- */
-typedef struct _mesa_glinterop_export_out {
- uint32_t size; /* size of this structure */
-
- /* The DMABUF handle. It must be closed by the caller using the POSIX
- * close() function when it's not needed anymore. Mesa is not responsible
- * for closing the handle.
- *
- * Not closing the handle by the caller will lead to a resource leak,
- * prevents releasing the GPU buffer, and may prevent creating new DMABUF
- * handles until the process termination.
- */
- int dmabuf_fd;
-
- /* The mutable OpenGL internal format specified by glTextureView or
- * glTexBuffer. If the object is not one of those, the original internal
- * format specified by glTexStorage, glTexImage, or glRenderbufferStorage
- * will be returned.
- */
- GLenum internalformat;
-
- /* Parameters specified by glTexBufferRange for GL_TEXTURE_BUFFER. */
- GLintptr buf_offset;
- GLsizeiptr buf_size;
-
- /* Parameters specified by glTextureView. If the object is not a texture
- * view, default parameters covering the whole texture will be returned.
- */
- GLuint view_minlevel;
- GLuint view_numlevels;
- GLuint view_minlayer;
- GLuint view_numlayers;
-} mesa_glinterop_export_out;
-
-#if !defined(MESA_GLINTEROP_NO_GLX)
-/**
- * Query device information.
- *
- * \param dpy GLX display
- * \param context GLX context
- * \param out where to return the information
- *
- * \return MESA_GLINTEROP_SUCCESS or MESA_GLINTEROP_* != 0 on error
- */
-GLAPI int GLAPIENTRY
-MesaGLInteropGLXQueryDeviceInfo(Display *dpy, GLXContext context,
- mesa_glinterop_device_info *out);
-#endif
-
-/**
- * Same as MesaGLInteropGLXQueryDeviceInfo except that it accepts EGLDisplay
- * and EGLContext.
- */
-GLAPI int GLAPIENTRY
-MesaGLInteropEGLQueryDeviceInfo(EGLDisplay dpy, EGLContext context,
- mesa_glinterop_device_info *out);
-
-
-#if !defined(MESA_GLINTEROP_NO_GLX)
-/**
- * Create and return a DMABUF handle corresponding to the given OpenGL
- * object, and return other parameters about the OpenGL object.
- *
- * \param dpy GLX display
- * \param context GLX context
- * \param in input parameters
- * \param out return values
- *
- * \return MESA_GLINTEROP_SUCCESS or MESA_GLINTEROP_* != 0 on error
- */
-GLAPI int GLAPIENTRY
-MesaGLInteropGLXExportObject(Display *dpy, GLXContext context,
- mesa_glinterop_export_in *in,
- mesa_glinterop_export_out *out);
-#endif
-
-/**
- * Same as MesaGLInteropGLXExportObject except that it accepts
- * EGLDisplay and EGLContext.
- */
-GLAPI int GLAPIENTRY
-MesaGLInteropEGLExportObject(EGLDisplay dpy, EGLContext context,
- mesa_glinterop_export_in *in,
- mesa_glinterop_export_out *out);
-
-
-#if !defined(MESA_GLINTEROP_NO_GLX)
-typedef int (APIENTRYP PFNMESAGLINTEROPGLXQUERYDEVICEINFOPROC)(Display *dpy, GLXContext context,
- mesa_glinterop_device_info *out);
-#endif
-typedef int (APIENTRYP PFNMESAGLINTEROPEGLQUERYDEVICEINFOPROC)(EGLDisplay dpy, EGLContext context,
- mesa_glinterop_device_info *out);
-#if !defined(MESA_GLINTEROP_NO_GLX)
-typedef int (APIENTRYP PFNMESAGLINTEROPGLXEXPORTOBJECTPROC)(Display *dpy, GLXContext context,
- mesa_glinterop_export_in *in,
- mesa_glinterop_export_out *out);
-#endif
-typedef int (APIENTRYP PFNMESAGLINTEROPEGLEXPORTOBJECTPROC)(EGLDisplay dpy, EGLContext context,
- mesa_glinterop_export_in *in,
- mesa_glinterop_export_out *out);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* MESA_GLINTEROP_H */
diff --git a/projects/clr/rocclr/runtime/device/rocm/rocappprofile.cpp b/projects/clr/rocclr/runtime/device/rocm/rocappprofile.cpp
deleted file mode 100644
index 0e0ac15113..0000000000
--- a/projects/clr/rocclr/runtime/device/rocm/rocappprofile.cpp
+++ /dev/null
@@ -1,61 +0,0 @@
-//
-// Copyright (c) 2009 Advanced Micro Devices, Inc. All rights reserved.
-//
-
-
-#ifndef WITHOUT_HSA_BACKEND
-
-#include "top.hpp"
-#include "device/device.hpp"
-#include "device/appprofile.hpp"
-#include "device/rocm/rocappprofile.hpp"
-
-#include
-
-amd::AppProfile* rocCreateAppProfile()
-{
- amd::AppProfile* appProfile = new roc::AppProfile;
-
- if ((appProfile == NULL) || !appProfile->init()) {
- return NULL;
- }
-
- return appProfile;
-}
-
-namespace roc {
-
-bool AppProfile::ParseApplicationProfile()
-{
- std::string appName("Explorer");
-
- std::transform(appName.begin(), appName.end(), appName.begin(), ::tolower);
- std::transform(appFileName_.begin(), appFileName_.end(), appFileName_.begin(), ::tolower);
-
- if (appFileName_.compare(appName) == 0 ) {
- hsaDeviceHint_ = CL_HSA_DISABLED_AMD;
- gpuvmHighAddr_ = false;
- noHsaInit_ = true;
- profileOverridesAllSettings_ = true;
- }
-
- // Setting both bits is invalid, make it niether.
- if (hsaDeviceHint_ & CL_HSA_ENABLED_AMD
- && hsaDeviceHint_ & CL_HSA_DISABLED_AMD) {
- hsaDeviceHint_ = 0;
- }
-
- if (noHsaInit_) {
- // If no HSA initialization, then force hint flag to non-HSA device.
- // Even if this is not forced, the device selection logic will endure it.
- // After all hint flags are treated as hint only - depending on
- // availibility.
- hsaDeviceHint_ = CL_HSA_DISABLED_AMD;
- }
-
- return true;
-}
-
-}
-
-#endif
diff --git a/projects/clr/rocclr/runtime/device/rocm/rocappprofile.hpp b/projects/clr/rocclr/runtime/device/rocm/rocappprofile.hpp
deleted file mode 100644
index 00221dda2c..0000000000
--- a/projects/clr/rocclr/runtime/device/rocm/rocappprofile.hpp
+++ /dev/null
@@ -1,23 +0,0 @@
-//
-// Copyright (c) 2009 Advanced Micro Devices, Inc. All rights reserved.
-//
-#pragma once
-
-#ifndef WITHOUT_HSA_BACKEND
-
-namespace roc {
-
-class AppProfile : public amd::AppProfile
-{
-public:
- AppProfile(): amd::AppProfile() {}
-
-protected:
- //! parse application profile based on application file name
- virtual bool ParseApplicationProfile();
-};
-
-}
-
-#endif
-
diff --git a/projects/clr/rocclr/runtime/device/rocm/rocbinary.hpp b/projects/clr/rocclr/runtime/device/rocm/rocbinary.hpp
deleted file mode 100644
index cdb910f622..0000000000
--- a/projects/clr/rocclr/runtime/device/rocm/rocbinary.hpp
+++ /dev/null
@@ -1,51 +0,0 @@
-//
-// Copyright (c) 2009 Advanced Micro Devices, Inc. All rights reserved.
-//
-#pragma once
-
-#include "top.hpp"
-#include "rocdevice.hpp"
-
-#ifndef WITHOUT_HSA_BACKEND
-
-namespace roc {
-
-typedef std::map NameKernelMap;
-
-class ClBinary : public device::ClBinary
-{
-public:
- ClBinary(const Device& dev, BinaryImageFormat bifVer = BIF_VERSION3)
- : device::ClBinary(dev, bifVer)
- {}
-
- //! Destructor
- ~ClBinary() {}
-
-
-protected:
- bool setElfTarget() {
- uint32_t target = static_cast(21);//dev().calTarget());
- assert (((0xFFFF8000 & target) == 0) && "ASIC target ID >= 2^15");
- uint16_t elf_target = (uint16_t)(0x7FFF & target);
- return elfOut()->setTarget(elf_target, amd::OclElf::CAL_PLATFORM);
- return true;
- }
-
-private:
- //! Disable default copy constructor
- ClBinary(const ClBinary&);
-
- //! Disable default operator=
- ClBinary& operator=(const ClBinary&);
-
- //! Returns the HSA device for this object
- const Device& dev() const { return static_cast(dev_); }
-
-};
-
-} // namespace roc
-
-#endif // WITHOUT_HSA_BACKEND
-
-
diff --git a/projects/clr/rocclr/runtime/device/rocm/rocblit.cpp b/projects/clr/rocclr/runtime/device/rocm/rocblit.cpp
deleted file mode 100644
index c4824d8bf7..0000000000
--- a/projects/clr/rocclr/runtime/device/rocm/rocblit.cpp
+++ /dev/null
@@ -1,1532 +0,0 @@
-//
-// Copyright (c) 2013 Advanced Micro Devices, Inc. All rights reserved.
-//
-
-#include "platform/commandqueue.hpp"
-#include "device/rocm/rocdevice.hpp"
-#include "device/rocm/rocblit.hpp"
-#include "device/rocm/rocmemory.hpp"
-#include "device/rocm/rocvirtual.hpp"
-#include "utils/debug.hpp"
-
-namespace roc {
-
-
-void
-FindPinSize(
- size_t& pinSize, const amd::Coord3D& size,
- size_t& rowPitch, size_t& slicePitch, const Image& image)
-{
- size_t elementSize = image.owner()->asImage()->getImageFormat().getElementSize();
- pinSize = size[0] * elementSize;
- if ((rowPitch == 0) || (rowPitch == pinSize)) {
- rowPitch = 0;
- }
- else {
- pinSize = rowPitch;
- }
-
- // Calculate the pin size, which should be equal to the copy size
- for (uint i = 1; i < 3; ++i) {
- pinSize *= size[i];
- if (i == 1) {
- if ((slicePitch == 0) || (slicePitch == pinSize)) {
- slicePitch = 0;
- }
- else {
- if (image.getHsaImageDescriptor().geometry != HSA_EXT_IMAGE_GEOMETRY_1DA) {
- pinSize = slicePitch;
- }
- else {
- pinSize = slicePitch * size[i];
- }
- }
- }
- }
-}
-
-HsaBlitManager::HsaBlitManager(device::VirtualDevice& vDev, Setup setup)
- : HostBlitManager(vDev, setup),
- roc_device_(reinterpret_cast(dev_)) {
- completion_signal_.handle = 0;
-}
-
-bool HsaBlitManager::hsaCopy(const void *hostSrc, void *hostDst,
- uint32_t size, bool hostToDev) const {
-
- // No allocation is necessary for Full Profile
- hsa_status_t status;
- if (roc_device_.agent_profile() == HSA_PROFILE_FULL) {
- status = hsa_memory_copy(hostDst, hostSrc, size);
- if (status != HSA_STATUS_SUCCESS) {
- LogPrintfError("Hsa copy of data failed with code %d", status);
- }
- return (status == HSA_STATUS_SUCCESS);
- }
-
- // Allocate requested size of memory
- size_t align = 0x04;
- bool atomics = false;
- void *hsaBuffer = NULL;
- hsaBuffer = roc_device_.hostAlloc(size, align, false);
- if (hsaBuffer == NULL) {
- LogError("Hsa buffer allocation failed with code");
- return false;
- }
-
- const hsa_signal_value_t kInitVal = 1;
- hsa_signal_store_relaxed(completion_signal_, kInitVal);
-
- // Copy data from Host to Device
- if (hostToDev) {
- memcpy(hsaBuffer, hostSrc, size);
- status = hsa_amd_memory_async_copy(
- hostDst, roc_device_.getBackendDevice(), hsaBuffer,
- roc_device_.getCpuAgent(), size, 0, NULL, completion_signal_);
- if (status == HSA_STATUS_SUCCESS) {
- hsa_signal_value_t val =
- hsa_signal_wait_acquire(completion_signal_, HSA_SIGNAL_CONDITION_EQ, 0,
- uint64_t(-1), HSA_WAIT_STATE_ACTIVE);
-
- if (val != (kInitVal - 1)) {
- LogError("Async copy failed");
- status = HSA_STATUS_ERROR;
- }
- }
- else {
- LogPrintfError("Hsa copy from host to device failed with code %d", status);
- }
-
- roc_device_.hostFree(hsaBuffer, size);
- return (status == HSA_STATUS_SUCCESS);
- }
-
- // Copy data from Device to Host
- status = hsa_amd_memory_async_copy(hsaBuffer, roc_device_.getCpuAgent(),
- hostSrc, roc_device_.getBackendDevice(),
- size, 0, NULL, completion_signal_);
- if (status == HSA_STATUS_SUCCESS) {
- hsa_signal_value_t val = hsa_signal_wait_acquire(
- completion_signal_, HSA_SIGNAL_CONDITION_EQ, 0, uint64_t(-1),
- HSA_WAIT_STATE_ACTIVE);
-
- if (val != (kInitVal - 1)) {
- LogError("Async copy failed");
- status = HSA_STATUS_ERROR;
- }
-
- if (status == HSA_STATUS_SUCCESS) {
- memcpy(hostDst, hsaBuffer, size);
- }
- } else {
- LogPrintfError("Hsa copy from device to host failed with code %d", status);
- }
-
- roc_device_.hostFree(hsaBuffer, size);
- return (status == HSA_STATUS_SUCCESS);
-}
-
-bool HsaBlitManager::readBuffer(device::Memory& srcMemory, void* dstHost,
- const amd::Coord3D& origin,
- const amd::Coord3D& size, bool entire) const {
- hsa_memory_register(dstHost, size[0]);
- void* src = static_cast(srcMemory).getDeviceMemory();
-
- // Copy data from device to host
- const void *srcDev = reinterpret_cast(src) + origin[0];
- bool retval = hsaCopy(srcDev, dstHost, size[0], false);
-
- hsa_memory_deregister(dstHost, size[0]);
- return retval;
-}
-
-bool HsaBlitManager::readBufferRect(device::Memory& srcMemory, void* dst,
- const amd::BufferRect& bufRect,
- const amd::BufferRect& hostRect,
- const amd::Coord3D& size,
- bool entire) const {
- void* src = static_cast(srcMemory).getDeviceMemory();
-
- size_t srcOffset;
- size_t dstOffset;
-
- for (size_t z = 0; z < size[2]; ++z) {
- for (size_t y = 0; y < size[1]; ++y) {
- srcOffset = bufRect.offset(0, y, z);
- dstOffset = hostRect.offset(0, y, z);
-
- // Copy data from device to host - line by line
- void *dstHost = reinterpret_cast(dst) + dstOffset;
- const void *srcDev = reinterpret_cast(src) + srcOffset;
- bool retval = hsaCopy(srcDev, dstHost, size[0], false);
- if (!retval) {
- return retval;
- }
- }
- }
-
- return true;
-}
-
-static bool hsaCopyImageToBuffer(hsa_agent_t agent,
- hsa_ext_image_t srcImage,
- void* dstBuffer, const amd::Coord3D& srcOrigin,
- const amd::Coord3D& dstOrigin,
- const amd::Coord3D& size, bool entire,
- size_t rowPitch, size_t slicePitch) {
- hsa_ext_image_region_t image_region;
- image_region.offset.x = srcOrigin[0];
- image_region.offset.y = srcOrigin[1];
- image_region.offset.z = srcOrigin[2];
- image_region.range.x = size[0];
- image_region.range.y = size[1];
- image_region.range.z = size[2];
-
- char *dstHost = ((char*)dstBuffer) + dstOrigin[0];
-
- hsa_status_t status = hsa_ext_image_export(agent, srcImage, dstHost, rowPitch,
- slicePitch, &image_region);
- return (status == HSA_STATUS_SUCCESS);
-}
-
-bool HsaBlitManager::readImage(device::Memory& srcMemory, void* dstHost,
- const amd::Coord3D& origin,
- const amd::Coord3D& size, size_t rowPitch,
- size_t slicePitch, bool entire) const {
- roc::Image* srcImage = (roc::Image*)&srcMemory;
-
- void* svmDstHost = NULL;
- size_t pinSize = 0;
- FindPinSize(pinSize, size, rowPitch, slicePitch, *srcImage);
-
- hsa_agent_t agent = gpu().gpu_device();
-
- hsa_status_t status = hsa_amd_memory_lock(dstHost, pinSize,
- &agent, 1, &svmDstHost);
-
- if (status != HSA_STATUS_SUCCESS) {
- return false;
- }
-
- bool retval = hsaCopyImageToBuffer(agent, srcImage->getHsaImageObject(),
- svmDstHost, origin, amd::Coord3D(0), size, entire,
- rowPitch, slicePitch);
- hsa_amd_memory_unlock(dstHost);
- return retval;
-}
-
-bool HsaBlitManager::writeBuffer(const void* srcHost, device::Memory& dstMemory,
- const amd::Coord3D& origin,
- const amd::Coord3D& size, bool entire) const {
- hsa_memory_register(const_cast(srcHost), size[0]);
- void* dst = static_cast(dstMemory).getDeviceMemory();
-
- // Copy data from host to device
- void *dstDev = reinterpret_cast(dst) + origin[0];
- bool retval = hsaCopy(srcHost, dstDev, size[0], true);
-
- hsa_memory_deregister(const_cast(srcHost), size[0]);
- return retval;
-}
-
-bool HsaBlitManager::writeBufferRect(const void* src,
- device::Memory& dstMemory,
- const amd::BufferRect& hostRect,
- const amd::BufferRect& bufRect,
- const amd::Coord3D& size,
- bool entire) const {
- void* dst = static_cast(dstMemory).getDeviceMemory();
-
- size_t srcOffset;
- size_t dstOffset;
-
- for (size_t z = 0; z < size[2]; ++z) {
- for (size_t y = 0; y < size[1]; ++y) {
- srcOffset = hostRect.offset(0, y, z);
- dstOffset = bufRect.offset(0, y, z);
-
- // Copy data from host to device - line by line
- void *dstDev = reinterpret_cast(dst) + dstOffset;
- const void *srcHost = reinterpret_cast(src) + srcOffset;
- bool retval = hsaCopy(srcHost, dstDev, size[0], true);
- if (!retval) {
- return retval;
- }
- }
- }
-
- return true;
-}
-
-bool hsaCopyBufferToImage(hsa_agent_t agent, const void* srcBuffer,
- hsa_ext_image_t dstImage,
- const amd::Coord3D& srcOrigin,
- const amd::Coord3D& dstOrigin,
- const amd::Coord3D& size, bool entire,
- size_t rowPitch, size_t slicePitch) {
- char* srcHost = ((char*)srcBuffer) + srcOrigin[0];
-
- hsa_ext_image_region_t image_region;
- image_region.offset.x = dstOrigin[0];
- image_region.offset.y = dstOrigin[1];
- image_region.offset.z = dstOrigin[2];
- image_region.range.x = size[0];
- image_region.range.y = size[1];
- image_region.range.z = size[2];
-
- hsa_status_t status = hsa_ext_image_import(
- agent, srcHost, rowPitch, slicePitch, dstImage, &image_region);
- return (status == HSA_STATUS_SUCCESS);
-}
-
-bool HsaBlitManager::writeImage(const void* srcHost, device::Memory& dstMemory,
- const amd::Coord3D& origin,
- const amd::Coord3D& size, size_t rowPitch,
- size_t slicePitch, bool entire) const {
- roc::Image* image = (roc::Image*)&dstMemory;
-
- void* svmSrcHost = NULL;
- size_t pinSize = 0;
- FindPinSize(pinSize, size, rowPitch, slicePitch, *image);
-
- hsa_agent_t agent = gpu().gpu_device();
-
- hsa_status_t status = hsa_amd_memory_lock(const_cast(srcHost), pinSize,
- &agent, 1, &svmSrcHost);
-
- if (status != HSA_STATUS_SUCCESS) {
- return false;
- }
-
- bool retval = hsaCopyBufferToImage(agent, svmSrcHost,
- image->getHsaImageObject(), amd::Coord3D(0),
- origin, size, entire, rowPitch, slicePitch);
-
- hsa_amd_memory_unlock(const_cast(srcHost));
-
- return retval;
-}
-
-bool HsaBlitManager::copyBuffer(device::Memory& srcMemory,
- device::Memory& dstMemory,
- const amd::Coord3D& srcOrigin,
- const amd::Coord3D& dstOrigin,
- const amd::Coord3D& size, bool entire) const {
- void* src = static_cast(srcMemory).getDeviceMemory();
- void* dst = static_cast(dstMemory).getDeviceMemory();
-
- if (srcMemory.isHostMemDirectAccess() && dstMemory.isHostMemDirectAccess()) {
- if (srcMemory.owner()->getMemFlags() & CL_MEM_USE_HOST_PTR) {
- src = srcMemory.owner()->getHostMem();
- }
-
- if (dstMemory.owner()->getMemFlags() & CL_MEM_USE_HOST_PTR) {
- dst = dstMemory.owner()->getHostMem();
- }
- }
-
- const hsa_agent_t src_agent = (srcMemory.isHostMemDirectAccess())
- ? roc_device_.getCpuAgent()
- : roc_device_.getBackendDevice();
-
- const hsa_agent_t dst_agent = (dstMemory.isHostMemDirectAccess())
- ? roc_device_.getCpuAgent()
- : roc_device_.getBackendDevice();
-
- // Straight forward buffer copy
- const hsa_signal_value_t kInitVal = 1;
- hsa_signal_store_relaxed(completion_signal_, kInitVal);
- hsa_status_t status = hsa_amd_memory_async_copy(
- (reinterpret_cast(dst) + dstOrigin[0]), dst_agent,
- (reinterpret_cast(src) + srcOrigin[0]), src_agent, size[0],
- 0, NULL, completion_signal_);
- if (status != HSA_STATUS_SUCCESS) {
- LogPrintfError("DMA buffer failed with code %d", status);
- return false;
- }
-
- hsa_signal_value_t val =
- hsa_signal_wait_acquire(completion_signal_, HSA_SIGNAL_CONDITION_EQ, 0,
- uint64_t(-1), HSA_WAIT_STATE_ACTIVE);
-
- if (val != (kInitVal - 1)) {
- LogError("Async copy failed");
- return false;
- }
-
- return true;
-}
-
-bool HsaBlitManager::copyBufferRect(device::Memory& srcMemory,
- device::Memory& dstMemory,
- const amd::BufferRect& srcRect,
- const amd::BufferRect& dstRect,
- const amd::Coord3D& size,
- bool entire) const {
- void* src = static_cast(srcMemory).getDeviceMemory();
- void* dst = static_cast(dstMemory).getDeviceMemory();
-
- const hsa_signal_value_t kInitVal = size[2] * size[1];
- hsa_signal_store_relaxed(completion_signal_, kInitVal);
-
- for (size_t z = 0; z < size[2]; ++z) {
- for (size_t y = 0; y < size[1]; ++y) {
- size_t srcOffset = srcRect.offset(0, y, z);
- size_t dstOffset = dstRect.offset(0, y, z);
-
- // Copy memory line by line
- hsa_status_t status = hsa_amd_memory_async_copy(
- (reinterpret_cast(dst) + dstOffset),
- roc_device_.getBackendDevice(),
- (reinterpret_cast(src) + srcOffset),
- roc_device_.getBackendDevice(), size[0], 0, NULL,
- completion_signal_);
- if (status != HSA_STATUS_SUCCESS) {
- LogPrintfError("DMA buffer failed with code %d", status);
- return false;
- }
- }
- }
-
- hsa_signal_value_t val =
- hsa_signal_wait_acquire(completion_signal_, HSA_SIGNAL_CONDITION_EQ,
- 0, uint64_t(-1), HSA_WAIT_STATE_ACTIVE);
-
- if (val != 0) {
- LogError("Async copy failed");
- return false;
- }
-
- return true;
-}
-
-bool HsaBlitManager::copyImageToBuffer(device::Memory& srcMemory,
- device::Memory& dstMemory,
- const amd::Coord3D& srcOrigin,
- const amd::Coord3D& dstOrigin,
- const amd::Coord3D& size, bool entire,
- size_t rowPitch,
- size_t slicePitch) const {
- roc::Image& srcImage = (roc::Image&)srcMemory;
- roc::Buffer& dstBuffer = (roc::Buffer&)dstMemory;
-
- return hsaCopyImageToBuffer(gpu().gpu_device(), srcImage.getHsaImageObject(),
- dstBuffer.getDeviceMemory(), srcOrigin, dstOrigin,
- size, entire, rowPitch, slicePitch);
-}
-
-bool HsaBlitManager::copyBufferToImage(device::Memory& srcMemory,
- device::Memory& dstMemory,
- const amd::Coord3D& srcOrigin,
- const amd::Coord3D& dstOrigin,
- const amd::Coord3D& size, bool entire,
- size_t rowPitch,
- size_t slicePitch) const {
- roc::Buffer& srcBuffer = (roc::Buffer&)srcMemory;
- roc::Image& dstImage = (roc::Image&)dstMemory;
-
- return hsaCopyBufferToImage(gpu().gpu_device(), srcBuffer.getDeviceMemory(),
- dstImage.getHsaImageObject(), srcOrigin,
- dstOrigin, size, entire, rowPitch, slicePitch);
-}
-
-bool HsaBlitManager::copyImage(device::Memory& srcMemory,
- device::Memory& dstMemory,
- const amd::Coord3D& srcOrigin,
- const amd::Coord3D& dstOrigin,
- const amd::Coord3D& size, bool entire) const {
- if (srcMemory.isHostMemDirectAccess() &&
- dstMemory.isHostMemDirectAccess()) {
- return device::HostBlitManager::copyImage(
- srcMemory, dstMemory, srcOrigin, dstOrigin, size, entire);
- }
-
- roc::Image *srcImage = (roc::Image *)&srcMemory;
- roc::Image *dstImage = (roc::Image *)&dstMemory;
-
- hsa_dim3_t src_offset = { 0 };
- src_offset.x = srcOrigin[0];
- src_offset.y = srcOrigin[1];
- src_offset.z = srcOrigin[2];
-
- hsa_dim3_t dst_offset = { 0 };
- dst_offset.x = dstOrigin[0];
- dst_offset.y = dstOrigin[1];
- dst_offset.z = dstOrigin[2];
-
- hsa_dim3_t copy_size = { 0 };
- copy_size.x = size[0];
- copy_size.y = size[1];
- copy_size.z = size[2];
-
- hsa_status_t status = hsa_ext_image_copy(
- gpu().gpu_device(), srcImage->getHsaImageObject(), &src_offset,
- dstImage->getHsaImageObject(), &dst_offset, ©_size);
- return (status == HSA_STATUS_SUCCESS);
-}
-
-bool HsaBlitManager::fillBuffer(device::Memory& memory, const void* pattern,
- size_t patternSize, const amd::Coord3D& origin,
- const amd::Coord3D& size, bool entire) const {
- void* fillMem = static_cast(memory).getDeviceMemory();
-
- size_t offset = origin[0];
- size_t fillSize = size[0];
-
- if ((fillSize % patternSize) != 0) {
- LogError("Misaligned buffer size and pattern size!");
- }
-
- // Fill the buffer memory with a pattern
- for (size_t i = 0; i < (fillSize / patternSize); i++) {
- void *dstDev = reinterpret_cast(fillMem) + offset;
- bool retval = hsaCopy(pattern, dstDev, patternSize, true);
- if (!retval) {
- LogError("DMA buffer failed with code");
- return retval;
- }
-
- offset += patternSize;
- }
-
- return true;
-}
-
-bool HsaBlitManager::fillImage(device::Memory& memory, const void* pattern,
- const amd::Coord3D& origin,
- const amd::Coord3D& size, bool entire) const {
- if (memory.isHostMemDirectAccess()) {
- return device::HostBlitManager::fillImage(memory, pattern, origin, size, entire);
- }
-
- roc::Image *image = (roc::Image*)&memory;
- hsa_ext_image_region_t image_region;
- image_region.offset.x = origin[0];
- image_region.offset.y = origin[1];
- image_region.offset.z = origin[2];
- image_region.range.x = size[0];
- image_region.range.y = size[1];
- image_region.range.z = size[2];
-
- hsa_status_t status = hsa_ext_image_clear(
- gpu().gpu_device(), image->getHsaImageObject(),
- pattern, &image_region);
- return (status == HSA_STATUS_SUCCESS);
-}
-
-static void
-CalcRowSlicePitches(
- cl_ulong* pitch, const cl_int* copySize,
- size_t rowPitch, size_t slicePitch, const Memory& mem)
-{
- const roc::Image &hsaImage = static_cast< const roc::Image &>(mem);
- bool img1Darray =
- (mem.owner()->getType() == CL_MEM_OBJECT_IMAGE1D_ARRAY) ? true : false;
- size_t memFmtSize = mem.owner()->asImage()->getImageFormat().getElementSize();
-
- if (rowPitch == 0) {
- pitch[0] = copySize[0];
- }
- else {
- pitch[0] = rowPitch / memFmtSize;
- }
- if (slicePitch == 0) {
- pitch[1] = pitch[0] * (img1Darray ? 1 : copySize[1]);
- }
- else {
- pitch[1] = slicePitch / memFmtSize;
- }
- assert((pitch[0] <= pitch[1]) && "rowPitch must be <= slicePitch");
-
- if (img1Darray) {
- // For 1D array rowRitch = slicePitch
- pitch[0] = pitch[1];
- }
-}
-
-KernelBlitManager::KernelBlitManager(device::VirtualDevice& vDev, Setup setup)
- : HsaBlitManager(vDev, setup),
- context_(NULL),
- program_(NULL)
-{
- for (uint i = 0; i < BlitTotal; ++i) {
- kernels_[i] = NULL;
- }
-}
-
-KernelBlitManager::~KernelBlitManager()
-{
- for (uint i = 0; i < BlitTotal; ++i) {
- if (NULL != kernels_[i]) {
- kernels_[i]->release();
- }
- }
-
- if (NULL != program_) {
- program_->release();
- }
-
- if (NULL != context_) {
- // Release a dummy context
- context_->release();
- }
-}
-
-bool
-KernelBlitManager::readBuffer(
- device::Memory& srcMemory,
- void* dstHost,
- const amd::Coord3D& origin,
- const amd::Coord3D& size,
- bool entire) const
-{
- //if (setup_.disableReadBuffer_ || srcMemory.isHostMemDirectAccess()) {
- // return device::HostBlitManager::readBuffer(srcMemory, dstHost, origin,
- // size, entire);
- //}
- // Exercise HSA path for now.
- return HsaBlitManager::readBuffer(srcMemory, dstHost, origin,
- size, entire);
-
- amd::Buffer *dstMemory = new (*context_) amd::Buffer(
- *context_, CL_MEM_USE_HOST_PTR, size[0]);
-
- if (!dstMemory->create(const_cast(dstHost))) {
- LogError("[OCL] Fail to create mem object for destination");
- return false;
- }
-
- device::Memory *devDstMemory = dstMemory->getDeviceMemory(dev_);
- if (devDstMemory== NULL) {
- LogError("[OCL] Fail to create device mem object for destination");
- return false;
- }
-
- bool result = copyBuffer(
- srcMemory, *devDstMemory, origin, amd::Coord3D(0), size, entire);
-
- // Wait for the transfer to finish so that we could safely release the
- // destination memory object.
- // TODO: we could remove this if issue on implicit memory registration is
- // fixed by KFD, so that we could pass the pattern as SVM.
- gpu().releaseGpuMemoryFence();
-
- dstMemory->release();
-
- return result;
-}
-
-bool
-KernelBlitManager::readBufferRect(
- device::Memory& srcMemory,
- void* dstHost,
- const amd::BufferRect& bufRect,
- const amd::BufferRect& hostRect,
- const amd::Coord3D& size,
- bool entire) const
-{
- // if (setup_.disableReadBufferRect_ || srcMemory.isHostMemDirectAccess()) {
- //return device::HostBlitManager::readBufferRect(
- // srcMemory, dstHost, bufRect, hostRect, size, entire);
- // }
-
- // Exercise HSA path for now.
- return HsaBlitManager::readBufferRect(
- srcMemory, dstHost, bufRect, hostRect, size, entire);
-
- size_t dstSize = hostRect.start_ + hostRect.end_;
- amd::Buffer *dstMemory =
- new (*context_) amd::Buffer(*context_, CL_MEM_USE_HOST_PTR, dstSize);
-
- if (!dstMemory->create(const_cast(dstHost))) {
- LogError("[OCL] Fail to create mem object for destination");
- return false;
- }
-
- device::Memory *devDstMemory = dstMemory->getDeviceMemory(dev_);
- if (devDstMemory== NULL) {
- LogError("[OCL] Fail to create device mem object for destination");
- return false;
- }
-
- bool result = copyBufferRect(
- srcMemory, *devDstMemory, bufRect, hostRect, size, entire);
-
- // Wait for the transfer to finish so that we could safely release the
- // destination memory object.
- // TODO: we could remove this if issue on implicit memory registration is
- // fixed by KFD, so that we could pass the pattern as SVM.
- gpu().releaseGpuMemoryFence();
-
- dstMemory->release();
-
- return result;
-}
-
-void
-FindLinearSize(
- size_t& linearSize, const amd::Coord3D& size,
- size_t& rowPitch, size_t& slicePitch, const device::Memory& mem)
-{
- const roc::Image &image = static_cast(mem);
- size_t elementSize = mem.owner()->asImage()->getImageFormat().getElementSize();
-
- linearSize = size[0] * elementSize;
- if ((rowPitch == 0) || (rowPitch == linearSize)) {
- rowPitch = 0;
- }
- else {
- linearSize = rowPitch;
- }
-
- // Calculate the pin size, which should be equal to the copy size
- for (uint i = 1; i < mem.owner()->asImage()->getDims(); ++i) {
- linearSize *= size[i];
- if (i == 1) {
- if ((slicePitch == 0) || (slicePitch == linearSize)) {
- slicePitch = 0;
- }
- else {
- if (mem.owner()->getType() != CL_MEM_OBJECT_IMAGE1D_ARRAY) {
- linearSize = slicePitch;
- }
- else {
- linearSize = slicePitch * size[i];
- }
- }
- }
- }
-}
-
-// The following data structures will be used for the view creations.
-// Some formats has to be converted before a kernel blit operation
-struct FormatConvertion {
- cl_uint clOldType_;
- cl_uint clNewType_;
-};
-
-// The list of rejected data formats and corresponding conversion
-static const FormatConvertion RejectedData[] =
-{
- { CL_UNORM_INT8, CL_UNSIGNED_INT8 },
- { CL_UNORM_INT16, CL_UNSIGNED_INT16 },
- { CL_SNORM_INT8, CL_UNSIGNED_INT8 },
- { CL_SNORM_INT16, CL_UNSIGNED_INT16 },
- { CL_HALF_FLOAT, CL_UNSIGNED_INT16 },
- { CL_FLOAT, CL_UNSIGNED_INT32 },
- { CL_SIGNED_INT8, CL_UNSIGNED_INT8 },
- { CL_SIGNED_INT16, CL_UNSIGNED_INT16 },
- { CL_UNORM_INT_101010, CL_UNSIGNED_INT8 },
- { CL_SIGNED_INT32, CL_UNSIGNED_INT32 }
-};
-
-// The list of rejected channel's order and corresponding conversion
-static const FormatConvertion RejectedOrder[] =
-{
- { CL_A, CL_R },
- { CL_RA, CL_RG },
- { CL_LUMINANCE, CL_R },
- { CL_INTENSITY, CL_R },
- { CL_RGB, CL_RGBA },
- { CL_BGRA, CL_RGBA },
- { CL_ARGB, CL_RGBA },
- { CL_sRGB, CL_RGBA },
- { CL_sRGBx, CL_RGBA },
- { CL_sRGBA, CL_RGBA },
- { CL_sBGRA, CL_RGBA },
- { CL_DEPTH, CL_R}
-};
-
-const uint RejectedFormatDataTotal =
- sizeof(RejectedData) / sizeof(FormatConvertion);
-const uint RejectedFormatChannelTotal =
- sizeof(RejectedOrder) / sizeof(FormatConvertion);
-
-amd::Image::Format
-KernelBlitManager::filterFormat(amd::Image::Format oldFormat) const
-{
- cl_image_format newFormat;
- newFormat.image_channel_data_type = oldFormat.image_channel_data_type;
- newFormat.image_channel_order = oldFormat.image_channel_order;
-
- // Find unsupported formats
- for (uint i = 0; i < RejectedFormatDataTotal; ++i) {
- if (RejectedData[i].clOldType_ == oldFormat.image_channel_data_type) {
- newFormat.image_channel_data_type = RejectedData[i].clNewType_;
- break;
- }
- }
-
- // Find unsupported channel's order
- for (uint i = 0; i < RejectedFormatChannelTotal; ++i) {
- if (RejectedOrder[i].clOldType_ == oldFormat.image_channel_order) {
- newFormat.image_channel_order = RejectedOrder[i].clNewType_;
- break;
- }
- }
-
- return amd::Image::Format(newFormat);
-}
-
-device::Memory *
-KernelBlitManager::createImageView(
- device::Memory &parent,
- amd::Image::Format newFormat) const
-{
- amd::Image *image =
- parent.owner()->asImage()->createView(parent.owner()->getContext(), newFormat, &gpu());
-
- if (image == NULL) {
- LogError("[OCL] Fail to allocate view of image object");
- return NULL;
- }
-
- Image* devImage = new roc::Image(static_cast(dev_), *image);
- if (devImage == NULL) {
- LogError("[OCL] Fail to allocate device mem object for the view");
- image->release();
- return NULL;
- }
-
- if (!devImage->createView(static_cast(parent))) {
- LogError("[OCL] Fail to create device mem object for the view");
- delete devImage;
- image->release();
- return NULL;
- }
-
- image->replaceDeviceMemory(&dev_, devImage);
-
- return devImage;
-}
-
-bool
-KernelBlitManager::readImage(
- device::Memory& srcMemory,
- void* dstHost,
- const amd::Coord3D& origin,
- const amd::Coord3D& size,
- size_t rowPitch,
- size_t slicePitch,
- bool entire) const
-{
- return HsaBlitManager::readImage(
- srcMemory, dstHost, origin, size, rowPitch, slicePitch, entire);
-}
-
-bool
-KernelBlitManager::writeBuffer(
- const void* srcHost,
- device::Memory& dstMemory,
- const amd::Coord3D& origin,
- const amd::Coord3D& size,
- bool entire) const
-{
- // if (setup_.disableWriteBuffer_ || dstMemory.isHostMemDirectAccess()) {
- //return device::HostBlitManager::writeBuffer(srcHost, dstMemory, origin, size,
- // entire);
- // }
-
- // Exercise HSA path for now.
- return HsaBlitManager::writeBuffer(srcHost, dstMemory, origin, size,
- entire);
-
- amd::Buffer *srcMemory =
- new (*context_) amd::Buffer(*context_, CL_MEM_USE_HOST_PTR, size[0]);
-
- if (!srcMemory->create(const_cast(srcHost))) {
- LogError("[OCL] Fail to create mem object for destination");
- return false;
- }
-
- device::Memory *devSrcMemory = srcMemory->getDeviceMemory(dev_);
- if (devSrcMemory== NULL) {
- LogError("[OCL] Fail to create device mem object for destination");
- return false;
- }
-
- bool result =
- copyBuffer(*devSrcMemory, dstMemory, amd::Coord3D(0), origin, size, entire);
-
- // Wait for the transfer to finish so that we could safely release the
- // source memory object.
- // TODO: we could remove this if issue on implicit memory registration is
- // fixed by KFD, so that we could pass the pattern as SVM.
- gpu().releaseGpuMemoryFence();
-
- srcMemory->release();
-
- return result;
-}
-
-bool
-KernelBlitManager::writeBufferRect(
- const void* srcHost,
- device::Memory& dstMemory,
- const amd::BufferRect& hostRect,
- const amd::BufferRect& bufRect,
- const amd::Coord3D& size,
- bool entire) const
-{
- // if (setup_.disableWriteBufferRect_ || dstMemory.isHostMemDirectAccess()) {
- //return device::HostBlitManager::writeBufferRect(
- // srcHost, dstMemory, hostRect, bufRect, size, entire);
- // }
-
- // Exercise HSA path for now.
- return HsaBlitManager::writeBufferRect(
- srcHost, dstMemory, hostRect, bufRect, size, entire);
-
- size_t srcSize = hostRect.start_ + hostRect.end_;
- amd::Buffer *srcMemory =
- new (*context_) amd::Buffer(*context_, CL_MEM_USE_HOST_PTR, srcSize);
-
- if (!srcMemory->create(const_cast(srcHost))) {
- LogError("[OCL] Fail to create mem object for destination");
- return false;
- }
-
- device::Memory *devSrcMemory = srcMemory->getDeviceMemory(dev_);
- if (devSrcMemory== NULL) {
- LogError("[OCL] Fail to create device mem object for destination");
- return false;
- }
-
- bool result = copyBufferRect(
- *devSrcMemory, dstMemory, hostRect, bufRect, size, entire);
-
- // Wait for the transfer to finish so that we could safely release the
- // destination memory object.
- // TODO: we could remove this if issue on implicit memory registration is
- // fixed by KFD, so that we could pass the pattern as SVM.
- gpu().releaseGpuMemoryFence();
-
- srcMemory->release();
-
- return result;
-}
-
-bool
-KernelBlitManager::writeImage(
- const void* srcHost,
- device::Memory& dstMemory,
- const amd::Coord3D& origin,
- const amd::Coord3D& size,
- size_t rowPitch,
- size_t slicePitch,
- bool entire) const
-{
- return HsaBlitManager::writeImage(
- srcHost, dstMemory, origin, size, rowPitch, slicePitch, entire);
-}
-
-bool
-KernelBlitManager::copyBuffer(
- device::Memory& srcMemory,
- device::Memory& dstMemory,
- const amd::Coord3D& srcOrigin,
- const amd::Coord3D& dstOrigin,
- const amd::Coord3D& sizeIn,
- bool entire) const
-{
- // if (setup_.disableCopyBuffer_ ||
- // (srcMemory.isHostMemDirectAccess() &&
- // dstMemory.isHostMemDirectAccess())) {
- //return HsaBlitManager::copyBuffer(
- // srcMemory, dstMemory, srcOrigin, dstOrigin, sizeIn, entire);
- // }
-
- // Exercise HSA path for now.
- return HsaBlitManager::copyBuffer(
- srcMemory, dstMemory, srcOrigin, dstOrigin, sizeIn, entire);
-
- uint blitType = BlitCopyBuffer;
- size_t dim = 1;
- size_t globalWorkOffset[3] = { 0, 0, 0 };
- size_t globalWorkSize = 0;
- size_t localWorkSize = 0;
-
- const static uint CopyBuffAlignment[3] = { 16, 4, 1 };
- amd::Coord3D size(sizeIn[0], sizeIn[1], sizeIn[2]);
-
- bool aligned;
- uint i;
- for (i = 0; i < 3; ++i) {
- // Check source alignments
- aligned = ((srcOrigin[0] % CopyBuffAlignment[i]) == 0);
- // Check destination alignments
- aligned &= ((dstOrigin[0] % CopyBuffAlignment[i]) == 0);
- // Check copy size alignment in the first dimension
- aligned &= ((sizeIn[0] % CopyBuffAlignment[i]) == 0);
-
- if (aligned) {
- if (CopyBuffAlignment[i] != 1) {
- blitType = BlitCopyBufferAligned;
- }
- break;
- }
- }
-
- cl_uint remain;
- if (blitType == BlitCopyBufferAligned) {
- size.c[0] /= CopyBuffAlignment[i];
- }
- else {
- remain = size[0] % 4;
- size.c[0] /= 4;
- size.c[0] += 1;
- }
-
- // Program the dispatch dimensions
- localWorkSize = 256;
- globalWorkSize = amd::alignUp(size[0] , 256);
-
- // Program kernels arguments for the blit operation
- cl_mem clmem = ((cl_mem) as_cl(srcMemory.owner()));
- kernels_[blitType]->parameters().set(0, sizeof(cl_mem), &clmem);
- clmem = ((cl_mem) as_cl(dstMemory.owner()));
- kernels_[blitType]->parameters().set(1, sizeof(cl_mem), &clmem);
- // Program source origin
- cl_ulong srcOffset = srcOrigin[0] / CopyBuffAlignment[i];
- kernels_[blitType]->parameters().set(2, sizeof(srcOffset), &srcOffset);
-
- // Program destination origin
- cl_ulong dstOffset = dstOrigin[0] / CopyBuffAlignment[i];
- kernels_[blitType]->parameters().set(3, sizeof(dstOffset), &dstOffset);
-
- cl_ulong copySize = size[0];
- kernels_[blitType]->parameters().set(4, sizeof(copySize), ©Size);
-
- if (blitType == BlitCopyBufferAligned) {
- cl_int alignment = CopyBuffAlignment[i];
- kernels_[blitType]->parameters().set(5, sizeof(alignment), &alignment);
- }
- else {
- kernels_[blitType]->parameters().set(5, sizeof(remain), &remain);
- }
-
- // Create ND range object for the kernel's execution
- amd::NDRangeContainer ndrange(
- 1, globalWorkOffset, &globalWorkSize, &localWorkSize);
-
- // Execute the blit
- address parameters = kernels_[blitType]->parameters().capture(dev_);
- bool result = gpu().submitKernelInternal(
- ndrange, *kernels_[blitType], parameters, NULL);
- kernels_[blitType]->parameters().release(const_cast(parameters), dev_);
- return result;
-}
-
-bool
-KernelBlitManager::copyBufferRect(
- device::Memory& srcMemory,
- device::Memory& dstMemory,
- const amd::BufferRect& srcRectIn,
- const amd::BufferRect& dstRectIn,
- const amd::Coord3D& sizeIn,
- bool entire) const
-{
- // if (setup_.disableCopyBuffer_ ||
- // (srcMemory.isHostMemDirectAccess() && dstMemory.isHostMemDirectAccess())) {
- //return HsaBlitManager::copyBufferRect(
- // srcMemory, dstMemory, srcRectIn, dstRectIn, sizeIn, entire);
- // }
-
- // Exercise HSA path for now.
- return HsaBlitManager::copyBufferRect(
- srcMemory, dstMemory, srcRectIn, dstRectIn, sizeIn, entire);
-
- uint blitType = BlitCopyBufferRect;
- size_t dim = 3;
- size_t globalWorkOffset[3] = { 0, 0, 0 };
- size_t globalWorkSize[3];
- size_t localWorkSize[3];
-
- const static uint CopyRectAlignment[3] = { 16, 4, 1 };
-
- bool aligned;
- uint i;
- for (i = 0; i < sizeof(CopyRectAlignment) / sizeof(uint); i++) {
- // Check source alignments
- aligned = ((srcRectIn.rowPitch_ % CopyRectAlignment[i]) == 0);
- aligned &= ((srcRectIn.slicePitch_ % CopyRectAlignment[i]) == 0);
- aligned &= ((srcRectIn.start_ % CopyRectAlignment[i]) == 0);
-
- // Check destination alignments
- aligned &= ((dstRectIn.rowPitch_ % CopyRectAlignment[i]) == 0);
- aligned &= ((dstRectIn.slicePitch_ % CopyRectAlignment[i]) == 0);
- aligned &= ((dstRectIn.start_ % CopyRectAlignment[i]) == 0);
-
- // Check copy size alignment in the first dimension
- aligned &= ((sizeIn[0] % CopyRectAlignment[i]) == 0);
-
- if (aligned) {
- if (CopyRectAlignment[i] != 1) {
- blitType = BlitCopyBufferRectAligned;
- }
- break;
- }
- }
-
- amd::BufferRect srcRect;
- amd::BufferRect dstRect;
- amd::Coord3D size(sizeIn[0], sizeIn[1], sizeIn[2]);
-
- srcRect.rowPitch_ = srcRectIn.rowPitch_ / CopyRectAlignment[i];
- srcRect.slicePitch_ = srcRectIn.slicePitch_ / CopyRectAlignment[i];
- srcRect.start_ = srcRectIn.start_ / CopyRectAlignment[i];
- srcRect.end_ = srcRectIn.end_ / CopyRectAlignment[i];
-
- dstRect.rowPitch_ = dstRectIn.rowPitch_ / CopyRectAlignment[i];
- dstRect.slicePitch_ = dstRectIn.slicePitch_ / CopyRectAlignment[i];
- dstRect.start_ = dstRectIn.start_ / CopyRectAlignment[i];
- dstRect.end_ = dstRectIn.end_ / CopyRectAlignment[i];
-
- size.c[0] /= CopyRectAlignment[i];
-
- // Program the kernel's workload depending on the transfer dimensions
- if ((size[1] == 1) && (size[2] == 1)) {
- globalWorkSize[0] = amd::alignUp(size[0], 256);
- globalWorkSize[1] = 1;
- globalWorkSize[2] = 1;
- localWorkSize[0] = 256;
- localWorkSize[1] = 1;
- localWorkSize[2] = 1;
- }
- else if (size[2] == 1) {
- globalWorkSize[0] = amd::alignUp(size[0], 16);
- globalWorkSize[1] = amd::alignUp(size[1], 16);
- globalWorkSize[2] = 1;
- localWorkSize[0] = localWorkSize[1] = 16;
- localWorkSize[2] = 1;
- }
- else {
- globalWorkSize[0] = amd::alignUp(size[0], 8);
- globalWorkSize[1] = amd::alignUp(size[1], 8);
- globalWorkSize[2] = amd::alignUp(size[2], 4);
- localWorkSize[0] = localWorkSize[1] = 8;
- localWorkSize[2] = 4;
- }
-
-
- // Program kernels arguments for the blit operation
- cl_mem clmem = ((cl_mem) as_cl(srcMemory.owner()));
- kernels_[blitType]->parameters().set(0, sizeof(cl_mem), &clmem);
- clmem = ((cl_mem) as_cl(dstMemory.owner()));
- kernels_[blitType]->parameters().set(1, sizeof(cl_mem), &clmem);
- cl_ulong src[4] = {srcRect.rowPitch_,
- srcRect.slicePitch_,
- srcRect.start_, 0 };
- kernels_[blitType]->parameters().set(2, sizeof(src), src);
- cl_ulong dst[4] = {dstRect.rowPitch_,
- dstRect.slicePitch_,
- dstRect.start_, 0 };
- kernels_[blitType]->parameters().set(3, sizeof(dst), dst);
- cl_ulong copySize[4] = {size[0],
- size[1],
- size[2],
- CopyRectAlignment[i] };
- kernels_[blitType]->parameters().set(4, sizeof(copySize), copySize);
-
- // Create ND range object for the kernel's execution
- amd::NDRangeContainer ndrange(dim,
- globalWorkOffset, globalWorkSize, localWorkSize);
-
- // Execute the blit
- address parameters = kernels_[blitType]->parameters().capture(dev_);
- bool result = gpu().submitKernelInternal(
- ndrange, *kernels_[blitType], parameters, NULL);
- kernels_[blitType]->parameters().release(const_cast(parameters), dev_);
- return result;
-}
-
-bool
-KernelBlitManager::copyImageToBuffer(
- device::Memory& srcMemory,
- device::Memory& dstMemory,
- const amd::Coord3D& srcOrigin,
- const amd::Coord3D& dstOrigin,
- const amd::Coord3D& size,
- bool entire,
- size_t rowPitch,
- size_t slicePitch) const
-{
- if (dstMemory.isHostMemDirectAccess()) {
- return HsaBlitManager::copyImageToBuffer(srcMemory, dstMemory, srcOrigin,
- dstOrigin, size, entire, rowPitch,
- slicePitch);
- }
-
- amd::Image::Format oldFormat = srcMemory.owner()->asImage()->getImageFormat();
- amd::Image::Format newFormat = filterFormat(oldFormat);
- bool useView = false;
-
- device::Memory* srcView = &srcMemory;
- if (oldFormat != newFormat) {
- srcView = createImageView(srcMemory, newFormat);
- useView = true;
- }
-
- roc::Image& srcImage = static_cast(*srcView);
-
- amd::Image* image = srcImage.owner()->asImage();
- uint blitType = 0;
- blitType = BlitCopyImageToBuffer;
- size_t dim = 0;
- size_t globalWorkOffset[3] = {0, 0, 0};
- size_t globalWorkSize[3];
- size_t localWorkSize[3];
-
- // Program the kernels workload depending on the blit dimensions
- const size_t imageDims = srcImage.owner()->asImage()->getDims();
- dim = 3;
- // Find the current blit type
- if (imageDims == 1) {
- globalWorkSize[0] = amd::alignUp(size[0], 256);
- globalWorkSize[1] = amd::alignUp(size[1], 1);
- globalWorkSize[2] = amd::alignUp(size[2], 1);
- localWorkSize[0] = 256;
- localWorkSize[1] = localWorkSize[2] = 1;
- } else if (imageDims == 2) {
- globalWorkSize[0] = amd::alignUp(size[0], 16);
- globalWorkSize[1] = amd::alignUp(size[1], 16);
- globalWorkSize[2] = amd::alignUp(size[2], 1);
- localWorkSize[0] = localWorkSize[1] = 16;
- localWorkSize[2] = 1;
- } else {
- globalWorkSize[0] = amd::alignUp(size[0], 8);
- globalWorkSize[1] = amd::alignUp(size[1], 8);
- globalWorkSize[2] = amd::alignUp(size[2], 4);
- localWorkSize[0] = localWorkSize[1] = 8;
- localWorkSize[2] = 4;
- }
-
- // Program kernels arguments for the blit operation
- cl_mem clmem = ((cl_mem)as_cl(srcImage.owner()));
- kernels_[blitType]->parameters().set(0, sizeof(cl_mem), &clmem);
- clmem = ((cl_mem)as_cl(dstMemory.owner()));
- kernels_[blitType]->parameters().set(1, sizeof(cl_mem), &clmem);
-
- // Update extra paramters for USHORT and UBYTE pointers.
- // Only then compiler can optimize the kernel to use
- // UAV Raw for other writes
- kernels_[blitType]->parameters().set(2, sizeof(cl_mem), &clmem);
- kernels_[blitType]->parameters().set(3, sizeof(cl_mem), &clmem);
-
- cl_int srcOrg[4] = {(cl_int)srcOrigin[0], (cl_int)srcOrigin[1],
- (cl_int)srcOrigin[2], 0};
- cl_int copySize[4] = {(cl_int)size[0], (cl_int)size[1], (cl_int)size[2], 0};
-
- kernels_[blitType]->parameters().set(4, sizeof(srcOrg), srcOrg);
-
- const size_t elementSize =
- srcImage.owner()->asImage()->getImageFormat().getElementSize();
- const size_t numChannels =
- srcImage.owner()->asImage()->getImageFormat().getNumChannels();
-
- // 1 element granularity for writes by default
- cl_int granularity = 1;
- if (elementSize == 2) {
- granularity = 2;
- } else if (elementSize >= 4) {
- granularity = 4;
- }
- CondLog(((dstOrigin[0] % granularity) != 0), "Unaligned offset in blit!");
- cl_ulong dstOrg[4] = {dstOrigin[0] / granularity, dstOrigin[1], dstOrigin[2],
- 0};
- kernels_[blitType]->parameters().set(5, sizeof(dstOrg), dstOrg);
- kernels_[blitType]->parameters().set(6, sizeof(copySize), copySize);
-
- // Program memory format
- uint multiplier = elementSize / sizeof(uint32_t);
- multiplier = (multiplier == 0) ? 1 : multiplier;
- cl_uint format[4] = {(cl_uint)numChannels,
- (cl_uint)(elementSize / numChannels), multiplier, 0};
- kernels_[blitType]->parameters().set(7, sizeof(format), format);
-
- // Program row and slice pitches
- cl_ulong pitch[4] = {0};
- CalcRowSlicePitches(pitch, copySize, rowPitch, slicePitch, srcImage);
- kernels_[blitType]->parameters().set(8, sizeof(pitch), pitch);
-
- // Create ND range object for the kernel's execution
- amd::NDRangeContainer ndrange(dim, globalWorkOffset, globalWorkSize,
- localWorkSize);
-
- // Execute the blit
- address parameters = kernels_[blitType]->parameters().capture(dev_);
- bool result = gpu().submitKernelInternal(ndrange, *kernels_[blitType],
- parameters, NULL);
- kernels_[blitType]->parameters().release(const_cast(parameters),
- dev_);
-
- if (useView) {
- srcView->owner()->release();
- }
-
- return result;
-}
-
-bool KernelBlitManager::copyBufferToImage(device::Memory& srcMemory,
- device::Memory& dstMemory,
- const amd::Coord3D& srcOrigin,
- const amd::Coord3D& dstOrigin,
- const amd::Coord3D& size, bool entire,
- size_t rowPitch,
- size_t slicePitch) const {
- if (srcMemory.isHostMemDirectAccess()) {
- return HsaBlitManager::copyBufferToImage(srcMemory, dstMemory, srcOrigin,
- dstOrigin, size, entire, rowPitch,
- slicePitch);
- }
-
- amd::Image::Format oldFormat = dstMemory.owner()->asImage()->getImageFormat();
- amd::Image::Format newFormat = filterFormat(oldFormat);
- bool useView = false;
-
- device::Memory* dstView = &dstMemory;
- if (oldFormat != newFormat) {
- dstView = createImageView(dstMemory, newFormat);
- useView = true;
- }
-
- roc::Image& dstImage = static_cast(*dstView);
-
- // Use a common blit type with three dimensions by default
- uint blitType = BlitCopyBufferToImage;
- size_t dim = 0;
- size_t globalWorkOffset[3] = {0, 0, 0};
- size_t globalWorkSize[3];
- size_t localWorkSize[3];
-
- // Program the kernels workload depending on the blit dimensions
- const size_t imageDims = dstImage.owner()->asImage()->getDims();
- dim = 3;
- if (imageDims == 1) {
- globalWorkSize[0] = amd::alignUp(size[0], 256);
- globalWorkSize[1] = amd::alignUp(size[1], 1);
- globalWorkSize[2] = amd::alignUp(size[2], 1);
- localWorkSize[0] = 256;
- localWorkSize[1] = localWorkSize[2] = 1;
- } else if (imageDims == 2) {
- globalWorkSize[0] = amd::alignUp(size[0], 16);
- globalWorkSize[1] = amd::alignUp(size[1], 16);
- globalWorkSize[2] = amd::alignUp(size[2], 1);
- localWorkSize[0] = localWorkSize[1] = 16;
- localWorkSize[2] = 1;
- } else {
- globalWorkSize[0] = amd::alignUp(size[0], 8);
- globalWorkSize[1] = amd::alignUp(size[1], 8);
- globalWorkSize[2] = amd::alignUp(size[2], 4);
- localWorkSize[0] = localWorkSize[1] = 8;
- localWorkSize[2] = 4;
- }
-
- // Program kernels arguments for the blit operation
- cl_mem clmem = ((cl_mem)as_cl(srcMemory.owner()));
- kernels_[blitType]->parameters().set(0, sizeof(cl_mem), &clmem);
- clmem = ((cl_mem)as_cl(dstImage.owner()));
- kernels_[blitType]->parameters().set(1, sizeof(cl_mem), &clmem);
-
- const size_t elementSize =
- dstImage.owner()->asImage()->getImageFormat().getElementSize();
- const size_t numChannels =
- dstImage.owner()->asImage()->getImageFormat().getNumChannels();
-
- // 1 element granularity for writes by default
- cl_int granularity = 1;
- if (elementSize == 2) {
- granularity = 2;
- } else if (elementSize >= 4) {
- granularity = 4;
- }
- CondLog(((srcOrigin[0] % granularity) != 0), "Unaligned offset in blit!");
- cl_ulong srcOrg[4] = {srcOrigin[0] / granularity, srcOrigin[1], srcOrigin[2],
- 0};
- kernels_[blitType]->parameters().set(2, sizeof(srcOrg), srcOrg);
-
- cl_int dstOrg[4] = {(cl_int)dstOrigin[0], (cl_int)dstOrigin[1],
- (cl_int)dstOrigin[2], 0};
- cl_int copySize[4] = {(cl_int)size[0], (cl_int)size[1], (cl_int)size[2], 0};
-
- kernels_[blitType]->parameters().set(3, sizeof(dstOrg), dstOrg);
- kernels_[blitType]->parameters().set(4, sizeof(copySize), copySize);
-
- // Program memory format
- uint multiplier = elementSize / sizeof(uint32_t);
- multiplier = (multiplier == 0) ? 1 : multiplier;
- cl_uint format[4] = {(cl_uint)numChannels,
- (cl_uint)(elementSize / numChannels), multiplier, 0};
- kernels_[blitType]->parameters().set(5, sizeof(format), format);
-
- // Program row and slice pitches
- cl_ulong pitch[4] = {0};
- CalcRowSlicePitches(pitch, copySize, rowPitch, slicePitch, dstImage);
- kernels_[blitType]->parameters().set(6, sizeof(pitch), pitch);
-
- // Create ND range object for the kernel's execution
- amd::NDRangeContainer ndrange(dim, globalWorkOffset, globalWorkSize,
- localWorkSize);
-
- // Execute the blit
- address parameters = kernels_[blitType]->parameters().capture(dev_);
- bool result = gpu().submitKernelInternal(ndrange, *kernels_[blitType],
- parameters, NULL);
- kernels_[blitType]->parameters().release(const_cast(parameters),
- dev_);
-
- if (useView) {
- dstView->owner()->release();
- }
-
- return result;
-}
-
-bool
-KernelBlitManager::copyImage(
- device::Memory& srcMemory,
- device::Memory& dstMemory,
- const amd::Coord3D& srcOrigin,
- const amd::Coord3D& dstOrigin,
- const amd::Coord3D& size,
- bool entire) const
-{
- return HsaBlitManager::copyImage(
- srcMemory, dstMemory, srcOrigin, dstOrigin, size, entire);
-}
-
-bool
-KernelBlitManager::fillBuffer(
- device::Memory& memory,
- const void* pattern,
- size_t patternSize,
- const amd::Coord3D& origin,
- const amd::Coord3D& size,
- bool entire
- ) const
-{
- if (setup_.disableFillBuffer_ || memory.isHostMemDirectAccess()) {
- return HostBlitManager::fillBuffer(memory, pattern, patternSize, origin,
- size, entire);
- }
-
- uint fillType = FillBuffer;
- size_t globalWorkOffset[3] = { 0, 0, 0 };
- cl_ulong fillSize = size[0] / patternSize;
- size_t globalWorkSize = amd::alignUp(fillSize, 256);
- size_t localWorkSize = 256;
- bool dwordAligned =
- ((patternSize % sizeof(uint32_t)) == 0) ? true : false;
-
- // Program kernels arguments for the fill operation
- if (dwordAligned) {
- kernels_[fillType]->parameters().set(0, sizeof(cl_mem), NULL);
- cl_mem clmem = ((cl_mem) as_cl(memory.owner()));
- kernels_[fillType]->parameters().set(1, sizeof(cl_mem), &clmem);
- }
- else {
- cl_mem clmem = ((cl_mem) as_cl(memory.owner()));
- kernels_[fillType]->parameters().set(0, sizeof(cl_mem), &clmem);
- kernels_[fillType]->parameters().set(1, sizeof(cl_mem), NULL);
- }
-
- amd::Buffer *fillMemory =
- new (*context_) amd::Buffer(*context_, CL_MEM_USE_HOST_PTR, patternSize);
-
- if (!fillMemory->create(const_cast(pattern))) {
- LogError("[OCL] Fail to create mem object for destination");
- return false;
- }
-
- if (fillMemory->getDeviceMemory(dev_) == NULL) {
- LogError("[OCL] Fail to create device mem object for destination");
- return false;
- }
-
- cl_mem clmem = ((cl_mem) as_cl(fillMemory));
- kernels_[fillType]->parameters().set(2, sizeof(cl_mem), &clmem);
- cl_ulong offset = origin[0];
- if (dwordAligned) {
- patternSize /= sizeof(uint32_t);
- offset /= sizeof(uint32_t);
- }
- kernels_[fillType]->parameters().set(3, sizeof(cl_uint), &patternSize);
- kernels_[fillType]->parameters().set(4, sizeof(offset), &offset);
- kernels_[fillType]->parameters().set(5, sizeof(fillSize), &fillSize);
-
- // Create ND range object for the kernel's execution
- amd::NDRangeContainer ndrange(1,
- globalWorkOffset, &globalWorkSize, &localWorkSize);
-
- // Execute the blit
- address parameters = kernels_[fillType]->parameters().capture(dev_);
- bool result = gpu().submitKernelInternal(
- ndrange, *kernels_[fillType], parameters, NULL);
- kernels_[fillType]->parameters().release(const_cast(parameters), dev_);
-
- // Wait for the transfer to finish so that we could safely release the
- // fill memory object.
- // TODO: we could remove this if issue on implicit memory registration is
- // fixed by KFD, so that we could pass the pattern as SVM.
- gpu().releaseGpuMemoryFence();
-
- fillMemory->release();
-
- return result;
-}
-
-bool
-KernelBlitManager::fillImage(
- device::Memory& memory,
- const void* pattern,
- const amd::Coord3D& origin,
- const amd::Coord3D& size,
- bool entire
- ) const
-{
- return HsaBlitManager::fillImage(memory, pattern, origin, size, entire);
-}
-
-bool
-KernelBlitManager::create(amd::Device& device)
-{
- if (!HsaBlitManager::create(device)) {
- return false;
- }
- if (!createProgram(static_cast(device))) {
- return false;
- }
-
- return true;
-}
-
-bool
-KernelBlitManager::createProgram(Device& device)
-{
- // Save context and program for this device
- context_ = device.blitProgram()->context_;
- context_->retain();
- program_ = device.blitProgram()->program_;
- program_->retain();
-
- bool result = false;
- do {
- // Create kernel objects for all blits
- for (uint i = 0; i < BlitTotal; ++i) {
- const amd::Symbol* symbol = program_->findSymbol(BlitName[i]);
- if (symbol == NULL) {
- break;
- }
- kernels_[i] = new amd::Kernel(*program_, *symbol, BlitName[i]);
- if (kernels_[i] == NULL) {
- break;
- }
- }
-
- result = true;
- } while(!result);
-
- return result;
-}
-
-} // namespace roc
diff --git a/projects/clr/rocclr/runtime/device/rocm/rocblit.hpp b/projects/clr/rocclr/runtime/device/rocm/rocblit.hpp
deleted file mode 100644
index 16d1ef2363..0000000000
--- a/projects/clr/rocclr/runtime/device/rocm/rocblit.hpp
+++ /dev/null
@@ -1,412 +0,0 @@
-//
-// Copyright (c) 2013 Advanced Micro Devices, Inc. All rights reserved.
-//
-#pragma once
-
-#include "top.hpp"
-#include "platform/command.hpp"
-#include "platform/commandqueue.hpp"
-#include "device/device.hpp"
-#include "device/blit.hpp"
-
-/*! \addtogroup HSA Blit Implementation
- * @{
- */
-
-//! HSA Blit Manager Implementation
-namespace roc {
-
-class Device;
-class Kernel;
-class Memory;
-class VirtualGPU;
-
-//! DMA Blit Manager
-class HsaBlitManager : public device::HostBlitManager
-{
-public:
- //! Constructor
- HsaBlitManager(
- device::VirtualDevice& vdev, //!< Virtual GPU to be used for blits
- Setup setup = Setup() //!< Specifies HW accelerated blits
- );
-
- //! Destructor
- virtual ~HsaBlitManager() {
- if (completion_signal_.handle != 0) {
- hsa_signal_destroy(completion_signal_);
- }
- }
-
- //! Creates HostBlitManager object
- virtual bool create(amd::Device& device) {
- if (HSA_STATUS_SUCCESS != hsa_signal_create(0, 0, NULL, &completion_signal_)) {
- return false;
- }
- return true;
- }
-
- //! Copies a buffer object to system memory
- virtual bool readBuffer(
- device::Memory& srcMemory, //!< Source memory object
- void* dstHost, //!< Destination host memory
- const amd::Coord3D& origin, //!< Source origin
- const amd::Coord3D& size, //!< Size of the copy region
- bool entire = false //!< Entire buffer will be updated
- ) const;
-
- //! Copies a buffer object to system memory
- virtual bool readBufferRect(
- device::Memory& srcMemory, //!< Source memory object
- void* dstHost, //!< Destinaiton host memory
- const amd::BufferRect& bufRect, //!< Source rectangle
- const amd::BufferRect& hostRect, //!< Destination rectangle
- const amd::Coord3D& size, //!< Size of the copy region
- bool entire = false //!< Entire buffer will be updated
- ) const;
-
- //! Copies an image object to system memory
- virtual bool readImage(
- device::Memory& srcMemory, //!< Source memory object
- void* dstHost, //!< Destination host memory
- const amd::Coord3D& origin, //!< Source origin
- const amd::Coord3D& size, //!< Size of the copy region
- size_t rowPitch, //!< Row pitch for host memory
- size_t slicePitch, //!< Slice pitch for host memory
- bool entire = false //!< Entire buffer will be updated
- ) const;
-
- //! Copies system memory to a buffer object
- virtual bool writeBuffer(
- const void* srcHost, //!< Source host memory
- device::Memory& dstMemory, //!< Destination memory object
- const amd::Coord3D& origin, //!< Destination origin
- const amd::Coord3D& size, //!< Size of the copy region
- bool entire = false //!< Entire buffer will be updated
- ) const;
-
- //! Copies system memory to a buffer object
- virtual bool writeBufferRect(
- const void* srcHost, //!< Source host memory
- device::Memory& dstMemory, //!< Destination memory object
- const amd::BufferRect& hostRect, //!< Destination rectangle
- const amd::BufferRect& bufRect, //!< Source rectangle
- const amd::Coord3D& size, //!< Size of the copy region
- bool entire = false //!< Entire buffer will be updated
- ) const;
-
- //! Copies system memory to an image object
- virtual bool writeImage(
- const void* srcHost, //!< Source host memory
- device::Memory& dstMemory, //!< Destination memory object
- const amd::Coord3D& origin, //!< Destination origin
- const amd::Coord3D& size, //!< Size of the copy region
- size_t rowPitch, //!< Row pitch for host memory
- size_t slicePitch, //!< Slice pitch for host memory
- bool entire = false //!< Entire buffer will be updated
- ) const;
-
- //! Copies a buffer object to another buffer object
- virtual bool copyBuffer(
- device::Memory& srcMemory, //!< Source memory object
- device::Memory& dstMemory, //!< Destination memory object
- const amd::Coord3D& srcOrigin, //!< Source origin
- const amd::Coord3D& dstOrigin, //!< Destination origin
- const amd::Coord3D& size, //!< Size of the copy region
- bool entire = false //!< Entire buffer will be updated
- ) const;
-
- //! Copies a buffer object to another buffer object
- virtual bool copyBufferRect(
- device::Memory& srcMemory, //!< Source memory object
- device::Memory& dstMemory, //!< Destination memory object
- const amd::BufferRect& srcRect, //!< Source rectangle
- const amd::BufferRect& dstRect, //!< Destination rectangle
- const amd::Coord3D& size, //!< Size of the copy region
- bool entire = false //!< Entire buffer will be updated
- ) const;
-
- //! Copies an image object to a buffer object
- virtual bool copyImageToBuffer(
- device::Memory& srcMemory, //!< Source memory object
- device::Memory& dstMemory, //!< Destination memory object
- const amd::Coord3D& srcOrigin, //!< Source origin
- const amd::Coord3D& dstOrigin, //!< Destination origin
- const amd::Coord3D& size, //!< Size of the copy region
- bool entire = false, //!< Entire buffer will be updated
- size_t rowPitch = 0, //!< Pitch for buffer
- size_t slicePitch = 0 //!< Slice for buffer
- ) const;
-
- //! Copies a buffer object to an image object
- virtual bool copyBufferToImage(
- device::Memory& srcMemory, //!< Source memory object
- device::Memory& dstMemory, //!< Destination memory object
- const amd::Coord3D& srcOrigin, //!< Source origin
- const amd::Coord3D& dstOrigin, //!< Destination origin
- const amd::Coord3D& size, //!< Size of the copy region
- bool entire = false, //!< Entire buffer will be updated
- size_t rowPitch = 0, //!< Pitch for buffer
- size_t slicePitch = 0 //!< Slice for buffer
- ) const;
-
- //! Copies an image object to another image object
- virtual bool copyImage(
- device::Memory& srcMemory, //!< Source memory object
- device::Memory& dstMemory, //!< Destination memory object
- const amd::Coord3D& srcOrigin, //!< Source origin
- const amd::Coord3D& dstOrigin, //!< Destination origin
- const amd::Coord3D& size, //!< Size of the copy region
- bool entire = false //!< Entire buffer will be updated
- ) const;
-
- //! Fills a buffer memory with a pattern data
- virtual bool fillBuffer(
- device::Memory& memory, //!< Memory object to fill with pattern
- const void* pattern, //!< Pattern data
- size_t patternSize, //!< Pattern size
- const amd::Coord3D& origin, //!< Destination origin
- const amd::Coord3D& size, //!< Size of the copy region
- bool entire = false //!< Entire buffer will be updated
- ) const;
-
- //! Fills an image memory with a pattern data
- virtual bool fillImage(
- device::Memory& dstMemory, //!< Memory object to fill with pattern
- const void* pattern, //!< Pattern data
- const amd::Coord3D& origin, //!< Destination origin
- const amd::Coord3D& size, //!< Size of the copy region
- bool entire = false //!< Entire buffer will be updated
- ) const;
-
-protected:
- //! Returns the virtual GPU object
- VirtualGPU& gpu() const { return static_cast(vDev_); }
-
-private:
- //! Handle of Hsa Device object
- const roc::Device& roc_device_;
-
- hsa_signal_t completion_signal_;
-
- //! Assits in transferring data from Host to Local or vice versa
- //! taking into account the Hsail profile supported by Hsa Agent
- bool hsaCopy(
- const void *hostSrc, //!< Contains source data to be copied
- void *hostDst, //!< Destination buffer address for copying
- uint32_t size, //!< Size of data to copy in bytes
- bool hostToDev //!< True if data is copied from Host To Device
- ) const;
-
- //! Disable copy constructor
- HsaBlitManager(const HsaBlitManager&);
-
- //! Disable operator=
- HsaBlitManager& operator=(const HsaBlitManager&);
-};
-
-//! Kernel Blit Manager
-//class KernelBlitManager : public HsaBlitManager
-class KernelBlitManager : public HsaBlitManager
-{
-private:
- VirtualGPU& gpu() const { return static_cast(vDev_); }
-public:
- enum {
- BlitCopyImage = 0,
- BlitCopyImage1DA,
- BlitCopyImageToBuffer,
- BlitCopyBufferToImage,
- BlitCopyBufferRect,
- BlitCopyBufferRectAligned,
- BlitCopyBuffer,
- BlitCopyBufferAligned,
- FillBuffer,
- FillImage,
- BlitTotal
- };
-
- //! Constructor
- KernelBlitManager(
- device::VirtualDevice& vdev, //!< Virtual GPU to be used for blits
- Setup setup = Setup() //!< Specifies HW accelerated blits
- );
-
- //! Destructor
- virtual ~KernelBlitManager();
-
- //! Creates HostBlitManager object
- virtual bool create(amd::Device& device);
-
- //! Copies a buffer object to system memory
- virtual bool readBuffer(
- device::Memory& srcMemory, //!< Source memory object
- void* dstHost, //!< Destination host memory
- const amd::Coord3D& origin, //!< Source origin
- const amd::Coord3D& size, //!< Size of the copy region
- bool entire = false //!< Entire buffer will be updated
- ) const;
-
- //! Copies a buffer object to system memory
- virtual bool readBufferRect(
- device::Memory& srcMemory, //!< Source memory object
- void* dstHost, //!< Destinaiton host memory
- const amd::BufferRect& bufRect, //!< Source rectangle
- const amd::BufferRect& hostRect, //!< Destination rectangle
- const amd::Coord3D& size, //!< Size of the copy region
- bool entire = false //!< Entire buffer will be updated
- ) const;
-
- //! Copies an image object to system memory
- virtual bool readImage(
- device::Memory& srcMemory, //!< Source memory object
- void* dstHost, //!< Destination host memory
- const amd::Coord3D& origin, //!< Source origin
- const amd::Coord3D& size, //!< Size of the copy region
- size_t rowPitch, //!< Row pitch for host memory
- size_t slicePitch, //!< Slice pitch for host memory
- bool entire = false //!< Entire buffer will be updated
- ) const;
-
- //! Copies system memory to a buffer object
- virtual bool writeBuffer(
- const void* srcHost, //!< Source host memory
- device::Memory& dstMemory, //!< Destination memory object
- const amd::Coord3D& origin, //!< Destination origin
- const amd::Coord3D& size, //!< Size of the copy region
- bool entire = false //!< Entire buffer will be updated
- ) const;
-
- //! Copies system memory to a buffer object
- virtual bool writeBufferRect(
- const void* srcHost, //!< Source host memory
- device::Memory& dstMemory, //!< Destination memory object
- const amd::BufferRect& hostRect, //!< Destination rectangle
- const amd::BufferRect& bufRect, //!< Source rectangle
- const amd::Coord3D& size, //!< Size of the copy region
- bool entire = false //!< Entire buffer will be updated
- ) const;
-
- //! Copies system memory to an image object
- virtual bool writeImage(
- const void* srcHost, //!< Source host memory
- device::Memory& dstMemory, //!< Destination memory object
- const amd::Coord3D& origin, //!< Destination origin
- const amd::Coord3D& size, //!< Size of the copy region
- size_t rowPitch, //!< Row pitch for host memory
- size_t slicePitch, //!< Slice pitch for host memory
- bool entire = false //!< Entire buffer will be updated
- ) const;
-
- //! Copies a buffer object to another buffer object
- virtual bool copyBuffer(
- device::Memory& srcMemory, //!< Source memory object
- device::Memory& dstMemory, //!< Destination memory object
- const amd::Coord3D& srcOrigin, //!< Source origin
- const amd::Coord3D& dstOrigin, //!< Destination origin
- const amd::Coord3D& size, //!< Size of the copy region
- bool entire = false //!< Entire buffer will be updated
- ) const;
-
- //! Copies a buffer object to another buffer object
- virtual bool copyBufferRect(
- device::Memory& srcMemory, //!< Source memory object
- device::Memory& dstMemory, //!< Destination memory object
- const amd::BufferRect& srcRect, //!< Source rectangle
- const amd::BufferRect& dstRect, //!< Destination rectangle
- const amd::Coord3D& size, //!< Size of the copy region
- bool entire = false //!< Entire buffer will be updated
- ) const;
-
- //! Copies an image object to a buffer object
- virtual bool copyImageToBuffer(
- device::Memory& srcMemory, //!< Source memory object
- device::Memory& dstMemory, //!< Destination memory object
- const amd::Coord3D& srcOrigin, //!< Source origin
- const amd::Coord3D& dstOrigin, //!< Destination origin
- const amd::Coord3D& size, //!< Size of the copy region
- bool entire = false, //!< Entire buffer will be updated
- size_t rowPitch = 0, //!< Pitch for buffer
- size_t slicePitch = 0 //!< Slice for buffer
- ) const;
-
- //! Copies a buffer object to an image object
- virtual bool copyBufferToImage(
- device::Memory& srcMemory, //!< Source memory object
- device::Memory& dstMemory, //!< Destination memory object
- const amd::Coord3D& srcOrigin, //!< Source origin
- const amd::Coord3D& dstOrigin, //!< Destination origin
- const amd::Coord3D& size, //!< Size of the copy region
- bool entire = false, //!< Entire buffer will be updated
- size_t rowPitch = 0, //!< Pitch for buffer
- size_t slicePitch = 0 //!< Slice for buffer
- ) const;
-
- //! Copies an image object to another image object
- virtual bool copyImage(
- device::Memory& srcMemory, //!< Source memory object
- device::Memory& dstMemory, //!< Destination memory object
- const amd::Coord3D& srcOrigin, //!< Source origin
- const amd::Coord3D& dstOrigin, //!< Destination origin
- const amd::Coord3D& size, //!< Size of the copy region
- bool entire = false //!< Entire buffer will be updated
- ) const;
-
- //! Fills a buffer memory with a pattern data
- virtual bool fillBuffer(
- device::Memory& memory, //!< Memory object to fill with pattern
- const void* pattern, //!< Pattern data
- size_t patternSize, //!< Pattern size
- const amd::Coord3D& origin, //!< Destination origin
- const amd::Coord3D& size, //!< Size of the copy region
- bool entire = false //!< Entire buffer will be updated
- ) const;
-
- //! Fills an image memory with a pattern data
- virtual bool fillImage(
- device::Memory& dstMemory, //!< Memory object to fill with pattern
- const void* pattern, //!< Pattern data
- const amd::Coord3D& origin, //!< Destination origin
- const amd::Coord3D& size, //!< Size of the copy region
- bool entire = false //!< Entire buffer will be updated
- ) const;
-
-private:
- //! Disable copy constructor
- KernelBlitManager(const KernelBlitManager&);
-
- //! Disable operator=
- KernelBlitManager& operator=(const KernelBlitManager&);
-
- //! Creates a program for all blit operations
- bool createProgram(
- Device& device //!< Device object
- );
-
- amd::Image::Format filterFormat(amd::Image::Format oldFormat) const;
-
- device::Memory *createImageView(
- device::Memory &parent,
- amd::Image::Format newFormat) const;
-
- amd::Context *context_; //!< A dummy context
- amd::Program *program_; //!< GPU program obejct
- amd::Kernel *kernels_[BlitTotal]; //!< GPU kernels for blit
-};
-
-static const char* BlitName[KernelBlitManager::BlitTotal] = {
- "copyImage",
- "copyImage1DA",
- "copyImageToBuffer",
- "copyBufferToImage",
- "copyBufferRect",
- "copyBufferRectAligned",
- "copyBuffer",
- "copyBufferAligned",
- "fillBuffer",
- "fillImage"
- };
-
-/*@}*/
-} // namespace roc
-
diff --git a/projects/clr/rocclr/runtime/device/rocm/roccompiler.cpp b/projects/clr/rocclr/runtime/device/rocm/roccompiler.cpp
deleted file mode 100644
index 6e89f63b08..0000000000
--- a/projects/clr/rocclr/runtime/device/rocm/roccompiler.cpp
+++ /dev/null
@@ -1,160 +0,0 @@
-//
-// Copyright (c) 2008 Advanced Micro Devices, Inc. All rights reserved.
-//
-#ifndef WITHOUT_HSA_BACKEND
-
-#include
-#include
-#include
-#include
-
-#include "os/os.hpp"
-#include "rocdevice.hpp"
-#include "rocprogram.hpp"
-#include "roccompilerlib.hpp"
-#include "utils/options.hpp"
-#include
-
-//CLC_IN_PROCESS_CHANGE
-extern int openclFrontEnd(const char* cmdline, std::string*, std::string* typeInfo = NULL);
-
-namespace roc {
-
-/* Temporary log function for the compiler library */
-static void logFunction(const char* msg, size_t size)
-{
- std::cout<< "Compiler Log: " << msg << std::endl;
-}
-
-static int programsCount = 0;
-
-bool
-HSAILProgram::compileImpl(const std::string& sourceCode,
- const std::vector& headers,
- const char** headerIncludeNames,
- amd::option::Options* options)
-{
- acl_error errorCode;
- aclTargetInfo target;
-
- //Defaulting to bonaire
- //Todo (sramalin) : Query the device for asic type-
- //Defaulting to Bonair for now.
- target = g_complibApi._aclGetTargetInfo(LP64_SWITCH("hsail","hsail64"), "Bonaire",
- &errorCode);
-
- //end if asic info is ready
- // We dump the source code for each program (param: headers)
- // into their filenames (headerIncludeNames) into the TEMP
- // folder specific to the OS and add the include path while
- // compiling
-
- //Find the temp folder for the OS
- std::string tempFolder = amd::Os::getEnvironment("TEMP");
- if (tempFolder.empty()) {
- tempFolder = amd::Os::getEnvironment("TMP");
- if (tempFolder.empty()) {
- tempFolder = WINDOWS_SWITCH(".","/tmp");;
- }
- }
- //Iterate through each source code and dump it into tmp
- std::fstream f;
- std::vector headerFileNames(headers.size());
- std::vector newDirs;
- for (size_t i = 0; i < headers.size(); ++i) {
- std::string headerPath = tempFolder;
- std::string headerIncludeName(headerIncludeNames[i]);
- // replace / in path with current os's file separator
- if ( amd::Os::fileSeparator() != '/') {
- for (std::string::iterator it = headerIncludeName.begin(),
- end = headerIncludeName.end();
- it != end;
- ++it) {
- if (*it == '/') *it = amd::Os::fileSeparator();
- }
- }
- size_t pos = headerIncludeName.rfind(amd::Os::fileSeparator());
- if (pos != std::string::npos) {
- headerPath += amd::Os::fileSeparator();
- headerPath += headerIncludeName.substr(0, pos);
- headerIncludeName = headerIncludeName.substr(pos+1);
- }
- if (!amd::Os::pathExists(headerPath)) {
- bool ret = amd::Os::createPath(headerPath);
- assert(ret && "failed creating path!");
- newDirs.push_back(headerPath);
- }
- std::string headerFullName
- = headerPath + amd::Os::fileSeparator() + headerIncludeName;
- headerFileNames[i] = headerFullName;
- f.open(headerFullName.c_str(), std::fstream::out);
- //Should we allow asserts
- assert(!f.fail() && "failed creating header file!");
- f.write(headers[i]->c_str(), headers[i]->length());
- f.close();
- }
-
- //Create Binary
- binaryElf_ = g_complibApi._aclBinaryInit(sizeof(aclBinary),
- &target,
- &binOpts_,
- &errorCode);
-
- if( errorCode!=ACL_SUCCESS ) {
- buildLog_ += "Error while compiling opencl source:\
- aclBinary init failure \n";
- LogWarning("aclBinaryInit failed");
- return false;
- }
-
- //Insert opencl into binary
- errorCode = g_complibApi._aclInsertSection(device().compiler(),
- binaryElf_,
- sourceCode.c_str(),
- strlen(sourceCode.c_str()),
- aclSOURCE);
-
- if ( errorCode != ACL_SUCCESS ) {
- buildLog_ += "Error while converting to BRIG: \
- Inserting openCl Source \n";
- }
-
- //Set the options for the compiler
- //Set the include path for the temp folder that contains the includes
- if(!headers.empty()) {
- this->compileOptions_.append(" -I");
- this->compileOptions_.append(tempFolder);
- }
-
- //Add only for CL2.0 and later
- if (options->oVariables->CLStd[2] >= '2') {
- std::stringstream opts;
- opts << " -D" << "CL_DEVICE_MAX_GLOBAL_VARIABLE_SIZE="
- << device().info().maxGlobalVariableSize_;
- compileOptions_.append(opts.str());
- }
-
- //Compile source to IR
- this->compileOptions_.append(hsailOptions());
-
- errorCode = g_complibApi._aclCompile(device().compiler(),
- binaryElf_,
- //"-Wf,--support_all_extensions",
- this->compileOptions_.c_str(),
- ACL_TYPE_OPENCL,
- ACL_TYPE_LLVMIR_BINARY,
- logFunction);
- buildLog_ += g_complibApi._aclGetCompilerLog(device().compiler());
- if( errorCode!=ACL_SUCCESS ) {
- LogWarning("aclCompile failed");
- buildLog_ += "Error while compiling \
- opencl source: Compiling CL to IR";
- return false;
- }
- // Save the binary in the interface class
- saveBinaryAndSetType(TYPE_COMPILED);
- return true;
-
-}
-}
-#endif // WITHOUT_GPU_BACKEND
diff --git a/projects/clr/rocclr/runtime/device/rocm/roccompilerlib.cpp b/projects/clr/rocclr/runtime/device/rocm/roccompilerlib.cpp
deleted file mode 100644
index e933d5c393..0000000000
--- a/projects/clr/rocclr/runtime/device/rocm/roccompilerlib.cpp
+++ /dev/null
@@ -1,59 +0,0 @@
-#include "roccompilerlib.hpp"
-#include "utils/flags.hpp"
-
-#include "acl.h"
-
-namespace roc {
-
-void* g_complibModule = NULL;
-struct CompLibApi g_complibApi;
-
-//
-// g_complibModule is defined in LoadCompLib(). This macro must be used only in LoadCompLib() function.
-//
-#define LOADSYMBOL(api) \
- g_complibApi._##api = (pfn_##api) amd::Os::getSymbol(g_complibModule, #api); \
- if( g_complibApi._##api == NULL ) { \
- LogError ("amd::Os::getSymbol() for exported func " #api " failed."); \
- amd::Os::unloadLibrary(g_complibModule); \
- return false; \
- }
-
-bool LoadCompLib(bool offline)
-{
- g_complibModule = amd::Os::loadLibrary("amdhsacl" LP64_SWITCH(LINUX_SWITCH("32",""), "64"));
- if( g_complibModule == NULL ) {
- if (!offline) {
- LogError( "amd::Os::loadLibrary() for loading of amdhsacl.dll failed.");
- }
- return false;
- }
-
- LOADSYMBOL(aclCompilerInit)
- LOADSYMBOL(aclGetTargetInfo)
- LOADSYMBOL(aclBinaryInit)
- LOADSYMBOL(aclInsertSection)
- LOADSYMBOL(aclCompile)
- LOADSYMBOL(aclCompilerFini)
- LOADSYMBOL(aclBinaryFini)
- LOADSYMBOL(aclWriteToMem)
- LOADSYMBOL(aclQueryInfo)
- LOADSYMBOL(aclExtractSymbol)
- LOADSYMBOL(aclGetCompilerLog)
- LOADSYMBOL(aclCreateFromBinary)
- LOADSYMBOL(aclReadFromMem)
- LOADSYMBOL(aclBinaryVersion)
- LOADSYMBOL(aclLink)
-
- return true;
-}
-
-void UnloadCompLib()
-{
- if( g_complibModule )
- {
- amd::Os::unloadLibrary(g_complibModule);
- }
-}
-
-} // namespace roc
diff --git a/projects/clr/rocclr/runtime/device/rocm/roccompilerlib.hpp b/projects/clr/rocclr/runtime/device/rocm/roccompilerlib.hpp
deleted file mode 100644
index ca31bd9216..0000000000
--- a/projects/clr/rocclr/runtime/device/rocm/roccompilerlib.hpp
+++ /dev/null
@@ -1,77 +0,0 @@
-#pragma once
-
-//
-// This file hsa the code for explicity loading amdoclcl.dll.
-// Exported functions from amdoclcl.dll can be added for usage as need-basis.
-// With explicit/dynamic loading roc will not have any linkage to amdoclcl.lib.
-//
-
-#include "thread/thread.hpp"
-#include "acl.h"
-#include "utils/debug.hpp"
-
-using namespace amd;
-
-namespace roc {
-
-//
-// To use any new exported function from amdhsacl.dll please add/make that function specific changes
-// in typedef below, struct CompLibApi and in hsacompilerLib.cpp::LoadCompLib() function.
-//
-
-//
-// Convention: The typedefed function name must be prefixed with pfn_
-//
-typedef aclCompiler* (ACL_API_ENTRY *pfn_aclCompilerInit) (aclCompilerOptions *opts, acl_error *error_code);
-typedef aclTargetInfo (ACL_API_ENTRY *pfn_aclGetTargetInfo) (const char*, const char*, acl_error*);
-typedef aclBinary* (ACL_API_ENTRY *pfn_aclBinaryInit) (size_t, const aclTargetInfo*, const aclBinaryOptions*, acl_error*);
-typedef acl_error (ACL_API_ENTRY *pfn_aclInsertSection) (aclCompiler *cl, aclBinary *binary, const void *data, size_t data_size, aclSections id);
-typedef acl_error (ACL_API_ENTRY *pfn_aclCompile) (aclCompiler *cl, aclBinary *bin, const char *options, aclType from, aclType to, aclLogFunction compile_callback);
-typedef acl_error (ACL_API_ENTRY *pfn_aclCompilerFini) (aclCompiler *cl);
-typedef acl_error (ACL_API_ENTRY *pfn_aclBinaryFini) (aclBinary *bin);
-typedef acl_error (ACL_API_ENTRY *pfn_aclWriteToMem) (aclBinary *bin,void **mem, size_t *size);
-typedef acl_error (ACL_API_ENTRY *pfn_aclQueryInfo) (aclCompiler *cl, const aclBinary *binary, aclQueryType query, const char *kernel, void *data_ptr, size_t *ptr_size);
-typedef const void* (ACL_API_ENTRY *pfn_aclExtractSymbol) (aclCompiler *cl,const aclBinary *binary,size_t *size,aclSections id,const char *symbol,acl_error *error_code);
-typedef aclBinary* (ACL_API_ENTRY *pfn_aclReadFromMem) (void *mem,size_t size, acl_error *error_code);
-typedef char* (ACL_API_ENTRY *pfn_aclGetCompilerLog) (aclCompiler* cl);
-typedef aclBinary* (ACL_API_ENTRY *pfn_aclCreateFromBinary) (const aclBinary *binary,aclBIFVersion version);
-typedef aclBIFVersion (ACL_API_ENTRY *pfn_aclBinaryVersion) (const aclBinary *binary);
-typedef acl_error (ACL_API_ENTRY *pfn_aclLink) (aclCompiler* cl, aclBinary *src_bin, unsigned int num_libs, aclBinary **libs, aclType link_mode,const char* options, aclLogFunction link_callback);
-//
-// Convention: prefix struct member variable with with underscore '_'
-// would be nice if there was no underscore prfix, but on Linux the token
-// pasting in the macro is srtict and his is the workaround.
-//
-struct CompLibApi
-{
- pfn_aclCompilerInit _aclCompilerInit;
- pfn_aclGetTargetInfo _aclGetTargetInfo;
- pfn_aclBinaryInit _aclBinaryInit;
- pfn_aclInsertSection _aclInsertSection;
- pfn_aclCompile _aclCompile;
- pfn_aclCompilerFini _aclCompilerFini;
- pfn_aclBinaryFini _aclBinaryFini;
- pfn_aclWriteToMem _aclWriteToMem;
- pfn_aclQueryInfo _aclQueryInfo;
- pfn_aclExtractSymbol _aclExtractSymbol;
- pfn_aclReadFromMem _aclReadFromMem;
- pfn_aclGetCompilerLog _aclGetCompilerLog;
- pfn_aclCreateFromBinary _aclCreateFromBinary;
- pfn_aclBinaryVersion _aclBinaryVersion;
- pfn_aclLink _aclLink;
-};
-
-
-//
-// Use g_ prefix for all global variables.
-//
-extern void* g_complibModule;
-extern CompLibApi g_complibApi;
-
-// Note: initializes global variable g_complibApi.
-// Not sure what error values we have, for now returning false on failure.
-bool LoadCompLib(bool isOfflineDevice=false);
-void UnloadCompLib();
-
-} // namespace roc
-
diff --git a/projects/clr/rocclr/runtime/device/rocm/rocdefs.hpp b/projects/clr/rocclr/runtime/device/rocm/rocdefs.hpp
deleted file mode 100644
index d94a10a1d7..0000000000
--- a/projects/clr/rocclr/runtime/device/rocm/rocdefs.hpp
+++ /dev/null
@@ -1,49 +0,0 @@
-#pragma once
-
-#ifndef WITHOUT_HSA_BACKEND
-
-namespace roc {
-
-typedef uint HsaDeviceId;
-
-struct AMDDeviceInfo {
- HsaDeviceId hsaDeviceId_; //!< Machine id
- const char* targetName_; //!< Target name for compilation
- const char* machineTarget_; //!< Machine target
- uint simdPerCU_; //!< Number of SIMDs per CU
- uint simdWidth_; //!< Number of workitems processed per SIMD
- uint simdInstructionWidth_; //!< Number of instructions processed per SIMD
- uint memChannelBankWidth_; //!< Memory channel bank width
- uint localMemSizePerCU_; //!< Local memory size per CU
- uint localMemBanks_; //!< Number of banks of local memory
- uint pciDeviceId; //!< PCIe device id
-};
-
-//The device ID must match with the device's index into DeviceInfo
-const HsaDeviceId HSA_SPECTRE_ID = 0;
-const HsaDeviceId HSA_SPOOKY_ID = 1;
-const HsaDeviceId HSA_TONGA_ID = 2;
-const HsaDeviceId HSA_CARRIZO_ID = 3;
-const HsaDeviceId HSA_ICELAND_ID = 4;
-const HsaDeviceId HSA_FIJI_ID = 5;
-const HsaDeviceId HSA_HAWAII_ID = 6;
-const HsaDeviceId HSA_ELLESMERE_ID = 7;
-const HsaDeviceId HSA_BAFFIN_ID = 8;
-const HsaDeviceId HSA_INVALID_DEVICE_ID = -1;
-
-static const AMDDeviceInfo DeviceInfo[] = {
- // targetName machineTarget
- /* TARGET_KAVERI_SPECTRE */ {HSA_SPECTRE_ID, "Spectre", "Spectre", 4, 16, 1, 256, 64 * Ki, 32, 0 },
- /* TARGET_KAVERI_SPOOKY */ {HSA_SPOOKY_ID, "Spooky", "Spooky", 4, 16, 1, 256, 64 * Ki, 32, 0 },
- /* TARGET_TONGA */ {HSA_TONGA_ID, "Tonga", "Tonga", 4, 16, 1, 256, 64 * Ki, 32, 0},
- /* TARGET_CARRIZO */ {HSA_CARRIZO_ID, "Carrizo", "Carrizo", 4, 16, 1, 256, 64 * Ki, 32, 0},
- /* TARGET_ICELAND */ {HSA_ICELAND_ID, "Topaz", "Topaz", 4, 16, 1, 256, 64 * Ki, 32, 0},
- /* TARGET_FIJI */ {HSA_FIJI_ID, "Fiji", "Fiji", 4, 16, 1, 256, 64 * Ki, 32, 0 },
- /* TARGET HAWAII */ {HSA_HAWAII_ID, "Hawaii", "Hawaii", 4, 16, 1, 256, 64 * Ki, 32, 0 },
- /* TARGET ELLESMERE */ {HSA_ELLESMERE_ID, "Ellesmere", "Ellesmere", 4, 16, 1, 256, 64 * Ki, 32, 0 },
- /* TARGET BAFFIN */ {HSA_BAFFIN_ID, "Baffin", "Baffin", 4, 16, 1, 256, 64 * Ki, 32, 0 }
-};
-
-}
-#endif
-
diff --git a/projects/clr/rocclr/runtime/device/rocm/rocdevice.cpp b/projects/clr/rocclr/runtime/device/rocm/rocdevice.cpp
deleted file mode 100644
index 71a0f7d1c5..0000000000
--- a/projects/clr/rocclr/runtime/device/rocm/rocdevice.cpp
+++ /dev/null
@@ -1,1296 +0,0 @@
-//
-// Copyright (c) 2008 Advanced Micro Devices, Inc. All rights reserved.
-//
-
-#ifndef WITHOUT_HSA_BACKEND
-
-#include "platform/program.hpp"
-#include "platform/kernel.hpp"
-#include "os/os.hpp"
-#include "utils/debug.hpp"
-#include "utils/flags.hpp"
-#include "utils/versions.hpp"
-#include "thread/monitor.hpp"
-#include "CL/cl_ext.h"
-
-#include "amdocl/cl_common.hpp"
-#include "device/rocm/rocdevice.hpp"
-#include "device/rocm/rocblit.hpp"
-#include "device/rocm/rocvirtual.hpp"
-#include "device/rocm/rocprogram.hpp"
-#include "device/rocm/roccompilerlib.hpp"
-#include "device/rocm/rocmemory.hpp"
-#include "device/rocm/rocglinterop.hpp"
-#include "kv_id.h"
-#include "vi_id.h"
-#include "cz_id.h"
-#include "ci_id.h"
-#include
-#include
-#include
-#include
-#include
-#include
-#endif // WITHOUT_HSA_BACKEND
-
-#define OPENCL_VERSION_STR XSTR(OPENCL_MAJOR) "." XSTR(OPENCL_MINOR)
-
-#ifndef WITHOUT_HSA_BACKEND
-namespace device {
-extern const char* BlitSourceCode;
-}
-
-namespace roc {
-aclCompiler* NullDevice::compilerHandle_;
-bool roc::Device::isHsaInitialized_ = false;
-hsa_agent_t roc::Device::cpu_agent_ = { 0 };
-std::vector roc::Device::gpu_agents_;
-const bool roc::Device::offlineDevice_ = false;
-const bool roc::NullDevice::offlineDevice_= true;
-
-
-static HsaDeviceId getHsaDeviceId(hsa_agent_t device, uint32_t& pci_id) {
- /*
- * Use the device id to determine the ASIC family
- */
- // TODO: translate from hsa_agent to internal AMD device id.
- if (HSA_STATUS_SUCCESS !=
- hsa_agent_get_info(
- device, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_CHIP_ID,
- &pci_id)) {
- return HSA_INVALID_DEVICE_ID;
- }
-
- switch (pci_id) {
- case DEVICE_ID_SPECTRE_MOBILE:
- case DEVICE_ID_SPECTRE_DESKTOP:
- case DEVICE_ID_SPECTRE_LITE_MOBILE_1309:
- case DEVICE_ID_SPECTRE_LITE_MOBILE_130A:
- case DEVICE_ID_SPECTRE_SL_MOBILE_130B:
- case DEVICE_ID_SPECTRE_MOBILE_130C:
- case DEVICE_ID_SPECTRE_LITE_MOBILE_130D:
- case DEVICE_ID_SPECTRE_SL_MOBILE_130E:
- case DEVICE_ID_SPECTRE_DESKTOP_130F:
- case DEVICE_ID_SPECTRE_WORKSTATION_1310:
- case DEVICE_ID_SPECTRE_WORKSTATION_1311:
- case DEVICE_ID_SPECTRE_LITE_DESKTOP_1313:
- case DEVICE_ID_SPECTRE_SL_DESKTOP_1315:
- case DEVICE_ID_SPECTRE_SL_MOBILE_1318:
- case DEVICE_ID_SPECTRE_SL_EMBEDDED_131B:
- case DEVICE_ID_SPECTRE_EMBEDDED_131C:
- case DEVICE_ID_SPECTRE_LITE_EMBEDDED_131D:
- return HSA_SPECTRE_ID;
- case DEVICE_ID_SPOOKY_MOBILE:
- case DEVICE_ID_SPOOKY_DESKTOP:
- case DEVICE_ID_SPOOKY_DESKTOP_1312:
- case DEVICE_ID_SPOOKY_DESKTOP_1316:
- case DEVICE_ID_SPOOKY_MOBILE_1317:
- return HSA_SPOOKY_ID;
- case DEVICE_ID_VI_TONGA_P_6920:
- case DEVICE_ID_VI_TONGA_P_6921:
- case DEVICE_ID_VI_TONGA_P_6928:
- case DEVICE_ID_VI_TONGA_P_692B:
- case DEVICE_ID_VI_TONGA_P_692F:
- case DEVICE_ID_VI_TONGA_P_6938:
- case DEVICE_ID_VI_TONGA_P_6939:
- return HSA_TONGA_ID;
- case DEVICE_ID_VI_FIJI_P_7300:
- return HSA_FIJI_ID;
- case DEVICE_ID_CZ_9870:
- case DEVICE_ID_CZ_9874:
- case DEVICE_ID_CZ_9875:
- case DEVICE_ID_CZ_9876:
- case DEVICE_ID_CZ_9877:
- return HSA_CARRIZO_ID;
- case DEVICE_ID_VI_ICELAND_M_6900:
- case DEVICE_ID_VI_ICELAND_M_6901:
- case DEVICE_ID_VI_ICELAND_M_6902:
- case DEVICE_ID_VI_ICELAND_M_6903:
- case DEVICE_ID_VI_ICELAND_M_6907:
- return HSA_ICELAND_ID;
- case DEVICE_ID_CI_HAWAII_P_67A0:
- case DEVICE_ID_CI_HAWAII_P_67A1:
- case DEVICE_ID_CI_HAWAII_P_67A2:
- case DEVICE_ID_CI_HAWAII_P_67A8:
- case DEVICE_ID_CI_HAWAII_P_67A9:
- case DEVICE_ID_CI_HAWAII_P_67AA:
- case DEVICE_ID_CI_HAWAII_P_67B0:
- case DEVICE_ID_CI_HAWAII_P_67B1:
- case DEVICE_ID_CI_HAWAII_P_67B8:
- case DEVICE_ID_CI_HAWAII_P_67B9:
- case DEVICE_ID_CI_HAWAII_P_67BE:
- return HSA_HAWAII_ID;
- case DEVICE_ID_VI_ELLESMERE_P_67C0:
- case DEVICE_ID_VI_ELLESMERE_P_67C1:
- case DEVICE_ID_VI_ELLESMERE_P_67C2:
- case DEVICE_ID_VI_ELLESMERE_P_67C4:
- case DEVICE_ID_VI_ELLESMERE_P_67C7:
- case DEVICE_ID_VI_ELLESMERE_P_67DF:
- case DEVICE_ID_VI_ELLESMERE_P_67D0:
- case DEVICE_ID_VI_ELLESMERE_P_67C8:
- case DEVICE_ID_VI_ELLESMERE_P_67C9:
- case DEVICE_ID_VI_ELLESMERE_P_67CA:
- case DEVICE_ID_VI_ELLESMERE_P_67CC:
- case DEVICE_ID_VI_ELLESMERE_P_67CF:
- return HSA_ELLESMERE_ID;
- case DEVICE_ID_VI_BAFFIN_M_67E0:
- case DEVICE_ID_VI_BAFFIN_M_67E3:
- case DEVICE_ID_VI_BAFFIN_M_67E8:
- case DEVICE_ID_VI_BAFFIN_M_67EB:
- case DEVICE_ID_VI_BAFFIN_M_67EF:
- case DEVICE_ID_VI_BAFFIN_M_67FF:
- case DEVICE_ID_VI_BAFFIN_M_67E1:
- case DEVICE_ID_VI_BAFFIN_M_67E7:
- case DEVICE_ID_VI_BAFFIN_M_67E9:
- return HSA_BAFFIN_ID;
- default:
- return HSA_INVALID_DEVICE_ID;
- }
-}
-
-bool NullDevice::create(const AMDDeviceInfo& deviceInfo) {
- online_ = false;
- deviceInfo_ = deviceInfo;
- // Mark the device as GPU type
- info_.type_ = CL_DEVICE_TYPE_GPU | CL_HSA_ENABLED_AMD;
- info_.vendorId_ = 0x1002;
-
- settings_ = new Settings();
- roc::Settings* hsaSettings = static_cast(settings_);
- if ((hsaSettings == NULL) ||
- // @Todo sramalin Use double precision from constsant
- !hsaSettings->create((true) & 0x1)) {
- LogError("Error creating settings for NULL HSA device");
- return false;
- }
- // Report the device name
- ::strcpy(info_.name_, "AMD HSA Device");
- info_.extensions_ = getExtensionString();
- info_.maxWorkGroupSize_ = hsaSettings->maxWorkGroupSize_;
- ::strcpy(info_.vendor_, "Advanced Micro Devices, Inc.");
- info_.oclcVersion_ = "OpenCL C " OPENCL_VERSION_STR " ";
- strcpy(info_.driverVersion_, "1.0 Provisional (hsa)");
- info_.version_ = "OpenCL " OPENCL_VERSION_STR " ";
- return true;
-}
-
-Device::Device(hsa_agent_t bkendDevice)
- : mapCacheOps_(nullptr)
- , mapCache_(nullptr)
- , _bkendDevice(bkendDevice)
- , gpuvm_segment_max_alloc_(0)
- , alloc_granularity_(0)
- , context_(nullptr)
- , xferQueue_(nullptr)
-{
- group_segment_.handle = 0;
- system_segment_.handle = 0;
- system_coarse_segment_.handle = 0;
- gpuvm_segment_.handle = 0;
-}
-
-Device::~Device()
-{
- // Release cached map targets
- for (uint i = 0; mapCache_ != NULL && i < mapCache_->size(); ++i) {
- if ((*mapCache_)[i] != NULL) {
- (*mapCache_)[i]->release();
- }
- }
- delete mapCache_;
- delete mapCacheOps_;
-
- // Destroy transfer queue
- if (xferQueue_ && xferQueue_->terminate()) {
- delete xferQueue_;
- xferQueue_ = NULL;
- }
-
- if (blitProgram_) {
- delete blitProgram_;
- blitProgram_ = NULL;
- }
-
- if (context_ != NULL) {
- context_->release();
- }
-
- if (info_.extensions_) {
- delete[]info_.extensions_;
- info_.extensions_ = NULL;
- }
-
- if (settings_) {
- delete settings_;
- settings_ = NULL;
- }
-}
-bool NullDevice::initCompiler(bool isOffline) {
- // Initializes g_complibModule and g_complibApi if they were not initialized
- if( g_complibModule == NULL ){
- if (!LoadCompLib(isOffline)) {
- if (!isOffline) {
- LogError("Error - could not find the compiler library");
- }
- return false;
- }
- }
- //Initialize the compiler handle if has already not been initialized
- //This is destroyed in Device::teardown
- acl_error error;
- if (!compilerHandle_) {
- compilerHandle_ = g_complibApi._aclCompilerInit(NULL, &error);
- if (error != ACL_SUCCESS) {
- LogError("Error initializing the compiler handle");
- return false;
- }
- }
- return true;
-}
-
-bool NullDevice::destroyCompiler() {
- if (compilerHandle_ != NULL) {
- acl_error error = g_complibApi._aclCompilerFini(compilerHandle_);
- if (error != ACL_SUCCESS) {
- LogError("Error closing the compiler");
- return false;
- }
- }
- if( g_complibModule != NULL ){
- UnloadCompLib();
- }
- return true;
-}
-
-void NullDevice::tearDown() {
- destroyCompiler();
-}
-bool NullDevice::init() {
- //Initialize the compiler
- if (!initCompiler(offlineDevice_)){
- return false;
- }
- //If there is an HSA enabled device online then skip any offline device
- std::vector devices;
- devices = getDevices(CL_DEVICE_TYPE_GPU | CL_HSA_ENABLED_AMD, false);
-
- //Load the offline devices
- //Iterate through the set of available offline devices
- for (uint id = 0; id < sizeof(DeviceInfo)/sizeof(AMDDeviceInfo); id++) {
- bool isOnline = false;
- //Check if the particular device is online
- for (unsigned int i=0; i< devices.size(); i++) {
- if (static_cast(devices[i])->deviceInfo_.hsaDeviceId_ ==
- DeviceInfo[id].hsaDeviceId_){
- isOnline = true;
- }
- }
- if (isOnline) {
- continue;
- }
- NullDevice* nullDevice = new NullDevice();
- if (!nullDevice->create(DeviceInfo[id])) {
- LogError("Error creating new instance of Device.");
- delete nullDevice;
- return false;
- }
- nullDevice->registerDevice();
- }
- return true;
-}
-NullDevice::~NullDevice() {
- if (info_.extensions_) {
- delete[]info_.extensions_;
- info_.extensions_ = NULL;
- }
-
- if (settings_) {
- delete settings_;
- settings_ = NULL;
- }
-}
-
-hsa_status_t Device::iterateAgentCallback(hsa_agent_t agent, void *data) {
- hsa_device_type_t dev_type = HSA_DEVICE_TYPE_CPU;
-
- hsa_status_t stat =
- hsa_agent_get_info(
- agent, HSA_AGENT_INFO_DEVICE, &dev_type);
-
- if (stat != HSA_STATUS_SUCCESS) {
- return stat;
- }
-
- if (dev_type == HSA_DEVICE_TYPE_CPU) {
- Device::cpu_agent_ = agent;
- return HSA_STATUS_SUCCESS;
- }
-
- gpu_agents_.push_back(agent);
-
- assert(dev_type == HSA_DEVICE_TYPE_GPU);
- Device *roc_device = new Device(agent);
- if (!roc_device) {
- LogError("Error creating new instance of Device on then heap.");
- return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
- }
-
- if (!roc_device->mapHSADeviceToOpenCLDevice(agent)) {
- LogError("Failed mapping of HsaDevice to Device.");
- return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
- }
-
- if (!roc_device->create()) {
- LogError("Error creating new instance of Device.");
- return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
- }
-
- uint32_t pci_id;
- HsaDeviceId deviceId = getHsaDeviceId(agent, pci_id);
- if (deviceId == HSA_INVALID_DEVICE_ID) {
- LogError(" Invalid HSA device");
- return HSA_STATUS_ERROR_INVALID_AGENT;
- }
- //Find device id in the table
- unsigned sizeOfTable = sizeof(DeviceInfo) / sizeof(AMDDeviceInfo);
- uint id;
- for (id = 0; id < sizeOfTable; id++) {
- if (DeviceInfo[id].hsaDeviceId_ == deviceId){
- break;
- }
- }
- //If the AmdDeviceInfo for the HsaDevice Id could not be found return false
- if (id == sizeOfTable) {
- return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
- }
- roc_device->deviceInfo_ = DeviceInfo[id];
- roc_device->deviceInfo_.pciDeviceId = pci_id;
-
- roc_device->registerDevice(); // no return code for this function
-
- return HSA_STATUS_SUCCESS;
-}
-
-bool Device::init() {
- LogInfo("Initializing HSA stack.");
-
- //Initialize the compiler
- if (!initCompiler(offlineDevice_)){
- return false;
- }
-
- if (HSA_STATUS_SUCCESS != hsa_init()) {
- LogError("hsa_init failed.");
- return false;
- }
-
- if (HSA_STATUS_SUCCESS !=
- hsa_iterate_agents(iterateAgentCallback, NULL)) {
- return false;
- }
-
- return true;
-}
-
-void
-Device::tearDown()
-{
- NullDevice::tearDown();
- hsa_shut_down();
-}
-
-bool
-Device::create()
-{
- if (!amd::Device::create()) {
- return false;
- }
-
- amd::Context::Info info = {0};
- std::vector devices;
- devices.push_back(this);
-
- // Create a dummy context
- context_ = new amd::Context(devices, info);
- if (context_ == NULL) {
- return false;
- }
-
- blitProgram_ = new BlitProgram(context_);
- // Create blit programs
- if (blitProgram_ == NULL || !blitProgram_->create(this)) {
- delete blitProgram_;
- blitProgram_ = NULL;
- LogError("Couldn't create blit kernels!");
- return false;
- }
-
- mapCacheOps_ = new amd::Monitor("Map Cache Lock", true);
- if (NULL == mapCacheOps_) {
- return false;
- }
-
- mapCache_ = new std::vector();
- if (mapCache_ == NULL) {
- return false;
- }
- // Use just 1 entry by default for the map cache
- mapCache_->push_back(NULL);
-
- xferQueue();
-
- return true;
-}
-
-device::Program*
-NullDevice::createProgram(amd::option::Options* options) {
- return new roc::HSAILProgram(*this);
-}
-
-device::Program*
-Device::createProgram(amd::option::Options* options) {
- return new roc::HSAILProgram(*this);
-}
-
-bool
-Device::mapHSADeviceToOpenCLDevice(hsa_agent_t dev)
-{
- // Create HSA settings
- settings_ = new Settings();
- roc::Settings* hsaSettings = static_cast(settings_);
- if ((hsaSettings == NULL) ||
- !hsaSettings->create((true) & 0x1)) {
- return false;
- }
-
- if (populateOCLDeviceConstants() == false) {
- return false;
- }
-
- // Setup System Memory to be Non-Coherent per user
- // request via environment variable. By default the
- // System Memory is setup to be Coherent
- if (hsaSettings->enableNCMode_) {
- hsa_status_t err =
- hsa_amd_coherency_set_type(dev, HSA_AMD_COHERENCY_TYPE_NONCOHERENT);
- if (err != HSA_STATUS_SUCCESS) {
- LogError("Unable to set NC memory policy!");
- return false;
- }
- }
-
- return true;
-}
-
-hsa_status_t Device::iterateGpuMemoryPoolCallback(hsa_amd_memory_pool_t pool,
- void* data) {
- if (data == NULL) {
- return HSA_STATUS_ERROR_INVALID_ARGUMENT;
- }
-
- hsa_region_segment_t segment_type = (hsa_region_segment_t)0;
- hsa_status_t stat =
- hsa_amd_memory_pool_get_info(
- pool, HSA_AMD_MEMORY_POOL_INFO_SEGMENT, &segment_type);
- if (stat != HSA_STATUS_SUCCESS) {
- return stat;
- }
-
- // TODO: system and device local segment
- Device *dev = reinterpret_cast(data);
- switch (segment_type) {
- case HSA_REGION_SEGMENT_GLOBAL: {
- if (dev->settings().enableLocalMemory_) {
- dev->gpuvm_segment_ = pool;
- }
- break;
- }
- case HSA_REGION_SEGMENT_GROUP:
- dev->group_segment_ = pool;
- break;
- default:
- break;
- }
-
- return HSA_STATUS_SUCCESS;
-}
-
-hsa_status_t Device::iterateCpuMemoryPoolCallback(hsa_amd_memory_pool_t pool,
- void* data) {
- if (data == NULL) {
- return HSA_STATUS_ERROR_INVALID_ARGUMENT;
- }
-
- hsa_region_segment_t segment_type = (hsa_region_segment_t)0;
- hsa_status_t stat = hsa_amd_memory_pool_get_info(
- pool, HSA_AMD_MEMORY_POOL_INFO_SEGMENT, &segment_type);
- if (stat != HSA_STATUS_SUCCESS) {
- return stat;
- }
-
- Device* dev = reinterpret_cast(data);
- switch (segment_type) {
- case HSA_REGION_SEGMENT_GLOBAL: {
- uint32_t global_flag = 0;
- hsa_status_t stat = hsa_amd_memory_pool_get_info(
- pool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &global_flag);
- if (stat != HSA_STATUS_SUCCESS) {
- return stat;
- }
-
- if ((global_flag & HSA_REGION_GLOBAL_FLAG_FINE_GRAINED) != 0) {
- dev->system_segment_ = pool;
- } else {
- dev->system_coarse_segment_ = pool;
- }
- break;
- }
- default:
- break;
- }
-
- return HSA_STATUS_SUCCESS;
-}
-
-bool
-Device::populateOCLDeviceConstants()
-{
- info_.available_ = true;
-
- roc::Settings* hsa_settings = static_cast(settings_);
-
- strcpy(info_.name_, "AMD HSA Device");
-
- char device_name[64] = { 0 };
- if (HSA_STATUS_SUCCESS !=
- hsa_agent_get_info(
- _bkendDevice, HSA_AGENT_INFO_NAME, device_name)) {
- return false;
- }
-
- strcpy(info_.boardName_, device_name);
-
- if (HSA_STATUS_SUCCESS != hsa_agent_get_info(_bkendDevice,
- HSA_AGENT_INFO_PROFILE,
- &agent_profile_)) {
- return false;
- }
-
- if (HSA_STATUS_SUCCESS !=
- hsa_agent_get_info(
- _bkendDevice, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT,
- &info_.maxComputeUnits_)) {
- return false;
- }
- assert(info_.maxComputeUnits_ > 0);
-
- if (HSA_STATUS_SUCCESS !=
- hsa_agent_get_info(
- _bkendDevice, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_CACHELINE_SIZE,
- &info_.globalMemCacheLineSize_)) {
- return false;
- }
- assert(info_.globalMemCacheLineSize_ > 0);
-
- uint32_t cachesize[4] = { 0 };
- if (HSA_STATUS_SUCCESS !=
- hsa_agent_get_info(
- _bkendDevice, HSA_AGENT_INFO_CACHE_SIZE, cachesize)) {
- return false;
- }
- assert(cachesize[0] > 0);
- info_.globalMemCacheSize_ = cachesize[0];
-
- info_.globalMemCacheType_ = CL_READ_WRITE_CACHE;
-
- info_.type_ = CL_DEVICE_TYPE_GPU | CL_HSA_ENABLED_AMD;
-
- uint32_t hsa_bdf_id = 0;
- if (HSA_STATUS_SUCCESS !=
- hsa_agent_get_info(
- _bkendDevice, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_BDFID, &hsa_bdf_id)) {
- return false;
- }
-
- info_.deviceTopology_.pcie.type = CL_DEVICE_TOPOLOGY_TYPE_PCIE_AMD;
- info_.deviceTopology_.pcie.bus = (hsa_bdf_id & (0xFF << 8)) >> 8;
- info_.deviceTopology_.pcie.device = (hsa_bdf_id & (0x1F << 3)) >> 3;
- info_.deviceTopology_.pcie.function = (hsa_bdf_id & 0x07);
- info_.extensions_ = getExtensionString();
- info_.nativeVectorWidthDouble_ =
- info_.preferredVectorWidthDouble_ = (settings().doublePrecision_) ? 1 : 0;
-
- if (HSA_STATUS_SUCCESS !=
- hsa_agent_get_info(
- _bkendDevice, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_MAX_CLOCK_FREQUENCY,
- &info_.maxClockFrequency_)) {
- return false;
- }
- assert(info_.maxClockFrequency_ > 0);
-
- if (HSA_STATUS_SUCCESS !=
- hsa_amd_agent_iterate_memory_pools(
- cpu_agent_, Device::iterateCpuMemoryPoolCallback, this)) {
- return false;
- }
-
- assert(system_segment_.handle != 0);
-
- if (HSA_STATUS_SUCCESS !=
- hsa_amd_agent_iterate_memory_pools(
- _bkendDevice, Device::iterateGpuMemoryPoolCallback, this)) {
- return false;
- }
-
- assert(group_segment_.handle != 0);
-
- size_t group_segment_size = 0;
- if (HSA_STATUS_SUCCESS !=
- hsa_amd_memory_pool_get_info(
- group_segment_, HSA_AMD_MEMORY_POOL_INFO_SIZE, &group_segment_size)) {
- return false;
- }
- assert(group_segment_size > 0);
-
- info_.localMemSizePerCU_ = group_segment_size;
- info_.localMemSize_ = group_segment_size;
-
- info_.maxWorkItemDimensions_ = 3;
-
- if (settings().enableLocalMemory_ && gpuvm_segment_.handle != 0) {
- size_t global_segment_size = 0;
- if (HSA_STATUS_SUCCESS !=
- hsa_amd_memory_pool_get_info(gpuvm_segment_,
- HSA_AMD_MEMORY_POOL_INFO_SIZE,
- &global_segment_size)) {
- return false;
- }
-
- assert(global_segment_size > 0);
- info_.globalMemSize_ = static_cast(global_segment_size);
-
- gpuvm_segment_max_alloc_ =
- cl_ulong(info_.globalMemSize_ *
- std::min(GPU_SINGLE_ALLOC_PERCENT, 100u) / 100u);
- assert(gpuvm_segment_max_alloc_ > 0);
-
- info_.maxMemAllocSize_ =
- static_cast(gpuvm_segment_max_alloc_);
-
- if (HSA_STATUS_SUCCESS !=
- hsa_amd_memory_pool_get_info(gpuvm_segment_,
- HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_GRANULE,
- &alloc_granularity_)) {
- return false;
- }
-
- assert(alloc_granularity_ > 0);
- }
- else {
- static const cl_ulong kDefaultGlobalMemSize = cl_ulong(1 * Gi);
- info_.globalMemSize_ = kDefaultGlobalMemSize;
- info_.maxMemAllocSize_ = info_.globalMemSize_ / 4;
-
- if (HSA_STATUS_SUCCESS !=
- hsa_amd_memory_pool_get_info(system_segment_,
- HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_GRANULE,
- &alloc_granularity_)) {
- return false;
- }
- }
-
- // Make sure the max allocation size is not larger than the available
- // memory size.
- info_.maxMemAllocSize_ =
- std::min(info_.maxMemAllocSize_, info_.globalMemSize_);
-
- /*make sure we don't run anything over 8 params for now*/
- info_.maxParameterSize_ = 1024; // [TODO]: CAL stack values: 1024*
- // constant
-
- uint32_t max_work_group_size = settings().maxWorkGroupSize_;
- /*
- if (HSA_STATUS_SUCCESS !=
- hsa_agent_get_info(
- _bkendDevice, HSA_AGENT_INFO_WORKGROUP_MAX_SIZE, &max_work_group_size)) {
- return false;
- }
- */
- assert(max_work_group_size > 0);
- info_.maxWorkGroupSize_ = max_work_group_size;
-
- uint16_t max_workgroup_size[3] = { 0, 0, 0 };
- if (HSA_STATUS_SUCCESS !=
- hsa_agent_get_info(
- _bkendDevice, HSA_AGENT_INFO_WORKGROUP_MAX_DIM, &max_workgroup_size)) {
- return false;
- }
-
- assert(max_workgroup_size[0] != 0 && max_workgroup_size[1] != 0 &&
- max_workgroup_size[2] != 0);
-
- info_.maxWorkItemSizes_[0] = max_workgroup_size[0];
- info_.maxWorkItemSizes_[1] = max_workgroup_size[1];
- info_.maxWorkItemSizes_[2] = max_workgroup_size[2];
-
- info_.nativeVectorWidthChar_ = info_.preferredVectorWidthChar_ = 4;
- info_.nativeVectorWidthShort_ = info_.preferredVectorWidthShort_ = 2;
- info_.nativeVectorWidthInt_ = info_.preferredVectorWidthInt_ = 1;
- info_.nativeVectorWidthLong_ = info_.preferredVectorWidthLong_ = 1;
- info_.nativeVectorWidthFloat_ = info_.preferredVectorWidthFloat_ = 1;
-
- info_.hostUnifiedMemory_ = CL_TRUE;
- info_.memBaseAddrAlign_ = 8 * (flagIsDefault(MEMOBJ_BASE_ADDR_ALIGN) ?
- sizeof(cl_long16) : MEMOBJ_BASE_ADDR_ALIGN);
- info_.minDataTypeAlignSize_ = sizeof(cl_long16);
-
- info_.maxConstantArgs_ = 8;
- info_.maxConstantBufferSize_ = 64 * 1024;
- info_.localMemType_ = CL_LOCAL;
- info_.errorCorrectionSupport_ = false;
- info_.profilingTimerResolution_ = 1;
- info_.littleEndian_ = true;
- info_.compilerAvailable_ = true;
- info_.executionCapabilities_ = CL_EXEC_KERNEL;
- info_.queueProperties_ = CL_QUEUE_PROFILING_ENABLE;
- info_.platform_ = AMD_PLATFORM;
- info_.profile_ = "FULL_PROFILE";
- strcpy(info_.vendor_, "Advanced Micro Devices, Inc.");
-
- info_.addressBits_ = LP64_SWITCH(32, 64);
- info_.maxSamplers_ = 16;
- info_.bufferFromImageSupport_ = CL_FALSE;
- info_.oclcVersion_ = "OpenCL C " OPENCL_VERSION_STR " ";
- strcpy(info_.driverVersion_, "1.0 Provisional (hsa)");
- info_.version_ = "OpenCL " OPENCL_VERSION_STR " ";
-
- info_.builtInKernels_ = "";
- info_.linkerAvailable_ = true;
- info_.preferredInteropUserSync_ = true;
- info_.printfBufferSize_ = PrintfDbg::WorkitemDebugSize * info().maxWorkGroupSize_;
- info_.vendorId_ = 0x1002; // AMD's PCIe vendor id
-
- info_.maxGlobalVariableSize_ = static_cast(info_.maxMemAllocSize_);
- info_.globalVariablePreferredTotalSize_ =
- static_cast(info_.globalMemSize_);
-
- // Populate the single config setting.
- info_.singleFPConfig_ = CL_FP_ROUND_TO_NEAREST | CL_FP_ROUND_TO_ZERO |
- CL_FP_ROUND_TO_INF | CL_FP_INF_NAN | CL_FP_FMA;
-
- if (hsa_settings->doublePrecision_) {
- info_.doubleFPConfig_ = info_.singleFPConfig_ | CL_FP_DENORM;
- info_.singleFPConfig_ |= CL_FP_CORRECTLY_ROUNDED_DIVIDE_SQRT;
- }
- info_.preferredPlatformAtomicAlignment_ = 0;
- info_.preferredGlobalAtomicAlignment_ = 0;
- info_.preferredLocalAtomicAlignment_ = 0;
-
- uint8_t hsa_extensions[128];
- if (HSA_STATUS_SUCCESS != hsa_agent_get_info(_bkendDevice,
- HSA_AGENT_INFO_EXTENSIONS,
- hsa_extensions)) {
- return false;
- }
-
- assert(HSA_EXTENSION_IMAGES < 8);
- const bool image_is_supported =
- ((hsa_extensions[0] & (1 << HSA_EXTENSION_IMAGES)) != 0);
- if (image_is_supported) {
- // Images
- if (HSA_STATUS_SUCCESS !=
- hsa_agent_get_info(_bkendDevice,
- static_cast(
- HSA_EXT_AGENT_INFO_MAX_SAMPLER_HANDLERS),
- &info_.maxSamplers_)) {
- return false;
- }
-
- if (HSA_STATUS_SUCCESS !=
- hsa_agent_get_info(_bkendDevice,
- static_cast(
- HSA_EXT_AGENT_INFO_MAX_IMAGE_RD_HANDLES),
- &info_.maxReadImageArgs_)) {
- return false;
- }
-
- // TODO: no attribute for write image.
- info_.maxWriteImageArgs_ = 8;
-
- if (HSA_STATUS_SUCCESS !=
- hsa_agent_get_info(_bkendDevice,
- static_cast(
- HSA_EXT_AGENT_INFO_MAX_IMAGE_RORW_HANDLES),
- &info_.maxReadWriteImageArgs_)) {
- return false;
- }
-
- uint32_t image_max_dim[3];
- if (HSA_STATUS_SUCCESS !=
- hsa_agent_get_info(_bkendDevice,
- static_cast(
- HSA_EXT_AGENT_INFO_IMAGE_2D_MAX_ELEMENTS),
- &image_max_dim)) {
- return false;
- }
-
- info_.image2DMaxWidth_ = image_max_dim[0];
- info_.image2DMaxHeight_ = image_max_dim[1];
-
- if (HSA_STATUS_SUCCESS !=
- hsa_agent_get_info(_bkendDevice,
- static_cast(
- HSA_EXT_AGENT_INFO_IMAGE_3D_MAX_ELEMENTS),
- &image_max_dim)) {
- return false;
- }
-
- info_.image3DMaxWidth_ = image_max_dim[0];
- info_.image3DMaxHeight_ = image_max_dim[1];
- info_.image3DMaxDepth_ = image_max_dim[2];
-
- uint32_t max_array_size = 0;
- if (HSA_STATUS_SUCCESS !=
- hsa_agent_get_info(_bkendDevice,
- static_cast(
- HSA_EXT_AGENT_INFO_IMAGE_ARRAY_MAX_LAYERS),
- &max_array_size)) {
- return false;
- }
-
- info_.imageMaxArraySize_ = max_array_size;
-
- if (HSA_STATUS_SUCCESS !=
- hsa_agent_get_info(_bkendDevice,
- static_cast(
- HSA_EXT_AGENT_INFO_IMAGE_1DB_MAX_ELEMENTS),
- &image_max_dim)) {
- return false;
- }
- info_.imageMaxBufferSize_ = image_max_dim[0];
-
- info_.imagePitchAlignment_ = 256;
-
- info_.imageBaseAddressAlignment_ = 256;
-
- info_.bufferFromImageSupport_ = CL_FALSE;
-
- info_.imageSupport_ =
- (info_.maxReadWriteImageArgs_ > 0) ? CL_TRUE : CL_FALSE;
- }
-
- // Enable SVM Capabilities of Hsa device. Ensure
- // user has not setup memory to be non-coherent
- info_.svmCapabilities_ = 0;
- if (hsa_settings->enableNCMode_ == false) {
- info_.svmCapabilities_ = CL_DEVICE_SVM_COARSE_GRAIN_BUFFER;
- info_.svmCapabilities_ |= CL_DEVICE_SVM_FINE_GRAIN_BUFFER;
- // Report fine-grain system only on full profile
- if (agent_profile_ == HSA_PROFILE_FULL) {
- info_.svmCapabilities_ |= CL_DEVICE_SVM_FINE_GRAIN_SYSTEM;
- }
- info_.svmCapabilities_ |= CL_DEVICE_SVM_ATOMICS;
- }
-
- return true;
-}
-
-device::VirtualDevice*
-Device::createVirtualDevice(amd::CommandQueue* queue)
-{
- bool profiling = (queue != NULL) &&
- queue->properties().test(CL_QUEUE_PROFILING_ENABLE);
-
- // Initialization of heap and other resources occur during the command
- // queue creation time.
- VirtualGPU *virtualDevice = new VirtualGPU(*this);
-
- if (!virtualDevice->create(profiling)) {
- delete virtualDevice;
- return NULL;
- }
-
- if(profiling) {
- hsa_amd_profiling_set_profiler_enabled(virtualDevice->gpu_queue(), 1);
- }
-
- return virtualDevice;
-}
-
-bool
-Device::globalFreeMemory(size_t *freeMemory) const
-{
- return false;
-}
-
-bool
-Device::bindExternalDevice(
- uint flags,
- void* const gfxDevice[],
- void* gfxContext,
- bool validateOnly)
-{
-#if defined(_WIN32)
- return false;
-#else
- if((flags&amd::Context::GLDeviceKhr)==0)
- return false;
-
- MesaInterop::MESA_INTEROP_KIND kind=MesaInterop::MESA_INTEROP_NONE;
- MesaInterop::DisplayHandle display;
- MesaInterop::ContextHandle context;
-
- if((flags&amd::Context::EGLDeviceKhr)!=0)
- {
- kind=MesaInterop::MESA_INTEROP_EGL;
- display.eglDisplay=reinterpret_cast(gfxDevice[amd::Context::GLDeviceKhrIdx]);
- context.eglContext=reinterpret_cast(gfxContext);
- }
- else
- {
- kind=MesaInterop::MESA_INTEROP_GLX;
- display.glxDisplay=reinterpret_cast(gfxDevice[amd::Context::GLDeviceKhrIdx]);
- context.glxContext=reinterpret_cast(gfxContext);
- }
-
- mesa_glinterop_device_info info;
- info.size=sizeof(mesa_glinterop_device_info);
- MesaInterop temp;
- if(!temp.Bind(kind, display, context))
- {
- assert(false && "Failed mesa interop bind.");
- return false;
- }
-
- if(!temp.GetInfo(info))
- {
- assert(false && "Failed to get mesa interop device info.");
- return false;
- }
-
- bool match=true;
- match &= info_.deviceTopology_.pcie.bus==info.pci_bus;
- match &= info_.deviceTopology_.pcie.device==info.pci_device;
- match &= info_.deviceTopology_.pcie.function==info.pci_function;
- match &= info_.vendorId_==info.vendor_id;
- match &= deviceInfo_.pciDeviceId==info.device_id;
-
- if(!validateOnly)
- mesa_=temp;
-
- return match;
-#endif
-}
-
-bool
-Device::unbindExternalDevice(
- uint flags,
- void* const gfxDevice[],
- void* gfxContext,
- bool validateOnly)
-{
-#if defined(_WIN32)
- return false;
-#else
- if ((flags&amd::Context::GLDeviceKhr)==0)
- return false;
- if(!validateOnly)
- mesa_.Unbind();
- return true;
-#endif
-}
-
-amd::Memory*
-Device::findMapTarget(size_t size) const
-{
- // Must be serialised for access
- amd::ScopedLock lk(*mapCacheOps_);
-
- amd::Memory* map = NULL;
- size_t minSize = 0;
- size_t maxSize = 0;
- uint mapId = mapCache_->size();
- uint releaseId = mapCache_->size();
-
- // Find if the list has a map target of appropriate size
- for (uint i = 0; i < mapCache_->size(); i++) {
- if ((*mapCache_)[i] != NULL) {
- // Requested size is smaller than the entry size
- if (size < (*mapCache_)[i]->getSize()) {
- if ((minSize == 0) ||
- (minSize > (*mapCache_)[i]->getSize())) {
- minSize = (*mapCache_)[i]->getSize();
- mapId = i;
- }
- }
- // Requeted size matches the entry size
- else if (size == (*mapCache_)[i]->getSize()) {
- mapId = i;
- break;
- }
- else {
- // Find the biggest map target in the list
- if (maxSize < (*mapCache_)[i]->getSize()) {
- maxSize = (*mapCache_)[i]->getSize();
- releaseId = i;
- }
- }
- }
- }
-
- // Check if we found any map target
- if (mapId < mapCache_->size()) {
- map = (*mapCache_)[mapId];
- (*mapCache_)[mapId] = NULL;
- }
- // If cache is full, then release the biggest map target
- else if (releaseId < mapCache_->size()) {
- (*mapCache_)[releaseId]->release();
- (*mapCache_)[releaseId] = NULL;
- }
-
- return map;
-}
-
-bool
-Device::addMapTarget(amd::Memory* memory) const
-{
- // Must be serialised for access
- amd::ScopedLock lk(*mapCacheOps_);
-
- //the svm memory shouldn't be cached
- if (!memory->canBeCached()) {
- return false;
- }
- // Find if the list has a map target of appropriate size
- for (uint i = 0; i < mapCache_->size(); ++i) {
- if ((*mapCache_)[i] == NULL) {
- (*mapCache_)[i] = memory;
- return true;
- }
- }
-
- // Add a new entry
- mapCache_->push_back(memory);
-
- return true;
-}
-
-device::Memory*
-Device::createMemory(amd::Memory &owner) const
-{
- roc::Memory* memory = NULL;
- if (owner.asBuffer()) {
- memory = new roc::Buffer(*this, owner);
- }
- else if (owner.asImage()) {
- memory = new roc::Image(*this, owner);
- }
- else {
- LogError("Unknown memory type");
- }
-
- if (memory == NULL) {
- return NULL;
- }
-
- bool result = memory->create();
-
- if (!result) {
- LogError("Failed creating memory");
- delete memory;
- return NULL;
- }
-
- if (!memory->isHostMemDirectAccess() && owner.asImage() &&
- owner.parent() == NULL &&
- (owner.getMemFlags() & (CL_MEM_COPY_HOST_PTR | CL_MEM_USE_HOST_PTR))) {
- // To avoid recurssive call to Device::createMemory, we perform
- // data transfer to the view of the image.
- amd::Image* imageView = owner.asImage()->createView(
- owner.getContext(), owner.asImage()->getImageFormat(), xferQueue());
-
- if (imageView == NULL) {
- LogError("[OCL] Fail to allocate view of image object");
- return NULL;
- }
-
- Image* devImageView =
- new roc::Image(static_cast(*this), *imageView);
- if (devImageView == NULL) {
- LogError("[OCL] Fail to allocate device mem object for the view");
- imageView->release();
- return NULL;
- }
-
- if (devImageView != NULL &&
- !devImageView->createView(static_cast(*memory))) {
- LogError("[OCL] Fail to create device mem object for the view");
- delete devImageView;
- imageView->release();
- return NULL;
- }
-
- imageView->replaceDeviceMemory(this, devImageView);
-
- result = xferMgr().writeImage(owner.getHostMem(), *devImageView,
- amd::Coord3D(0), imageView->getRegion(),
- imageView->getRowPitch(),
- imageView->getSlicePitch(), true);
-
- imageView->release();
- }
-
- if (!result) {
- delete memory;
- return NULL;
- }
-
- return memory;
-}
-
-void*
-Device::hostAlloc(size_t size, size_t alignment, bool atomics) const {
- void* ptr = NULL;
- const hsa_amd_memory_pool_t segment =
- (!atomics)
- ? (system_coarse_segment_.handle != 0) ? system_coarse_segment_
- : system_segment_
- : system_segment_;
- assert(segment.handle != 0);
- hsa_status_t stat = hsa_amd_memory_pool_allocate(segment, size, 0, &ptr);
- if (stat != HSA_STATUS_SUCCESS) {
- LogError("Fail allocation host memory");
- return NULL;
- }
-
- stat = hsa_amd_agents_allow_access(gpu_agents_.size(), &gpu_agents_[0],
- NULL, ptr);
- if (stat != HSA_STATUS_SUCCESS) {
- LogError("Fail hsa_amd_agents_allow_access");
- return NULL;
- }
-
- return ptr;
-}
-
-void
-Device::hostFree(void* ptr, size_t size) const
-{
- hsa_status_t stat =
- hsa_amd_memory_pool_free(ptr);
- if (stat != HSA_STATUS_SUCCESS) {
- LogError("Fail freeing host memory");
- assert(stat == HSA_STATUS_SUCCESS);
- }
-}
-
-void *
-Device::deviceLocalAlloc(size_t size) const
-{
- if (gpuvm_segment_.handle == 0 || gpuvm_segment_max_alloc_ == 0) {
- return NULL;
- }
-
- void *ptr = NULL;
- hsa_status_t stat =
- hsa_amd_memory_pool_allocate(gpuvm_segment_, size, 0, &ptr);
- if (stat != HSA_STATUS_SUCCESS) {
- LogError("Fail allocation local memory");
- return NULL;
- }
-
- stat = hsa_memory_assign_agent(ptr, _bkendDevice, HSA_ACCESS_PERMISSION_RW);
- if (stat != HSA_STATUS_SUCCESS) {
- LogError("Fail assigning local memory to agent");
- deviceLocalFree(ptr, size);
- return NULL;
- }
-
- return ptr;
-}
-
-void
-Device::deviceLocalFree(void *ptr, size_t size) const
-{
- hsa_status_t stat =
- hsa_amd_memory_pool_free(ptr);
- if (stat != HSA_STATUS_SUCCESS) {
- LogError("Fail freeing local memory");
- }
-}
-
-void*
-Device::svmAlloc(amd::Context& context, size_t size, size_t alignment, cl_svm_mem_flags flags, void* svmPtr) const
-{
- amd::Memory* mem = NULL;
- if (NULL == svmPtr) {
- bool atomics = (flags & CL_MEM_SVM_ATOMICS) != 0;
- void* ptr = hostAlloc(size, alignment, atomics);
-
- if (ptr != NULL) {
- // Copy paste from ORCA code.
- // create a hidden buffer, which will allocated on the device later
- mem = new (context)
- amd::Buffer(context, CL_MEM_USE_HOST_PTR, size, ptr);
- if (mem == NULL) {
- LogError("failed to create a svm mem object!");
- return NULL;
- }
-
- if (!mem->create(ptr)) {
- LogError("failed to create a svm hidden buffer!");
- mem->release();
- return NULL;
- }
-
- // add the information to context so that we can use it later.
- amd::SvmManager::AddSvmBuffer(ptr, mem);
-
- return ptr;
- }
- else {
- return NULL;
- }
- } else {
- // Copy paste from ORCA code.
- // Find the existing amd::mem object
- mem = amd::SvmManager::FindSvmBuffer(svmPtr);
-
- if (NULL == mem) {
- return NULL;
- }
-
- return svmPtr;
- }
-}
-
-void
-Device::svmFree(void* ptr) const
-{
- amd::Memory * svmMem = NULL;
- svmMem = amd::SvmManager::FindSvmBuffer(ptr);
- if (NULL != svmMem) {
- svmMem->release();
- amd::SvmManager::RemoveSvmBuffer(ptr);
- hostFree(ptr);
- }
-}
-
-VirtualGPU*
-Device::xferQueue() const
-{
- if (!xferQueue_) {
- // Create virtual device for internal memory transfer
- Device* thisDevice = const_cast(this);
- thisDevice->xferQueue_ = reinterpret_cast(
- thisDevice->createVirtualDevice());
- if (!xferQueue_) {
- LogError("Couldn't create the device transfer manager!");
- }
- }
- return xferQueue_;
-}
-
-}
-#endif // WITHOUT_HSA_BACKEND
diff --git a/projects/clr/rocclr/runtime/device/rocm/rocdevice.hpp b/projects/clr/rocclr/runtime/device/rocm/rocdevice.hpp
deleted file mode 100644
index dcc2aa07ad..0000000000
--- a/projects/clr/rocclr/runtime/device/rocm/rocdevice.hpp
+++ /dev/null
@@ -1,376 +0,0 @@
-//
-// Copyright (c) 2009 Advanced Micro Devices, Inc. All rights reserved.
-//
-#pragma once
-
-#ifndef WITHOUT_HSA_BACKEND
-
-#include "top.hpp"
-#include "CL/cl.h"
-#include "device/device.hpp"
-#include "platform/command.hpp"
-#include "platform/program.hpp"
-#include "platform/perfctr.hpp"
-#include "platform/memory.hpp"
-#include "utils/concurrent.hpp"
-#include "thread/thread.hpp"
-#include "thread/monitor.hpp"
-#include "utils/versions.hpp"
-#include "aclTypes.h"
-
-#include "device/rocm/rocsettings.hpp"
-#include "device/rocm/rocvirtual.hpp"
-#include "device/rocm/rocdefs.hpp"
-#include "device/rocm/rocprintf.hpp"
-#include "device/rocm/rocglinterop.hpp"
-
-#include "hsa.h"
-#include "hsa_ext_image.h"
-#include "hsa_ext_finalize.h"
-#include "hsa_ext_amd.h"
-
-#include
-#include
-
-// extern hsa::Runtime* g_hsaruntime;
-
-/*! \addtogroup HSA
- * @{
- */
-
-//! HSA Device Implementation
-namespace roc {
-
-/**
- * @brief List of environment variables that could be used to
- * configure the behavior of Hsa Runtime
- */
-#define ENVVAR_HSA_POLL_KERNEL_COMPLETION "HSA_POLL_COMPLETION"
-
-//! Forward declarations
-class Command;
-class Device;
-class GpuCommand;
-class Heap;
-class HeapBlock;
-class Program;
-class Kernel;
-class Memory;
-class Resource;
-class VirtualDevice;
-class PrintfDbg;
-
-//A NULL Device type used only for offline compilation
-// Only functions that are used for compilation will be in this device
-class NullDevice : public amd::Device {
-public:
- //! constructor
- NullDevice(){};
-
- //!create the device
- bool create(const AMDDeviceInfo& deviceInfo);
-
- //! Initialise all the offline devices that can be used for compilation
- static bool init();
- //! Teardown for offline devices
- static void tearDown();
-
- //! Destructor for the Null device
- virtual ~NullDevice();
-
- aclCompiler *compiler() const { return compilerHandle_; }
-
- //! Construct an HSAIL program object from the ELF assuming it is valid
- virtual device::Program *createProgram(amd::option::Options* options = NULL);
- const AMDDeviceInfo& deviceInfo() const {
- return deviceInfo_;
- }
- //! Gets the backend device for the NULL device type
- virtual hsa_agent_t getBackendDevice() const {
- ShouldNotReachHere();
- const hsa_agent_t kInvalidAgent = { 0 };
- return kInvalidAgent;
- }
-
- //List of dummy functions which are disabled for NullDevice
-
- //! Create sub-devices according to the given partition scheme.
- virtual cl_int createSubDevices(
- device::CreateSubDevicesInfo& create_info,
- cl_uint num_entries,
- cl_device_id* devices,
- cl_uint* num_devices) {
- ShouldNotReachHere();
- return CL_INVALID_VALUE; };
-
- //! Create a new virtual device environment.
- virtual device::VirtualDevice* createVirtualDevice(
- amd::CommandQueue* queue = NULL) {
- ShouldNotReachHere();
- return NULL;
- }
-
- virtual bool registerSvmMemory(void* ptr, size_t size) const {
- ShouldNotReachHere();
- return false;
- }
-
- virtual void deregisterSvmMemory(void* ptr) const {
- ShouldNotReachHere();
- }
-
- //! Just returns NULL for the dummy device
- virtual device::Memory* createMemory(amd::Memory& owner) const {
- ShouldNotReachHere();
- return NULL; }
-
- //! Sampler object allocation
- virtual bool createSampler(
- const amd::Sampler& owner, //!< abstraction layer sampler object
- device::Sampler** sampler //!< device sampler object
- ) const
- {
- ShouldNotReachHere();
- return true;
- }
-
- //! Just returns NULL for the dummy device
- virtual device::Memory* createView(
- amd::Memory& owner, //!< Owner memory object
- const device::Memory& parent //!< Parent device memory object for the view
- ) const {
- ShouldNotReachHere();
- return NULL;
- }
-
- //! Just returns NULL for the dummy device
- virtual void* svmAlloc(
- amd::Context& context, //!< The context used to create a buffer
- size_t size, //!< size of svm spaces
- size_t alignment, //!< alignment requirement of svm spaces
- cl_svm_mem_flags flags, //!< flags of creation svm spaces
- void* svmPtr //!< existing svm pointer for mGPU case
- ) const {
- ShouldNotReachHere();
- return NULL;
- }
-
- //! Just returns NULL for the dummy device
- virtual void svmFree(
- void* ptr //!< svm pointer needed to be freed
- ) const {
- ShouldNotReachHere();
- return;
- }
-
- //! Reallocates the provided buffer object
- virtual bool reallocMemory(amd::Memory& owner) const {
- ShouldNotReachHere();
- return false;
- }
-
- //! Acquire external graphics API object in the host thread
- //! Needed for OpenGL objects on CPU device
-
- virtual bool bindExternalDevice(
- uint flags, void* const pDevice[], void* pContext, bool validateOnly) {
- ShouldNotReachHere();
- return false;
- }
-
- virtual bool unbindExternalDevice(
- uint flags, void* const pDevice[], void* pContext, bool validateOnly) {
- ShouldNotReachHere();
- return false;
- }
-
- //! Releases non-blocking map target memory
- virtual void freeMapTarget(amd::Memory& mem, void* target) { ShouldNotReachHere();}
-
- //! Empty implementation on Null device
- virtual bool globalFreeMemory(size_t* freeMemory) const {
- ShouldNotReachHere();
- return false;
- }
-
-protected:
- //! Initialize compiler instance and handle
- static bool initCompiler(bool isOffline);
- //! destroy compiler instance and handle
- static bool destroyCompiler();
- //! Handle to the the compiler
- static aclCompiler* compilerHandle_;
- //! Device Id for an HsaDevice
- AMDDeviceInfo deviceInfo_;
-private:
- static const bool offlineDevice_;
-};
-
-//! A HSA device ordinal (physical HSA device)
-class Device : public NullDevice {
-public:
- //! Initialise the whole HSA device subsystem (CAL init, device enumeration, etc).
- static bool init();
- static void tearDown();
-
- //! Lookup all AMD HSA devices and memory regions.
- static hsa_status_t iterateAgentCallback(hsa_agent_t agent, void *data);
- static hsa_status_t iterateGpuMemoryPoolCallback(
- hsa_amd_memory_pool_t region, void* data);
- static hsa_status_t iterateCpuMemoryPoolCallback(
- hsa_amd_memory_pool_t region, void* data);
-
- static bool loadHsaModules();
-
- bool create();
-
- //! Construct a new physical HSA device
- Device(hsa_agent_t bkendDevice);
- virtual hsa_agent_t getBackendDevice() const { return _bkendDevice; }
-
- static const std::vector& getGpuAgents() {
- return gpu_agents_;
- }
-
- static hsa_agent_t getCpuAgent()
- {
- return cpu_agent_;
- }
-
- //! Destructor for the physical HSA device
- virtual ~Device();
-
- bool mapHSADeviceToOpenCLDevice(hsa_agent_t hsadevice);
-
- // Temporary, delete it later when HSA Runtime and KFD is fully fucntional.
- void fake_device();
-
- ///////////////////////////////////////////////////////////////////////////////
- // TODO: Below are all mocked up virtual functions from amd::Device, they may
- // need real implementation.
- ///////////////////////////////////////////////////////////////////////////////
-
-// #ifdef cl_ext_device_fission
- //! Create sub-devices according to the given partition scheme.
- virtual cl_int createSubDevices(
- device::CreateSubDevicesInfo &create_inf,
- cl_uint num_entries,
- cl_device_id *devices,
- cl_uint *num_devices)
- { return CL_INVALID_VALUE; }
-// #endif // cl_ext_device_fission
-
- // bool Device::create(CALuint ordinal);
-
- //! Instantiate a new virtual device
- virtual device::VirtualDevice *createVirtualDevice(
- amd::CommandQueue* queue = NULL);
-
- //! Construct an HSAIL program object from the ELF assuming it is valid
- virtual device::Program *createProgram(amd::option::Options* options = NULL);
-
- virtual device::Memory *createMemory(amd::Memory &owner) const;
-
- //! Sampler object allocation
- virtual bool createSampler(
- const amd::Sampler& owner, //!< abstraction layer sampler object
- device::Sampler** sampler //!< device sampler object
- ) const
- {
- //! \todo HSA team has to implement sampler allocation
- *sampler = NULL;
- return true;
- }
-
-
- //! Just returns NULL for the dummy device
- virtual device::Memory *createView(
- amd::Memory &owner, //!< Owner memory object
- const device::Memory &parent //!< Parent device memory object for the view
- ) const { return NULL; }
-
- //! Reallocates the provided buffer object
- virtual bool reallocMemory(amd::Memory &owner) const {return true; }
-
- //! Acquire external graphics API object in the host thread
- //! Needed for OpenGL objects on CPU device
- virtual bool bindExternalDevice(
- uint flags, void * const pDevice[], void *pContext, bool validateOnly);
-
- /**
- * @brief Removes the external device as an available device.
- *
- * @note: The current implementation is to avoid build break
- * and does not represent actual / correct implementation. This
- * needs to be done.
- */
- bool unbindExternalDevice(
- uint flags, //!< Enum val. for ext.API type: GL, D3D10, etc.
- void * const gfxDevice[], //!< D3D device do D3D, HDC/Display handle of X Window for GL
- void *gfxContext, //!< HGLRC/GLXContext handle
- bool validateOnly //!< Only validate if the device can inter-operate with
- //!< pDevice/pContext, do not bind.
- );
-
- //! Gets free memory on a GPU device
- virtual bool globalFreeMemory(size_t *freeMemory) const;
-
- virtual void* hostAlloc(size_t size, size_t alignment, bool atomics = false) const;
-
- virtual void hostFree(void* ptr, size_t size = 0) const;
-
- void *deviceLocalAlloc(size_t size) const;
-
- void deviceLocalFree(void *ptr, size_t size) const;
-
- virtual void* svmAlloc(amd::Context& context, size_t size, size_t alignment, cl_svm_mem_flags flags = CL_MEM_READ_WRITE, void* svmPtr = NULL) const;
-
- virtual void svmFree(void* ptr) const;
-
- const Settings &settings() const { return reinterpret_cast(*settings_); }
-
- //! Returns transfer engine object
- const device::BlitManager& xferMgr() const { return xferQueue()->blitMgr(); }
-
- const size_t alloc_granularity() const { return alloc_granularity_; }
-
- const hsa_profile_t agent_profile() const { return agent_profile_; }
-
- const MesaInterop& mesa() const { return mesa_; }
-
- //! Finds an appropriate map target
- amd::Memory* findMapTarget(size_t size) const;
-
- //! Adds a map target to the cache
- bool addMapTarget(amd::Memory* memory) const;
-
-private:
- amd::Monitor* mapCacheOps_; //!< Lock to serialise cache for the map resources
- std::vector* mapCache_; //!< Map cache info structure
-
- bool populateOCLDeviceConstants();
- static bool isHsaInitialized_;
- static hsa_agent_t cpu_agent_;
- static std::vector gpu_agents_;
- MesaInterop mesa_;
- hsa_agent_t _bkendDevice;
- hsa_profile_t agent_profile_;
- hsa_amd_memory_pool_t group_segment_;
- hsa_amd_memory_pool_t system_segment_;
- hsa_amd_memory_pool_t system_coarse_segment_;
- hsa_amd_memory_pool_t gpuvm_segment_;
- size_t gpuvm_segment_max_alloc_;
- size_t alloc_granularity_;
- static const bool offlineDevice_;
- amd::Context *context_; //!< A dummy context for internal data transfer
- VirtualGPU *xferQueue_; //!< Transfer queue, created on demand
-
- VirtualGPU* xferQueue() const;
-}; // class roc::Device
-} // namespace roc
-
-/**
- * @}
- */
-#endif /*WITHOUT_HSA_BACKEND*/
-
diff --git a/projects/clr/rocclr/runtime/device/rocm/rocglinterop.cpp b/projects/clr/rocclr/runtime/device/rocm/rocglinterop.cpp
deleted file mode 100644
index 0db28ab973..0000000000
--- a/projects/clr/rocclr/runtime/device/rocm/rocglinterop.cpp
+++ /dev/null
@@ -1,120 +0,0 @@
-//
-// Copyright (c) 2016 Advanced Micro Devices, Inc. All rights reserved.
-//
-
-#ifndef WITHOUT_HSA_BACKEND
-
-#include "os/os.hpp"
-#include "utils/debug.hpp"
-#include "utils/flags.hpp"
-#include "device/rocm/rocglinterop.hpp"
-
-#if !defined(_WIN32)
-#include
-#endif
-
-namespace roc
-{
-
-#if !defined(_WIN32)
-static PFNMESAGLINTEROPGLXQUERYDEVICEINFOPROC GlxInfo = nullptr;
-static PFNMESAGLINTEROPGLXEXPORTOBJECTPROC GlxExport = nullptr;
-static PFNMESAGLINTEROPEGLQUERYDEVICEINFOPROC EglInfo = nullptr;
-static PFNMESAGLINTEROPEGLEXPORTOBJECTPROC EglExport = nullptr;
-#endif
-
-std::atomic MesaInterop::refCount(0);
-
-bool MesaInterop::Supported()
-{
-#ifdef _WIN32
- return false;
-#else
- return true;
-#endif
-}
-
-//Attempt to locate Mesa interop APIs. Return which of glx/egl are supported.
-bool MesaInterop::Bind(MESA_INTEROP_KIND Kind, const DisplayHandle& Display, const ContextHandle& Context)
-{
-#if defined(_WIN32)
- return false;
-#else
- if(Kind==MESA_INTEROP_NONE)
- return false;
-
- if(kind!=MESA_INTEROP_NONE)
- {
- LogError("Error - MesaInterop Bind while already bound.");
- return false;
- }
-
- void* glxinfo=dlsym(RTLD_DEFAULT, "MesaGLInteropGLXQueryDeviceInfo");
- void* eglinfo=dlsym(RTLD_DEFAULT, "MesaGLInteropEGLQueryDeviceInfo");
-
- if(((glxinfo!=GlxInfo) || (eglinfo!=EglInfo)) && (refCount!=0))
- LogWarning("Warning - Mesa changed while holding interop contexts.");
-
- GlxInfo=(PFNMESAGLINTEROPGLXQUERYDEVICEINFOPROC)glxinfo;
- EglInfo=(PFNMESAGLINTEROPEGLQUERYDEVICEINFOPROC)eglinfo;
-
- GlxExport=(PFNMESAGLINTEROPGLXEXPORTOBJECTPROC)dlsym(RTLD_DEFAULT, "MesaGLInteropGLXExportObject");
- EglExport=(PFNMESAGLINTEROPEGLEXPORTOBJECTPROC)dlsym(RTLD_DEFAULT, "MesaGLInteropEGLExportObject");
-
- uint32_t ret=MESA_INTEROP_NONE;
- if(GlxInfo && GlxExport)
- ret|=MESA_INTEROP_GLX;
- if(EglInfo && EglExport)
- ret|=MESA_INTEROP_EGL;
-
- kind = MESA_INTEROP_KIND(ret & Kind);
- display=Display;
- context=Context;
-
- if(kind!=MESA_INTEROP_NONE)
- {
- refCount++;
- return true;
- }
- return false;
-
-#endif
-}
-
-bool MesaInterop::GetInfo(mesa_glinterop_device_info& info) const
-{
-#ifdef _WIN32
- return false;
-#else
- switch(kind)
- {
- case MESA_INTEROP_GLX:
- return GlxInfo(display.glxDisplay, context.glxContext, &info)==MESA_GLINTEROP_SUCCESS;
- case MESA_INTEROP_EGL:
- return EglInfo(display.eglDisplay, context.eglContext, &info)==MESA_GLINTEROP_SUCCESS;
- default:
- return false;
- }
-#endif
-}
-
-bool MesaInterop::Export (mesa_glinterop_export_in& in, mesa_glinterop_export_out& out) const
-{
-#ifdef _WIN32
- return false;
-#else
- switch(kind)
- {
- case MESA_INTEROP_GLX:
- return GlxExport(display.glxDisplay, context.glxContext, &in, &out)==MESA_GLINTEROP_SUCCESS;
- case MESA_INTEROP_EGL:
- return EglExport(display.eglDisplay, context.eglContext, &in, &out)==MESA_GLINTEROP_SUCCESS;
- default:
- return false;
- }
-#endif
-}
-
-}
-
-#endif // WITHOUT_HSA_BACKEND
diff --git a/projects/clr/rocclr/runtime/device/rocm/rocglinterop.hpp b/projects/clr/rocclr/runtime/device/rocm/rocglinterop.hpp
deleted file mode 100644
index 219baa8cdd..0000000000
--- a/projects/clr/rocclr/runtime/device/rocm/rocglinterop.hpp
+++ /dev/null
@@ -1,158 +0,0 @@
-//
-// Copyright (c) 2016 Advanced Micro Devices, Inc. All rights reserved.
-//
-#pragma once
-
-#ifndef WITHOUT_HSA_BACKEND
-
-#ifdef _WIN32
-//GLX header cannot be included in Windows due to X11 header dependency
-#define MESA_GLINTEROP_NO_GLX
-#include "device/rocm/mesa_glinterop.h"
-//Give GLX parameters void* size
-typedef void Display;
-typedef void* GLXContext;
-#undef MESA_GLINTEROP_NO_GLX
-#else
-#include "device/rocm/mesa_glinterop.h"
-#endif
-
-#include "device/rocm/rocregisters.hpp"
-#include "hsa_ext_amd.h"
-
-#include
-
-namespace roc
-{
-
- //Specific typed container for version 1
- typedef struct metadata_amd_ci_vi_s {
- uint32_t version; // Must be 1
- uint32_t vendorID; // AMD | CZ
- SQ_IMG_RSRC_WORD0 word0;
- SQ_IMG_RSRC_WORD1 word1;
- SQ_IMG_RSRC_WORD2 word2;
- SQ_IMG_RSRC_WORD3 word3;
- SQ_IMG_RSRC_WORD4 word4;
- SQ_IMG_RSRC_WORD5 word5;
- SQ_IMG_RSRC_WORD6 word6;
- SQ_IMG_RSRC_WORD7 word7;
- uint32_t mip_offsets[0]; //Mip level offset bits [39:8] for each level (if any)
- } metadata_amd_ci_vi_t;
-
- class image_metadata
- {
- private:
- metadata_amd_ci_vi_t* data;
-
- image_metadata(const image_metadata&)=delete;
- image_metadata& operator=(const image_metadata&)=delete;
-
- public:
- image_metadata() : data(nullptr) {}
- ~image_metadata() { data=nullptr; }
-
- bool create(hsa_amd_image_descriptor_t* image_desc)
- {
- if((image_desc->version!=1) || ((image_desc->deviceID>>16)!=0x1002)) return false;
- data=reinterpret_cast(image_desc);
- return true;
- }
-
- bool setMipLevel(uint32_t level)
- {
- if(level>data->word3.bits.last_level)
- return false;
- data->word3.bits.base_level=level;
- data->word3.bits.last_level=level;
- return true;
- }
-
- bool setLayer(uint32_t layer)
- {
- data->word3.bits.type=SQ_RSRC_IMG_2D_ARRAY;
- data->word5.bits.last_array=layer;
- data->word5.bits.base_array=layer;
- return true;
- }
-
- bool setFace(GLenum face)
- {
- int index=face-GL_TEXTURE_CUBE_MAP_POSITIVE_X;
- if(index<0 || index>5)
- return false;
- if(data->word3.bits.type!=SQ_RSRC_IMG_CUBE)
- return false;
- return setLayer(index);
- }
- };
-
- class MesaInterop
- {
- public:
-
- enum MESA_INTEROP_KIND { MESA_INTEROP_NONE=0, MESA_INTEROP_GLX=1, MESA_INTEROP_EGL=2 };
-
- union DisplayHandle
- {
- Display* glxDisplay;
- EGLDisplay eglDisplay;
- };
-
- union ContextHandle
- {
- GLXContext glxContext;
- EGLContext eglContext;
- };
-
- //True if the configuration supports the indicated interop ability.
- static bool Supported();
-
- MesaInterop() { kind=MESA_INTEROP_NONE; }
- MesaInterop(const MesaInterop& rhs) { *this=rhs; }
- ~MesaInterop() { Unbind(); }
-
- const MesaInterop& operator=(const MesaInterop& rhs)
- {
- display=rhs.display;
- context=rhs.context;
- kind=rhs.kind;
- if(kind!=MESA_INTEROP_NONE)
- refCount++;
- return *this;
- }
-
- /*
- Loads Mesa interop APIs and sets this interface object to use the indicated
- subsystem (GLX/EGL). Returns true if the required subsystem is found.
- */
- bool Bind(MESA_INTEROP_KIND Kind, const DisplayHandle& Display, const ContextHandle& Context);
-
- /*
- Releases use of Mesa interop APIs.
- Used to check for bad load/unload sequences.
- */
- void Unbind()
- {
- if(kind==MESA_INTEROP_NONE) return;
- assert(refCount>0 && "Invalid refCount in MesaInterop.");
- refCount--;
- kind=MESA_INTEROP_NONE;
- }
-
- bool GetInfo(mesa_glinterop_device_info& info) const;
-
- bool Export (mesa_glinterop_export_in& in, mesa_glinterop_export_out& out) const;
-
- private:
- static std::atomic refCount;
-
- DisplayHandle display;
- ContextHandle context;
- MESA_INTEROP_KIND kind;
- };
-
-}
-
-#endif /*WITHOUT_HSA_BACKEND*/
-
diff --git a/projects/clr/rocclr/runtime/device/rocm/rockernel.cpp b/projects/clr/rocclr/runtime/device/rocm/rockernel.cpp
deleted file mode 100644
index 1c08135897..0000000000
--- a/projects/clr/rocclr/runtime/device/rocm/rockernel.cpp
+++ /dev/null
@@ -1,680 +0,0 @@
-//
-// Copyright (c) 2009 Advanced Micro Devices, Inc. All rights reserved.
-//
-
-#include "rockernel.hpp"
-#include "SCHSAInterface.h"
-#include "amd_hsa_kernel_code.h"
-
-#include
-
-#ifndef WITHOUT_HSA_BACKEND
-
-namespace roc {
-
-inline static HSAIL_ARG_TYPE
-GetHSAILArgType(const aclArgData* argInfo)
-{
- switch (argInfo->type) {
- case ARG_TYPE_POINTER:
- return HSAIL_ARGTYPE_POINTER;
- case ARG_TYPE_VALUE:
- return HSAIL_ARGTYPE_VALUE;
- case ARG_TYPE_IMAGE:
- return HSAIL_ARGTYPE_IMAGE;
- case ARG_TYPE_SAMPLER:
- return HSAIL_ARGTYPE_SAMPLER;
- case ARG_TYPE_ERROR:
- default:
- return HSAIL_ARGTYPE_ERROR;
- }
-}
-
-inline static size_t
-GetHSAILArgAlignment(const aclArgData* argInfo)
-{
- switch (argInfo->type) {
- case ARG_TYPE_POINTER:
- return argInfo->arg.pointer.align;
- default:
- return 1;
- }
-}
-
-inline static HSAIL_ACCESS_TYPE
-GetHSAILArgAccessType(const aclArgData* argInfo)
-{
- if (argInfo->type == ARG_TYPE_POINTER) {
- switch (argInfo->arg.pointer.type) {
- case ACCESS_TYPE_RO:
- return HSAIL_ACCESS_TYPE_RO;
- case ACCESS_TYPE_WO:
- return HSAIL_ACCESS_TYPE_WO;
- case ACCESS_TYPE_RW:
- default:
- return HSAIL_ACCESS_TYPE_RW;
- }
- }
- return HSAIL_ACCESS_TYPE_NONE;
-}
-
-inline static HSAIL_ADDRESS_QUALIFIER
-GetHSAILAddrQual(const aclArgData* argInfo)
-{
- if (argInfo->type == ARG_TYPE_POINTER) {
- switch (argInfo->arg.pointer.memory) {
- case PTR_MT_CONSTANT_EMU:
- case PTR_MT_CONSTANT:
- case PTR_MT_UAV:
- case PTR_MT_GLOBAL:
- return HSAIL_ADDRESS_GLOBAL;
- case PTR_MT_LDS_EMU:
- case PTR_MT_LDS:
- return HSAIL_ADDRESS_LOCAL;
- case PTR_MT_ERROR:
- default:
- LogError("Unsupported address type");
- return HSAIL_ADDRESS_ERROR;
- }
- }
- else if ((argInfo->type == ARG_TYPE_IMAGE) ||
- (argInfo->type == ARG_TYPE_SAMPLER)) {
- return HSAIL_ADDRESS_GLOBAL;
- }
- return HSAIL_ADDRESS_ERROR;
-}
-
-/* f16 returns f32 - workaround due to comp lib */
-inline static HSAIL_DATA_TYPE
-GetHSAILDataType(const aclArgData* argInfo)
-{
- aclArgDataType dataType;
-
- if (argInfo->type == ARG_TYPE_POINTER) {
- dataType = argInfo->arg.pointer.data;
- }
- else if (argInfo->type == ARG_TYPE_VALUE) {
- dataType = argInfo->arg.value.data;
- }
- else {
- return HSAIL_DATATYPE_ERROR;
- }
- switch (dataType) {
- case DATATYPE_i1:
- return HSAIL_DATATYPE_B1;
- case DATATYPE_i8:
- return HSAIL_DATATYPE_S8;
- case DATATYPE_i16:
- return HSAIL_DATATYPE_S16;
- case DATATYPE_i32:
- return HSAIL_DATATYPE_S32;
- case DATATYPE_i64:
- return HSAIL_DATATYPE_S64;
- case DATATYPE_u8:
- return HSAIL_DATATYPE_U8;
- case DATATYPE_u16:
- return HSAIL_DATATYPE_U16;
- case DATATYPE_u32:
- return HSAIL_DATATYPE_U32;
- case DATATYPE_u64:
- return HSAIL_DATATYPE_U64;
- case DATATYPE_f16:
- return HSAIL_DATATYPE_F32;
- case DATATYPE_f32:
- return HSAIL_DATATYPE_F32;
- case DATATYPE_f64:
- return HSAIL_DATATYPE_F64;
- case DATATYPE_struct:
- return HSAIL_DATATYPE_STRUCT;
- case DATATYPE_opaque:
- return HSAIL_DATATYPE_OPAQUE;
- case DATATYPE_ERROR:
- default:
- return HSAIL_DATATYPE_ERROR;
- }
-}
-
-// returns size in number of bytes
-inline static int
-GetHSAILArgSize(const aclArgData *argInfo)
-{
- switch (argInfo->type) {
- case ARG_TYPE_VALUE:
- switch (GetHSAILDataType(argInfo)) {
- case HSAIL_DATATYPE_B1:
- return 1;
- case HSAIL_DATATYPE_B8:
- case HSAIL_DATATYPE_S8:
- case HSAIL_DATATYPE_U8:
- return 1;
- case HSAIL_DATATYPE_B16:
- case HSAIL_DATATYPE_U16:
- case HSAIL_DATATYPE_S16:
- case HSAIL_DATATYPE_F16:
- return 2;
- case HSAIL_DATATYPE_B32:
- case HSAIL_DATATYPE_U32:
- case HSAIL_DATATYPE_S32:
- case HSAIL_DATATYPE_F32:
- return 4;
- case HSAIL_DATATYPE_B64:
- case HSAIL_DATATYPE_U64:
- case HSAIL_DATATYPE_S64:
- case HSAIL_DATATYPE_F64:
- return 8;
- case HSAIL_DATATYPE_STRUCT:
- return argInfo->arg.value.numElements;
- default:
- return -1;
- }
- case ARG_TYPE_POINTER:
- case ARG_TYPE_IMAGE:
- case ARG_TYPE_SAMPLER:
- return sizeof(void*);
- default:
- return -1;
- }
-}
-
-inline static clk_value_type_t
-GetOclType(const aclArgData* argInfo)
-{
- static const clk_value_type_t ClkValueMapType[6][6] = {
- { T_CHAR, T_CHAR2, T_CHAR3, T_CHAR4, T_CHAR8, T_CHAR16 },
- { T_SHORT, T_SHORT2, T_SHORT3, T_SHORT4, T_SHORT8, T_SHORT16 },
- { T_INT, T_INT2, T_INT3, T_INT4, T_INT8, T_INT16 },
- { T_LONG, T_LONG2, T_LONG3, T_LONG4, T_LONG8, T_LONG16 },
- { T_FLOAT, T_FLOAT2, T_FLOAT3, T_FLOAT4, T_FLOAT8, T_FLOAT16 },
- { T_DOUBLE, T_DOUBLE2, T_DOUBLE3, T_DOUBLE4, T_DOUBLE8, T_DOUBLE16 },
- };
-
- uint sizeType;
- if ((argInfo->type == ARG_TYPE_POINTER) || (argInfo->type == ARG_TYPE_IMAGE)) {
- return T_POINTER;
- }
- else if (argInfo->type == ARG_TYPE_VALUE) {
- switch (argInfo->arg.value.data) {
- case DATATYPE_i8:
- case DATATYPE_u8:
- sizeType = 0;
- break;
- case DATATYPE_i16:
- case DATATYPE_u16:
- sizeType = 1;
- break;
- case DATATYPE_i32:
- case DATATYPE_u32:
- sizeType = 2;
- break;
- case DATATYPE_i64:
- case DATATYPE_u64:
- sizeType = 3;
- break;
- case DATATYPE_f16:
- case DATATYPE_f32:
- sizeType = 4;
- break;
- case DATATYPE_f64:
- sizeType = 5;
- break;
- default:
- return T_VOID;
- }
- switch (argInfo->arg.value.numElements) {
- case 1: return ClkValueMapType[sizeType][0];
- case 2: return ClkValueMapType[sizeType][1];
- case 3: return ClkValueMapType[sizeType][2];
- case 4: return ClkValueMapType[sizeType][3];
- case 8: return ClkValueMapType[sizeType][4];
- case 16: return ClkValueMapType[sizeType][5];
- default: return T_VOID;
- }
- }
- else if (argInfo->type == ARG_TYPE_SAMPLER) {
- return T_SAMPLER;
- }
- else {
- return T_VOID;
- }
-}
-
-inline static cl_kernel_arg_address_qualifier
-GetOclAddrQual(const aclArgData* argInfo)
-{
- if (argInfo->type == ARG_TYPE_POINTER) {
- switch (argInfo->arg.pointer.memory) {
- case PTR_MT_UAV:
- case PTR_MT_GLOBAL:
- return CL_KERNEL_ARG_ADDRESS_GLOBAL;
- case PTR_MT_CONSTANT:
- case PTR_MT_UAV_CONSTANT:
- case PTR_MT_CONSTANT_EMU:
- return CL_KERNEL_ARG_ADDRESS_CONSTANT;
- case PTR_MT_LDS_EMU:
- case PTR_MT_LDS:
- return CL_KERNEL_ARG_ADDRESS_LOCAL;
- default:
- return CL_KERNEL_ARG_ADDRESS_PRIVATE;
- }
- }
- else if (argInfo->type == ARG_TYPE_IMAGE) {
- return CL_KERNEL_ARG_ADDRESS_GLOBAL;
- }
- //default for all other cases
- return CL_KERNEL_ARG_ADDRESS_PRIVATE;
-}
-
-inline static cl_kernel_arg_access_qualifier
-GetOclAccessQual(const aclArgData* argInfo)
-{
- if (argInfo->type == ARG_TYPE_IMAGE) {
- switch (argInfo->arg.image.type) {
- case ACCESS_TYPE_RO:
- return CL_KERNEL_ARG_ACCESS_READ_ONLY;
- case ACCESS_TYPE_WO:
- return CL_KERNEL_ARG_ACCESS_WRITE_ONLY;
- case ACCESS_TYPE_RW:
- return CL_KERNEL_ARG_ACCESS_READ_WRITE;
- default:
- return CL_KERNEL_ARG_ACCESS_NONE;
- }
- }
- return CL_KERNEL_ARG_ACCESS_NONE;
-}
-
-inline static cl_kernel_arg_type_qualifier
-GetOclTypeQual(const aclArgData* argInfo)
-{
- cl_kernel_arg_type_qualifier rv = CL_KERNEL_ARG_TYPE_NONE;
- if (argInfo->type == ARG_TYPE_POINTER) {
- if (argInfo->arg.pointer.isVolatile) {
- rv |= CL_KERNEL_ARG_TYPE_VOLATILE;
- }
- if (argInfo->arg.pointer.isRestrict) {
- rv |= CL_KERNEL_ARG_TYPE_RESTRICT;
- }
- if (argInfo->isConst) {
- rv |= CL_KERNEL_ARG_TYPE_CONST;
- }
- switch (argInfo->arg.pointer.memory) {
- case PTR_MT_CONSTANT:
- case PTR_MT_UAV_CONSTANT:
- case PTR_MT_CONSTANT_EMU:
- rv |= CL_KERNEL_ARG_TYPE_CONST;
- break;
- default:
- break;
- }
- }
- return rv;
-}
-
-static int
-GetOclSize(const aclArgData* argInfo)
-{
- switch (argInfo->type) {
- case ARG_TYPE_POINTER: return sizeof(void *);
- case ARG_TYPE_VALUE:
- switch (argInfo->arg.value.data) {
- case DATATYPE_i8:
- case DATATYPE_u8:
- case DATATYPE_struct:
- return 1 * argInfo->arg.value.numElements;
- case DATATYPE_u16:
- case DATATYPE_i16:
- case DATATYPE_f16:
- return 2 * argInfo->arg.value.numElements;
- case DATATYPE_u32:
- case DATATYPE_i32:
- case DATATYPE_f32:
- return 4 * argInfo->arg.value.numElements;
- case DATATYPE_i64:
- case DATATYPE_u64:
- case DATATYPE_f64:
- return 8 * argInfo->arg.value.numElements;
- case DATATYPE_ERROR:
- default: return -1;
- }
- case ARG_TYPE_IMAGE: return sizeof(cl_mem);
- case ARG_TYPE_SAMPLER: return sizeof(cl_sampler);
- default: return -1;
- }
-}
-
-KernelArg::KernelArg(aclArgData *argInfo) {
- argInfo_ = argInfo;
- name_ = argInfo_->argStr;
- typeName_ = argInfo->typeStr;
-}
-
-int KernelArg::size() {
- switch (argInfo_->type) {
- case ARG_TYPE_POINTER: {
- return sizeof(void *);
- }
- case ARG_TYPE_VALUE: {
- switch (argInfo_->arg.value.data) {
- case DATATYPE_ERROR: {
- return -1;
- }
- case DATATYPE_i8:
- case DATATYPE_u8:
- case DATATYPE_struct: {
- return 1 * argInfo_->arg.value.numElements;
- }
- case DATATYPE_u16:
- case DATATYPE_i16:
- case DATATYPE_f16: {
- return 2 * argInfo_->arg.value.numElements;
- }
- case DATATYPE_u32:
- case DATATYPE_i32:
- case DATATYPE_f32: {
- return 4 * argInfo_->arg.value.numElements;
- }
- case DATATYPE_i64:
- case DATATYPE_u64:
- case DATATYPE_f64: {
- return 8 * argInfo_->arg.value.numElements;
- }
- default:
- return -1;
- }
- }
- case ARG_TYPE_IMAGE: {
- return sizeof(cl_mem);
- }
- case ARG_TYPE_SAMPLER: {
- return sizeof(cl_sampler);
- }
- default:
- return -1;
- }
-}
-
-std::string& KernelArg::name() {
- return name_;
-}
-
-std::string& KernelArg::typeName()
-{
- return typeName_;
-}
-
-void
-Kernel::initArgList(const aclArgData* aclArg)
-{
- // Initialize the hsail argument list too
- initHsailArgs(aclArg);
-
- // Iterate through the arguments and insert into parameterList
- device::Kernel::parameters_t params;
- amd::KernelParameterDescriptor desc;
- size_t offset = 0;
-
- // Reserved arguments for HSAIL launch
- aclArg += MaxExtraArgumentsNum;
- for (uint i = 0; aclArg->struct_size != 0; i++, aclArg++) {
- desc.name_ = hsailArgList_[i]->name_.c_str();
- desc.type_ = GetOclType(aclArg);
- desc.addressQualifier_ = GetOclAddrQual(aclArg);
- desc.accessQualifier_ = GetOclAccessQual(aclArg);
- desc.typeQualifier_ = GetOclTypeQual(aclArg);
- desc.typeName_ = hsailArgList_[i]->typeName_.c_str();
-
- // Make a check if it is local or global
- if (desc.addressQualifier_ == CL_KERNEL_ARG_ADDRESS_LOCAL) {
- desc.size_ = 0;
- }
- else {
- desc.size_ = GetOclSize(aclArg);
- }
-
- // Make offset alignment to match CPU metadata, since
- // in multidevice config abstraction layer has a single signature
- // and CPU sends the paramaters as they are allocated in memory
- size_t size = desc.size_;
- if (size == 0) {
- // Local memory for CPU
- size = sizeof(cl_mem);
- }
- offset = amd::alignUp(offset, std::min(size, size_t(16)));
- desc.offset_ = offset;
- offset += amd::alignUp(size, sizeof(uint32_t));
- params.push_back(desc);
- }
- createSignature(params);
-}
-
-void
-Kernel::initHsailArgs(const aclArgData* aclArg)
-{
- int offset = 0;
-
- // Reserved arguments for HSAIL launch
- aclArg += MaxExtraArgumentsNum;
-
- // Iterate through the each kernel argument
- for (; aclArg->struct_size != 0; aclArg++) {
- HsailKernelArg* arg = new HsailKernelArg;
- // Initialize HSAIL kernel argument
- arg->name_ = aclArg->argStr;
- arg->typeName_ = aclArg->typeStr;
- arg->size_ = GetHSAILArgSize(aclArg);
- arg->offset_ = offset;
- arg->type_ = GetHSAILArgType(aclArg);
- arg->addrQual_ = GetHSAILAddrQual(aclArg);
- arg->dataType_ = GetHSAILDataType(aclArg);
- // If vector of args we add additional arguments to flatten it out
- arg->numElem_ = ((aclArg->type == ARG_TYPE_VALUE) &&
- (aclArg->arg.value.data != DATATYPE_struct)) ?
- aclArg->arg.value.numElements : 1;
- arg->alignment_ = GetHSAILArgAlignment(aclArg);
- arg->access_ = GetHSAILArgAccessType(aclArg);
- offset += GetHSAILArgSize(aclArg);
- hsailArgList_.push_back(arg);
- }
-}
-
-Kernel::Kernel(std::string name, HSAILProgram* prog,
- const uint64_t& kernelCodeHandle,
- const uint32_t workgroupGroupSegmentByteSize,
- const uint32_t workitemPrivateSegmentByteSize,
- const uint32_t kernargSegmentByteSize,
- const uint32_t kernargSegmentAlignment,
- uint extraArgsNum)
- : device::Kernel(name),
- program_(prog),
- kernelCodeHandle_(kernelCodeHandle),
- workgroupGroupSegmentByteSize_(workgroupGroupSegmentByteSize),
- workitemPrivateSegmentByteSize_(workitemPrivateSegmentByteSize),
- kernargSegmentByteSize_(kernargSegmentByteSize),
- kernargSegmentAlignment_(kernargSegmentAlignment),
- extraArgumentsNum_(extraArgsNum) {}
-
-bool Kernel::init(){
- acl_error errorCode;
- //compile kernel down to ISA
- hsa_agent_t hsaDevice = program_->hsaDevice();
- // Pull out metadata from the ELF
- size_t sizeOfArgList;
- aclCompiler* compileHandle = program_->dev().compiler();
- std::string openClKernelName("&__OpenCL_" + name() + "_kernel");
- errorCode = g_complibApi._aclQueryInfo(compileHandle,
- program_->binaryElf(),
- RT_ARGUMENT_ARRAY,
- openClKernelName.c_str(),
- NULL,
- &sizeOfArgList);
- if (errorCode != ACL_SUCCESS) {
- return false;
- }
- std::unique_ptr argList(new char[sizeOfArgList]);
- errorCode = g_complibApi._aclQueryInfo(compileHandle,
- program_->binaryElf(),
- RT_ARGUMENT_ARRAY,
- openClKernelName.c_str(),
- argList.get(),
- &sizeOfArgList);
- if (errorCode != ACL_SUCCESS) {
- return false;
- }
- //Set the argList
- initArgList((const aclArgData *) argList.get());
- //Set the workgroup information for the kernel
- memset(&workGroupInfo_, 0, sizeof(workGroupInfo_));
- workGroupInfo_.availableLDSSize_ = program_->dev().info().localMemSizePerCU_;
- assert(workGroupInfo_.availableLDSSize_ > 0);
- workGroupInfo_.availableSGPRs_ = 0;
- workGroupInfo_.availableVGPRs_ = 0;
- size_t sizeOfWorkGroupSize;
- errorCode = g_complibApi._aclQueryInfo(compileHandle,
- program_->binaryElf(),
- RT_WORK_GROUP_SIZE,
- openClKernelName.c_str(),
- NULL,
- &sizeOfWorkGroupSize);
- if (errorCode != ACL_SUCCESS) {
- return false;
- }
- errorCode = g_complibApi._aclQueryInfo(compileHandle,
- program_->binaryElf(),
- RT_WORK_GROUP_SIZE,
- openClKernelName.c_str(),
- workGroupInfo_.compileSize_,
- &sizeOfWorkGroupSize);
- if (errorCode != ACL_SUCCESS) {
- return false;
- }
-
- uint32_t wavefront_size = 0;
- if (HSA_STATUS_SUCCESS !=
- hsa_agent_get_info(
- program_->hsaDevice(), HSA_AGENT_INFO_WAVEFRONT_SIZE,
- &wavefront_size)) {
- return false;
- }
- assert(wavefront_size > 0);
-
- // Setting it the same as used LDS.
- workGroupInfo_.localMemSize_ = workgroupGroupSegmentByteSize_;
- workGroupInfo_.privateMemSize_ = workitemPrivateSegmentByteSize_;
- workGroupInfo_.usedLDSSize_ = workgroupGroupSegmentByteSize_;
- workGroupInfo_.preferredSizeMultiple_ = wavefront_size;
- workGroupInfo_.usedSGPRs_ = 0;
- workGroupInfo_.usedStackSize_ = 0;
- workGroupInfo_.usedVGPRs_ = 0;
- workGroupInfo_.wavefrontPerSIMD_ =
- program_->dev().info().maxWorkItemSizes_[0] / wavefront_size;
- workGroupInfo_.wavefrontSize_ = wavefront_size;
- if (workGroupInfo_.compileSize_[0] != 0) {
- workGroupInfo_.size_ =
- workGroupInfo_.compileSize_[0] *
- workGroupInfo_.compileSize_[1] *
- workGroupInfo_.compileSize_[2];
- }
- else {
- workGroupInfo_.size_ = program_->dev().info().maxWorkGroupSize_;
- }
-
- // Pull out printf metadata from the ELF
- size_t sizeOfPrintfList;
- errorCode = g_complibApi._aclQueryInfo(compileHandle, program_->binaryElf(), RT_GPU_PRINTF_ARRAY,
- openClKernelName.c_str(), NULL, &sizeOfPrintfList);
- if (errorCode != ACL_SUCCESS){
- return false;
- }
-
- // Make sure kernel has any printf info
- if (0 != sizeOfPrintfList) {
- std::unique_ptr aclPrintfList(new char[sizeOfPrintfList]);
- if (!aclPrintfList) {
- return false;
- }
- errorCode = g_complibApi._aclQueryInfo(
- compileHandle, program_->binaryElf(), RT_GPU_PRINTF_ARRAY,
- openClKernelName.c_str(), aclPrintfList.get(), &sizeOfPrintfList);
- if (errorCode != ACL_SUCCESS) {
- return false;
- }
-
- // Set the Printf List
- initPrintf(reinterpret_cast(aclPrintfList.get()));
- }
- return true;
-}
-
-void Kernel::initPrintf(const aclPrintfFmt* aclPrintf) {
- PrintfInfo info;
- uint index = 0;
- for (; aclPrintf->struct_size != 0; aclPrintf++) {
- index = aclPrintf->ID;
- if (printf_.size() <= index) {
- printf_.resize(index + 1);
- }
- std::string pfmt = aclPrintf->fmtStr;
- size_t pos = 0;
- for (size_t i = 0; i < pfmt.size(); ++i) {
- char symbol = pfmt[pos++];
- if (symbol == '\\') {
- // Rest of the C escape sequences (e.g. \') are handled correctly
- // by the MDParser, we are not sure exactly how!
- switch (pfmt[pos]) {
- case 'a':
- pos++;
- symbol = '\a';
- break;
- case 'b':
- pos++;
- symbol = '\b';
- break;
- case 'f':
- pos++;
- symbol = '\f';
- break;
- case 'n':
- pos++;
- symbol = '\n';
- break;
- case 'r':
- pos++;
- symbol = '\r';
- break;
- case 'v':
- pos++;
- symbol = '\v';
- break;
- case '7':
- if (pfmt[++pos] == '2') {
- pos++;
- i++;
- symbol = '\72';
- }
- break;
- default:
- break;
- }
- }
- info.fmtString_.push_back(symbol);
- }
- info.fmtString_ += "\n";
- uint32_t* tmp_ptr = const_cast(aclPrintf->argSizes);
- for (uint i = 0; i < aclPrintf->numSizes; i++, tmp_ptr++) {
- info.arguments_.push_back(*tmp_ptr);
- }
- printf_[index] = info;
- info.arguments_.clear();
- }
-}
-
-
-Kernel::~Kernel() {
- while (!hsailArgList_.empty()) {
- HsailKernelArg* kernelArgPointer = hsailArgList_.back();
- delete kernelArgPointer;
- hsailArgList_.pop_back();
- }
-}
-
-} // namespace roc
-#endif // WITHOUT_HSA_BACKEND
diff --git a/projects/clr/rocclr/runtime/device/rocm/rockernel.hpp b/projects/clr/rocclr/runtime/device/rocm/rockernel.hpp
deleted file mode 100644
index 6a68e67aa5..0000000000
--- a/projects/clr/rocclr/runtime/device/rocm/rockernel.hpp
+++ /dev/null
@@ -1,195 +0,0 @@
-//
-// Copyright (c) 2009 Advanced Micro Devices, Inc. All rights reserved.
-//
-#pragma once
-
-#include
-#include "acl.h"
-#include "rocprogram.hpp"
-#include "top.hpp"
-#include "rocprintf.hpp"
-
-#ifndef WITHOUT_HSA_BACKEND
-
-namespace roc {
-
-#define MAX_INFO_STRING_LEN 0x40
-enum HSAIL_ADDRESS_QUALIFIER{
-HSAIL_ADDRESS_ERROR=0,
-HSAIL_ADDRESS_GLOBAL,
-HSAIL_ADDRESS_LOCAL,
-HSAIL_MAX_ADDRESS_QUALIFIERS
-} ;
-
-enum HSAIL_ARG_TYPE{
-HSAIL_ARGTYPE_ERROR=0,
-HSAIL_ARGTYPE_POINTER,
-HSAIL_ARGTYPE_VALUE,
-HSAIL_ARGTYPE_IMAGE,
-HSAIL_ARGTYPE_SAMPLER,
-HSAIL_ARGMAX_ARG_TYPES
-};
-
-enum HSAIL_DATA_TYPE{
-HSAIL_DATATYPE_ERROR=0,
-HSAIL_DATATYPE_B1,
-HSAIL_DATATYPE_B8,
-HSAIL_DATATYPE_B16,
-HSAIL_DATATYPE_B32,
-HSAIL_DATATYPE_B64,
-HSAIL_DATATYPE_S8,
-HSAIL_DATATYPE_S16,
-HSAIL_DATATYPE_S32,
-HSAIL_DATATYPE_S64,
-HSAIL_DATATYPE_U8,
-HSAIL_DATATYPE_U16,
-HSAIL_DATATYPE_U32,
-HSAIL_DATATYPE_U64,
-HSAIL_DATATYPE_F16,
-HSAIL_DATATYPE_F32,
-HSAIL_DATATYPE_F64,
-HSAIL_DATATYPE_STRUCT,
-HSAIL_DATATYPE_OPAQUE,
-HSAIL_DATATYPE_MAX_TYPES
-};
-
-enum HSAIL_ACCESS_TYPE {
- HSAIL_ACCESS_TYPE_NONE = 0,
- HSAIL_ACCESS_TYPE_RO,
- HSAIL_ACCESS_TYPE_WO,
- HSAIL_ACCESS_TYPE_RW
-};
-
-struct HsailKernelArg
-{
- std::string name_; //!< Argument's name
- std::string typeName_; //!< Argument's type name
- uint size_; //!< Size in bytes
- uint offset_; //!< Argument's offset
- uint alignment_; //!< Argument's alignment
- HSAIL_ARG_TYPE type_; //!< Type of the argument
- HSAIL_ADDRESS_QUALIFIER addrQual_; //!< Address qualifier of the argument
- HSAIL_DATA_TYPE dataType_; //!< The type of data
- uint numElem_; //!< Number of elements
- HSAIL_ACCESS_TYPE access_; //!< Access type for the argument
-};
-
-class KernelArg
-{
-public:
- KernelArg(aclArgData* argInfo);
- //! Return type of the argument
- clk_value_type_t amdoclType();
- //! Global, local etc - returns amdocl types
- clk_address_space_t amdoclAddrQual();
- //! Global,localetc - returns opencl type
- cl_kernel_arg_address_qualifier oclAddrQual();
- //! read , write etc - returns amdocl type
- clk_arg_qualifier_t amdoclAccessQual();
- //! read , write etc - returns opencl type type
- cl_kernel_arg_access_qualifier oclAccessQual();
- //! const,volatile,restrict etc - returns opencl type type
- cl_kernel_arg_type_qualifier oclTypeQual();
-
- //! Name of the argument
- std::string& name();
- //! Name of the argument
- std::string& typeName();
- //! reflection
- std::string reflection(){ return name(); };
- //! Returns the size of the argument
- int size();
- //! returns the offset
- int offset();
-
- void setOffset();
-
-private:
- aclArgData* argInfo_;
- int offset_;
- std::string name_;
- std::string typeName_;
-};
-
-class Kernel : public device::Kernel
-{
-public:
- Kernel(std::string name,
- HSAILProgram* prog,
- const uint64_t &kernelCodeHandle,
- const uint32_t workgroupGroupSegmentByteSize,
- const uint32_t workitemPrivateSegmentByteSize,
- const uint32_t kernargSegmentByteSize,
- const uint32_t kernargSegmentAlignment,
- uint extraArgsNum);
-
- const uint64_t& KernelCodeHandle() {
- return kernelCodeHandle_;
- }
-
- const uint32_t WorkgroupGroupSegmentByteSize() const {
- return workgroupGroupSegmentByteSize_;
- }
-
- const uint32_t workitemPrivateSegmentByteSize() const {
- return workitemPrivateSegmentByteSize_;
- }
-
- const uint64_t KernargSegmentByteSize() const {
- return kernargSegmentByteSize_;
- }
-
- const uint8_t KernargSegmentAlignment() const {
- return kernargSegmentAlignment_;
- }
-
- ~Kernel();
-
- //! Initializes the metadata required for this kernel
- bool init();
-
- const HSAILProgram* program() {
- return static_cast(program_);
- }
-
- //! Returns a pointer to the hsail argument at the specified index
- HsailKernelArg* hsailArgAt(size_t index) const {
- return hsailArgList_[index];
- }
-
- //! Max number of possible extra (hidden) kernel arguments
- static const uint MaxExtraArgumentsNum = 6;
-
- uint extraArgumentsNum() const { return extraArgumentsNum_; }
-
- //! Return printf info array
- const std::vector& printfInfo() const {return printf_;}
-
-private:
- //! Populates hsailArgList_
- void initArgList(const aclArgData* aclArg);
-
- //! Initializes Hsail Argument metadata and info ;
- void initHsailArgs(const aclArgData* aclArg);
-
- //! Initializes HSAIL Printf metadata and info
- void initPrintf(const aclPrintfFmt* aclPrintf);
-
- HSAILProgram *program_; //!< The roc::HSAILProgram context
- std::vector hsailArgList_; //!< Vector list of HSAIL Arguments
- std::string compileOptions_; //!< compile used for finalizing this kernel
- uint64_t kernelCodeHandle_; //!< Kernel code handle (aka amd_kernel_code_t)
- const uint32_t workgroupGroupSegmentByteSize_;
- const uint32_t workitemPrivateSegmentByteSize_;
- const uint32_t kernargSegmentByteSize_;
- const uint32_t kernargSegmentAlignment_;
- size_t kernelDirectiveOffset_;
- const uint extraArgumentsNum_; // Number of arguments in Kernenv
- std::vector printf_;
-};
-
-} // namespace roc
-
-#endif // WITHOUT_HSA_BACKEND
-
-
diff --git a/projects/clr/rocclr/runtime/device/rocm/rocmemory.cpp b/projects/clr/rocclr/runtime/device/rocm/rocmemory.cpp
deleted file mode 100644
index e1eb87602e..0000000000
--- a/projects/clr/rocclr/runtime/device/rocm/rocmemory.cpp
+++ /dev/null
@@ -1,783 +0,0 @@
-//
-// Copyright (c) 2008 Advanced Micro Devices, Inc. All rights reserved.
-//
-
-#ifndef WITHOUT_HSA_BACKEND
-
-#if !defined(_WIN32)
-#include
-#endif
-
-#include "CL/cl_ext.h"
-
-#include "utils/util.hpp"
-#include "device/device.hpp"
-#include "device/rocm/rocmemory.hpp"
-#include "device/rocm/rocdevice.hpp"
-#include "device/rocm/rocblit.hpp"
-#include "device/rocm/rocglinterop.hpp"
-#include "thread/monitor.hpp"
-#include "platform/memory.hpp"
-#include "platform/sampler.hpp"
-#include "api/opencl/amdocl/cl_gl_amd.hpp"
-
-namespace roc {
-
-/////////////////////////////////roc::Memory//////////////////////////////
-Memory::Memory(const roc::Device &dev, amd::Memory &owner)
- : device::Memory(owner),
- dev_(dev),
- deviceMemory_(NULL),
- kind_(MEMORY_KIND_NORMAL)
-{
-}
-
-Memory::~Memory()
-{
- dev_.removeVACache(this);
-}
-
-bool
-Memory::allocateMapMemory(size_t allocationSize)
-{
- assert(mapMemory_ == NULL);
-
- void *mapData = NULL;
-
- amd::Memory* mapMemory = dev_.findMapTarget(owner()->getSize());
-
- if (mapMemory == nullptr) {
- // Create buffer object to contain the map target.
- mapMemory =
- new(owner()->getContext()) amd::Buffer(
- owner()->getContext(), CL_MEM_ALLOC_HOST_PTR, owner()->getSize());
-
- if ((mapMemory == NULL) || (!mapMemory->create())) {
- LogError("[OCL] Fail to allocate map target object");
- dev_.hostFree(mapData);
- if (mapMemory) {
- mapMemory->release();
- }
- return false;
- }
-
- roc::Memory* hsaMapMemory = reinterpret_cast(
- mapMemory->getDeviceMemory(dev_));
- if (hsaMapMemory == nullptr) {
- mapMemory->release();
- return false;
- }
- }
-
- mapMemory_ = mapMemory;
-
- return true;
-}
-
-void*
-Memory::allocMapTarget(
- const amd::Coord3D &origin,
- const amd::Coord3D ®ion,
- uint mapFlags,
- size_t *rowPitch,
- size_t *slicePitch)
-{
- // Map/Unmap must be serialized.
- amd::ScopedLock lock(owner()->lockMemoryOps());
-
- incIndMapCount();
-
- // If the device backing storage is direct accessible, use it.
- if (isHostMemDirectAccess()) {
- if (owner()->getHostMem() != nullptr) {
- return (static_cast(owner()->getHostMem()) + origin[0]);
- }
-
- return (static_cast(deviceMemory_) + origin[0]);
- }
-
- // Otherwise, check for host memory.
- void *hostMem = owner()->getHostMem();
- if (hostMem != NULL) {
- return (static_cast(hostMem) + origin[0]);
- }
-
- // Allocate one if needed.
- if (indirectMapCount_ == 1) {
- if (!allocateMapMemory(owner()->getSize())) {
- decIndMapCount();
- return NULL;
- }
- }
- else {
- // Did the map resource allocation fail?
- if (mapMemory_ == NULL) {
- LogError("Could not map target resource");
- return NULL;
- }
- }
-
- roc::Memory* hsaMapMemory = reinterpret_cast(
- mapMemory_->getDeviceMemory(dev_));
- return reinterpret_cast(hsaMapMemory->getDeviceMemory()) + origin[0];
-}
-
-void
-Memory::decIndMapCount()
-{
- // Map/Unmap must be serialized.
- amd::ScopedLock lock(owner()->lockMemoryOps());
-
- if (indirectMapCount_ == 0) {
- LogError("decIndMapCount() called when indirectMapCount_ already zero");
- return;
- }
-
- // Decrement the counter and release indirect map if it's the last op
- if (--indirectMapCount_ == 0 &&
- mapMemory_ != NULL) {
- if (!dev_.addMapTarget(mapMemory_)) {
- // Release the buffer object containing the map data.
- mapMemory_->release();
- }
- mapMemory_ = nullptr;
- }
-}
-
-void *
-Memory::cpuMap(
- device::VirtualDevice& vDev,
- uint flags,
- uint startLayer,
- uint numLayers,
- size_t* rowPitch,
- size_t* slicePitch)
-{
- // Create the map target.
- void * mapTarget =
- allocMapTarget(amd::Coord3D(0), amd::Coord3D(0), 0, rowPitch, slicePitch);
-
- assert(mapTarget != NULL);
-
- if (!isHostMemDirectAccess()) {
- if (!vDev.blitMgr().readBuffer(
- *this, mapTarget, amd::Coord3D(0), amd::Coord3D(size()), true)) {
- decIndMapCount();
- return NULL;
- }
- }
-
- return mapTarget;
-}
-
-void
-Memory::cpuUnmap(device::VirtualDevice& vDev)
-{
- if (!isHostMemDirectAccess()) {
- if (!vDev.blitMgr().writeBuffer(
- mapMemory_->getHostMem(), *this, amd::Coord3D(0),
- amd::Coord3D(size()), true)) {
- LogError("[OCL] Fail sync the device memory on cpuUnmap");
- }
- }
-
- decIndMapCount();
-}
-
-// Setup an interop buffer (dmabuf handle) as an OpenCL buffer
-bool Memory::createInteropBuffer(GLenum targetType, int miplevel, size_t* metadata_size, const hsa_amd_image_descriptor_t** metadata)
-{
-#if defined(_WIN32)
- return false;
-#else
- assert(owner()->isInterop() && "Object is not an interop object.");
-
- mesa_glinterop_export_in in;
- mesa_glinterop_export_out out;
-
- in.size=sizeof(mesa_glinterop_export_in);
- out.size=sizeof(mesa_glinterop_export_out);
-
- if(owner()->getMemFlags() & CL_MEM_READ_ONLY)
- in.access=MESA_GLINTEROP_ACCESS_READ_ONLY;
- else if(owner()->getMemFlags() & CL_MEM_WRITE_ONLY)
- in.access=MESA_GLINTEROP_ACCESS_WRITE_ONLY;
- else
- in.access=MESA_GLINTEROP_ACCESS_READ_WRITE;
-
- in.target = targetType;
- in.obj=owner()->getInteropObj()->asGLObject()->getGLName();
- in.miplevel=miplevel;
- in.out_driver_data_size=0;
- in.out_driver_data=NULL;
-
- if(!dev_.mesa().Export(in, out))
- return false;
-
- size_t size;
- hsa_agent_t agent=dev_.getBackendDevice();
- hsa_status_t status=hsa_amd_interop_map_buffer(1, &agent, out.dmabuf_fd, 0, &size, &deviceMemory_, metadata_size, (const void**)metadata);
- close(out.dmabuf_fd);
-
- if(status!=HSA_STATUS_SUCCESS)
- return false;
-
- kind_=MEMORY_KIND_INTEROP;
- assert(deviceMemory_!=NULL && "Interop map failed to produce a pointer!");
-
- return true;
-#endif
-}
-
-void Memory::destroyInteropBuffer()
-{
- assert(kind_==MEMORY_KIND_INTEROP && "Memory must be interop type.");
- hsa_amd_interop_unmap_buffer(deviceMemory_);
- deviceMemory_=NULL;
-}
-
-/////////////////////////////////roc::Buffer//////////////////////////////
-
-Buffer::Buffer(const roc::Device &dev, amd::Memory &owner)
- : roc::Memory(dev, owner)
-{}
-
-Buffer::~Buffer()
-{
- destroy();
-}
-
-void
-Buffer::destroy()
-{
- if (owner()->parent() != NULL) {
- return;
- }
-
- if(kind_==MEMORY_KIND_INTEROP)
- {
- destroyInteropBuffer();
- return;
- }
-
- const cl_mem_flags memFlags = owner()->getMemFlags();
-
- if ((deviceMemory_ != nullptr) &&
- (deviceMemory_ != owner()->getHostMem())) {
- // if they are identical, the host pointer will be
- // deallocated later on => avoid double deallocation
- if (isHostMemDirectAccess()) {
- if (memFlags & CL_MEM_USE_HOST_PTR) {
- if (dev_.agent_profile() != HSA_PROFILE_FULL) {
- hsa_amd_memory_unlock(owner()->getHostMem());
- }
- }
- }
- else {
- dev_.deviceLocalFree(deviceMemory_, size());
- }
- }
-
- if (memFlags & CL_MEM_USE_HOST_PTR) {
- if (dev_.agent_profile() == HSA_PROFILE_FULL) {
- hsa_memory_deregister(owner()->getHostMem(), size());
- }
- }
-}
-
-bool
-Buffer::create()
-{
- //Interop buffer
- if(owner()->isInterop())
- return createInteropBuffer(GL_ARRAY_BUFFER, 0, NULL, NULL);
-
- if (owner()->parent()) {
- // Sub-Buffer creation.
- roc::Memory *parentBuffer =
- static_cast(owner()->parent()->getDeviceMemory(dev_));
-
- if (parentBuffer == NULL) {
- LogError("[OCL] Fail to allocate parent buffer");
- return false;
- }
-
- const size_t offset = owner()->getOrigin();
- deviceMemory_ =
- static_cast(parentBuffer->getDeviceMemory()) + offset;
-
- flags_ |= SubMemoryObject;
- flags_ |=
- parentBuffer->isHostMemDirectAccess() ? HostMemoryDirectAccess : 0;
- return true;
- }
-
- // Allocate backing storage in device local memory unless UHP or AHP are set
- const cl_mem_flags memFlags = owner()->getMemFlags();
- if (!(memFlags & (CL_MEM_USE_HOST_PTR |
- CL_MEM_ALLOC_HOST_PTR | CL_MEM_USE_PERSISTENT_MEM_AMD))) {
- deviceMemory_ = dev_.deviceLocalAlloc(size());
-
- if (deviceMemory_ == NULL) {
- // TODO: device memory is not enabled yet.
- // Fallback to system memory if exist.
-
- flags_ |= HostMemoryDirectAccess;
- if (dev_.agent_profile() == HSA_PROFILE_FULL &&
- owner()->getHostMem() != NULL) {
- deviceMemory_ = owner()->getHostMem();
- assert(
- amd::isMultipleOf(
- deviceMemory_,
- static_cast(dev_.info().memBaseAddrAlign_)));
- return true;
- }
-
- deviceMemory_ = dev_.hostAlloc(size(), 1, false);
- }
-
- assert(
- amd::isMultipleOf(
- deviceMemory_,
- static_cast(dev_.info().memBaseAddrAlign_)));
-
- if (deviceMemory_ && (memFlags & CL_MEM_COPY_HOST_PTR)) {
- // To avoid recurssive call to Device::createMemory, we perform
- // data transfer to the view of the buffer.
- amd::Buffer *bufferView = new (owner()->getContext()) amd::Buffer(
- *owner(), 0, owner()->getOrigin(), owner()->getSize());
- bufferView->create();
-
- roc::Buffer *devBufferView =
- new roc::Buffer(dev_, *bufferView);
- devBufferView->deviceMemory_ = deviceMemory_;
-
- bufferView->replaceDeviceMemory(&dev_, devBufferView);
-
- bool ret = dev_.xferMgr().writeBuffer(
- owner()->getHostMem(), *devBufferView, amd::Coord3D(0),
- amd::Coord3D(size()), true);
-
- if (!ret) {
- dev_.deviceLocalFree(deviceMemory_, size());
- deviceMemory_ = NULL;
- }
-
- bufferView->release();
- return ret;
- }
-
- return deviceMemory_ != NULL;
- }
- else if (memFlags & CL_MEM_USE_PERSISTENT_MEM_AMD) {
- deviceMemory_ = dev_.hostAlloc(size(), 1, false);
- if (deviceMemory_ != nullptr) {
- if (owner()->getHostMem() != nullptr) {
- memcpy(deviceMemory_, owner()->getHostMem(), size());
- }
- flags_ |= HostMemoryDirectAccess;
- }
- return deviceMemory_ != nullptr;
- }
-
- assert(owner()->getHostMem() != NULL);
-
- flags_ |= HostMemoryDirectAccess;
-
- if (dev_.agent_profile() == HSA_PROFILE_FULL) {
- deviceMemory_ = owner()->getHostMem();
-
- if (memFlags & CL_MEM_USE_HOST_PTR) {
- hsa_memory_register(deviceMemory_, size());
- }
-
- return deviceMemory_ != NULL;
- }
-
- if (owner()->getSvmPtr() != owner()->getHostMem()) {
- if (memFlags & CL_MEM_USE_HOST_PTR) {
- hsa_agent_t agent = dev_.getBackendDevice();
- hsa_status_t status = hsa_amd_memory_lock(
- owner()->getHostMem(), owner()->getSize(), &agent, 1, &deviceMemory_);
- if (status != HSA_STATUS_SUCCESS) {
- deviceMemory_ = nullptr;
- }
- }
- else {
- deviceMemory_ = owner()->getHostMem();
- }
- }
- else {
- deviceMemory_ = owner()->getHostMem();
- }
-
- return deviceMemory_ != NULL;
-}
-
-/////////////////////////////////roc::Image//////////////////////////////
-typedef struct ChannelOrderMap {
- uint32_t cl_channel_order;
- hsa_ext_image_channel_order_t hsa_channel_order;
-} ChannelOrderMap;
-
-typedef struct ChannelTypeMap {
- uint32_t cl_channel_type;
- hsa_ext_image_channel_type_t hsa_channel_type;
-} ChannelTypeMap;
-
-static const ChannelOrderMap kChannelOrderMapping[] = {
- { CL_R, HSA_EXT_IMAGE_CHANNEL_ORDER_R },
- { CL_A, HSA_EXT_IMAGE_CHANNEL_ORDER_A },
- { CL_RG, HSA_EXT_IMAGE_CHANNEL_ORDER_RG },
- { CL_RA, HSA_EXT_IMAGE_CHANNEL_ORDER_RA },
- { CL_RGB, HSA_EXT_IMAGE_CHANNEL_ORDER_RGB },
- { CL_RGBA, HSA_EXT_IMAGE_CHANNEL_ORDER_RGBA },
- { CL_BGRA, HSA_EXT_IMAGE_CHANNEL_ORDER_BGRA },
- { CL_ARGB, HSA_EXT_IMAGE_CHANNEL_ORDER_ARGB },
- { CL_INTENSITY, HSA_EXT_IMAGE_CHANNEL_ORDER_INTENSITY },
- { CL_LUMINANCE, HSA_EXT_IMAGE_CHANNEL_ORDER_LUMINANCE },
- { CL_Rx, HSA_EXT_IMAGE_CHANNEL_ORDER_RX },
- { CL_RGx, HSA_EXT_IMAGE_CHANNEL_ORDER_RGX },
- { CL_RGBx, HSA_EXT_IMAGE_CHANNEL_ORDER_RGBX },
- { CL_DEPTH, HSA_EXT_IMAGE_CHANNEL_ORDER_DEPTH },
- { CL_DEPTH_STENCIL, HSA_EXT_IMAGE_CHANNEL_ORDER_DEPTH_STENCIL },
- { CL_sRGB, HSA_EXT_IMAGE_CHANNEL_ORDER_SRGB },
- { CL_sRGBx, HSA_EXT_IMAGE_CHANNEL_ORDER_SRGBX },
- { CL_sRGBA, HSA_EXT_IMAGE_CHANNEL_ORDER_SRGBA },
- { CL_sBGRA, HSA_EXT_IMAGE_CHANNEL_ORDER_SBGRA },
- { CL_ABGR, HSA_EXT_IMAGE_CHANNEL_ORDER_ABGR },
-};
-
-static const ChannelTypeMap kChannelTypeMapping[] = {
- {CL_SNORM_INT8, HSA_EXT_IMAGE_CHANNEL_TYPE_SNORM_INT8},
- {CL_SNORM_INT16, HSA_EXT_IMAGE_CHANNEL_TYPE_SNORM_INT16},
- {CL_UNORM_INT8, HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_INT8},
- {CL_UNORM_INT16, HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_INT16},
- {CL_UNORM_SHORT_565, HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_SHORT_565},
- {CL_UNORM_SHORT_555, HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_SHORT_555},
- {CL_UNORM_INT_101010, HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_SHORT_101010},
- {CL_SIGNED_INT8, HSA_EXT_IMAGE_CHANNEL_TYPE_SIGNED_INT8},
- {CL_SIGNED_INT16, HSA_EXT_IMAGE_CHANNEL_TYPE_SIGNED_INT16},
- {CL_SIGNED_INT32, HSA_EXT_IMAGE_CHANNEL_TYPE_SIGNED_INT32},
- {CL_UNSIGNED_INT8, HSA_EXT_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8},
- {CL_UNSIGNED_INT16, HSA_EXT_IMAGE_CHANNEL_TYPE_UNSIGNED_INT16},
- {CL_UNSIGNED_INT32, HSA_EXT_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32},
- {CL_HALF_FLOAT, HSA_EXT_IMAGE_CHANNEL_TYPE_HALF_FLOAT},
- {CL_FLOAT, HSA_EXT_IMAGE_CHANNEL_TYPE_FLOAT},
- {CL_UNORM_INT24, HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_INT24},
-};
-
-
-static hsa_access_permission_t
-GetHsaAccessPermission(const cl_mem_flags flags) {
- if(flags & CL_MEM_READ_ONLY)
- return HSA_ACCESS_PERMISSION_RO;
- else if(flags & CL_MEM_WRITE_ONLY)
- return HSA_ACCESS_PERMISSION_WO;
- else
- return HSA_ACCESS_PERMISSION_RW;
-}
-
-Image::Image(const roc::Device& dev, amd::Memory& owner) :
- roc::Memory(dev, owner)
-{
- flags_ &= (~HostMemoryDirectAccess & ~HostMemoryRegistered);
- populateImageDescriptor();
- hsaImageObject_.handle = 0;
- hsaImageData_ = NULL;
-}
-
-void
-Image::populateImageDescriptor()
-{
- amd::Image* image = owner()->asImage();
-
- // build HSA runtime image descriptor
- imageDescriptor_.width = image->getWidth();
- imageDescriptor_.height = image->getHeight();
- imageDescriptor_.depth = image->getDepth();
- imageDescriptor_.array_size = 0;
-
- switch (image->getType())
- {
- case CL_MEM_OBJECT_IMAGE1D:
- imageDescriptor_.geometry = HSA_EXT_IMAGE_GEOMETRY_1D;
- imageDescriptor_.height = 1;
- imageDescriptor_.depth = 1;
- break;
- case CL_MEM_OBJECT_IMAGE1D_BUFFER:
- imageDescriptor_.geometry = HSA_EXT_IMAGE_GEOMETRY_1DB;
- imageDescriptor_.height = 1;
- imageDescriptor_.depth = 1;
- break;
- case CL_MEM_OBJECT_IMAGE1D_ARRAY:
- //@todo - arraySize = height ?!
- imageDescriptor_.geometry = HSA_EXT_IMAGE_GEOMETRY_1DA;
- imageDescriptor_.height = 1;
- imageDescriptor_.array_size = image->getHeight();
- break;
- case CL_MEM_OBJECT_IMAGE2D:
- imageDescriptor_.geometry = HSA_EXT_IMAGE_GEOMETRY_2D;
- imageDescriptor_.depth = 1;
- break;
- case CL_MEM_OBJECT_IMAGE2D_ARRAY:
- //@todo - arraySize = depth ?!
- imageDescriptor_.geometry = HSA_EXT_IMAGE_GEOMETRY_2DA;
- imageDescriptor_.depth = 1;
- imageDescriptor_.array_size = image->getDepth();
- break;
- case CL_MEM_OBJECT_IMAGE3D:
- imageDescriptor_.geometry = HSA_EXT_IMAGE_GEOMETRY_3D;
- break;
- }
-
- const int kChannelOrderCount =
- sizeof(kChannelOrderMapping) / sizeof(ChannelOrderMap);
- for (int i = 0; i < kChannelOrderCount; i++) {
- if (image->getImageFormat().image_channel_order ==
- kChannelOrderMapping[i].cl_channel_order) {
- imageDescriptor_.format.channel_order =
- kChannelOrderMapping[i].hsa_channel_order;
- break;
- }
- }
-
- const int kChannelTypeCount =
- sizeof(kChannelTypeMapping) / sizeof(ChannelTypeMap);
- for (int i = 0; i < kChannelTypeCount; i++) {
- if (image->getImageFormat().image_channel_data_type ==
- kChannelTypeMapping[i].cl_channel_type) {
- imageDescriptor_.format.channel_type =
- kChannelTypeMapping[i].hsa_channel_type;
- break;
- }
- }
-
- permission_ =
- GetHsaAccessPermission(owner()->getMemFlags());
-}
-
-bool
-Image::createInteropImage()
-{
- auto obj=owner()->getInteropObj()->asGLObject();
- assert(obj->getCLGLObjectType()!=CL_GL_OBJECT_BUFFER && "Non-image OpenGL object used with interop image API.");
-
- const hsa_amd_image_descriptor_t* meta;
- size_t size=0;
-
- GLenum glTarget = obj->getGLTarget();
- if (glTarget == GL_TEXTURE_CUBE_MAP) {
- glTarget = obj->getCubemapFace();
- }
- if(!createInteropBuffer(glTarget, obj->getGLMipLevel(), &size, &meta))
- {
- assert(false && "Failed to map image buffer.");
- return false;
- }
- MAKE_SCOPE_GUARD(BufferGuard, [&](){ destroyInteropBuffer(); });
-
- amdImageDesc_=(hsa_amd_image_descriptor_t*)malloc(size);
- if(amdImageDesc_==NULL)
- return false;
- MAKE_SCOPE_GUARD(DescGuard, [&](){ free(amdImageDesc_); amdImageDesc_=NULL; });
-
- memcpy(amdImageDesc_, meta, size);
-
- image_metadata desc;
- if(!desc.create(amdImageDesc_))
- return false;
-
- if(!desc.setMipLevel(obj->getGLMipLevel()))
- return false;
-
- if (obj->getGLTarget()==GL_TEXTURE_CUBE_MAP)
- desc.setFace(obj->getCubemapFace());
-
- hsaImageData_=deviceMemory_;
-
- hsa_status_t err=hsa_amd_image_create(dev_.getBackendDevice(), &imageDescriptor_, amdImageDesc_, hsaImageData_, permission_, &hsaImageObject_);
- if(err!=HSA_STATUS_SUCCESS)
- return false;
-
- BufferGuard.Dismiss();
- DescGuard.Dismiss();
- return true;
-}
-
-bool
-Image::create()
-{
- if (owner()->parent()) {
- // Image view creation
- roc::Memory *parent =
- static_cast(owner()->parent()->getDeviceMemory(dev_));
-
- if (parent == NULL) {
- LogError("[OCL] Fail to allocate parent image");
- return false;
- }
-
- return createView(*parent);
- }
-
- //Interop image
- if(owner()->isInterop())
- return createInteropImage();
-
- // Get memory size requirement for device specific image.
- hsa_status_t status = hsa_ext_image_data_get_info(
- dev_.getBackendDevice(), &imageDescriptor_,
- permission_, &deviceImageInfo_);
-
- if (status != HSA_STATUS_SUCCESS) {
- LogError("[OCL] Fail to allocate image memory");
- return false;
- }
-
- // roc::Device::hostAlloc and deviceLocalAlloc implementation does not
- // support alignment larger than HSA memory region allocation granularity.
- // In this case, the user manages the alignment.
- const size_t alloc_size =
- (deviceImageInfo_.alignment <= dev_.alloc_granularity())
- ? deviceImageInfo_.size
- : deviceImageInfo_.size + deviceImageInfo_.alignment;
-
- if (!(owner()->getMemFlags() & CL_MEM_ALLOC_HOST_PTR)) {
- deviceMemory_ = dev_.deviceLocalAlloc(alloc_size);
- }
-
- if (deviceMemory_ == NULL) {
- deviceMemory_ =
- dev_.hostAlloc(alloc_size, 1, false);
- }
-
- hsaImageData_ = reinterpret_cast(
- amd::alignUp(reinterpret_cast(deviceMemory_),
- deviceImageInfo_.alignment));
-
- assert(amd::isMultipleOf(
- hsaImageData_, static_cast(deviceImageInfo_.alignment)));
-
- status = hsa_ext_image_create(
- dev_.getBackendDevice(), &imageDescriptor_, hsaImageData_,
- permission_, &hsaImageObject_);
-
- if (status != HSA_STATUS_SUCCESS) {
- LogError("[OCL] Fail to allocate image memory");
- return false;
- }
-
- return true;
-}
-
-bool
-Image::createView(Memory &parent)
-{
- deviceMemory_ = parent.getDeviceMemory();
-
- hsaImageData_ = (parent.owner()->asBuffer() != NULL)
- ? deviceMemory_
- : static_cast(parent).hsaImageData_;
-
- kind_=parent.getKind();
-
- hsa_status_t status;
- if(kind_==MEMORY_KIND_INTEROP)
- status = hsa_amd_image_create(dev_.getBackendDevice(), &imageDescriptor_, amdImageDesc_, hsaImageData_, permission_, &hsaImageObject_);
- else
- status= hsa_ext_image_create(dev_.getBackendDevice(), &imageDescriptor_, hsaImageData_, permission_, &hsaImageObject_);
-
- if (status != HSA_STATUS_SUCCESS) {
- LogError("[OCL] Fail to allocate image memory");
- return false;
- }
-
- return true;
-}
-
-void*
-Image::allocMapTarget(
- const amd::Coord3D& origin,
- const amd::Coord3D& region,
- uint mapFlags,
- size_t* rowPitch,
- size_t* slicePitch)
-{
- amd::ScopedLock lock(owner()->lockMemoryOps());
-
- incIndMapCount();
-
- void* pHostMem = owner()->getHostMem();
-
- if (pHostMem == NULL) {
- if (indirectMapCount_ == 1) {
- if (!allocateMapMemory(owner()->getSize())) {
- decIndMapCount();
- return NULL;
- }
- }
- else {
- // Did the map resource allocation fail?
- if (mapMemory_ == NULL) {
- LogError("Could not map target resource");
- return NULL;
- }
- }
-
- pHostMem = mapMemory_->getHostMem();
- }
-
- amd::Image* image = owner()->asImage();
-
- size_t elementSize = image->getImageFormat().getElementSize();
-
- size_t offset = origin[0] * elementSize;
-
- // Adjust offset with Y dimension
- offset += image->getRowPitch() * origin[1];
-
- // Adjust offset with Z dimension
- offset += image->getSlicePitch() * origin[2];
-
- *rowPitch = image->getRowPitch();
- if (slicePitch != NULL) {
- *slicePitch = image->getSlicePitch();
- }
-
- return (static_cast(pHostMem)+offset);
-}
-
-Image::~Image()
-{
- destroy();
-}
-
-void
-Image::destroy()
-{
- if (owner()->parent() != NULL) {
- return;
- }
-
- if(kind_==MEMORY_KIND_INTEROP)
- {
- hsa_ext_image_destroy(dev_.getBackendDevice(), hsaImageObject_);
- free(amdImageDesc_);
- amdImageDesc_=NULL;
- destroyInteropBuffer();
- return;
- }
-
- if (deviceMemory_ != NULL) {
- dev_.hostFree(deviceMemory_, deviceImageInfo_.size);
- }
-
- if (hsaImageObject_.handle != 0) {
- hsa_status_t status =
- hsa_ext_image_destroy(dev_.getBackendDevice(), hsaImageObject_);
- assert(status == HSA_STATUS_SUCCESS);
- }
-}
-}
-#endif // WITHOUT_HSA_BACKEND
diff --git a/projects/clr/rocclr/runtime/device/rocm/rocmemory.hpp b/projects/clr/rocclr/runtime/device/rocm/rocmemory.hpp
deleted file mode 100644
index a561ad3f86..0000000000
--- a/projects/clr/rocclr/runtime/device/rocm/rocmemory.hpp
+++ /dev/null
@@ -1,188 +0,0 @@
-//
-// Copyright (c) 2016 Advanced Micro Devices, Inc. All rights reserved.
-//
-#pragma once
-
-#ifndef WITHOUT_HSA_BACKEND
-
-#include "top.hpp"
-#include "platform/memory.hpp"
-#include "utils/debug.hpp"
-#include "device/rocm/rocdevice.hpp"
-#include "device/rocm/rocglinterop.hpp"
-
-namespace roc {
-class Memory : public device::Memory {
- public:
- enum MEMORY_KIND { MEMORY_KIND_NORMAL=0, MEMORY_KIND_LOCK, MEMORY_KIND_GART, MEMORY_KIND_INTEROP };
-
- Memory(const roc::Device &dev, amd::Memory &owner);
-
- virtual ~Memory();
-
- // Getter for deviceMemory_.
- void *getDeviceMemory() const { return deviceMemory_; }
-
- // Gets a pointer to a region of host-visible memory for use as the target
- // of an indirect map for a given memory object
- virtual void *allocMapTarget(const amd::Coord3D &origin,
- const amd::Coord3D ®ion,
- uint mapFlags,
- size_t *rowPitch,
- size_t *slicePitch);
-
- // Create device memory according to OpenCL memory flag.
- virtual bool create() = 0;
-
- // Pins system memory associated with this memory object.
- virtual bool pinSystemMemory(void *hostPtr, // System memory address
- size_t size // Size of allocated system memory
- ) {
- Unimplemented();
- return true;
- }
-
- // Immediate blocking write from device cache to owners's backing store.
- // Marks owner as "current" by resetting the last writer to NULL.
- virtual void syncHostFromCache(SyncFlags syncFlags = SyncFlags())
- {
- // Need to revisit this when multi-devices is supported.
- }
-
- // Releases indirect map surface
- void releaseIndirectMap() { decIndMapCount(); }
-
- //! Map the device memory to CPU visible
- virtual void* cpuMap(
- device::VirtualDevice& vDev, //!< Virtual device for map operaiton
- uint flags = 0, //!< flags for the map operation
- // Optimization for multilayer map/unmap
- uint startLayer = 0, //!< Start layer for multilayer map
- uint numLayers = 0, //!< End layer for multilayer map
- size_t* rowPitch = NULL,//!< Row pitch for the device memory
- size_t* slicePitch = NULL //!< Slice pitch for the device memory
- );
-
- //! Unmap the device memory
- virtual void cpuUnmap(
- device::VirtualDevice& vDev //!< Virtual device for unmap operaiton
- );
-
- //Mesa has already decomressed if needed and also does acquire at the start of every command batch.
- virtual bool processGLResource(GLResourceOP operation) { return true; }
-
- // Accessors for indirect map memory object
- amd::Memory *mapMemory() const { return mapMemory_; }
-
- MEMORY_KIND getKind() const { return kind_; }
-
- protected:
-
- bool allocateMapMemory(size_t allocationSize);
-
- // Decrement map count
- virtual void decIndMapCount();
-
- // Free / deregister device memory.
- virtual void destroy() = 0;
-
- // Place interop object into HSA's flat address space
- bool createInteropBuffer(GLenum targetType, int miplevel, size_t* metadata_size, const hsa_amd_image_descriptor_t** metadata);
-
- void destroyInteropBuffer();
-
- // Pointer to the device associated with this memory object.
- const roc::Device &dev_;
-
- // Pointer to the device memory. This could be in system or device local mem.
- void* deviceMemory_;
-
- // Track if this memory is interop, lock, gart, or normal.
- MEMORY_KIND kind_;
-
- private:
- // Disable copy constructor
- Memory(const Memory &);
-
- // Disable operator=
- Memory &operator=(const Memory &);
-
-};
-
-class Buffer : public roc::Memory {
- public:
- Buffer(const roc::Device &dev, amd::Memory &owner);
-
- virtual ~Buffer();
-
- // Create device memory according to OpenCL memory flag.
- virtual bool create();
-
- // Recreate the device memory using new size and alignment.
- bool recreate(size_t newSize, size_t newAlignment, bool forceSystem);
-
- private:
- // Disable copy constructor
- Buffer(const Buffer &);
-
- // Disable operator=
- Buffer &operator=(const Buffer &);
-
- // Free device memory.
- void destroy();
-};
-
-class Image : public roc::Memory
-{
-public:
- Image(const roc::Device& dev, amd::Memory& owner);
-
- virtual ~Image();
-
- //! Create device memory according to OpenCL memory flag.
- virtual bool create();
-
- //! Create an image view
- bool createView(Memory &parent);
-
- //! Gets a pointer to a region of host-visible memory for use as the target
- //! of an indirect map for a given memory object
- virtual void* allocMapTarget(
- const amd::Coord3D& origin,
- const amd::Coord3D& region,
- uint mapFlags,
- size_t* rowPitch,
- size_t* slicePitch);
-
- size_t getDeviceDataSize() { return deviceImageInfo_.size; }
- size_t getDeviceDataAlignment() { return deviceImageInfo_.alignment; }
-
- hsa_ext_image_t getHsaImageObject() { return hsaImageObject_; }
- const hsa_ext_image_descriptor_t& getHsaImageDescriptor() const { return imageDescriptor_; }
-private:
- //! Disable copy constructor
- Image(const Buffer&);
-
- //! Disable operator=
- Image& operator=(const Buffer&);
-
- // Setup an interop image
- bool createInteropImage();
-
- // Free / deregister device memory.
- void destroy();
-
- void populateImageDescriptor();
-
- hsa_ext_image_descriptor_t imageDescriptor_;
- hsa_access_permission_t permission_;
- hsa_ext_image_data_info_t deviceImageInfo_;
- hsa_ext_image_t hsaImageObject_;
- hsa_amd_image_descriptor_t* amdImageDesc_;
-
- const void* hsaImageData_;
-};
-
-}
-#endif
-
diff --git a/projects/clr/rocclr/runtime/device/rocm/rocprintf.cpp b/projects/clr/rocclr/runtime/device/rocm/rocprintf.cpp
deleted file mode 100644
index 8efbe57ba8..0000000000
--- a/projects/clr/rocclr/runtime/device/rocm/rocprintf.cpp
+++ /dev/null
@@ -1,467 +0,0 @@
-//
-// Copyright (c) 2010 Advanced Micro Devices, Inc. All rights reserved.
-//
-
-#include "top.hpp"
-#include "os/os.hpp"
-#include "device/device.hpp"
-#include "device/rocm/rocdefs.hpp"
-#include "device/rocm/rocmemory.hpp"
-#include "device/rocm/rockernel.hpp"
-#include "device/rocm/rocprogram.hpp"
-#include "device/rocm/rocdevice.hpp"
-#include "device/rocm/rocprintf.hpp"
-#include
-#include
-#include
-
-namespace roc {
-
-PrintfDbg::PrintfDbg(Device& device, FILE* file)
- : dbgBuffer_(NULL),
- dbgBuffer_size_(0),
- dbgFile_(file),
- gpuDevice_(device) {}
-
-PrintfDbg::~PrintfDbg() { dev().hostFree(dbgBuffer_, dbgBuffer_size_); }
-
-bool PrintfDbg::allocate(bool realloc) {
- if (NULL == dbgBuffer_) {
- dbgBuffer_size_ = dev().info().printfBufferSize_;
- dbgBuffer_ = reinterpret_cast(
- dev().hostAlloc(dbgBuffer_size_, sizeof(void*)));
- } else if (realloc) {
- LogWarning("Debug buffer reallocation!");
- // Double the buffer size if it's not big enough
- dev().hostFree(dbgBuffer_, dbgBuffer_size_);
- dbgBuffer_size_ = dbgBuffer_size_ << 1;
- dbgBuffer_ = reinterpret_cast(dbgBuffer_size_, sizeof(void*));
- }
-
- return (NULL != dbgBuffer_) ? true : false;
-}
-
-bool PrintfDbg::checkFloat(const std::string& fmt) const {
- switch (fmt[fmt.size() - 1]) {
- case 'e':
- case 'E':
- case 'f':
- case 'g':
- case 'G':
- case 'a':
- return true;
- break;
- default:
- break;
- }
- return false;
-}
-
-bool PrintfDbg::checkString(const std::string& fmt) const {
- if (fmt[fmt.size() - 1] == 's') return true;
- return false;
-}
-
-int PrintfDbg::checkVectorSpecifier(const std::string& fmt, size_t startPos,
- size_t& curPos) const {
- int vectorSize = 0;
- size_t pos = curPos;
- size_t size = curPos - startPos;
-
- if (size >= 3) {
- size = 0;
- // no modifiers
- if (fmt[curPos - 3] == 'v') {
- size = 2;
- }
- // the modifiers are "h" or "l"
- else if (fmt[curPos - 4] == 'v') {
- size = 3;
- }
- // the modifier is "hh"
- else if ((curPos >= 5) && (fmt[curPos - 5] == 'v')) {
- size = 4;
- }
- if (size > 0) {
- curPos = size;
- pos -= curPos;
-
- // Get vector size
- vectorSize = fmt[pos++] - '0';
- // PrintfDbg supports only 2, 3, 4, 8 and 16 wide vectors
- switch (vectorSize) {
- case 1:
- if ((fmt[pos++] - '0') == 6) {
- vectorSize = 16;
- } else {
- vectorSize = 0;
- }
- break;
- case 2:
- case 3:
- case 4:
- case 8:
- break;
- default:
- vectorSize = 0;
- break;
- }
- }
- }
-
- return vectorSize;
-}
-
-static const size_t ConstStr = 0xffffffff;
-static const char Separator[] = ",\0";
-
-size_t PrintfDbg::outputArgument(const std::string& fmt, bool printFloat,
- size_t size, const uint32_t* argument) const {
- // Serialize the output to the screen
- // amd::ScopedLock k(dev().lockAsyncOps());
-
- size_t copiedBytes = size;
- // Print the string argument, using standard PrintfDbg()
- if (checkString(fmt.c_str())) {
- // copiedBytes should be as number of printed chars
- copiedBytes = 0;
- //(null) should be printed
- if (*argument == 0) {
- amd::Os::printf(fmt.data(), 0);
- // copiedBytes = strlen("(null)")
- copiedBytes = 6;
- } else {
- const unsigned char* argumentStr =
- reinterpret_cast(argument);
- amd::Os::printf(fmt.data(), argumentStr);
- // copiedBytes = strlen(argumentStr)
- while (argumentStr[copiedBytes++] != 0)
- ;
- }
- }
-
- // Print the argument(except for string ), using standard PrintfDbg()
- else {
- bool hlModifier = (strstr(fmt.c_str(), "hl") != NULL);
- std::string hlFmt;
- if (hlModifier) {
- hlFmt = fmt;
- hlFmt.erase(hlFmt.find_first_of("hl"), 2);
- }
- switch (size) {
- case 0: {
- const char* str = reinterpret_cast(argument);
- amd::Os::printf(fmt.data(), str);
- // Find the string length
- while (str[copiedBytes++] != 0)
- ;
- } break;
- case 1:
- amd::Os::printf(fmt.data(),
- *(reinterpret_cast(argument)));
- break;
- case 2:
- case 4:
- if (printFloat) {
- static const char* fSpecifiers = "eEfgGa";
- std::string fmtF = fmt;
- size_t posS = fmtF.find_first_of("%");
- size_t posE = fmtF.find_first_of(fSpecifiers);
- if (posS != std::string::npos && posE != std::string::npos) {
- fmtF.replace(posS + 1, posE - posS, "s");
- }
- float fArg = *(reinterpret_cast(argument));
- float fSign = copysign(1.0, fArg);
- if (isinf(fArg) && !isnan(fArg)) {
- if (fSign < 0) {
- amd::Os::printf(fmtF.data(), "-infinity");
- } else {
- amd::Os::printf(fmtF.data(), "infinity");
- }
- } else if (isnan(fArg)) {
- if (fSign < 0) {
- amd::Os::printf(fmtF.data(), "-nan");
- } else {
- amd::Os::printf(fmtF.data(), "nan");
- }
- } else if (hlModifier) {
- amd::Os::printf(hlFmt.data(), fArg);
- } else {
- amd::Os::printf(fmt.data(), fArg);
- }
- } else {
- bool hhModifier = (strstr(fmt.c_str(), "hh") != NULL);
- if (hhModifier) {
- // current implementation of printf in gcc 4.5.2 runtime libraries,
- // doesn`t recognize "hh" modifier ==>
- // argument should be explicitly converted to unsigned char (uchar)
- // before printing and
- // fmt should be updated not to contain "hh" modifier
- std::string hhFmt = fmt;
- hhFmt.erase(hhFmt.find_first_of("h"), 2);
- amd::Os::printf(
- hhFmt.data(),
- *(reinterpret_cast(argument)));
- } else if (hlModifier) {
- amd::Os::printf(hlFmt.data(), *argument);
- } else {
- amd::Os::printf(fmt.data(), *argument);
- }
- }
- break;
- case 8:
- if (printFloat) {
- if (hlModifier) {
- amd::Os::printf(hlFmt.data(),
- *(reinterpret_cast(argument)));
- } else {
- amd::Os::printf(fmt.data(),
- *(reinterpret_cast(argument)));
- }
- } else {
- std::string out = fmt;
- // Use 'll' for 64 bit printf
- out.insert((out.size() - 1), 1, 'l');
- amd::Os::printf(out.data(),
- *(reinterpret_cast(argument)));
- }
- break;
- case ConstStr: {
- const char* str = reinterpret_cast(argument);
- amd::Os::printf(fmt.data(), str);
- } break;
- default:
- amd::Os::printf("Error: Unsupported data size for PrintfDbg. %d bytes",
- static_cast(size));
- return 0;
- }
- }
- fflush(stdout);
- return copiedBytes;
-}
-
-void PrintfDbg::outputDbgBuffer(const PrintfInfo& info,
- const uint32_t* workitemData, size_t& i) const {
- static const char* specifiers = "cdieEfgGaosuxXp";
- static const char* modifiers = "hl";
- static const char* special = "%n";
- static const std::string sepStr = "%s";
- const uint32_t* s = workitemData;
- size_t pos = 0;
-
- // Find the format string
- std::string str = info.fmtString_;
- std::string fmt;
- size_t posStart, posEnd;
-
- // Print all arguments
- // Note: the following code walks through all arguments, provided by the
- // kernel and
- // finds the corresponding specifier in the format string.
- // Then it splits the original string into substrings with a single specifier
- // and
- // uses standard PrintfDbg() to print each argument
- for (uint j = 0; j < info.arguments_.size(); ++j) {
- do {
- posStart = str.find_first_of("%", pos);
- if (posStart != std::string::npos) {
- posStart++;
- // Erase all spaces after %
- while (str[posStart] == ' ') {
- str.erase(posStart, 1);
- }
- size_t tmp = str.find_first_of(special, posStart);
- size_t tmp2 = str.find_first_of(specifiers, posStart);
- // Special cases. Special symbol is located before any specifier
- if (tmp < tmp2) {
- posEnd = posStart + 1;
- fmt = str.substr(pos, posEnd - pos);
- fmt.erase(posStart - pos - 1, 1);
- pos = posStart = posEnd;
- outputArgument(sepStr, false, ConstStr,
- reinterpret_cast(fmt.data()));
- continue;
- }
- break;
- } else if (pos < str.length()) {
- outputArgument(
- sepStr, false, ConstStr,
- reinterpret_cast((str.substr(pos)).data()));
- }
- } while (posStart != std::string::npos);
-
- if (posStart != std::string::npos) {
- bool printFloat = false;
- int vectorSize = 0;
- size_t length;
- size_t idPos = 0;
-
- // Search for PrintfDbg specifier in the format string.
- // It will be a split point for the output
- posEnd = str.find_first_of(specifiers, posStart);
- if (posEnd == std::string::npos) {
- pos = posStart = posEnd;
- break;
- }
- posEnd++;
-
- size_t curPos = posEnd;
- vectorSize = checkVectorSpecifier(str, posStart, curPos);
-
- // Get substring from the last position to the current specifier
- fmt = str.substr(pos, posEnd - pos);
-
- // Readjust the string pointer if PrintfDbg outputs a vector
- if (vectorSize != 0) {
- size_t posVecSpec = fmt.length() - (curPos + 1);
- size_t posVecMod = fmt.find_first_of(modifiers, posVecSpec + 1);
- size_t posMod = str.find_first_of(modifiers, posStart);
- if (posMod < posEnd) {
- fmt = fmt.erase(posVecSpec, posVecMod - posVecSpec);
- } else {
- fmt = fmt.erase(posVecSpec, curPos);
- }
- idPos = posStart - pos - 1;
- }
- pos = posStart = posEnd;
-
- // Find out if the argument is a float
- printFloat = checkFloat(fmt);
-
- // Is it a scalar value?
- if (vectorSize == 0) {
- length = outputArgument(fmt, printFloat, info.arguments_[j], &s[i]);
- if (0 == length) {
- return;
- }
- i += amd::alignUp(length, sizeof(uint32_t)) / sizeof(uint32_t);
- } else {
- // 3-component vector's size is defined as 4 * size of each scalar
- // component
- size_t elemSize =
- info.arguments_[j] / (vectorSize == 3 ? 4 : vectorSize);
- size_t k = i * sizeof(uint32_t);
- std::string elementStr = fmt.substr(idPos, fmt.size());
-
- // Print first element with full string
- if (0 == outputArgument(fmt, printFloat, elemSize, &s[i])) {
- return;
- }
-
- // Print other elemnts with separator if available
- for (int e = 1; e < vectorSize; ++e) {
- const char* t = reinterpret_cast(s);
- // Output the vector separator
- outputArgument(sepStr, false, ConstStr,
- reinterpret_cast(Separator));
-
- // Output the next element
- outputArgument(
- elementStr, printFloat, elemSize,
- reinterpret_cast(&t[k + e * elemSize]));
- }
- i += (amd::alignUp(info.arguments_[j], sizeof(uint32_t))) /
- sizeof(uint32_t);
- }
- } else {
- amd::Os::printf(
- "Error: The arguments don't match the printf format string. "
- "printf(%s)",
- info.fmtString_.data());
- return;
- }
- }
-
- if (pos != std::string::npos) {
- fmt = str.substr(pos, str.size() - pos);
- outputArgument(sepStr, false, ConstStr,
- reinterpret_cast(fmt.data()));
- }
-}
-
-bool PrintfDbg::init(bool printfEnabled) {
- // Set up debug output buffer (if printf active)
- if (printfEnabled) {
- if (!allocate()) {
- return false;
- }
-
- // The first two DWORDs in the printf buffer are as follows:
- // First DWORD = Offset to where next information is to
- // be written, initialized to 0
- // Second DWORD = Number of bytes available for printf data
- // = buffer size \96 2*sizeof(uint32_t)
- const uint8_t initSize = 2 * sizeof(uint32_t);
- uint8_t sysMem[initSize];
- memset(sysMem, 0, initSize);
- uint32_t dbgBufferSize = dbgBuffer_size_ - initSize;
- memcpy(&sysMem[4], &dbgBufferSize, sizeof(dbgBufferSize));
-
- // Copy offset and number of bytes available for printf data
- // into the corresponding location in the debug buffer
- hsa_status_t err =
- hsa_memory_copy(dbgBuffer_, sysMem, 2 * sizeof(uint32_t));
- if (err != HSA_STATUS_SUCCESS) {
- LogError("\n Can't copy offset and bytes available data to dgbBuffer_!");
- return false;
- }
- }
- return true;
-}
-
-bool PrintfDbg::output(VirtualGPU& gpu, bool printfEnabled,
- const std::vector& printfInfo) {
- if (printfEnabled) {
- uint32_t offsetSize = 0;
-
- // Wait until outstanding kernels finish
- gpu.releaseGpuMemoryFence();
-
- // Get memory pointer to the staged buffer
- uint32_t* dbgBufferPtr = reinterpret_cast(dbgBuffer_);
- if (NULL == dbgBufferPtr) {
- return false;
- }
-
- offsetSize = *dbgBufferPtr;
-
- if (offsetSize == 0) {
- LogError("\n The printf buffer is empty!");
- return false;
- }
-
- // Get a pointer to the buffer data
- dbgBufferPtr =
- reinterpret_cast(dbgBuffer_ + 2 * sizeof(uint32_t));
- if (NULL == dbgBufferPtr) {
- return false;
- }
-
- std::vector::const_iterator ita;
- uint sb = 0;
- uint sbt = 0;
- size_t idx = 1;
-
- // parse the debug buffer
- while (sbt < offsetSize) {
- assert(((*dbgBufferPtr) < printfInfo.size()) &&
- "Cound't find the reported PrintfID!");
- const PrintfInfo& info = printfInfo[(*dbgBufferPtr)];
- sb += sizeof(uint32_t);
- for (ita = info.arguments_.begin(); ita != info.arguments_.end(); ++ita) {
- sb += *ita;
- }
-
- // There's something in the debug buffer
- outputDbgBuffer(info, dbgBufferPtr, idx);
-
- sbt += sb;
- dbgBufferPtr += sb / sizeof(uint32_t);
- sb = 0;
- }
- }
-
- return true;
-}
-
-} // namespace gpu
diff --git a/projects/clr/rocclr/runtime/device/rocm/rocprintf.hpp b/projects/clr/rocclr/runtime/device/rocm/rocprintf.hpp
deleted file mode 100644
index d0c8e10ca8..0000000000
--- a/projects/clr/rocclr/runtime/device/rocm/rocprintf.hpp
+++ /dev/null
@@ -1,115 +0,0 @@
-//
-// Copyright (c) 2010 Advanced Micro Devices, Inc. All rights reserved.
-//
-#pragma once
-
-/*! \addtogroup GPU GPU Device Implementation
- * @{
- */
-#ifndef isinf
-#ifdef _MSC_VER
-#define isinf(X) (!_finite(X) && !_isnan(X))
-#endif //_MSC_VER
-#endif // isinf
-
-#ifndef isnan
-#ifdef _MSC_VER
-#define isnan(X) (_isnan(X))
-#endif //_MSC_VER
-#endif // isnan
-
-#ifndef copysign
-#ifdef _MSC_VER
-#define copysign(X, Y) (_copysign(X, Y))
-#endif //_MSC_VER
-#endif // copysign
-
-//! GPU Device Implementation
-namespace roc {
-
-//! Printf info structure
-struct PrintfInfo {
- std::string fmtString_; //!< formated string for printf
- std::vector arguments_; //!< passed arguments to the printf() call
-};
-
-class Kernel;
-class VirtualGPU;
-class Device;
-
-class PrintfDbg : public amd::HeapObject {
- public:
- //! Debug buffer size per workitem
- static const uint WorkitemDebugSize = 4096;
-
- //! constructor
- PrintfDbg(Device& device, FILE* file = NULL);
-
- //! Destructor
- ~PrintfDbg();
-
- //! Initializes the debug buffer before kernel's execution
- bool init(bool printfEnabled //!< checks for printf
- );
-
- //! Prints the kernel's debug informaiton from the buffer
- bool output(VirtualGPU& gpu,
- bool printfEnabled, //!< checks for printf
- const std::vector& printfInfo //!< printf info
- );
-
- //! Returns debug buffer object
- address dbgBuffer() const { return dbgBuffer_; }
-
- protected:
- address dbgBuffer_; //!< Buffer to hold debug output
- size_t dbgBuffer_size_; //!< Size of the debugger buffer
- FILE* dbgFile_; //!< Debug file
- Device& gpuDevice_; //!< GPU device object
-
- //! Gets GPU device object
- Device& dev() const { return gpuDevice_; }
-
- //! Allocates the debug buffer
- bool allocate(
- bool realloc = false //!< If TRUE then reallocate the debug memory
- );
-
- //! Returns TRUE if a float value has to be printed
- bool checkFloat(const std::string& fmt //!< Format string
- ) const;
-
- //! Returns TRUE if a string value has to be printed
- bool checkString(const std::string& fmt //!< Format string
- ) const;
-
- //! Finds the specifier in the format string
- int checkVectorSpecifier(const std::string& fmt, //!< Format string
- size_t startPos, //!< Start position for processing
- size_t& curPos //!< End position for processing
- ) const;
-
- //! Outputs an argument
- size_t outputArgument(const std::string& fmt, //!< Format strint
- bool printFloat, //!< Argument is a float value
- size_t size, //!< Argument's size
- const uint32_t* argument //!< Argument's location
- ) const;
-
- //! Displays the PrintfDbg
- void outputDbgBuffer(
- const PrintfInfo& info, //!< printf info
- const uint32_t* workitemData, //!< The PrintfDbg dump buffer
- size_t& i //!< index to the data in the buffer
- ) const;
-
- private:
- //! Disable copy constructor
- PrintfDbg(const PrintfDbg&);
-
- //! Disable assignment
- PrintfDbg& operator=(const PrintfDbg&);
-};
-
-/*@}*/} // namespace roc
-
diff --git a/projects/clr/rocclr/runtime/device/rocm/rocprogram.cpp b/projects/clr/rocclr/runtime/device/rocm/rocprogram.cpp
deleted file mode 100644
index a442da513e..0000000000
--- a/projects/clr/rocclr/runtime/device/rocm/rocprogram.cpp
+++ /dev/null
@@ -1,845 +0,0 @@
-//
-// Copyright (c) 2008 Advanced Micro Devices, Inc. All rights reserved.
-//
-
-
-#ifndef WITHOUT_HSA_BACKEND
-
-#include "rocprogram.hpp"
-
-#include "compiler/lib/loaders/elf/elf.hpp"
-#include "compiler/lib/utils/options.hpp"
-#include "rockernel.hpp"
-#include "roccompilerlib.hpp"
-#include "utils/bif_section_labels.hpp"
-
-
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-
-
-#endif // WITHOUT_HSA_BACKEND
-
-namespace roc {
-#ifndef WITHOUT_HSA_BACKEND
- /* Temporary log function for the compiler library */
- static void logFunction(const char *msg, size_t size) {
- std::cout << "Compiler Library log :" << msg << std::endl;
- }
-
- HSAILProgram::~HSAILProgram() {
- acl_error error;
- // Free the elf binary
- if (binaryElf_ != NULL) {
- error = g_complibApi._aclBinaryFini(binaryElf_);
- if (error != ACL_SUCCESS) {
- LogWarning( "Error while destroying the acl binary \n" );
- }
- }
- // Destroy the executable.
- if (hsaExecutable_.handle != 0) {
- hsa_executable_destroy(hsaExecutable_);
- }
- // Destroy the code object.
- if (hsaProgramCodeObject_.handle != 0) {
- hsa_code_object_destroy(hsaProgramCodeObject_);
- }
- // Destroy the program handle.
- if (hsaProgramHandle_.handle != 0) {
- hsa_ext_program_destroy(hsaProgramHandle_);
- }
- destroyBrigModule();
- destroyBrigContainer();
- releaseClBinary();
- }
-
- HSAILProgram::HSAILProgram(roc::NullDevice& device): device::Program(device),
- llvmBinary_(),
- binaryElf_(NULL),
- device_(device),
- brigModule_(NULL),
- hsaBrigContainer_(NULL)
- {
- memset(&binOpts_, 0, sizeof(binOpts_));
- binOpts_.struct_size = sizeof(binOpts_);
- //binOpts_.elfclass = LP64_SWITCH( ELFCLASS32, ELFCLASS64 );
- //Setting as 32 bit because hsail64 returns an invalid aclTargetInfo
- //when aclGetTargetInfo is called - EPR# 377910
- binOpts_.elfclass = ELFCLASS32;
- binOpts_.bitness = ELFDATA2LSB;
- binOpts_.alloc = &::malloc;
- binOpts_.dealloc = &::free;
- hsaProgramHandle_.handle = 0;
- hsaProgramCodeObject_.handle = 0;
- hsaExecutable_.handle = 0;
- }
-
- bool HSAILProgram::initClBinary(char *binaryIn, size_t size) { // Save the
- // original
- // binary that
- // isn't owned
- // by ClBinary
- clBinary()->saveOrigBinary(binaryIn, size);
-
- char *bin = binaryIn;
- size_t sz = size;
-
- int encryptCode;
-
- char *decryptedBin;
- size_t decryptedSize;
- if (!clBinary()->decryptElf(binaryIn, size,
- &decryptedBin, &decryptedSize, &encryptCode)) {
- return false;
- }
- if (decryptedBin != NULL) {
- // It is decrypted binary.
- bin = decryptedBin;
- sz = decryptedSize;
- }
-
- // Both 32-bit and 64-bit are allowed!
- if (!amd::isElfMagic(bin)) {
- // Invalid binary.
- if (decryptedBin != NULL) {
- delete[]decryptedBin;
- }
- return false;
- }
-
- clBinary()->setFlags(encryptCode);
-
- return clBinary()->setBinary(bin, sz, (decryptedBin != NULL));
- }
-
-
- bool HSAILProgram::initBuild(amd::option::Options *options) {
- compileOptions_ = options->origOptionStr;
-
- if (!device::Program::initBuild(options)) {
- return false;
- }
- // Elf Binary setup
- std::string outFileName;
-
- // true means hsail required
- clBinary()->init(options, true);
- if (options->isDumpFlagSet(amd::option::DUMP_BIF)) {
- outFileName = options->getDumpFileName(".bin");
- }
-
- bool useELF64 = getCompilerOptions()->oVariables->EnableGpuElf64;
- if (!clBinary()->setElfOut(useELF64 ? ELFCLASS64 : ELFCLASS32,
- (outFileName.size() >
- 0) ? outFileName.c_str() : NULL)) {
- LogError("Setup elf out for gpu failed");
- return false;
- }
- return true;
- }
-
- // ! post-compile setup for GPU
- bool HSAILProgram::finiBuild(bool isBuildGood) {
- clBinary()->resetElfOut();
- clBinary()->resetElfIn();
-
- if (!isBuildGood) {
- // Prevent the encrypted binary form leaking out
- clBinary()->setBinary(NULL, 0);
-
- }
-
- return device::Program::finiBuild(isBuildGood);
- }
-
- static char *readFile(std::string source_filename, size_t &size) {
- FILE *fp = ::fopen(source_filename.c_str(), "rb");
- unsigned int length;
- size_t offset = 0;
- char *ptr;
-
- if (!fp) {
- return NULL;
- }
-
- // obtain file size.
- ::fseek(fp, 0, SEEK_END);
- length = ::ftell(fp);
- ::rewind(fp);
-
- ptr = reinterpret_cast(malloc(offset + length + 1));
- if (length != fread(&ptr[offset], 1, length, fp)) {
- free(ptr);
- return NULL;
- }
-
- ptr[offset + length] = '\0';
- size = offset + length;
- ::fclose(fp);
- return ptr;
- }
-
- aclType HSAILProgram::getCompilationStagesFromBinary(std::vector& completeStages, bool& needOptionsCheck)
- {
- acl_error errorCode;
- size_t secSize = 0;
- completeStages.clear();
- aclType from = ACL_TYPE_DEFAULT;
- needOptionsCheck = true;
- size_t boolSize = sizeof(bool);
- //! @todo Should we also check for ACL_TYPE_OPENCL & ACL_TYPE_LLVMIR_TEXT?
- // Checking llvmir in .llvmir section
- bool containsLlvmirText = true;
- errorCode = g_complibApi._aclQueryInfo(device().compiler(), binaryElf_, RT_CONTAINS_LLVMIR, NULL, &containsLlvmirText, &boolSize);
- if (errorCode != ACL_SUCCESS) {
- containsLlvmirText = false;
- }
- // Checking compile & link options in .comment section
- bool containsOpts = true;
- errorCode = g_complibApi._aclQueryInfo(device().compiler(), binaryElf_, RT_CONTAINS_OPTIONS, NULL, &containsOpts, &boolSize);
- if (errorCode != ACL_SUCCESS) {
- containsOpts = false;
- }
- if (containsLlvmirText && containsOpts) {
- completeStages.push_back(from);
- from = ACL_TYPE_LLVMIR_BINARY;
- }
- // Checking HSAIL in .cg section
- bool containsHsailText = true;
- errorCode = g_complibApi._aclQueryInfo(device().compiler(), binaryElf_, RT_CONTAINS_HSAIL, NULL, &containsHsailText, &boolSize);
- if (errorCode != ACL_SUCCESS) {
- containsHsailText = false;
- }
- // Checking BRIG sections
- bool containsBrig = true;
- errorCode = g_complibApi._aclQueryInfo(device().compiler(), binaryElf_, RT_CONTAINS_BRIG, NULL, &containsBrig, &boolSize);
- if (errorCode != ACL_SUCCESS) {
- containsBrig = false;
- }
- if (containsBrig) {
- completeStages.push_back(from);
- from = ACL_TYPE_HSAIL_BINARY;
- // Here we should check that CG stage was done.
- // Right now there are 2 criterions to check it (besides BRIG itself):
- // 1. matadata symbols symOpenclKernel for every kernel.
- // 2. HSAIL text in aclCODEGEN section.
- // Unfortunately there is no appropriate way in Compiler Lib to check 1.
- // because kernel names are unknown here, therefore only 2.
- if (containsHsailText) {
- completeStages.push_back(from);
- from = ACL_TYPE_CG;
- }
- }
- else if (containsHsailText) {
- completeStages.push_back(from);
- from = ACL_TYPE_HSAIL_TEXT;
- }
- // Checking ISA in .text section
- bool containsShaderIsa = true;
- errorCode = g_complibApi._aclQueryInfo(device().compiler(), binaryElf_, RT_CONTAINS_ISA, NULL, &containsShaderIsa, &boolSize);
- if (errorCode != ACL_SUCCESS) {
- containsShaderIsa = false;
- }
- if (containsShaderIsa) {
- completeStages.push_back(from);
- from = ACL_TYPE_ISA;
- }
- std::string sCurOptions = compileOptions_ + linkOptions_;
- amd::option::Options curOptions;
- if (!amd::option::parseAllOptions(sCurOptions, curOptions)) {
- buildLog_ += curOptions.optionsLog();
- LogError("Parsing compile options failed.");
- return ACL_TYPE_DEFAULT;
- }
- switch (from) {
- // compile from HSAIL text, no matter prev. stages and options
- case ACL_TYPE_HSAIL_TEXT:
- needOptionsCheck = false;
- break;
- case ACL_TYPE_HSAIL_BINARY:
- case ACL_TYPE_CG:
- // do not check options, if LLVMIR is absent or might be absent or options are absent
- if (curOptions.oVariables->BinLLVMIR || !containsLlvmirText || !containsOpts) {
- needOptionsCheck = false;
- }
- break;
- case ACL_TYPE_ISA:
- // do not check options, if LLVMIR is absent or might be absent or options are absent
- if (curOptions.oVariables->BinLLVMIR || !containsLlvmirText || !containsOpts) {
- needOptionsCheck = false;
- }
- if (containsBrig && containsHsailText && curOptions.oVariables->BinHSAIL) {
- needOptionsCheck = false;
- // recompile from prev. stage, if BRIG || HSAIL are absent
- } else {
- from = completeStages.back();
- completeStages.pop_back();
- needOptionsCheck = true;
- }
- break;
- // recompilation might be needed
- case ACL_TYPE_LLVMIR_BINARY:
- case ACL_TYPE_DEFAULT:
- default:
- break;
- }
- return from;
- }
-
- aclType HSAILProgram::getNextCompilationStageFromBinary(amd::option::Options* options) {
- aclType continueCompileFrom = ACL_TYPE_DEFAULT;
- binary_t binary = this->binary();
- // If the binary already exists
- if ((binary.first != NULL) && (binary.second > 0)) {
- void *mem = const_cast(binary.first);
- acl_error errorCode;
- binaryElf_ = g_complibApi._aclReadFromMem(mem, binary.second, &errorCode);
- if (errorCode != ACL_SUCCESS) {
- buildLog_ += "Error while BRIG Codegen phase: aclReadFromMem failure \n" ;
- LogWarning("aclReadFromMem failed");
- return continueCompileFrom;
- }
- // Calculate the next stage to compile from, based on sections in binaryElf_;
- // No any validity checks here
- std::vector completeStages;
- bool needOptionsCheck = true;
- continueCompileFrom = getCompilationStagesFromBinary(completeStages, needOptionsCheck);
- // Saving binary in the interface class,
- // which also load compile & link options from binary
- setBinary(static_cast(mem), binary.second);
- if (!options || !needOptionsCheck) {
- return continueCompileFrom;
- }
- bool recompile = false;
- //! @todo Should we also check for ACL_TYPE_OPENCL & ACL_TYPE_LLVMIR_TEXT?
- switch (continueCompileFrom) {
- case ACL_TYPE_HSAIL_BINARY:
- case ACL_TYPE_CG:
- case ACL_TYPE_ISA: {
- // Compare options loaded from binary with current ones, recompile if differ;
- // If compile options are absent in binary, do not compare and recompile
- if (compileOptions_.empty())
- break;
- const oclBIFSymbolStruct* symbol = findBIF30SymStruct(symOpenclCompilerOptions);
- assert(symbol && "symbol not found");
- std::string symName = std::string(symbol->str[bif::PRE]) + std::string(symbol->str[bif::POST]);
- size_t symSize = 0;
- const void *opts = g_complibApi._aclExtractSymbol(device().compiler(),
- binaryElf_, &symSize, aclCOMMENT, symName.c_str(), &errorCode);
- if (errorCode != ACL_SUCCESS) {
- recompile = true;
- break;
- }
- std::string sBinOptions = std::string((char*)opts, symSize);
- std::string sCurOptions = compileOptions_ + linkOptions_;
- amd::option::Options curOptions, binOptions;
- if (!amd::option::parseAllOptions(sBinOptions, binOptions)) {
- buildLog_ += binOptions.optionsLog();
- LogError("Parsing compile options from binary failed.");
- return ACL_TYPE_DEFAULT;
- }
- if (!amd::option::parseAllOptions(sCurOptions, curOptions)) {
- buildLog_ += curOptions.optionsLog();
- LogError("Parsing compile options failed.");
- return ACL_TYPE_DEFAULT;
- }
- if (!curOptions.equals(binOptions)) {
- recompile = true;
- }
- break;
- }
- default:
- break;
- }
- if (recompile) {
- while (!completeStages.empty()) {
- continueCompileFrom = completeStages.back();
- if (continueCompileFrom == ACL_TYPE_LLVMIR_BINARY ||
- continueCompileFrom == ACL_TYPE_DEFAULT) {
- break;
- }
- completeStages.pop_back();
- }
- }
- }
- return continueCompileFrom;
- }
-
- bool HSAILProgram::saveBinaryAndSetType(type_t type) {
- //Write binary to memory
- void *rawBinary = NULL;
- size_t size;
- if (g_complibApi._aclWriteToMem(binaryElf_, &rawBinary, &size)
- != ACL_SUCCESS) {
- buildLog_ += "Failed to write binary to memory \n";
- return false;
- }
- clBinary()->saveBIFBinary((char*)rawBinary, size);
- //Set the type of binary
- setType(type);
- //Free memory containing rawBinary
- binaryElf_->binOpts.dealloc(rawBinary);
- return true;
- }
-
- bool HSAILProgram::linkImpl(const std::vector &inputPrograms,
- amd::option::Options *options,
- bool createLibrary) {
- std::vector::const_iterator it
- = inputPrograms.begin();
- std::vector::const_iterator itEnd
- = inputPrograms.end();
- acl_error errorCode;
-
- // For each program we need to extract the LLVMIR and create
- // aclBinary for each
- std::vector binaries_to_link;
-
- for (size_t i = 0; it != itEnd; ++it, ++i) {
- HSAILProgram *program = (HSAILProgram *)*it;
- // Check if the program was created with clCreateProgramWIthBinary
- binary_t binary = program->binary();
- if ((binary.first != NULL) && (binary.second > 0)) {
- // Binary already exists -- we can also check if there is no
- // opencl source code
- // Need to check if LLVMIR exists in the binary
- // If LLVMIR does not exist then is it valid
- // We need to pull out all the compiled kernels
- // We cannot do this at present because we need at least
- // Hsail text to pull the kernels oout
- void *mem = const_cast(binary.first);
- binaryElf_ = g_complibApi._aclReadFromMem(mem,
- binary.second,
- &errorCode);
-
- if (errorCode != ACL_SUCCESS) {
- LogWarning("Error while linking : Could not read from raw binary");
- return false;
- }
- }
- // At this stage each HSAILProgram contains a valid binary_elf
- // Check if LLVMIR is in the binary
- size_t boolSize = sizeof(bool);
- bool containsLLLVMIR = false;
- errorCode = g_complibApi._aclQueryInfo(device().compiler(), binaryElf_,
- RT_CONTAINS_LLVMIR, NULL, &containsLLLVMIR, &boolSize);
- if (errorCode != ACL_SUCCESS || !containsLLLVMIR) {
- buildLog_ +="Error while linking : Invalid binary (Missing LLVMIR section)";
- return false;
- }
- // Create a new aclBinary for each LLVMIR and save it in a list
- aclBIFVersion ver = g_complibApi._aclBinaryVersion(binaryElf_);
- aclBinary *bin = g_complibApi._aclCreateFromBinary(binaryElf_, ver);
- binaries_to_link.push_back(bin);
- }
-
- // At this stage each HSAILProgram in the list has an aclBinary initialized
- // and contains LLVMIR
- // We can now go ahead and link them.
- if (binaries_to_link.size() > 1) {
- errorCode = g_complibApi._aclLink(device().compiler(),
- binaries_to_link[0],
- binaries_to_link.size() - 1,
- &binaries_to_link[1],
- ACL_TYPE_LLVMIR_BINARY,
- "-create-library",
- NULL);
- }
- else {
- errorCode = g_complibApi._aclLink(device().compiler(),
- binaries_to_link[0],
- 0,
- NULL,
- ACL_TYPE_LLVMIR_BINARY,
- "-create-library",
- NULL);
- }
- if (errorCode != ACL_SUCCESS) {
- buildLog_ += "Failed to link programs";
- return false;
- }
- // Store the newly linked aclBinary for this program.
- binaryElf_ = binaries_to_link[0];
- // Free all the other aclBinaries
- for (size_t i = 1; i < binaries_to_link.size(); i++) {
- g_complibApi._aclBinaryFini(binaries_to_link[i]);
- }
- if (createLibrary) {
- saveBinaryAndSetType(TYPE_LIBRARY);
- return true;
- }
-
- // Now call linkImpl with the new options
- return linkImpl(options);
- }
-
- bool HSAILProgram::initBrigModule() {
- const char *symbol_name = "__BRIG__";
- BrigModuleHeader* brig;
- acl_error error_code;
- size_t size;
- const void* symbol_data = g_complibApi._aclExtractSymbol(
- device().compiler(),
- binaryElf_,
- &size,
- aclBRIG,
- symbol_name,
- &error_code);
- if (error_code != ACL_SUCCESS) {
- std::string error = "Could not find Brig in BIF: ";
- error += symbol_name;
- LogError(error.c_str());
- buildLog_ += error;
- return false;
- }
- brig = (BrigModuleHeader*)malloc(size);
- memcpy(brig, symbol_data, size);
- brigModule_ = brig;
- return true;
- }
- void HSAILProgram::destroyBrigModule() {
- if (brigModule_ != NULL) {
- free(brigModule_);
- }
- }
- bool HSAILProgram::initBrigContainer() {
- assert(brigModule_ != NULL);
-
- //Create a BRIG container
- hsaBrigContainer_ = new BrigContainer(brigModule_);
- if (!hsaBrigContainer_) {
- return false;
- }
- return true;
- }
-
- void HSAILProgram::destroyBrigContainer() {
- delete (hsaBrigContainer_);
- }
-
-
- void HSAILProgram::hsaError(const char *msg, hsa_status_t status) {
- std::string fmsg;
- fmsg += msg;
- if (status != HSA_STATUS_SUCCESS) {
- const char *hmsg = 0;
- hsa_status_string(status, &hmsg);
- if (hmsg) {
- fmsg += ": ";
- fmsg += hmsg;
- }
- }
- LogError(fmsg.c_str());
- buildLog_ += fmsg;
- }
-
- bool HSAILProgram::linkImpl(amd::option::Options *options) {
- acl_error errorCode;
- aclType continueCompileFrom = ACL_TYPE_LLVMIR_BINARY;
- bool finalize = true;
- // If !binaryElf_ then program must have been created using clCreateProgramWithBinary
- if (!binaryElf_) {
- continueCompileFrom = getNextCompilationStageFromBinary(options);
- }
- switch (continueCompileFrom) {
- // Compilation from ACL_TYPE_LLVMIR_BINARY to ACL_TYPE_CG in cases:
- // 1. if the program is not created with binary;
- // 2. if the program is created with binary and contains only .llvmir & .comment
- // 3. if the program is created with binary, contains .llvmir, .comment, brig sections,
- // but the binary's compile & link options differ from current ones (recompilation);
- case ACL_TYPE_LLVMIR_BINARY:
- // Compilation from ACL_TYPE_HSAIL_BINARY to ACL_TYPE_CG in cases:
- // 1. if the program is created with binary and contains only brig sections
- case ACL_TYPE_HSAIL_BINARY:
- // Compilation from ACL_TYPE_HSAIL_TEXT to ACL_TYPE_CG in cases:
- // 1. if the program is created with binary and contains only hsail text
- case ACL_TYPE_HSAIL_TEXT: {
- std::string curOptions = options->origOptionStr + hsailOptions();
- errorCode = g_complibApi._aclCompile(device().compiler(), binaryElf_,
- curOptions.c_str(), continueCompileFrom, ACL_TYPE_CG, logFunction);
- buildLog_ += g_complibApi._aclGetCompilerLog(device().compiler());
- if (errorCode != ACL_SUCCESS) {
- buildLog_ += "Error while BRIG Codegen phase: compilation error \n" ;
- return false;
- }
- break;
- }
- case ACL_TYPE_CG:
- break;
- case ACL_TYPE_ISA:
- finalize = false;
- break;
- default:
- buildLog_ += "Error while BRIG Codegen phase: the binary is incomplete \n" ;
- return false;
- }
- //Stop compilation if it is an offline device - HSA runtime does not
- //support ISA compiled offline
- if (!dev().isOnline()) {
- return true;
- }
-
- hsa_agent_t hsaDevice = dev().getBackendDevice();
- if (!initBrigModule()) {
- hsaError("Failed to create Brig Module");
- return false;
- }
-
- // Create a BrigContainer.
- if (!initBrigContainer()) {
- hsaError("Failed to create Brig Container");
- return false;
- }
- // Create a program.
- hsa_status_t status = hsa_ext_program_create(
- HSA_MACHINE_MODEL_LARGE,
- HSA_PROFILE_FULL,
- HSA_DEFAULT_FLOAT_ROUNDING_MODE_ZERO,
- NULL,
- &hsaProgramHandle_
- );
- if (status != HSA_STATUS_SUCCESS) {
- hsaError("Failed to create hsail program", status);
- return false;
- }
-
- // Add module to a program.
- hsa_ext_module_t programModule =
- reinterpret_cast(brigModule_);
- status = hsa_ext_program_add_module(
- hsaProgramHandle_, programModule
- );
- if (status != HSA_STATUS_SUCCESS) {
- hsaError("Failed to add a module to the program", status);
- return false;
- }
-
- // Obtain agent's Isa.
- hsa_isa_t hsaDeviceIsa;
- status = hsa_agent_get_info(
- hsaDevice, HSA_AGENT_INFO_ISA, &hsaDeviceIsa
- );
- if (status != HSA_STATUS_SUCCESS) {
- hsaError("Failed to create hsail program", status);
- return false;
- }
-
- // Finalize a program.
- hsa_ext_control_directives_t hsaControlDirectives;
- memset(&hsaControlDirectives, 0, sizeof(hsa_ext_control_directives_t));
- status = hsa_ext_program_finalize(
- hsaProgramHandle_,
- hsaDeviceIsa,
- 0,
- hsaControlDirectives,
- NULL,
- HSA_CODE_OBJECT_TYPE_PROGRAM,
- &hsaProgramCodeObject_
- );
- if (status != HSA_STATUS_SUCCESS) {
- hsaError("Failed to finalize hsail program", status);
- return false;
- }
-
- // HLC always generates full profile
- hsa_profile_t profile = HSA_PROFILE_FULL;
-
- // Create an executable.
- status = hsa_executable_create(
- profile,
- HSA_EXECUTABLE_STATE_UNFROZEN,
- "",
- &hsaExecutable_
- );
- if (status != HSA_STATUS_SUCCESS) {
- hsaError("Failed to create executable", status);
- return false;
- }
-
- // Load the code object.
- status = hsa_executable_load_code_object(
- hsaExecutable_, hsaDevice, hsaProgramCodeObject_, NULL
- );
- if (status != HSA_STATUS_SUCCESS) {
- hsaError("Failed to load code object", status);
- return false;
- }
-
- // Freeze the executable.
- status = hsa_executable_freeze(hsaExecutable_, NULL);
- if (status != HSA_STATUS_SUCCESS) {
- hsaError("Failed to freeze executable", status);
- return false;
- }
-
- Code first_d = hsaBrigContainer_->code().begin();
- Code last_d = hsaBrigContainer_->code().end();
- //Iterate through the symbols using brig assembler
- for (;first_d != last_d;first_d = first_d.next()) {
- if (DirectiveExecutable de = first_d) {
- // Disable function compilation unconditionally.
- // TODO: May remove this after the finalizer supports function compilation.
- if (DirectiveFunction df = first_d) {
- continue;
- }
-
- std::string kernelName = (SRef)de.name();
- if (de.linkage() != BRIG_LINKAGE_PROGRAM) {
- kernelName.insert(0, "am::");
- }
- // Query symbol handle for this symbol.
- hsa_executable_symbol_t kernelSymbol;
- status = hsa_executable_get_symbol(
- hsaExecutable_, NULL, kernelName.c_str(), hsaDevice, 0, &kernelSymbol
- );
- if (status != HSA_STATUS_SUCCESS) {
- hsaError("Failed to get executable symbol", status);
- return false;
- }
-
- // Query code handle for this symbol.
- uint64_t kernelCodeHandle;
- status = hsa_executable_symbol_get_info(
- kernelSymbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &kernelCodeHandle
- );
- if (status != HSA_STATUS_SUCCESS) {
- hsaError("Failed to get executable symbol info", status);
- return false;
- }
-
- std::string openclKernelName = kernelName;
- // Strip the opencl and kernel name
- kernelName = kernelName.substr(strlen("&__OpenCL_"), kernelName.size());
- kernelName = kernelName.substr(0,kernelName.size() - strlen("_kernel"));
- aclMetadata md;
- md.numHiddenKernelArgs = 0;
- size_t sizeOfnumHiddenKernelArgs = sizeof(md.numHiddenKernelArgs);
- errorCode = g_complibApi._aclQueryInfo(device().compiler(), binaryElf_, RT_NUM_KERNEL_HIDDEN_ARGS,
- openclKernelName.c_str(), &md.numHiddenKernelArgs, &sizeOfnumHiddenKernelArgs);
- if (errorCode != ACL_SUCCESS) {
- buildLog_ += "Error while Finalization phase: Kernel extra arguments count querying from the ELF failed\n";
- return false;
- }
-
- uint32_t workgroupGroupSegmentByteSize;
- status = hsa_executable_symbol_get_info(
- kernelSymbol,
- HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE,
- &workgroupGroupSegmentByteSize);
- if (status != HSA_STATUS_SUCCESS) {
- hsaError("Failed to get group segment size info", status);
- return false;
- }
-
- uint32_t workitemPrivateSegmentByteSize;
- status = hsa_executable_symbol_get_info(
- kernelSymbol,
- HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE,
- &workitemPrivateSegmentByteSize);
- if (status != HSA_STATUS_SUCCESS) {
- hsaError("Failed to get private segment size info", status);
- return false;
- }
-
- uint32_t kernargSegmentByteSize;
- status = hsa_executable_symbol_get_info(
- kernelSymbol,
- HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE,
- &kernargSegmentByteSize);
- if (status != HSA_STATUS_SUCCESS) {
- hsaError("Failed to get kernarg segment size info", status);
- return false;
- }
-
- uint32_t kernargSegmentAlignment;
- status = hsa_executable_symbol_get_info(
- kernelSymbol,
- HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_ALIGNMENT,
- &kernargSegmentAlignment);
- if (status != HSA_STATUS_SUCCESS) {
- hsaError("Failed to get kernarg segment alignment info", status);
- return false;
- }
-
- Kernel *aKernel = new roc::Kernel(
- kernelName,
- this,
- kernelCodeHandle,
- workgroupGroupSegmentByteSize,
- workitemPrivateSegmentByteSize,
- kernargSegmentByteSize,
- kernargSegmentAlignment,
- md.numHiddenKernelArgs
- );
- if (!aKernel->init()) {
- return false;
- }
- aKernel->setUniformWorkGroupSize(options->oVariables->UniformWorkGroupSize);
- kernels()[kernelName] = aKernel;
- }
- }
- saveBinaryAndSetType(TYPE_EXECUTABLE);
- buildLog_ += g_complibApi._aclGetCompilerLog(device().compiler());
- return true;
- }
-
- bool HSAILProgram::createBinary(amd::option::Options *options) {
- return false;
- }
-
- bool HSAILProgram::initClBinary() {
- if (clBinary_ == NULL) {
- clBinary_ = new ClBinary(static_cast(device()));
- if (clBinary_ == NULL) {
- return false;
- }
- }
- return true;
- }
-
- void HSAILProgram::releaseClBinary() {
- if (clBinary_ != NULL) {
- delete clBinary_;
- clBinary_ = NULL;
- }
- }
-
- std::string HSAILProgram::hsailOptions() {
- std::string hsailOptions;
- //Set options for the standard device specific options
- //This is just for legacy compiler code
- // All our devices support these options now
- hsailOptions.append(" -DFP_FAST_FMAF=1");
- hsailOptions.append(" -DFP_FAST_FMA=1");
- //TODO: this is a quick fix to restore original f32 denorm flushing
- //Make this target/option dependent
- hsailOptions.append(" -cl-denorms-are-zero");
- //TODO(sramalin) : Query the device for opencl version
- // and only set if -cl-std wasn't specified in
- // original build options (app)
- //hsailOptions.append(" -cl-std=CL1.2");
- //check if the host is 64 bit or 32 bit
- LP64_ONLY(hsailOptions.append(" -m64"));
- //Now append each extension supported by the device
- // one by one
- std::string token;
- std::istringstream iss("");
- iss.str(device().info().extensions_);
- while (getline(iss, token, ' ')) {
- if (!token.empty()) {
- hsailOptions.append(" -D");
- hsailOptions.append(token);
- hsailOptions.append("=1");
- }
- }
- return hsailOptions;
- }
-
-#endif // WITHOUT_HSA_BACKEND
-} // namespace hsa
-
diff --git a/projects/clr/rocclr/runtime/device/rocm/rocprogram.hpp b/projects/clr/rocclr/runtime/device/rocm/rocprogram.hpp
deleted file mode 100644
index 877c64b240..0000000000
--- a/projects/clr/rocclr/runtime/device/rocm/rocprogram.hpp
+++ /dev/null
@@ -1,156 +0,0 @@
-//
-// Copyright (c) 2008 Advanced Micro Devices, Inc. All rights reserved.
-//
-#pragma once
-
-#ifndef WITHOUT_HSA_BACKEND
-
-#include "rocbinary.hpp"
-#include "roccompilerlib.hpp"
-#include "acl.h"
-#include
-#include
-#include
-#include
-#include "rocdevice.hpp"
-#include "HSAILItems.h"
-
-using namespace HSAIL_ASM;
-//! \namespace roc HSA Device Implementation
-namespace roc {
-
- //! \class empty program
- class HSAILProgram : public device::Program
- {
- friend class ClBinary;
- public:
- //! Default constructor
- HSAILProgram(roc::NullDevice& device);
- //! Default destructor
- ~HSAILProgram();
-
- // Initialize Binary for GPU (used only for clCreateProgramWithBinary()).
- virtual bool initClBinary(char *binaryIn, size_t size);
-
- //! Returns the aclBinary associated with the progrm
- const aclBinary* binaryElf() const {
- return static_cast(binaryElf_); }
-
- const std::string& HsailText() {
- return hsailProgram_;
- }
-
- const NullDevice& dev() const { return device_; }
- //! Returns the hsaBinary associated with the progrm
- hsa_agent_t hsaDevice() const {
- return dev().getBackendDevice();
- }
-
- protected:
- //! log and append to build log an error from runtime
- void hsaError(const char *msg, hsa_status_t status = HSA_STATUS_SUCCESS);
-
- //! pre-compile setup for GPU
- virtual bool initBuild(amd::option::Options* options);
-
- //! post-compile setup for GPU
- virtual bool finiBuild(bool isBuildGood);
-
- /*! \brief Compiles GPU CL program to LLVM binary (compiler frontend)
- *
- * \return True if we successefully compiled a GPU program
- */
- virtual bool compileImpl(
- const std::string& sourceCode, //!< the program's source code
- const std::vector& headers,
- const char** headerIncludeNames,
- amd::option::Options* options //!< compile options's object
- );
-
- /*! \brief Compiles LLVM binary to HSAIL code (compiler backend: link+opt+codegen)
- *
- * \return The build error code
- */
- int compileBinaryToHSAIL(
- amd::option::Options* options //!< options for compilation
- );
-
-
- virtual bool linkImpl(amd::option::Options* options);
-
- //! Link the device programs.
- virtual bool linkImpl (const std::vector& inputPrograms,
- amd::option::Options* options,
- bool createLibrary);
-
- virtual bool createBinary(amd::option::Options* options);
-
- //! Initialize Binary
- virtual bool initClBinary();
-
- //! Release the Binary
- virtual void releaseClBinary();
-
- virtual const aclTargetInfo & info(const char * str = ""){
- return info_;
- }
-
- virtual bool isElf(const char* bin) const {
- return amd::isElfMagic(bin);
- //return false;
- }
-
- //! Returns the binary
- // This should ensure that the binary is updated with all the kernels
- // ClBinary& clBinary() { return binary_; }
- ClBinary* clBinary() {
- return static_cast(device::Program::clBinary());
- }
- const ClBinary* clBinary() const {
- return static_cast(device::Program::clBinary());
- }
- private:
- /* \brief Returns the next stage to compile from, based on sections in binary,
- * also returns completeStages in a vector, which contains at least ACL_TYPE_DEFAULT,
- * sets needOptionsCheck to true if options check is needed to decide whether or not to recompile
- */
- aclType getCompilationStagesFromBinary(std::vector& completeStages, bool& needOptionsCheck);
-
- /* \brief Returns the next stage to compile from, based on sections and options in binary
- */
- aclType getNextCompilationStageFromBinary(amd::option::Options* options);
- bool saveBinaryAndSetType(type_t type);
- bool initBrigContainer();
- void destroyBrigContainer();
- //Initializes BRIG module
- bool initBrigModule();
- void destroyBrigModule();
- //! Disable default copy constructor
- HSAILProgram(const HSAILProgram&);
-
- //! Disable operator=
- HSAILProgram& operator=(const HSAILProgram&);
-
- //! Returns all the options to be appended while passing to the
- //compiler library
- std::string hsailOptions();
-
- std::string openCLSource_; //!< Original OpenCL source
- std::string hsailProgram_; //!< HSAIL program after compilation.
- std::string llvmBinary_; //!< LLVM IR binary code
- //!< aclBinary and aclCompiler - for the compiler libray
- aclBinary* binaryElf_; //!
-#include
-#include
-
-/**
-* HSA image object size in bytes (see HSAIL spec)
-*/
-#define HSA_IMAGE_OBJECT_SIZE 48
-
-/**
-* HSA image object alignment in bytes (see HSAIL spec)
-*/
-#define HSA_IMAGE_OBJECT_ALIGNMENT 16
-
-/**
-* HSA sampler object size in bytes (see HSAIL spec)
-*/
-#define HSA_SAMPLER_OBJECT_SIZE 32
-
-/**
-* HSA sampler object alignment in bytes (see HSAIL spec)
-*/
-#define HSA_SAMPLER_OBJECT_ALIGNMENT 16
-
-namespace roc {
-// (HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) invalidates I, K and L1
-// (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE) invalidates L1, L2 and flushes L2
-
-static const uint16_t kDispatchPacketHeaderNoSync =
- (HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) |
- (HSA_FENCE_SCOPE_NONE << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) |
- (HSA_FENCE_SCOPE_NONE << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE);
-
-static const uint16_t kDispatchPacketHeader =
- (HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) |
- (1 << HSA_PACKET_HEADER_BARRIER) |
- (HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) |
- (HSA_FENCE_SCOPE_NONE << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE);
-
-static const uint16_t kBarrierPacketHeader =
- (HSA_PACKET_TYPE_BARRIER_AND << HSA_PACKET_HEADER_TYPE) |
- (1 << HSA_PACKET_HEADER_BARRIER) |
- (HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) |
- (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE);
-
-static const uint16_t kBarrierPacketAcquireHeader =
- (HSA_PACKET_TYPE_BARRIER_AND << HSA_PACKET_HEADER_TYPE) |
- (1 << HSA_PACKET_HEADER_BARRIER) |
- (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) |
- (HSA_FENCE_SCOPE_NONE << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE);
-
-static const uint16_t kBarrierPacketReleaseHeader =
- (HSA_PACKET_TYPE_BARRIER_AND << HSA_PACKET_HEADER_TYPE) |
- (1 << HSA_PACKET_HEADER_BARRIER) |
- (HSA_FENCE_SCOPE_NONE << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) |
- (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE);
-
-static const hsa_barrier_and_packet_t kBarrierAcquirePacket =
- {kBarrierPacketAcquireHeader,0,0,0,0,0,0,0,0,0};
-
-static const hsa_barrier_and_packet_t kBarrierReleasePacket =
- {kBarrierPacketReleaseHeader,0,0,0,0,0,0,0,0,0};
-
-double Timestamp::ticksToTime_=0;
-
-/**
-* Set the ocl correlation handle (essentially the cl_event handle)
-* to correlate the cl kernel launch and HSA kernel dispatch
-*/
-typedef hsa_status_t
- (*hsa_ext_tools_set_correlation_handle)(const hsa_agent_t agent,
- void *correlation_handle);
-static void SetOclCorrelationHandle(void *tools_lib, const hsa_agent_t agent, void *handle) {
- hsa_ext_tools_set_correlation_handle func =
- (hsa_ext_tools_set_correlation_handle)Os::getSymbol(tools_lib, "hsa_ext_tools_set_correlation_handler");
- if (func) {
- func(agent, handle);
- }
-
- return;
-}
-
-bool
-VirtualGPU::MemoryDependency::create(size_t numMemObj)
-{
- if (numMemObj > 0) {
- // Allocate the array of memory objects for dependency tracking
- memObjectsInQueue_ = new MemoryState[numMemObj];
- if (NULL == memObjectsInQueue_) {
- return false;
- }
- memset(memObjectsInQueue_, 0, sizeof(MemoryState) * numMemObj);
- maxMemObjectsInQueue_ = numMemObj;
- }
-
- return true;
-}
-
-void
-VirtualGPU::MemoryDependency::validate(
- VirtualGPU& gpu,
- const Memory* memory,
- bool readOnly)
-{
- bool flushL1Cache = false;
-
- if (maxMemObjectsInQueue_ == 0) {
- // Sync AQL packets
- gpu.setAqlHeader(kDispatchPacketHeader);
- return;
- }
-
- uint64_t curStart = reinterpret_cast(memory->getDeviceMemory());
- uint64_t curEnd = curStart + memory->size();
-
- // Loop through all memory objects in the queue and find dependency
- // @note don't include objects from the current kernel
- for (size_t j = 0; j < endMemObjectsInQueue_; ++j) {
- // Check if the queue already contains this mem object and
- // GPU operations aren't readonly
- uint64_t busyStart = memObjectsInQueue_[j].start_;
- uint64_t busyEnd = memObjectsInQueue_[j].end_;
-
- // Check if the start inside the busy region
- if ((((curStart >= busyStart) && (curStart < busyEnd)) ||
- // Check if the end inside the busy region
- ((curEnd > busyStart) && (curEnd <= busyEnd)) ||
- // Check if the start/end cover the busy region
- ((curStart <= busyStart) && (curEnd >= busyEnd))) &&
- // If the buys region was written or the current one is for write
- (!memObjectsInQueue_[j].readOnly_ || !readOnly)) {
- flushL1Cache = true;
- break;
- }
- }
-
- // Did we reach the limit?
- if (maxMemObjectsInQueue_ <= (numMemObjectsInQueue_ + 1)) {
- flushL1Cache = true;
- }
-
- if (flushL1Cache) {
- // Sync AQL packets
- gpu.setAqlHeader(kDispatchPacketHeader);
-
- // Clear memory dependency state
- const static bool All = true;
- clear(!All);
- }
-
- // Insert current memory object into the queue always,
- // since runtime calls flush before kernel execution and it has to keep
- // current kernel in tracking
- memObjectsInQueue_
- [numMemObjectsInQueue_].start_ = curStart;
- memObjectsInQueue_
- [numMemObjectsInQueue_].end_ = curEnd;
- memObjectsInQueue_
- [numMemObjectsInQueue_].readOnly_ = readOnly;
- numMemObjectsInQueue_++;
-}
-
-void
-VirtualGPU::MemoryDependency::clear(bool all)
-{
- if (numMemObjectsInQueue_ > 0) {
- size_t i, j;
- if (all) {
- endMemObjectsInQueue_ = numMemObjectsInQueue_;
- }
-
- // Preserve all objects from the current kernel
- for (i = 0, j = endMemObjectsInQueue_; j < numMemObjectsInQueue_; i++, j++) {
- memObjectsInQueue_[i].start_ = memObjectsInQueue_[j].start_;
- memObjectsInQueue_[i].end_ = memObjectsInQueue_[j].end_;
- memObjectsInQueue_[i].readOnly_ = memObjectsInQueue_[j].readOnly_;
- }
- // Clear all objects except current kernel
- memset(&memObjectsInQueue_[i], 0, sizeof(amd::Memory*) * numMemObjectsInQueue_);
- numMemObjectsInQueue_ -= endMemObjectsInQueue_;
- endMemObjectsInQueue_ = 0;
- }
-}
-
-bool
-VirtualGPU::processMemObjects(
- const amd::Kernel& kernel,
- const_address params)
-{
- static const bool NoAlias = true;
- const Kernel& hsaKernel = static_cast
- (*(kernel.getDeviceKernel(dev(), NoAlias)));
- const amd::KernelSignature& signature = kernel.signature();
- const amd::KernelParameters& kernelParams = kernel.parameters();
-
- // AQL packets
- setAqlHeader(kDispatchPacketHeaderNoSync);
-
- // Mark the tracker with a new kernel,
- // so we can avoid checks of the aliased objects
- memoryDependency().newKernel();
-
- bool deviceSupportFGS = 0 != dev().isFineGrainedSystem(true);
- bool supportFineGrainedSystem = deviceSupportFGS;
- FGSStatus status = kernelParams.getSvmSystemPointersSupport();
- switch (status) {
- case FGS_YES:
- if (!deviceSupportFGS) {
- return false;
- }
- supportFineGrainedSystem = true;
- break;
- case FGS_NO:
- supportFineGrainedSystem = false;
- break;
- case FGS_DEFAULT:
- default:
- break;
- }
-
- size_t count = kernelParams.getNumberOfSvmPtr();
- size_t execInfoOffset = kernelParams.getExecInfoOffset();
- bool sync = true;
-
- amd::Memory* memory = NULL;
- //get svm non arugment information
- void* const* svmPtrArray =
- reinterpret_cast(params + execInfoOffset);
- for (size_t i = 0; i < count; i++) {
- memory = amd::SvmManager::FindSvmBuffer(svmPtrArray[i]);
- if (NULL == memory) {
- if (!supportFineGrainedSystem) {
- return false;
- }
- else if (sync) {
- // Sync AQL packets
- setAqlHeader(kDispatchPacketHeader);
- // Clear memory dependency state
- const static bool All = true;
- memoryDependency().clear(!All);
- continue;
- }
- }
- else {
- Memory* gpuMemory = static_cast(memory->getDeviceMemory(dev()));
- if (NULL != gpuMemory) {
- const static bool IsReadOnly = false;
- // Validate SVM passed in the non argument list
- memoryDependency().validate(*this, gpuMemory, IsReadOnly);
- }
- else {
- return false;
- }
- }
- }
-
- // Check all parameters for the current kernel
- for (size_t i = 0; i < signature.numParameters(); ++i) {
- const amd::KernelParameterDescriptor& desc = signature.at(i);
- const HsailKernelArg* arg = hsaKernel.hsailArgAt(i);
- Memory* memory = NULL;
- bool readOnly = false;
- amd::Memory* svmMem = NULL;
-
- // Find if current argument is a buffer
- if ((desc.type_ == T_POINTER) && (arg->addrQual_ != HSAIL_ADDRESS_LOCAL)) {
- if (kernelParams.boundToSvmPointer(dev(), params, i)) {
- svmMem = amd::SvmManager::FindSvmBuffer(
- *reinterpret_cast(params + desc.offset_));
- if (!svmMem) {
- // Sync AQL packets
- setAqlHeader(kDispatchPacketHeader);
- // Clear memory dependency state
- const static bool All = true;
- memoryDependency().clear(!All);
- continue;
- }
- }
-
- if (*reinterpret_cast
- (params + desc.offset_) != NULL) {
- if (NULL == svmMem) {
- memory = static_cast((*reinterpret_cast
- (params + desc.offset_))->getDeviceMemory(dev()));
- }
- else {
- memory = static_cast(svmMem->getDeviceMemory(dev()));
- }
- }
-
- if (memory != NULL) {
- // Check image
- readOnly = (desc.accessQualifier_ ==
- CL_KERNEL_ARG_ACCESS_READ_ONLY) ? true : false;
- // Check buffer
- readOnly |= (arg->access_ == HSAIL_ACCESS_TYPE_RO) ? true : false;
- // Validate memory for a dependency in the queue
- memoryDependency().validate(*this, memory, readOnly);
- }
- }
- }
-
- return true;
-}
-
-template
-bool
-VirtualGPU::dispatchGenericAqlPacket(
- AqlPacket* packet,
- bool blocking)
-{
- const uint32_t queueSize = gpu_queue_->size;
- const uint32_t queueMask = queueSize - 1;
-
- //Check for queue full and wait if needed.
- uint64_t index = hsa_queue_load_write_index_relaxed(gpu_queue_);
- uint64_t read = hsa_queue_load_read_index_relaxed(gpu_queue_);
- hsa_signal_t signal;
-
- // TODO: placeholder to setup the kernel to populate start and end timestamp.
- if (timestamp_ != nullptr) {
- // Find signal slot
- ProfilingSignal* profilingSignal = &signal_pool_[index & queueMask];
- // Make sure we save the old results in the TS structure
- if (profilingSignal->ts_ != nullptr) {
- profilingSignal->ts_->checkGpuTime();
- }
- // Update the new TS with the signal info
- timestamp_->setProfilingSignal(profilingSignal);
- packet->completion_signal = profilingSignal->signal_;
- profilingSignal->ts_ = timestamp_;
- timestamp_->setAgent(gpu_device_);
- }
-
- if ((index - read) == queueMask) {
- if (packet->completion_signal.handle == 0) {
- packet->completion_signal = barrier_signal_;
- }
- signal = packet->completion_signal;
- // Initialize signal for a wait
- hsa_signal_store_relaxed(signal, InitSignalValue);
- blocking = true;
- }
-
- //Insert packet
- ((AqlPacket*)(
- gpu_queue_->base_address))[index & queueMask] = *packet;
- hsa_queue_store_write_index_release(gpu_queue_, index + 1);
- hsa_signal_store_relaxed(gpu_queue_->doorbell_signal, index);
-
- //Wait on signal ?
- if (blocking) {
- if (hsa_signal_wait_acquire(
- signal, HSA_SIGNAL_CONDITION_LT, 1, uint64_t(-1),
- HSA_WAIT_STATE_BLOCKED) != 0) {
- LogPrintfError("Failed signal [0x%lx] wait", signal.handle);
- return false;
- }
-
- // Release the pool, since runtime just drained the entire queue
- resetKernArgPool();
- }
-
- return true;
-}
-
-bool
-VirtualGPU::dispatchAqlPacket(
- hsa_kernel_dispatch_packet_t* packet,
- bool blocking)
-{
- return dispatchGenericAqlPacket(packet, blocking);
-}
-
-bool
-VirtualGPU::dispatchAqlPacket(
- hsa_barrier_and_packet_t* packet,
- bool blocking)
-{
- return dispatchGenericAqlPacket(packet, blocking);
-}
-
-void
-VirtualGPU::dispatchBarrierPacket(const hsa_barrier_and_packet_t* packet)
-{
- assert(packet->completion_signal.handle != 0);
- const uint32_t queueSize = gpu_queue_->size;
- const uint32_t queueMask = queueSize - 1;
-
- uint64_t index = hsa_queue_load_write_index_relaxed(gpu_queue_);
- ((hsa_barrier_and_packet_t*)(
- gpu_queue_->base_address))[index&queueMask] = *packet;
-
- hsa_queue_store_write_index_relaxed(gpu_queue_, index + 1);
-
- hsa_signal_store_relaxed(gpu_queue_->doorbell_signal, index);
-}
-
-/**
- * @brief Waits on an outstanding kernel without regard to how
- * it was dispatched - with or without a signal
- *
- * @return bool true if Wait returned successfully, false
- * otherwise
- */
-bool VirtualGPU::releaseGpuMemoryFence() {
- // Return if there is no pending dispatch
- if (!hasPendingDispatch_) {
- return false;
- }
-
- // Initialize signal for the barrier packet.
- hsa_signal_store_relaxed(barrier_signal_, InitSignalValue);
-
- // Dispatch barrier packet into the queue and wait till it finishes.
- dispatchBarrierPacket(&barrier_packet_);
- if (hsa_signal_wait_acquire(
- barrier_signal_, HSA_SIGNAL_CONDITION_EQ, 0, uint64_t(-1),
- HSA_WAIT_STATE_BLOCKED) != 0) {
- LogError("Barrier packet submission failed");
- return false;
- }
-
- hasPendingDispatch_ = false;
-
- // Release all memory dependencies
- memoryDependency().clear();
-
- // Release the pool, since runtime just completed a barrier
- resetKernArgPool();
-
- return true;
-}
-
-VirtualGPU::VirtualGPU(Device &device)
- : device::VirtualDevice(device)
- , roc_device_(device)
-{
- gpu_device_ = device.getBackendDevice();
- // Initialize the last signal and dispatch flags
- timestamp_ = NULL;
- hasPendingDispatch_ = false;
- tools_lib_ = NULL;
-
- kernarg_pool_base_ = NULL;
- kernarg_pool_size_ = 0;
- kernarg_pool_cur_offset_ = 0;
- aqlHeader_ = kDispatchPacketHeaderNoSync;
- barrier_signal_.handle = 0;
-}
-
-VirtualGPU::~VirtualGPU()
-{
- if (timestamp_ != NULL) {
- delete timestamp_;
- timestamp_ = NULL;
- LogError("There was a timestamp that was not used; deleting.");
- }
- if (printfdbg_ != NULL){
- delete printfdbg_;
- printfdbg_ = NULL;
- }
-
- tools_lib_ = NULL;
-}
-
-bool
-VirtualGPU::create(bool profilingEna)
-{
- // Set the event handle to the tools lib if the env var
- // Load the library using its advertised "soname"
- std::string lib_name = Os::getEnvironment("HSA_TOOLS_LIB");
- if (lib_name != "") {
-#if defined(_WIN32) || defined(__CYGWIN__)
- const char *tools_lib_name = "hsa-runtime-tools" LP64_SWITCH("", "64") ".dll";
-#else
- const char *tools_lib_name = "libhsa-runtime-tools" LP64_SWITCH("", "64") ".so.1";
-#endif
- tools_lib_ = Os::loadLibrary(tools_lib_name);
- }
-
- uint32_t queue_max_packets = 0;
- if (HSA_STATUS_SUCCESS !=
- hsa_agent_get_info(
- gpu_device_, HSA_AGENT_INFO_QUEUE_MAX_SIZE, &queue_max_packets)) {
- return false;
- }
-
- //Pick a reasonable queue size
- uint32_t queue_size = 1024;
- queue_size = (queue_max_packets < queue_size) ? queue_max_packets : queue_size;
- while (hsa_queue_create(gpu_device_,
- queue_size, HSA_QUEUE_TYPE_MULTI, NULL, NULL, UINT_MAX, UINT_MAX,
- &gpu_queue_) != HSA_STATUS_SUCCESS) {
- queue_size >>= 1;
- if (queue_size < 64) {
- return false;
- }
- }
-
- if (!initPool(dev().settings().kernargPoolSize_, (profilingEna) ? queue_size : 0)) {
- LogError("Couldn't allocate arguments/signals for the queue");
- return false;
- }
-
- device::BlitManager::Setup blitSetup;
- blitMgr_ = new KernelBlitManager(*this, blitSetup);
- if ((NULL == blitMgr_) || !blitMgr_->create(roc_device_)) {
- LogError("Could not create BlitManager!");
- return false;
- }
-
- // Create signal for the barrier packet.
- hsa_signal_t signal = { 0 };
- if (HSA_STATUS_SUCCESS !=
- hsa_signal_create(InitSignalValue, 0, NULL, &signal)) {
- return false;
- }
- barrier_signal_ = signal;
-
- // Initialize barrier packet.
- memset(&barrier_packet_, 0, sizeof(barrier_packet_));
- barrier_packet_.header = kBarrierPacketHeader;
- barrier_packet_.completion_signal = barrier_signal_;
-
- // Create a object of PrintfDbg
- printfdbg_ = new PrintfDbg(roc_device_);
- if (NULL == printfdbg_) {
- LogError("\nCould not create printfDbg Object!");
- return false;
- }
-
- // Initialize timestamp conversion factor
- if (Timestamp::getGpuTicksToTime() == 0) {
- uint64_t frequency;
- hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY, &frequency);
- Timestamp::setGpuTicksToTime(1e9/double(frequency));
- }
-
- if (!memoryDependency().create(GPU_NUM_MEM_DEPENDENCY)) {
- LogError("Could not create the array of memory objects!");
- return false;
- }
-
- return true;
-}
-
-bool
-VirtualGPU::terminate()
-{
- delete blitMgr_;
-
- // Release the resources of signal
- releaseGpuMemoryFence();
- hsa_status_t err = hsa_queue_destroy(gpu_queue_);
- if (err != HSA_STATUS_SUCCESS) {
- return false;
- }
-
- if (barrier_signal_.handle != 0) {
- hsa_signal_destroy(barrier_signal_);
- }
-
- if (tools_lib_) {
- Os::unloadLibrary(tools_lib_);
- tools_lib_ = NULL;
- }
-
- destroyPool();
-
- return true;
-}
-
-bool
-VirtualGPU::initPool(size_t kernarg_pool_size, uint signal_pool_count)
-{
- kernarg_pool_size_ = kernarg_pool_size;
- kernarg_pool_base_ = reinterpret_cast(
- roc_device_.hostAlloc(kernarg_pool_size_, 1, true));
- if (kernarg_pool_base_ == nullptr) {
- return false;
- }
-
- if (signal_pool_count != 0) {
- signal_pool_.resize(signal_pool_count);
- for (uint i = 0; i < signal_pool_count; ++i) {
- ProfilingSignal profilingSignal;
- if (HSA_STATUS_SUCCESS != hsa_signal_create(
- 0, 0, nullptr, &profilingSignal.signal_)) {
- return false;
- }
- signal_pool_[i] = profilingSignal;
- }
- }
-
- return true;
-}
-
-void
-VirtualGPU::destroyPool() {
- if (kernarg_pool_base_ != nullptr) {
- roc_device_.hostFree(kernarg_pool_base_, kernarg_pool_size_);
- }
-
- if (signal_pool_.size() > 0) {
- for (uint i = 0; i < signal_pool_.size(); ++i) {
- hsa_signal_destroy(signal_pool_[i].signal_);
- }
- }
-}
-
-void*
-VirtualGPU::allocKernArg(size_t size, size_t alignment)
-{
- char* result = nullptr;
- do {
- result = alignUp(kernarg_pool_base_ + kernarg_pool_cur_offset_, alignment);
- const size_t pool_new_usage = (result + size) - kernarg_pool_base_;
- if (pool_new_usage <= kernarg_pool_size_) {
- kernarg_pool_cur_offset_ = pool_new_usage;
- return result;
- }
- else {
- //! We run out of the arguments space!
- //! That means the app didn't call clFlush/clFinish for very long time.
- //! We can issue a barrier to avoid expensive extra memory allocations.
-
- // Initialize signal for the barrier packet.
- hsa_signal_store_relaxed(barrier_signal_, InitSignalValue);
-
- // Dispatch barrier packet into the queue and wait till it finishes.
- dispatchBarrierPacket(&barrier_packet_);
- if (hsa_signal_wait_acquire(
- barrier_signal_, HSA_SIGNAL_CONDITION_EQ, 0, uint64_t(-1),
- HSA_WAIT_STATE_BLOCKED) != 0) {
- LogError("Kernel arguments reset failed");
- }
-
- resetKernArgPool();
- }
- } while (true);
-
- return result;
-}
-
-/* profilingBegin, when profiling is enabled, creates a timestamp to save in
-* virtualgpu's timestamp_, and calls start() to get the current host
-* timestamp.
-*/
-void VirtualGPU::profilingBegin(amd::Command &command, bool drmProfiling)
-{
- if (command.profilingInfo().enabled_) {
- if (timestamp_ != NULL) {
- LogWarning("Trying to create a second timestamp in VirtualGPU. \
- This could have unintended consequences.");
- return;
- }
- timestamp_ = new Timestamp;
- timestamp_->start();
- }
-}
-
-/* profilingEnd, when profiling is enabled, checks to see if a signal was
-* created for whatever command we are running and calls end() to get the
-* current host timestamp if no signal is available. It then saves the pointer
-* timestamp_ to the command's data.
-*/
-void VirtualGPU::profilingEnd(amd::Command &command)
-{
- if (command.profilingInfo().enabled_) {
- if (timestamp_->getProfilingSignal() == nullptr) {
- timestamp_->end();
- }
- command.setData(reinterpret_cast(timestamp_));
- timestamp_ = NULL;
- }
-}
-
-struct DestroySampler : public std::binary_function {
- bool operator() (hsa_ext_sampler_t &sampler,
- hsa_agent_t agent) const {
- hsa_status_t status = hsa_ext_sampler_destroy(agent, sampler);
- return status == HSA_STATUS_SUCCESS;
- }
-};
-
-void VirtualGPU::updateCommandsState(amd::Command *list)
-{
- Timestamp *ts = NULL;
-
- amd::Command* current = list;
- amd::Command* next = NULL;
-
- if (current == NULL) {
- return;
- }
-
- uint64_t endTimeStamp = 0;
- uint64_t startTimeStamp = endTimeStamp;
-
- if (current->profilingInfo().enabled_) {
- // TODO: use GPU timestamp when available.
- endTimeStamp = amd::Os::timeNanos();
- startTimeStamp = endTimeStamp;
-
- // This block gets the first valid timestamp from the first command
- // that has one. This timestamp is used below to mark any command that
- // came before it to start and end with this first valid start time.
- current = list;
- while (current != NULL) {
- if (current->data() != NULL) {
- ts = reinterpret_cast(current->data());
- startTimeStamp = ts->getStart();
- endTimeStamp = ts->getStart();
- break;
- }
- current = current->getNext();
- }
- }
-
- // Iterate through the list of commands, and set timestamps as appropriate
- // Note, if a command does not have a timestamp, it does one of two things:
- // - if the command (without a timestamp), A, precedes another command, C,
- // that _does_ contain a valid timestamp, command A will set RUNNING and
- // COMPLETE with the RUNNING (start) timestamp from command C. This would
- // also be true for command B, which is between A and C. These timestamps
- // are actually retrieved in the block above (startTimeStamp, endTimeStamp).
- // - if the command (without a timestamp), C, follows another command, A,
- // that has a valid timestamp, command C will be set RUNNING and COMPLETE
- // with the COMPLETE (end) timestamp of the previous command, A. This is
- // also true for any command B, which falls between A and C.
- current = list;
- while (current != NULL) {
- if (current->profilingInfo().enabled_) {
- if (current->data() != NULL) {
- // Since this is a valid command to get a timestamp, we use the
- // timestamp provided by the runtime (saved in the data())
- ts = reinterpret_cast(current->data());
- startTimeStamp = ts->getStart();
- endTimeStamp = ts->getEnd();
- delete ts;
- current->setData(NULL);
- }
- else {
- // If we don't have a command that contains a valid timestamp,
- // we simply use the end timestamp of the previous command.
- // Note, if this is a command before the first valid timestamp,
- // this will be equal to the start timestamp of the first valid
- // timestamp at this point.
- startTimeStamp = endTimeStamp;
- }
- }
-
- if (current->status() == CL_SUBMITTED) {
- current->setStatus(CL_RUNNING, startTimeStamp);
- current->setStatus(CL_COMPLETE, endTimeStamp);
- }
- else if (current->status() != CL_COMPLETE) {
- LogPrintfError("Unexpected command status - %d.", current->status());
- }
-
- next = current->getNext();
- current->release();
- current = next;
- }
-
- // Release the sampler handles allocated for the various
- // on one or more kernel submissions
- std::for_each(samplerList_.begin(),
- samplerList_.end(),
- std::bind2nd(DestroySampler(), gpu_device_));
- samplerList_.clear();
-
- return;
-}
-
-void VirtualGPU::submitReadMemory(amd::ReadMemoryCommand &cmd)
-{
- // Wait on a kernel if one is outstanding
- releaseGpuMemoryFence();
-
- profilingBegin(cmd);
-
- size_t offset = 0;
- // Find if virtual address is a CL allocation
- device::Memory* hostMemory = dev().findMemoryFromVA(cmd.destination(), &offset);
-
- device::Memory *devMem = cmd.source().getDeviceMemory(dev());
- void *dst = cmd.destination();
- amd::Coord3D size = cmd.size();
-
- //! @todo: add multi-devices synchronization when supported.
-
- cl_command_type type = cmd.type();
- bool result = false;
- bool imageBuffer = false;
-
- // Force buffer read for IMAGE1D_BUFFER
- if ((type == CL_COMMAND_READ_IMAGE) &&
- (cmd.source().getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER)) {
- type = CL_COMMAND_READ_BUFFER;
- imageBuffer = true;
- }
-
- switch (type) {
- case CL_COMMAND_READ_BUFFER: {
- amd::Coord3D origin(cmd.origin()[0]);
- if (imageBuffer) {
- size_t elemSize =
- cmd.source().asImage()->getImageFormat().getElementSize();
- origin.c[0] *= elemSize;
- size.c[0] *= elemSize;
- }
- if (hostMemory != nullptr) {
- // Accelerated transfer without pinning
- amd::Coord3D dstOrigin(offset);
- result = blitMgr().copyBuffer(*devMem, *hostMemory,
- origin, dstOrigin, size, cmd.isEntireMemory());
- }
- else {
- result = blitMgr().readBuffer(
- *devMem, dst, origin, size,
- cmd.isEntireMemory());
- }
- break;
- }
- case CL_COMMAND_READ_BUFFER_RECT: {
- result = blitMgr().readBufferRect(
- *devMem, dst, cmd.bufRect(), cmd.hostRect(), size,
- cmd.isEntireMemory());
- break;
- }
- case CL_COMMAND_READ_IMAGE: {
- result = blitMgr().readImage(
- *devMem, dst, cmd.origin(), size, cmd.rowPitch(),
- cmd.slicePitch(), cmd.isEntireMemory());
- break;
- }
- default:
- ShouldNotReachHere();
- break;
- }
-
- if (!result) {
- LogError("submitReadMemory failed!");
- cmd.setStatus(CL_OUT_OF_RESOURCES);
- }
-
- profilingEnd(cmd);
-}
-
-void VirtualGPU::submitWriteMemory(amd::WriteMemoryCommand &cmd)
-{
- // Wait on a kernel if one is outstanding
- releaseGpuMemoryFence();
-
- profilingBegin(cmd);
-
- size_t offset = 0;
- // Find if virtual address is a CL allocation
- device::Memory* hostMemory = dev().findMemoryFromVA(cmd.source(), &offset);
-
- device::Memory *devMem = cmd.destination().getDeviceMemory(dev());
- const char *src = static_cast(cmd.source());
- amd::Coord3D size = cmd.size();
-
- //! @todo add multi-devices synchronization when supported.
-
- cl_command_type type = cmd.type();
- bool result = false;
- bool imageBuffer = false;
-
- // Force buffer write for IMAGE1D_BUFFER
- if ((type == CL_COMMAND_WRITE_IMAGE) &&
- (cmd.destination().getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER)) {
- type = CL_COMMAND_WRITE_BUFFER;
- imageBuffer = true;
- }
-
- switch (type) {
- case CL_COMMAND_WRITE_BUFFER: {
- amd::Coord3D origin(cmd.origin()[0]);
- if (imageBuffer) {
- size_t elemSize =
- cmd.destination().asImage()->getImageFormat().getElementSize();
- origin.c[0] *= elemSize;
- size.c[0] *= elemSize;
- }
- if (hostMemory != nullptr) {
- // Accelerated transfer without pinning
- amd::Coord3D srcOrigin(offset);
- result = blitMgr().copyBuffer(*hostMemory, *devMem,
- srcOrigin, origin, size, cmd.isEntireMemory());
- }
- else {
- result = blitMgr().writeBuffer(
- src, *devMem , origin, size,
- cmd.isEntireMemory());
- }
- break;
- }
- case CL_COMMAND_WRITE_BUFFER_RECT: {
- result = blitMgr().writeBufferRect(
- src, *devMem, cmd.hostRect(), cmd.bufRect(), size,
- cmd.isEntireMemory());
- break;
- }
- case CL_COMMAND_WRITE_IMAGE: {
- result = blitMgr().writeImage(
- src, *devMem, cmd.origin(), size, cmd.rowPitch(),
- cmd.slicePitch(), cmd.isEntireMemory());
- break;
- }
- default:
- ShouldNotReachHere();
- break;
- }
-
- if (!result) {
- LogError("submitWriteMemory failed!");
- cmd.setStatus(CL_OUT_OF_RESOURCES);
- }
- else {
- cmd.destination().signalWrite(&dev());
- }
-
- profilingEnd(cmd);
-}
-
-void VirtualGPU::submitSvmFreeMemory(amd::SvmFreeMemoryCommand& cmd)
-{
- // in-order semantics: previous commands need to be done before we start
- releaseGpuMemoryFence();
-
- profilingBegin(cmd);
- const std::vector& svmPointers = cmd.svmPointers();
- if (cmd.pfnFreeFunc() == NULL) {
- // pointers allocated using clSVMAlloc
- for (cl_uint i = 0; i < svmPointers.size(); i++) {
- amd::SvmBuffer::free(cmd.context(), svmPointers[i]);
- }
- }
- else {
- cmd.pfnFreeFunc()(as_cl(cmd.queue()->asCommandQueue()), svmPointers.size(),
- (void**) (&(svmPointers[0])), cmd.userData());
- }
- profilingEnd(cmd);
-}
-
-void VirtualGPU::submitSvmCopyMemory(amd::SvmCopyMemoryCommand& cmd)
-{
- // in-order semantics: previous commands need to be done before we start
- releaseGpuMemoryFence();
- profilingBegin(cmd);
- amd::SvmBuffer::memFill(cmd.dst(), cmd.src(), cmd.srcSize(), 1);
- profilingEnd(cmd);
-}
-
-void VirtualGPU::submitSvmFillMemory(amd::SvmFillMemoryCommand& cmd)
-{
- // in-order semantics: previous commands need to be done before we start
- releaseGpuMemoryFence();
- profilingBegin(cmd);
- SvmBuffer::memFill(cmd.dst(), cmd.pattern(), cmd.patternSize(), cmd.times());
- profilingEnd(cmd);
-}
-
-void VirtualGPU::submitCopyMemory(amd::CopyMemoryCommand &cmd)
-{
- // Wait on a kernel if one is outstanding
- releaseGpuMemoryFence();
-
- profilingBegin(cmd);
-
- device::Memory *srcDevMem = cmd.source().getDeviceMemory(dev());
- device::Memory *destDevMem = cmd.destination().getDeviceMemory(dev());
- amd::Coord3D size = cmd.size();
-
- //! @todo add multi-devices synchronization when supported.
-
- cl_command_type type = cmd.type();
- bool result = false;
- bool srcImageBuffer = false;
- bool dstImageBuffer = false;
-
- // Force buffer copy for IMAGE1D_BUFFER
- if (cmd.source().getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER) {
- srcImageBuffer = true;
- type = CL_COMMAND_COPY_BUFFER;
- }
- if (cmd.destination().getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER) {
- dstImageBuffer = true;
- type = CL_COMMAND_COPY_BUFFER;
- }
-
- switch (cmd.type()) {
- case CL_COMMAND_COPY_BUFFER: {
- amd::Coord3D srcOrigin(cmd.srcOrigin()[0]);
- amd::Coord3D dstOrigin(cmd.dstOrigin()[0]);
-
- if (srcImageBuffer) {
- const size_t elemSize =
- cmd.source().asImage()->getImageFormat().getElementSize();
- srcOrigin.c[0] *= elemSize;
- if (dstImageBuffer) {
- dstOrigin.c[0] *= elemSize;
- }
- size.c[0] *= elemSize;
- }
- else if (dstImageBuffer) {
- const size_t elemSize =
- cmd.destination().asImage()->getImageFormat().getElementSize();
- dstOrigin.c[0] *= elemSize;
- size.c[0] *= elemSize;
- }
-
- result = blitMgr().copyBuffer(
- *srcDevMem, *destDevMem, srcOrigin,
- dstOrigin, size, cmd.isEntireMemory());
- break;
- }
- case CL_COMMAND_COPY_BUFFER_RECT: {
- result = blitMgr().copyBufferRect(
- *srcDevMem, *destDevMem, cmd.srcRect(),
- cmd.dstRect(), size, cmd.isEntireMemory());
- break;
- }
- case CL_COMMAND_COPY_IMAGE: {
- result = blitMgr().copyImage(
- *srcDevMem, *destDevMem, cmd.srcOrigin(),
- cmd.dstOrigin(), size, cmd.isEntireMemory());
- break;
- }
- case CL_COMMAND_COPY_IMAGE_TO_BUFFER: {
- result = blitMgr().copyImageToBuffer(
- *srcDevMem, *destDevMem, cmd.srcOrigin(),
- cmd.dstOrigin(), size, cmd.isEntireMemory());
- break;
- }
- case CL_COMMAND_COPY_BUFFER_TO_IMAGE: {
- result = blitMgr().copyBufferToImage(
- *srcDevMem, *destDevMem, cmd.srcOrigin(),
- cmd.dstOrigin(), size, cmd.isEntireMemory());
- break;
- }
- default:
- ShouldNotReachHere();
- break;
- }
-
- if (!result) {
- LogError("submitCopyMemory failed!");
- cmd.setStatus(CL_OUT_OF_RESOURCES);
- }
-
- cmd.destination().signalWrite(&dev());
-
- profilingEnd(cmd);
-}
-
-void VirtualGPU::submitSvmMapMemory(amd::SvmMapMemoryCommand& cmd)
-{
- // No fence is needed since this is a no-op: the
- // command will be completed only after all the
- // previous commands are complete
- profilingBegin(cmd);
- profilingEnd(cmd);
-}
-
-void VirtualGPU::submitSvmUnmapMemory(amd::SvmUnmapMemoryCommand& cmd)
-{
- // No fence is needed since this is a no-op: the
- // command will be completed only after all the
- // previous commands are complete
- profilingBegin(cmd);
- profilingEnd(cmd);
-}
-
-void VirtualGPU::submitMapMemory(amd::MapMemoryCommand &cmd)
-{
- // Wait on a kernel if one is outstanding
- releaseGpuMemoryFence();
-
- profilingBegin(cmd);
-
- //! @todo add multi-devices synchronization when supported.
-
- roc::Memory *devMemory = reinterpret_cast(
- cmd.memory().getDeviceMemory(dev(), false));
-
- cl_command_type type = cmd.type();
- bool imageBuffer = false;
-
- // Save map requirement.
- cl_map_flags mapFlag = cmd.mapFlags();
-
- // Treat no map flag as read-write.
- if (mapFlag == 0) {
- mapFlag = CL_MAP_READ | CL_MAP_WRITE;
- }
-
- devMemory->saveMapInfo(cmd.mapPtr(), cmd.origin(), cmd.size(),
- mapFlag, cmd.isEntireMemory());
-
- // Sync to the map target.
- if (devMemory->isHostMemDirectAccess()) {
- // Add memory to VA cache, so rutnime can detect direct access to VA
- dev().addVACache(devMemory);
- }
- if ((!devMemory->isHostMemDirectAccess()) &&
- (mapFlag & (CL_MAP_READ | CL_MAP_WRITE))) {
- bool result = false;
- roc::Memory *hsaMemory = static_cast(devMemory);
-
- amd::Memory* mapMemory = hsaMemory->mapMemory();
- void *hostPtr = mapMemory == NULL ?
- hsaMemory->owner()->getHostMem() :
- mapMemory->getHostMem();
-
- if (type == CL_COMMAND_MAP_BUFFER) {
- amd::Coord3D origin(cmd.origin()[0]);
- amd::Coord3D size(cmd.size()[0]);
- amd::Coord3D dstOrigin(cmd.origin()[0], 0, 0);
- if (imageBuffer) {
- size_t elemSize =
- cmd.memory().asImage()->getImageFormat().getElementSize();
- origin.c[0] *= elemSize;
- size.c[0] *= elemSize;
- }
-
- if (mapMemory != nullptr) {
- roc::Memory *hsaMapMemory = static_cast(
- mapMemory->getDeviceMemory(dev(), false));
- result = blitMgr().copyBuffer(*hsaMemory,
- *hsaMapMemory, origin, dstOrigin,
- size, cmd.isEntireMemory());
- }
- else {
- result = blitMgr().readBuffer(
- *hsaMemory, static_cast(hostPtr)+origin[0],
- origin, size, cmd.isEntireMemory());
- }
-
- }
- else if (type == CL_COMMAND_MAP_IMAGE) {
- amd::Image* image = cmd.memory().asImage();
- result = blitMgr().readImage(
- *hsaMemory, hostPtr, amd::Coord3D(0),
- image->getRegion(), image->getRowPitch(),
- image->getSlicePitch(), true);
- }
- else {
- ShouldNotReachHere();
- }
-
- if (!result) {
- LogError("submitMapMemory failed!");
- cmd.setStatus(CL_OUT_OF_RESOURCES);
- }
- }
-
- profilingEnd(cmd);
-}
-
-void VirtualGPU::submitUnmapMemory(amd::UnmapMemoryCommand &cmd)
-{
- roc::Memory* devMemory = static_cast(
- cmd.memory().getDeviceMemory(dev(), false));
-
- const device::Memory::WriteMapInfo* mapInfo =
- devMemory->writeMapInfo(cmd.mapPtr());
- if (nullptr == mapInfo) {
- LogError("Unmap without map call");
- return;
- }
- // Wait on a kernel if one is outstanding
- releaseGpuMemoryFence();
- profilingBegin(cmd);
-
- // Force buffer write for IMAGE1D_BUFFER
- bool imageBuffer = (cmd.memory().getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER);
-
- if (devMemory->isHostMemDirectAccess()) {
- // Remove memory from VA cache
- dev().removeVACache(devMemory);
- }
- if (mapInfo->isUnmapWrite()) {
- // Commit the changes made by the user.
- if (!devMemory->isHostMemDirectAccess()) {
- bool result = false;
-
- if (cmd.memory().asImage() && !imageBuffer) {
- amd::Image *image = cmd.memory().asImage();
- result = blitMgr().writeImage(
- cmd.mapPtr(), *devMemory,
- mapInfo->origin_,
- mapInfo->region_,
- image->getRowPitch(), image->getSlicePitch());
- }
- else {
- amd::Coord3D origin(mapInfo->origin_[0]);
- amd::Coord3D size(mapInfo->region_[0]);
- if (imageBuffer) {
- size_t elemSize =
- cmd.memory().asImage()->getImageFormat().getElementSize();
- origin.c[0] *= elemSize;
- size.c[0] *= elemSize;
- }
- if (devMemory->mapMemory() != nullptr) {
- roc::Memory *mapMemory = static_cast(
- devMemory->mapMemory()->getDeviceMemory(dev(), false));
-
- result = blitMgr().copyBuffer(
- *mapMemory, *devMemory,
- mapInfo->origin_,
- mapInfo->origin_,
- mapInfo->region_,
- mapInfo->isEntire());
- }
- else {
- result = blitMgr().writeBuffer(
- cmd.mapPtr(), *devMemory, origin, size);
- }
- }
- if (!result) {
- LogError("submitMapMemory failed!");
- cmd.setStatus(CL_OUT_OF_RESOURCES);
- }
- }
-
- cmd.memory().signalWrite(&dev());
- }
-
- devMemory->clearUnmapInfo(cmd.mapPtr());
-
- profilingEnd(cmd);
-}
-
-void VirtualGPU::submitFillMemory(amd::FillMemoryCommand &cmd)
-{
- // Wait on a kernel if one is outstanding
- releaseGpuMemoryFence();
-
- profilingBegin(cmd);
-
- device::Memory *devMemory = cmd.memory().getDeviceMemory(dev(), false);
-
- //! @todo add multi-devices synchronization when supported.
-
- cl_command_type type = cmd.type();
- bool result = false;
- bool imageBuffer = false;
- float fillValue[4];
-
- // Force fill buffer for IMAGE1D_BUFFER
- if ((type == CL_COMMAND_FILL_IMAGE) &&
- (cmd.memory().getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER)) {
- type = CL_COMMAND_FILL_BUFFER;
- imageBuffer = true;
- }
-
- // Find the the right fill operation
- switch (type) {
- case CL_COMMAND_FILL_BUFFER: {
- const void* pattern = cmd.pattern();
- size_t patternSize = cmd.patternSize();
- amd::Coord3D origin(cmd.origin()[0]);
- amd::Coord3D size(cmd.size()[0]);
- // Reprogram fill parameters if it's an IMAGE1D_BUFFER object
- if (imageBuffer) {
- size_t elemSize =
- cmd.memory().asImage()->getImageFormat().getElementSize();
- origin.c[0] *= elemSize;
- size.c[0] *= elemSize;
- memset(fillValue, 0, sizeof(fillValue));
- cmd.memory().asImage()->getImageFormat().formatColor(pattern,
- fillValue);
- pattern = fillValue;
- patternSize = elemSize;
- }
- result = blitMgr().fillBuffer(
- *devMemory, pattern, patternSize, origin, size,
- cmd.isEntireMemory());
- break;
- }
- case CL_COMMAND_FILL_IMAGE: {
- result = blitMgr().fillImage(
- *devMemory, cmd.pattern(), cmd.origin(), cmd.size(),
- cmd.isEntireMemory());
- break;
- }
- default:
- ShouldNotReachHere();
- break;
- }
-
- if (!result) {
- LogError("submitFillMemory failed!");
- cmd.setStatus(CL_OUT_OF_RESOURCES);
- }
-
- cmd.memory().signalWrite(&dev());
-
- profilingEnd(cmd);
-}
-
-void VirtualGPU::submitMigrateMemObjects(amd::MigrateMemObjectsCommand &vcmd)
-{
- // Wait on a kernel if one is outstanding
- releaseGpuMemoryFence();
-
- profilingBegin(vcmd);
-
- std::vector::const_iterator itr;
-
- for (itr = vcmd.memObjects().begin();
- itr != vcmd.memObjects().end();
- itr++) {
- // Find device memory
- device::Memory *m = (*itr)->getDeviceMemory(dev());
- roc::Memory *memory = static_cast(m);
-
- if (vcmd.migrationFlags() & CL_MIGRATE_MEM_OBJECT_HOST) {
- //! @todo revisit this when multi devices is supported.
- } else if (vcmd.migrationFlags() &
- CL_MIGRATE_MEM_OBJECT_CONTENT_UNDEFINED) {
- //! @todo revisit this when multi devices is supported.
- } else {
- LogWarning("Unknown operation for memory migration!");
- }
- }
-
- profilingEnd(vcmd);
-}
-
-/*! \brief Writes to the buffer and incrememts the write pointer to the
- * buffer. Also, ensures that the argument is written to an
- * aligned memory as specified
- *
- * @param dst The write pointer to the buffer
- * @param src The source pointer
- * @param size The size in bytes to copy
- * @param alignment The alignment to follow while writing to the buffer
- */
-static void
-addArg(unsigned char** dst, const void* src,
- size_t size, uint32_t alignment)
-{
- *dst = amd::alignUp(*dst, alignment);
- memcpy(*dst, src, size);
- *dst += size;
-}
-
-static inline void
-addArg(unsigned char** dst, const void* src, size_t size)
-{
- assert(size < UINT32_MAX);
- addArg(dst, src, size, size);
-}
-
- //Over rides the workgroup size fields in the packet with runtime/compiler set sizes
- void setRuntimeCompilerLocalSize(hsa_kernel_dispatch_packet_t& dispatchPacket,
- amd::NDRangeContainer sizes,
- const size_t* compile_size,
- const roc::Device &dev){
- //Todo (sramalin) need to check if compile_size is set to 0 if dimension is not valid
- // else this error check is incorrect
- if (compile_size[0] || compile_size[1] || compile_size[2]) {
- dispatchPacket.workgroup_size_x = sizes.dimensions()>0 ? compile_size[0] : 1;
- dispatchPacket.workgroup_size_y = sizes.dimensions()>1 ? compile_size[1] : 1;
- dispatchPacket.workgroup_size_z = sizes.dimensions()>2 ? compile_size[2] : 1;
- }
- else {
- //Runtime must set the group size
- dispatchPacket.workgroup_size_x = 1;
- dispatchPacket.workgroup_size_y = 1;
- dispatchPacket.workgroup_size_z = 1;
-
- if (sizes.dimensions() == 1) {
- dispatchPacket.workgroup_size_x = dev.settings().maxWorkGroupSize_;
- }
- else if (sizes.dimensions() == 2) {
- dispatchPacket.workgroup_size_x = dev.settings().maxWorkGroupSize2DX_;
- dispatchPacket.workgroup_size_y = dev.settings().maxWorkGroupSize2DY_;
- }
- else if (sizes.dimensions() == 3) {
- dispatchPacket.workgroup_size_x = dev.settings().maxWorkGroupSize3DX_;
- dispatchPacket.workgroup_size_y = dev.settings().maxWorkGroupSize3DY_;
- dispatchPacket.workgroup_size_z = dev.settings().maxWorkGroupSize3DZ_;
- }
- }
-}
-
- static void
-fillSampleDescriptor(
- hsa_ext_sampler_descriptor_t& samplerDescriptor,
- const amd::Sampler& sampler)
- {
- samplerDescriptor.filter_mode = sampler.filterMode() == CL_FILTER_NEAREST ?
- HSA_EXT_SAMPLER_FILTER_MODE_NEAREST : HSA_EXT_SAMPLER_FILTER_MODE_LINEAR;
- samplerDescriptor.coordinate_mode = sampler.normalizedCoords() ?
- HSA_EXT_SAMPLER_COORDINATE_MODE_NORMALIZED :
- HSA_EXT_SAMPLER_COORDINATE_MODE_UNNORMALIZED;
- switch (sampler.addressingMode()) {
- case CL_ADDRESS_CLAMP_TO_EDGE:
- samplerDescriptor.address_mode =
- HSA_EXT_SAMPLER_ADDRESSING_MODE_CLAMP_TO_EDGE;
- break;
- case CL_ADDRESS_REPEAT:
- samplerDescriptor.address_mode =
- HSA_EXT_SAMPLER_ADDRESSING_MODE_REPEAT;
- break;
- case CL_ADDRESS_CLAMP:
- samplerDescriptor.address_mode =
- HSA_EXT_SAMPLER_ADDRESSING_MODE_CLAMP_TO_BORDER;
- break;
- case CL_ADDRESS_MIRRORED_REPEAT:
- samplerDescriptor.address_mode =
- HSA_EXT_SAMPLER_ADDRESSING_MODE_MIRRORED_REPEAT;
- break;
- case CL_ADDRESS_NONE:
- samplerDescriptor.address_mode =
- HSA_EXT_SAMPLER_ADDRESSING_MODE_UNDEFINED;
- break;
- default:
- return;
- }
- }
-
-bool
-VirtualGPU::submitKernelInternal(
- const amd::NDRangeContainer& sizes,
- const amd::Kernel& kernel,
- const_address parameters,
- void *eventHandle)
-{
- if (tools_lib_) {
- SetOclCorrelationHandle(tools_lib_, this->gpu_device_, eventHandle);
- }
-
- device::Kernel *devKernel = const_cast
- (kernel.getDeviceKernel(dev()));
- Kernel &gpuKernel = static_cast(*devKernel);
- const size_t compilerLdsUsage = gpuKernel.WorkgroupGroupSegmentByteSize();
- size_t ldsUsage = compilerLdsUsage;
-
- // Check memory dependency and SVM objects
- if (!processMemObjects(kernel, parameters)) {
- LogError("Wrong memory objects!");
- return false;
- }
-
- // Init PrintfDbg object if printf is enabled.
- bool printfEnabled = (gpuKernel.printfInfo().size() > 0) ? true : false;
- if (!printfDbg()->init(printfEnabled)){
- LogError("\nPrintfDbg object initialization failed!");
- return false;
- }
-
- // Allocate buffer to hold kernel arguments
- address argBuffer =
- (address)allocKernArg(gpuKernel.KernargSegmentByteSize(),
- gpuKernel.KernargSegmentAlignment());
-
- if (argBuffer == NULL) {
- LogError("Out of memory");
- return false;
- }
-
- address argPtr = argBuffer;
-
- // The HLC generates Kernenv arguments, first 3 are global offsets.
- const uint extraAargs = ((roc::Kernel*)devKernel)->extraArgumentsNum();
- for (uint j = 0; j < extraAargs; ++j) {
- // The 4th parameter is the pointer to print buffer
- if (3 == j) {
- address bufferPtr = printfDbg()->dbgBuffer();
- addArg(&argPtr, &bufferPtr, sizeof(void*));
- }else {
- const size_t offset = j < sizes.dimensions() ? sizes.offset()[j] : 0;
- addArg(&argPtr, &offset, sizeof(void*)); //Should be uint32_t for small model and uint64_t for large!
- }
- }
-
- const amd::KernelSignature& signature = kernel.signature();
- const amd::KernelParameters& kernelParams = kernel.parameters();
-
- // Find all parameters for the current kernel
- for (uint i = 0; i != signature.numParameters(); ++i) {
- const HsailKernelArg* arg = gpuKernel.hsailArgAt(i);
- const_address srcArgPtr = parameters + signature.at(i).offset_;
-
- if (arg->type_ == HSAIL_ARGTYPE_POINTER ) {
- const size_t size = sizeof(void*);
- if (arg->addrQual_ == HSAIL_ADDRESS_LOCAL) {
- ldsUsage = amd::alignUp(ldsUsage, arg->alignment_); //!< do we need this?
- addArg(&argPtr, &ldsUsage, size);
- ldsUsage += *reinterpret_cast(srcArgPtr);
- continue;
- }
- assert((arg->addrQual_ == HSAIL_ADDRESS_GLOBAL) &&
- "Unsupported address qualifier");
- if (kernelParams.boundToSvmPointer(dev(), parameters, i)) {
- addArg(&argPtr, srcArgPtr, size);
- continue;
- }
- amd::Memory* mem = *reinterpret_cast(srcArgPtr);
- if (mem == NULL) {
- addArg(&argPtr, srcArgPtr, size);
- continue;
- }
-
- Memory *devMem = static_cast(mem->getDeviceMemory(dev()));
- //! @todo add multi-devices synchronization when supported.
- void* globalAddress = devMem->getDeviceMemory();
- addArg(&argPtr, &globalAddress, size);
-
- //! @todo Compiler has to return read/write attributes
- const cl_mem_flags flags = mem->getMemFlags();
- if (!flags || (flags & (CL_MEM_READ_WRITE | CL_MEM_WRITE_ONLY))) {
- mem->signalWrite(&dev());
- }
- }
- else if (arg->type_ == HSAIL_ARGTYPE_VALUE) {
- if (arg->dataType_ == HSAIL_DATATYPE_STRUCT) {
- void *mem = allocKernArg(arg->size_, arg->alignment_);
- if (mem == NULL) {
- LogError("Out of memory");
- return false;
- }
- memcpy(mem, srcArgPtr, arg->size_);
- addArg(&argPtr, &mem, sizeof(void*));
- continue;
- }
- for (uint e = 0; e < arg->numElem_; ++e) {
- addArg(&argPtr, srcArgPtr, arg->size_);
- srcArgPtr += arg->size_;
- }
- }
- else if (arg->type_ == HSAIL_ARGTYPE_IMAGE) {
- amd::Memory* mem = *reinterpret_cast(srcArgPtr);
- Image* image = static_cast(mem->getDeviceMemory(dev()));
- if (image == NULL) {
- LogError("Kernel image argument is not an image object");
- return false;
- }
-
- if (dev().settings().enableImageHandle_) {
- const uint64_t image_srd = image->getHsaImageObject().handle;
- assert(amd::isMultipleOf(image_srd, sizeof(image_srd)));
- addArg(&argPtr, &image_srd, sizeof(image_srd));
- }
- else {
- // Image arguments are of size 48 bytes and are aligned to 16 bytes
- addArg(&argPtr, (void *)image->getHsaImageObject().handle,
- HSA_IMAGE_OBJECT_SIZE, HSA_IMAGE_OBJECT_ALIGNMENT);
- }
-
- //! @todo Compiler has to return read/write attributes
- const cl_mem_flags flags = mem->getMemFlags();
- if (!flags || (flags & (CL_MEM_READ_WRITE | CL_MEM_WRITE_ONLY))) {
- mem->signalWrite(&dev());
- }
- }
- else if (arg->type_ == HSAIL_ARGTYPE_SAMPLER) {
- amd::Sampler* sampler = *reinterpret_cast(srcArgPtr);
- if (sampler == NULL) {
- LogError("Kernel sampler argument is not an sampler object");
- return false;
- }
-
- hsa_ext_sampler_descriptor_t samplerDescriptor;
- fillSampleDescriptor(samplerDescriptor, *sampler);
-
- hsa_ext_sampler_t hsa_sampler;
- hsa_status_t status = hsa_ext_sampler_create(dev().getBackendDevice(),
- &samplerDescriptor, &hsa_sampler);
- if (status != HSA_STATUS_SUCCESS) {
- LogError("Error creating device sampler object!");
- return false;
- }
-
- if (dev().settings().enableImageHandle_) {
- uint64_t sampler_srd = hsa_sampler.handle;
- addArg(&argPtr, &sampler_srd, sizeof(sampler_srd));
- samplerList_.push_back(hsa_sampler);
- // TODO: destroy sampler.
- }
- else {
- argPtr = amd::alignUp(argPtr, HSA_SAMPLER_OBJECT_ALIGNMENT);
-
- memcpy(argPtr, (void*)hsa_sampler.handle, HSA_SAMPLER_OBJECT_SIZE);
- argPtr += HSA_SAMPLER_OBJECT_SIZE;
- hsa_ext_sampler_destroy(dev().getBackendDevice(), hsa_sampler);
- }
- }
- }
-
- // Check there is no arguments' buffer overflow
- assert(argPtr <= argBuffer + gpuKernel.KernargSegmentByteSize());
-
- // Check for group memory overflow
- //! @todo Check should be in HSA - here we should have at most an assert
- assert(roc_device_.info().localMemSizePerCU_ > 0);
- if (ldsUsage > roc_device_.info().localMemSizePerCU_) {
- LogError("No local memory available\n");
- return false;
- }
-
- //Initialize the dispatch Packet
- hsa_kernel_dispatch_packet_t dispatchPacket;
- memset(&dispatchPacket, 0, sizeof(dispatchPacket));
-
- dispatchPacket.kernel_object = gpuKernel.KernelCodeHandle();
-
- dispatchPacket.header = aqlHeader_;
- dispatchPacket.setup |= sizes.dimensions() << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
- dispatchPacket.grid_size_x = sizes.dimensions()>0 ? sizes.global()[0] : 1;
- dispatchPacket.grid_size_y = sizes.dimensions()>1 ? sizes.global()[1] : 1;
- dispatchPacket.grid_size_z = sizes.dimensions()>2 ? sizes.global()[2] : 1;
-
- const size_t* compile_size = devKernel->workGroupInfo()->compileSize_;
- if (sizes.local().product() != 0) {
- dispatchPacket.workgroup_size_x = sizes.dimensions()>0 ? sizes.local()[0] : 1;
- dispatchPacket.workgroup_size_y = sizes.dimensions()>1 ? sizes.local()[1] : 1;
- dispatchPacket.workgroup_size_z = sizes.dimensions()>2 ? sizes.local()[2] : 1;
- } else {
- setRuntimeCompilerLocalSize(dispatchPacket, sizes, compile_size, dev());
- }
- dispatchPacket.kernarg_address = argBuffer;
- dispatchPacket.group_segment_size = ldsUsage;
- dispatchPacket.private_segment_size = devKernel->workGroupInfo()->privateMemSize_;
-
- //Dispatch the packet
- if (!dispatchAqlPacket(&dispatchPacket, false)){
- return false;
- }
-
- // Mark the flag indicating if a dispatch is outstanding.
- // We are not waiting after every dispatch.
- hasPendingDispatch_ = true;
-
- // Output printf buffer
- if(!printfDbg()->output(*this, printfEnabled, gpuKernel.printfInfo())){
- LogError("\nCould not print data from the printf buffer!");
- return false;
- }
- return true;
-}
-/**
- * @brief Api to dispatch a kernel for execution. The implementation
- * parses the input object, an instance of virtual command to obtain
- * the parameters of global size, work group size, offsets of work
- * items, enable/disable profiling, etc.
- *
- * It also parses the kernel arguments buffer to inject into Hsa Runtime
- * the list of kernel parameters.
- */
-void VirtualGPU::submitKernel(amd::NDRangeKernelCommand &vcmd) {
- profilingBegin(vcmd);
-
- // Submit kernel to HW
- if (!submitKernelInternal(
- vcmd.sizes(), vcmd.kernel(), vcmd.parameters(),
- static_cast(as_cl(&vcmd.event())))) {
- LogError("AQL dispatch failed!");
- vcmd.setStatus(CL_INVALID_OPERATION);
- }
-
- profilingEnd(vcmd);
-}
-
-void VirtualGPU::submitNativeFn(amd::NativeFnCommand &cmd) {
- // std::cout<<__FUNCTION__<<" not implemented"<<"*********"<signal_, &time);
- start_ = time.start * ticksToTime_;
- end_ = time.end * ticksToTime_;
- profilingSignal_->ts_ = nullptr;
- profilingSignal_ = nullptr;
- }
- }
-
- // Start a timestamp (get timestamp from OS)
- void start() {
- start_ = amd::Os::timeNanos();
- }
-
- // End a timestamp (get timestamp from OS)
- void end() {
- end_ = amd::Os::timeNanos();
- }
-
- static void setGpuTicksToTime(double ticksToTime) { ticksToTime_=ticksToTime; }
- static double getGpuTicksToTime() { return ticksToTime_; }
-};
-
-class VirtualGPU : public device::VirtualDevice {
-public:
- //! Initial signal value
- static const hsa_signal_value_t InitSignalValue = 1;
-
- class MemoryDependency : public amd::EmbeddedObject
- {
- public:
- //! Default constructor
- MemoryDependency()
- : memObjectsInQueue_(NULL)
- , numMemObjectsInQueue_(0)
- , maxMemObjectsInQueue_(0) {}
-
- ~MemoryDependency() { delete [] memObjectsInQueue_; }
-
- //! Creates memory dependecy structure
- bool create(size_t numMemObj);
-
- //! Notify the tracker about new kernel
- void newKernel() { endMemObjectsInQueue_ = numMemObjectsInQueue_; }
-
- //! Validates memory object on dependency
- void validate(VirtualGPU& gpu, const Memory* memory, bool readOnly);
-
- //! Clear memory dependency
- void clear(bool all = true);
-
- private:
- struct MemoryState {
- uint64_t start_; //! Busy memory start address
- uint64_t end_; //! Busy memory end address
- bool readOnly_; //! Current GPU state in the queue
- };
-
- MemoryState* memObjectsInQueue_; //!< Memory object state in the queue
- size_t endMemObjectsInQueue_; //!< End of mem objects in the queue
- size_t numMemObjectsInQueue_; //!< Number of mem objects in the queue
- size_t maxMemObjectsInQueue_; //!< Maximum number of mem objects in the queue
- };
-
- VirtualGPU(Device &device);
- ~VirtualGPU();
-
- bool create(bool profilingEna);
- bool terminate();
- const Device& dev() const { return roc_device_; }
-
- void profilingBegin(amd::Command &command, bool drmProfiling = false);
- void profilingEnd(amd::Command &command);
-
- void updateCommandsState(amd::Command* list);
-
- void submitReadMemory(amd::ReadMemoryCommand& cmd);
- void submitWriteMemory(amd::WriteMemoryCommand& cmd);
- void submitCopyMemory(amd::CopyMemoryCommand& cmd);
- void submitMapMemory(amd::MapMemoryCommand& cmd);
- void submitUnmapMemory(amd::UnmapMemoryCommand& cmd);
- void submitKernel(amd::NDRangeKernelCommand& cmd);
- bool submitKernelInternal(
- const amd::NDRangeContainer& sizes, //!< Workload sizes
- const amd::Kernel& kernel, //!< Kernel for execution
- const_address parameters, //!< Parameters for the kernel
- void *event_handle //!< Handle to OCL event for debugging
- );
- void submitNativeFn(amd::NativeFnCommand& cmd);
- void submitMarker(amd::Marker& cmd);
-
- void submitAcquireExtObjects(amd::AcquireExtObjectsCommand& cmd);
- void submitReleaseExtObjects(amd::ReleaseExtObjectsCommand& cmd);
- void submitPerfCounter(amd::PerfCounterCommand& cmd){};
-
- void flush(amd::Command* list = NULL, bool wait = false);
- void submitFillMemory(amd::FillMemoryCommand& cmd);
- void submitMigrateMemObjects(amd::MigrateMemObjectsCommand& cmd);
-
-// { roc OpenCL integration
-// Added these stub (no-ops) implementation of pure virtual methods,
-// when integrating HSA and OpenCL branches.
-// TODO: After inegration, whoever is working on VirtualGPU should write
-// actual implemention.
- virtual void submitSignal(amd::SignalCommand &cmd) {}
- virtual void submitMakeBuffersResident(amd::MakeBuffersResidentCommand &cmd) {}
-
- virtual void submitSvmFreeMemory(amd::SvmFreeMemoryCommand& cmd);
- virtual void submitSvmCopyMemory(amd::SvmCopyMemoryCommand& cmd);
- virtual void submitSvmFillMemory(amd::SvmFillMemoryCommand& cmd);
- virtual void submitSvmMapMemory(amd::SvmMapMemoryCommand& cmd);
- virtual void submitSvmUnmapMemory(amd::SvmUnmapMemoryCommand& cmd);
-
- void submitThreadTraceMemObjects(amd::ThreadTraceMemObjectsCommand &cmd) {}
- void submitThreadTrace(amd::ThreadTraceCommand &vcmd) {}
-
- /**
- * @brief Waits on an outstanding kernel without regard to how
- * it was dispatched - with or without a signal
- *
- * @return bool true if Wait returned successfully, false
- * otherwise
- */
- bool releaseGpuMemoryFence();
-
- hsa_agent_t gpu_device() { return gpu_device_; }
- hsa_queue_t* gpu_queue() { return gpu_queue_; }
-
- // Return pointer to PrintfDbg
- PrintfDbg* printfDbg() const {return printfdbg_;}
-
- //! Returns memory dependency class
- MemoryDependency& memoryDependency() { return memoryDependency_; }
-
- //! Detects memory dependency for HSAIL kernels and uses appropriate AQL header
- bool processMemObjects(
- const amd::Kernel& kernel, //!< AMD kernel object for execution
- const_address params //!< Pointer to the param's store
- );
-
-// } roc OpenCL integration
-private:
- bool dispatchAqlPacket(
- hsa_kernel_dispatch_packet_t* packet, bool blocking = true);
- bool dispatchAqlPacket(
- hsa_barrier_and_packet_t* packet, bool blocking = true);
- template bool dispatchGenericAqlPacket(
- AqlPacket* packet, bool blocking);
- void dispatchBarrierPacket(const hsa_barrier_and_packet_t* packet);
- void initializeDispatchPacket(hsa_kernel_dispatch_packet_t* packet,
- amd::NDRangeContainer& sizes);
-
- bool initPool(size_t kernarg_pool_size, uint signal_pool_count);
- void destroyPool();
-
- void* allocKernArg(size_t size, size_t alignment);
- void resetKernArgPool() { kernarg_pool_cur_offset_ = 0; }
-
- //! Updates AQL header for the upcomming dispatch
- void setAqlHeader(uint16_t header) { aqlHeader_ = header; }
-
- /**
- * @brief Maintains the list of sampler allocated for one or more kernel
- * submissions.
- */
- std::vector samplerList_;
-
- /**
- * @brief Indicates if a kernel dispatch is outstanding. This flag is
- * used to synchronized on kernel outputs.
- */
- bool hasPendingDispatch_;
- Timestamp* timestamp_;
- hsa_agent_t gpu_device_; //!< Physical device
- hsa_queue_t* gpu_queue_; //!< Queue associated with a gpu
- hsa_barrier_and_packet_t barrier_packet_;
- hsa_signal_t barrier_signal_;
- uint32_t dispatch_id_; //!< This variable must be updated atomically.
- Device& roc_device_; //!< roc device object
- void * tools_lib_;
- PrintfDbg* printfdbg_;
- MemoryDependency memoryDependency_; //!< Memory dependency class
- uint16_t aqlHeader_; //!< AQL header for dispatch
-
- char* kernarg_pool_base_;
- size_t kernarg_pool_size_;
- uint kernarg_pool_cur_offset_;
-
- std::vector signal_pool_; //!< Pool of signals for profiling
-
- friend class Timestamp;
-};
-}
-