Support malloc/free for hip-clang

2018-07-24 18:12:32 -04:00
@@ -91,8 +91,22 @@ Setting HCC_UNPINNED_COPY_MODE = 3, forces all unpinned transfer to use direct m
 
 Following environment variables can be used to control the transfer thresholds:

-   HCC_H2D_STAGING_THRESHOLD - Threshold in KB for H2D copy. For sizes smaller than threshold direct copy logic would be used else staging buffers logic. By default it is set to 64.
+-   HCC_H2D_STAGING_THRESHOLD - Threshold in KB for H2D copy. For sizes smaller than threshold direct copy logic would be used else staging buffers logic. By default it is set to 64.
   
 -   HCC_H2D_PININPLACE_THRESHOLD - Threshold in KB for H2D copy. For sizes smaller than threshold staging buffers logic would be used else PinInPlace logic. By default it is set to 4096.

 -   HCC_D2H_PININPLACE_THRESHOLD  - Threshold in KB for D2H copy. For sizes smaller than threshold staging buffer logic would be used else PinInPlace logic. By default it is set to 1024.
+
+## Device-Side Malloc
+
+hip-hcc and hip-clang supports device-side malloc and free. Users can allocate
+memory dynamically in a kernel. The allocated memory are in global address
+space, however, different threads get different memory allocations for the same
+call of malloc. The allocated memory can be accessed or freed by other threads
+or other kernels. It persists in the life time of the HIP program until it is
+freed.
+
+The memory are allocated in pages. Users can define macro
+`__HIP_SIZE_OF_PAGE` for controlling the page size in bytes and macro
+`__HIP_NUM_PAGES` for controlling the total number of pages that can be
+allocated.
@@ -1029,4 +1029,5 @@ static inline __device__ void* memset(void* ptr, int val, size_t size) {
    unsigned char val8 = static_cast<unsigned char>(val);
    return __hip_hc_memset(ptr, val8, size);
 }
+
 #endif
@@ -0,0 +1,102 @@
+/*
+Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#ifndef HIP_INCLUDE_HIP_HCC_DETAIL_HIP_MEMORY_H
+#define HIP_INCLUDE_HIP_HCC_DETAIL_HIP_MEMORY_H
+
+// Implementation of malloc and free device functions.
+// HIP heap is implemented as a global array with fixed size. Users may define
+// __HIP_SIZE_OF_PAGE and __HIP_NUM_PAGES to have a larger heap.
+
+// Size of page in bytes.
+#ifndef __HIP_SIZE_OF_PAGE
+#define __HIP_SIZE_OF_PAGE 64
+#endif
+
+// Total number of pages
+#ifndef __HIP_NUM_PAGES
+#define __HIP_NUM_PAGES (16 * 64 * 64)
+#endif
+
+#define __HIP_SIZE_OF_HEAP (__HIP_NUM_PAGES * __HIP_SIZE_OF_PAGE)
+
+__attribute__((weak)) __device__ char __hip_device_heap[__HIP_SIZE_OF_HEAP];
+__attribute__((weak)) __device__
+    uint32_t __hip_device_page_flag[__HIP_NUM_PAGES];
+
+extern "C" inline __device__ void* __hip_malloc(size_t size) {
+    char* heap = (char*)__hip_device_heap;
+    if (size > __HIP_SIZE_OF_HEAP) {
+        return (void*)nullptr;
+    }
+    uint32_t totalThreads =
+        hipBlockDim_x * hipGridDim_x * hipBlockDim_y
+        * hipGridDim_y * hipBlockDim_z * hipGridDim_z;
+    uint32_t currentWorkItem = hipThreadIdx_x + hipBlockDim_x * hipBlockIdx_x;
+
+    uint32_t numHeapsPerWorkItem = __HIP_NUM_PAGES / totalThreads;
+    uint32_t heapSizePerWorkItem = __HIP_SIZE_OF_HEAP / totalThreads;
+
+    uint32_t stride = size / __HIP_SIZE_OF_PAGE;
+    uint32_t start = numHeapsPerWorkItem * currentWorkItem;
+
+    uint32_t k = 0;
+
+    while (__hip_device_page_flag[k] > 0) {
+        k++;
+    }
+
+    for (uint32_t i = 0; i < stride - 1; i++) {
+        __hip_device_page_flag[i + start + k] = 1;
+    }
+
+    __hip_device_page_flag[start + stride - 1 + k] = 2;
+
+    void* ptr = (void*)(heap
+        + heapSizePerWorkItem * currentWorkItem + k * __HIP_SIZE_OF_PAGE);
+
+    return ptr;
+}
+
+extern "C" inline __device__ void* __hip_free(void* ptr) {
+    if (ptr == nullptr) {
+        return nullptr;
+    }
+
+    uint32_t offsetByte = (uint64_t)ptr - (uint64_t)__hip_device_heap;
+    uint32_t offsetPage = offsetByte / __HIP_SIZE_OF_PAGE;
+
+    while (__hip_device_page_flag[offsetPage] != 0) {
+        if (__hip_device_page_flag[offsetPage] == 2) {
+            __hip_device_page_flag[offsetPage] = 0;
+            offsetPage++;
+            break;
+        } else {
+            __hip_device_page_flag[offsetPage] = 0;
+            offsetPage++;
+        }
+    }
+
+    return nullptr;
+}
+
+#endif // HIP_INCLUDE_HIP_HCC_DETAIL_HIP_MEMORY_H
@@ -260,11 +260,11 @@ static constexpr Coordinates<hc_get_workitem_id> threadIdx;

 #endif // defined __HCC__
 #if __HCC_OR_HIP_CLANG__
-extern "C" __device__ void* __hip_hc_malloc(size_t);
-extern "C" __device__ void* __hip_hc_free(void* ptr);
+extern "C" __device__ void* __hip_malloc(size_t);
+extern "C" __device__ void* __hip_free(void* ptr);

-static inline __device__ void* malloc(size_t size) { return __hip_hc_malloc(size); }
-static inline __device__ void* free(void* ptr) { return __hip_hc_free(ptr); }
+static inline __device__ void* malloc(size_t size) { return __hip_malloc(size); }
+static inline __device__ void* free(void* ptr) { return __hip_free(ptr); }

 #ifdef __HCC_ACCELERATOR__

@@ -438,6 +438,8 @@ extern const __device__ __attribute__((weak)) __hip_builtin_gridDim_t gridDim;
 #define hipGridDim_y gridDim.y
 #define hipGridDim_z gridDim.z

+#include <hip/hcc_detail/math_functions.h>
+
 // Support std::complex.
 #pragma push_macro("__CUDA__")
 #define __CUDA__
@@ -448,8 +450,9 @@ extern const __device__ __attribute__((weak)) __hip_builtin_gridDim_t gridDim;
 #undef __CUDA__
 #pragma pop_macro("__CUDA__")

-#include <hip/hcc_detail/math_functions.h>

 #endif

+#include <hip/hcc_detail/hip_memory.h>
+
 #endif  // HIP_HCC_DETAIL_RUNTIME_H
@@ -28,70 +28,6 @@ THE SOFTWARE.
 #include "hip/hip_runtime.h"
 #include <atomic>

-//=================================================================================================
-/*
- Implementation of malloc and free device functions.
-
- This is the best place to put them because the device
- global variables need to be initialized at the start.
-*/
-__device__ char gpuHeap[SIZE_OF_HEAP];
-__device__ uint32_t gpuFlags[NUM_PAGES];
-
-__device__ void* __hip_hc_malloc(size_t size) {
-    char* heap = (char*)gpuHeap;
-    if (size > SIZE_OF_HEAP) {
-        return (void*)nullptr;
-    }
-    uint32_t totalThreads =
-        blockDim.x * gridDim.x * blockDim.y * gridDim.y * blockDim.z * gridDim.z;
-    uint32_t currentWorkItem = threadIdx.x + blockDim.x * blockIdx.x;
-
-    uint32_t numHeapsPerWorkItem = NUM_PAGES / totalThreads;
-    uint32_t heapSizePerWorkItem = SIZE_OF_HEAP / totalThreads;
-
-    uint32_t stride = size / SIZE_OF_PAGE;
-    uint32_t start = numHeapsPerWorkItem * currentWorkItem;
-
-    uint32_t k = 0;
-
-    while (gpuFlags[k] > 0) {
-        k++;
-    }
-
-    for (uint32_t i = 0; i < stride - 1; i++) {
-        gpuFlags[i + start + k] = 1;
-    }
-
-    gpuFlags[start + stride - 1 + k] = 2;
-
-    void* ptr = (void*)(heap + heapSizePerWorkItem * currentWorkItem + k * SIZE_OF_PAGE);
-
-    return ptr;
-}
-
-__device__ void* __hip_hc_free(void* ptr) {
-    if (ptr == nullptr) {
-        return nullptr;
-    }
-
-    uint32_t offsetByte = (uint64_t)ptr - (uint64_t)gpuHeap;
-    uint32_t offsetPage = offsetByte / SIZE_OF_PAGE;
-
-    while (gpuFlags[offsetPage] != 0) {
-        if (gpuFlags[offsetPage] == 2) {
-            gpuFlags[offsetPage] = 0;
-            offsetPage++;
-            break;
-        } else {
-            gpuFlags[offsetPage] = 0;
-            offsetPage++;
-        }
-    }
-
-    return nullptr;
-}
-
 // abort
 __device__ void abort() { return hc::abort(); }

@@ -29,14 +29,6 @@ THE SOFTWARE.
 Heap size computation for malloc and free device functions.
 */

-#define NUM_PAGES_PER_THREAD 16
-#define SIZE_OF_PAGE 64
-#define NUM_THREADS_PER_CU 64
-#define NUM_CUS_PER_GPU 64  // Specific for r9 Nano
-#define NUM_PAGES NUM_PAGES_PER_THREAD* NUM_THREADS_PER_CU* NUM_CUS_PER_GPU
-#define SIZE_MALLOC NUM_PAGES* SIZE_OF_PAGE
-#define SIZE_OF_HEAP SIZE_MALLOC
-
 #define HIP_SQRT_2 1.41421356237
 #define HIP_SQRT_PI 1.77245385091

@@ -62,9 +54,6 @@ THE SOFTWARE.

 #define HIP_PI 3.14159265358979323846

-__device__ void* __hip_hc_malloc(size_t size);
-__device__ void* __hip_hc_free(void* ptr);
-
 __device__ float __hip_erfinvf(float x);
 __device__ double __hip_erfinv(double x);

@@ -99,7 +99,7 @@ hipError_t hipDeviceGetLimit(size_t* pValue, hipLimit_t limit) {
        return ihipLogStatus(hipErrorInvalidValue);
    }
    if (limit == hipLimitMallocHeapSize) {
-        *pValue = (size_t)SIZE_OF_HEAP;
+        *pValue = (size_t)__HIP_SIZE_OF_HEAP;
        return ihipLogStatus(hipSuccess);
    } else {
        return ihipLogStatus(hipErrorUnsupportedLimit);
@@ -0,0 +1,190 @@
+/*
+Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR
+IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+/* HIT_START
+ * BUILD: %t %s NVCC_OPTIONS -std=c++11
+ * RUN: %t EXCLUDE_HIP_PLATFORM nvcc
+ * HIT_END
+ */
+#include "test_common.h"
+#include <iostream>
+#include <complex>
+
+// Tolerance for error
+const double tolerance = 1e-6;
+const bool verbose = false;
+
+#define LEN 64
+
+#define ALL_FUN \
+  OP(add) \
+  OP(sub) \
+  OP(mul) \
+  OP(div)
+
+#define OP(x) CK_##x,
+enum CalcKind {
+  ALL_FUN
+};
+#undef OP
+
+#define OP(x) case CK_##x: return #x;
+std::string getName(enum CalcKind CK) {
+  switch(CK){
+  ALL_FUN
+  }
+}
+#undef OP
+
+// Calculates function.
+// If the function has one argument, B is ignored.
+// If the function returns real number, converts it to a complex number.
+#define ONE_ARG(func) \
+  case CK_##func: \
+    return std::complex<FloatT>(std::func(A));
+
+template<typename FloatT>
+__device__ __host__ std::complex<FloatT> calc(std::complex<FloatT> A,
+                                              std::complex<FloatT> B,
+                                              enum CalcKind CK) {
+  switch(CK) {
+  case CK_add:
+    return A + B;
+  case CK_sub:
+    return A - B;
+  case CK_mul:
+    return A * B;
+  case CK_div:
+    return A / B;
+    
+  }
+}
+
+// Allocate memory in kernel and save the address to pA and pB.
+// Copy value from A, B to allocated memory.
+template<typename FloatT>
+__global__ void kernel_alloc(std::complex<FloatT>* A,
+                             std::complex<FloatT>* B,
+                             std::complex<FloatT>** pA,
+                             std::complex<FloatT>** pB) {
+    typedef std::complex<FloatT> CFloatT;
+    int tx = threadIdx.x + blockIdx.x * blockDim.x;
+    if (tx == 0) {
+        *pA = (CFloatT*)malloc(sizeof(CFloatT)*LEN);
+        *pB = (CFloatT*)malloc(sizeof(CFloatT)*LEN);
+        for (int i = 0; i < LEN; i++) {
+            (*pA)[i] = A[i];
+            (*pB)[i] = B[i];
+      }
+    }
+}
+
+// Do calculation using values saved in allocated memmory. pA, pB are buffers
+// containing the address of the device-side allocated array.
+template<typename FloatT>
+__global__ void kernel_free(std::complex<FloatT>** pA,
+                            std::complex<FloatT>** pB, std::complex<FloatT>* C,
+                            enum CalcKind CK) {
+    typedef std::complex<FloatT> CFloatT;
+    int tx = threadIdx.x + blockIdx.x * blockDim.x;
+    C[tx] = calc<FloatT>((*pA)[tx], (*pB)[tx], CK);
+    if (tx == 0) {
+        free(*pA);
+        free(*pB);
+    }
+}
+
+template<typename FloatT>
+void test() {
+    typedef std::complex<FloatT> ComplexT;
+    ComplexT *A, *Ad, *B, *Bd, *C, *Cd, *D;
+    A = new ComplexT[LEN];
+    B = new ComplexT[LEN];
+    C = new ComplexT[LEN];
+    D = new ComplexT[LEN];
+    hipMalloc((void**)&Ad, sizeof(ComplexT)*LEN);
+    hipMalloc((void**)&Bd, sizeof(ComplexT)*LEN);
+    hipMalloc((void**)&Cd, sizeof(ComplexT)*LEN);
+
+    for (uint32_t i = 0; i < LEN; i++) {
+        A[i] = ComplexT((i + 1) * 1.0f, (i + 2) * 1.0f);
+        B[i] = A[i];
+        C[i] = A[i];
+    }
+    hipMemcpy(Ad, A, sizeof(ComplexT)*LEN, hipMemcpyHostToDevice);
+    hipMemcpy(Bd, B, sizeof(ComplexT)*LEN, hipMemcpyHostToDevice);
+
+    // Run kernel for a calculation kind and verify by comparing with host
+    // calculation result. Returns false if fails.
+    auto test_fun = [&](enum CalcKind CK) {
+      // kernel_alloc allocates memory on device side and initialize it.
+      // kernel_free uses allocated memory from kernel_alloc and does the
+      // calculation then free the memory.
+      // pA and pB are buffers to pass the device-side allocated memory address
+      // from kernel_alloc to kernel_free.
+      ComplexT **pA, **pB;
+      hipMalloc((ComplexT***)&pA, sizeof(ComplexT*));
+      hipMalloc((ComplexT***)&pB, sizeof(ComplexT*));
+      hipLaunchKernelGGL(kernel_alloc<FloatT>, dim3(1), dim3(LEN), 0, 0,
+          Ad, Bd, pA, pB);
+      hipDeviceSynchronize();
+      hipLaunchKernelGGL(kernel_free<FloatT>, dim3(1), dim3(LEN), 0, 0,
+          pA, pB, Cd, CK);
+      hipMemcpy(C, Cd, sizeof(ComplexT)*LEN, hipMemcpyDeviceToHost);
+      hipFree(pA);
+      hipFree(pB);
+      for (int i = 0; i < LEN; i++) {
+        ComplexT Expected = calc(A[i], B[i], CK);
+        FloatT error = std::abs(C[i] - Expected);
+        if (std::abs(Expected) > tolerance)
+          error /= std::abs(Expected);
+        bool pass = error < tolerance;
+        if (verbose || !pass) {
+          std::cout << "Function: " << getName(CK)
+                    << " Operands: " << A[i] << " " << B[i]
+                    << " Result: " << C[i]
+                    << " Expected: " << Expected
+                    << " Error: " << error
+                    << " Pass: " << pass
+                    << std::endl;
+        }
+        if (!pass)
+          return false;
+      }
+      return true;
+    };
+
+#define OP(x) assert(test_fun(CK_##x));
+    ALL_FUN
+#undef OP
+
+    hipFree(Ad);
+    hipFree(Bd);
+    hipFree(Cd);
+    delete[] A;
+    delete[] B;
+    delete[] C;
+    delete[] D;
+}
+
+int main() {
+  test<float>();
+  test<double>();
+  passed();
+  return 0;
+}