From d17dc21c9cd46240b4ff7695723095c3deec55f9 Mon Sep 17 00:00:00 2001 From: Yaxun Sam Liu Date: Tue, 24 Jul 2018 18:12:32 -0400 Subject: [PATCH] Support malloc/free for hip-clang --- hipamd/docs/markdown/hip_programming_guide.md | 16 +- .../include/hip/hcc_detail/device_functions.h | 1 + hipamd/include/hip/hcc_detail/hip_memory.h | 102 ++++++++++ hipamd/include/hip/hcc_detail/hip_runtime.h | 13 +- hipamd/src/device_util.cpp | 64 ------ hipamd/src/device_util.h | 11 - hipamd/src/hip_device.cpp | 2 +- .../tests/src/deviceLib/hipDeviceMalloc.cpp | 190 ++++++++++++++++++ 8 files changed, 317 insertions(+), 82 deletions(-) create mode 100644 hipamd/include/hip/hcc_detail/hip_memory.h create mode 100644 hipamd/tests/src/deviceLib/hipDeviceMalloc.cpp diff --git a/hipamd/docs/markdown/hip_programming_guide.md b/hipamd/docs/markdown/hip_programming_guide.md index 9313eb22e1..52d250cab5 100644 --- a/hipamd/docs/markdown/hip_programming_guide.md +++ b/hipamd/docs/markdown/hip_programming_guide.md @@ -91,8 +91,22 @@ Setting HCC_UNPINNED_COPY_MODE = 3, forces all unpinned transfer to use direct m Following environment variables can be used to control the transfer thresholds: -- HCC_H2D_STAGING_THRESHOLD - Threshold in KB for H2D copy. For sizes smaller than threshold direct copy logic would be used else staging buffers logic. By default it is set to 64. +- HCC_H2D_STAGING_THRESHOLD - Threshold in KB for H2D copy. For sizes smaller than threshold direct copy logic would be used else staging buffers logic. By default it is set to 64. - HCC_H2D_PININPLACE_THRESHOLD - Threshold in KB for H2D copy. For sizes smaller than threshold staging buffers logic would be used else PinInPlace logic. By default it is set to 4096. - HCC_D2H_PININPLACE_THRESHOLD - Threshold in KB for D2H copy. For sizes smaller than threshold staging buffer logic would be used else PinInPlace logic. By default it is set to 1024. + +## Device-Side Malloc + +hip-hcc and hip-clang supports device-side malloc and free. Users can allocate +memory dynamically in a kernel. The allocated memory are in global address +space, however, different threads get different memory allocations for the same +call of malloc. The allocated memory can be accessed or freed by other threads +or other kernels. It persists in the life time of the HIP program until it is +freed. + +The memory are allocated in pages. Users can define macro +`__HIP_SIZE_OF_PAGE` for controlling the page size in bytes and macro +`__HIP_NUM_PAGES` for controlling the total number of pages that can be +allocated. \ No newline at end of file diff --git a/hipamd/include/hip/hcc_detail/device_functions.h b/hipamd/include/hip/hcc_detail/device_functions.h index 6455fa6cd1..34a9a194e2 100644 --- a/hipamd/include/hip/hcc_detail/device_functions.h +++ b/hipamd/include/hip/hcc_detail/device_functions.h @@ -1029,4 +1029,5 @@ static inline __device__ void* memset(void* ptr, int val, size_t size) { unsigned char val8 = static_cast(val); return __hip_hc_memset(ptr, val8, size); } + #endif diff --git a/hipamd/include/hip/hcc_detail/hip_memory.h b/hipamd/include/hip/hcc_detail/hip_memory.h new file mode 100644 index 0000000000..9167baba38 --- /dev/null +++ b/hipamd/include/hip/hcc_detail/hip_memory.h @@ -0,0 +1,102 @@ +/* +Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#ifndef HIP_INCLUDE_HIP_HCC_DETAIL_HIP_MEMORY_H +#define HIP_INCLUDE_HIP_HCC_DETAIL_HIP_MEMORY_H + +// Implementation of malloc and free device functions. +// HIP heap is implemented as a global array with fixed size. Users may define +// __HIP_SIZE_OF_PAGE and __HIP_NUM_PAGES to have a larger heap. + +// Size of page in bytes. +#ifndef __HIP_SIZE_OF_PAGE +#define __HIP_SIZE_OF_PAGE 64 +#endif + +// Total number of pages +#ifndef __HIP_NUM_PAGES +#define __HIP_NUM_PAGES (16 * 64 * 64) +#endif + +#define __HIP_SIZE_OF_HEAP (__HIP_NUM_PAGES * __HIP_SIZE_OF_PAGE) + +__attribute__((weak)) __device__ char __hip_device_heap[__HIP_SIZE_OF_HEAP]; +__attribute__((weak)) __device__ + uint32_t __hip_device_page_flag[__HIP_NUM_PAGES]; + +extern "C" inline __device__ void* __hip_malloc(size_t size) { + char* heap = (char*)__hip_device_heap; + if (size > __HIP_SIZE_OF_HEAP) { + return (void*)nullptr; + } + uint32_t totalThreads = + hipBlockDim_x * hipGridDim_x * hipBlockDim_y + * hipGridDim_y * hipBlockDim_z * hipGridDim_z; + uint32_t currentWorkItem = hipThreadIdx_x + hipBlockDim_x * hipBlockIdx_x; + + uint32_t numHeapsPerWorkItem = __HIP_NUM_PAGES / totalThreads; + uint32_t heapSizePerWorkItem = __HIP_SIZE_OF_HEAP / totalThreads; + + uint32_t stride = size / __HIP_SIZE_OF_PAGE; + uint32_t start = numHeapsPerWorkItem * currentWorkItem; + + uint32_t k = 0; + + while (__hip_device_page_flag[k] > 0) { + k++; + } + + for (uint32_t i = 0; i < stride - 1; i++) { + __hip_device_page_flag[i + start + k] = 1; + } + + __hip_device_page_flag[start + stride - 1 + k] = 2; + + void* ptr = (void*)(heap + + heapSizePerWorkItem * currentWorkItem + k * __HIP_SIZE_OF_PAGE); + + return ptr; +} + +extern "C" inline __device__ void* __hip_free(void* ptr) { + if (ptr == nullptr) { + return nullptr; + } + + uint32_t offsetByte = (uint64_t)ptr - (uint64_t)__hip_device_heap; + uint32_t offsetPage = offsetByte / __HIP_SIZE_OF_PAGE; + + while (__hip_device_page_flag[offsetPage] != 0) { + if (__hip_device_page_flag[offsetPage] == 2) { + __hip_device_page_flag[offsetPage] = 0; + offsetPage++; + break; + } else { + __hip_device_page_flag[offsetPage] = 0; + offsetPage++; + } + } + + return nullptr; +} + +#endif // HIP_INCLUDE_HIP_HCC_DETAIL_HIP_MEMORY_H diff --git a/hipamd/include/hip/hcc_detail/hip_runtime.h b/hipamd/include/hip/hcc_detail/hip_runtime.h index c2ae6e8e4f..8734feaf5d 100644 --- a/hipamd/include/hip/hcc_detail/hip_runtime.h +++ b/hipamd/include/hip/hcc_detail/hip_runtime.h @@ -260,11 +260,11 @@ static constexpr Coordinates threadIdx; #endif // defined __HCC__ #if __HCC_OR_HIP_CLANG__ -extern "C" __device__ void* __hip_hc_malloc(size_t); -extern "C" __device__ void* __hip_hc_free(void* ptr); +extern "C" __device__ void* __hip_malloc(size_t); +extern "C" __device__ void* __hip_free(void* ptr); -static inline __device__ void* malloc(size_t size) { return __hip_hc_malloc(size); } -static inline __device__ void* free(void* ptr) { return __hip_hc_free(ptr); } +static inline __device__ void* malloc(size_t size) { return __hip_malloc(size); } +static inline __device__ void* free(void* ptr) { return __hip_free(ptr); } #ifdef __HCC_ACCELERATOR__ @@ -438,6 +438,8 @@ extern const __device__ __attribute__((weak)) __hip_builtin_gridDim_t gridDim; #define hipGridDim_y gridDim.y #define hipGridDim_z gridDim.z +#include + // Support std::complex. #pragma push_macro("__CUDA__") #define __CUDA__ @@ -448,8 +450,9 @@ extern const __device__ __attribute__((weak)) __hip_builtin_gridDim_t gridDim; #undef __CUDA__ #pragma pop_macro("__CUDA__") -#include #endif +#include + #endif // HIP_HCC_DETAIL_RUNTIME_H diff --git a/hipamd/src/device_util.cpp b/hipamd/src/device_util.cpp index 87fbe0fcbc..7fa77dc5fe 100644 --- a/hipamd/src/device_util.cpp +++ b/hipamd/src/device_util.cpp @@ -28,70 +28,6 @@ THE SOFTWARE. #include "hip/hip_runtime.h" #include -//================================================================================================= -/* - Implementation of malloc and free device functions. - - This is the best place to put them because the device - global variables need to be initialized at the start. -*/ -__device__ char gpuHeap[SIZE_OF_HEAP]; -__device__ uint32_t gpuFlags[NUM_PAGES]; - -__device__ void* __hip_hc_malloc(size_t size) { - char* heap = (char*)gpuHeap; - if (size > SIZE_OF_HEAP) { - return (void*)nullptr; - } - uint32_t totalThreads = - blockDim.x * gridDim.x * blockDim.y * gridDim.y * blockDim.z * gridDim.z; - uint32_t currentWorkItem = threadIdx.x + blockDim.x * blockIdx.x; - - uint32_t numHeapsPerWorkItem = NUM_PAGES / totalThreads; - uint32_t heapSizePerWorkItem = SIZE_OF_HEAP / totalThreads; - - uint32_t stride = size / SIZE_OF_PAGE; - uint32_t start = numHeapsPerWorkItem * currentWorkItem; - - uint32_t k = 0; - - while (gpuFlags[k] > 0) { - k++; - } - - for (uint32_t i = 0; i < stride - 1; i++) { - gpuFlags[i + start + k] = 1; - } - - gpuFlags[start + stride - 1 + k] = 2; - - void* ptr = (void*)(heap + heapSizePerWorkItem * currentWorkItem + k * SIZE_OF_PAGE); - - return ptr; -} - -__device__ void* __hip_hc_free(void* ptr) { - if (ptr == nullptr) { - return nullptr; - } - - uint32_t offsetByte = (uint64_t)ptr - (uint64_t)gpuHeap; - uint32_t offsetPage = offsetByte / SIZE_OF_PAGE; - - while (gpuFlags[offsetPage] != 0) { - if (gpuFlags[offsetPage] == 2) { - gpuFlags[offsetPage] = 0; - offsetPage++; - break; - } else { - gpuFlags[offsetPage] = 0; - offsetPage++; - } - } - - return nullptr; -} - // abort __device__ void abort() { return hc::abort(); } diff --git a/hipamd/src/device_util.h b/hipamd/src/device_util.h index 8fa96da9d9..84dbbf71ed 100644 --- a/hipamd/src/device_util.h +++ b/hipamd/src/device_util.h @@ -29,14 +29,6 @@ THE SOFTWARE. Heap size computation for malloc and free device functions. */ -#define NUM_PAGES_PER_THREAD 16 -#define SIZE_OF_PAGE 64 -#define NUM_THREADS_PER_CU 64 -#define NUM_CUS_PER_GPU 64 // Specific for r9 Nano -#define NUM_PAGES NUM_PAGES_PER_THREAD* NUM_THREADS_PER_CU* NUM_CUS_PER_GPU -#define SIZE_MALLOC NUM_PAGES* SIZE_OF_PAGE -#define SIZE_OF_HEAP SIZE_MALLOC - #define HIP_SQRT_2 1.41421356237 #define HIP_SQRT_PI 1.77245385091 @@ -62,9 +54,6 @@ THE SOFTWARE. #define HIP_PI 3.14159265358979323846 -__device__ void* __hip_hc_malloc(size_t size); -__device__ void* __hip_hc_free(void* ptr); - __device__ float __hip_erfinvf(float x); __device__ double __hip_erfinv(double x); diff --git a/hipamd/src/hip_device.cpp b/hipamd/src/hip_device.cpp index 72150c3f54..2aae7cf2a8 100644 --- a/hipamd/src/hip_device.cpp +++ b/hipamd/src/hip_device.cpp @@ -99,7 +99,7 @@ hipError_t hipDeviceGetLimit(size_t* pValue, hipLimit_t limit) { return ihipLogStatus(hipErrorInvalidValue); } if (limit == hipLimitMallocHeapSize) { - *pValue = (size_t)SIZE_OF_HEAP; + *pValue = (size_t)__HIP_SIZE_OF_HEAP; return ihipLogStatus(hipSuccess); } else { return ihipLogStatus(hipErrorUnsupportedLimit); diff --git a/hipamd/tests/src/deviceLib/hipDeviceMalloc.cpp b/hipamd/tests/src/deviceLib/hipDeviceMalloc.cpp new file mode 100644 index 0000000000..8eb8cdcc3c --- /dev/null +++ b/hipamd/tests/src/deviceLib/hipDeviceMalloc.cpp @@ -0,0 +1,190 @@ +/* +Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR +IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ +/* HIT_START + * BUILD: %t %s NVCC_OPTIONS -std=c++11 + * RUN: %t EXCLUDE_HIP_PLATFORM nvcc + * HIT_END + */ +#include "test_common.h" +#include +#include + +// Tolerance for error +const double tolerance = 1e-6; +const bool verbose = false; + +#define LEN 64 + +#define ALL_FUN \ + OP(add) \ + OP(sub) \ + OP(mul) \ + OP(div) + +#define OP(x) CK_##x, +enum CalcKind { + ALL_FUN +}; +#undef OP + +#define OP(x) case CK_##x: return #x; +std::string getName(enum CalcKind CK) { + switch(CK){ + ALL_FUN + } +} +#undef OP + +// Calculates function. +// If the function has one argument, B is ignored. +// If the function returns real number, converts it to a complex number. +#define ONE_ARG(func) \ + case CK_##func: \ + return std::complex(std::func(A)); + +template +__device__ __host__ std::complex calc(std::complex A, + std::complex B, + enum CalcKind CK) { + switch(CK) { + case CK_add: + return A + B; + case CK_sub: + return A - B; + case CK_mul: + return A * B; + case CK_div: + return A / B; + + } +} + +// Allocate memory in kernel and save the address to pA and pB. +// Copy value from A, B to allocated memory. +template +__global__ void kernel_alloc(std::complex* A, + std::complex* B, + std::complex** pA, + std::complex** pB) { + typedef std::complex CFloatT; + int tx = threadIdx.x + blockIdx.x * blockDim.x; + if (tx == 0) { + *pA = (CFloatT*)malloc(sizeof(CFloatT)*LEN); + *pB = (CFloatT*)malloc(sizeof(CFloatT)*LEN); + for (int i = 0; i < LEN; i++) { + (*pA)[i] = A[i]; + (*pB)[i] = B[i]; + } + } +} + +// Do calculation using values saved in allocated memmory. pA, pB are buffers +// containing the address of the device-side allocated array. +template +__global__ void kernel_free(std::complex** pA, + std::complex** pB, std::complex* C, + enum CalcKind CK) { + typedef std::complex CFloatT; + int tx = threadIdx.x + blockIdx.x * blockDim.x; + C[tx] = calc((*pA)[tx], (*pB)[tx], CK); + if (tx == 0) { + free(*pA); + free(*pB); + } +} + +template +void test() { + typedef std::complex ComplexT; + ComplexT *A, *Ad, *B, *Bd, *C, *Cd, *D; + A = new ComplexT[LEN]; + B = new ComplexT[LEN]; + C = new ComplexT[LEN]; + D = new ComplexT[LEN]; + hipMalloc((void**)&Ad, sizeof(ComplexT)*LEN); + hipMalloc((void**)&Bd, sizeof(ComplexT)*LEN); + hipMalloc((void**)&Cd, sizeof(ComplexT)*LEN); + + for (uint32_t i = 0; i < LEN; i++) { + A[i] = ComplexT((i + 1) * 1.0f, (i + 2) * 1.0f); + B[i] = A[i]; + C[i] = A[i]; + } + hipMemcpy(Ad, A, sizeof(ComplexT)*LEN, hipMemcpyHostToDevice); + hipMemcpy(Bd, B, sizeof(ComplexT)*LEN, hipMemcpyHostToDevice); + + // Run kernel for a calculation kind and verify by comparing with host + // calculation result. Returns false if fails. + auto test_fun = [&](enum CalcKind CK) { + // kernel_alloc allocates memory on device side and initialize it. + // kernel_free uses allocated memory from kernel_alloc and does the + // calculation then free the memory. + // pA and pB are buffers to pass the device-side allocated memory address + // from kernel_alloc to kernel_free. + ComplexT **pA, **pB; + hipMalloc((ComplexT***)&pA, sizeof(ComplexT*)); + hipMalloc((ComplexT***)&pB, sizeof(ComplexT*)); + hipLaunchKernelGGL(kernel_alloc, dim3(1), dim3(LEN), 0, 0, + Ad, Bd, pA, pB); + hipDeviceSynchronize(); + hipLaunchKernelGGL(kernel_free, dim3(1), dim3(LEN), 0, 0, + pA, pB, Cd, CK); + hipMemcpy(C, Cd, sizeof(ComplexT)*LEN, hipMemcpyDeviceToHost); + hipFree(pA); + hipFree(pB); + for (int i = 0; i < LEN; i++) { + ComplexT Expected = calc(A[i], B[i], CK); + FloatT error = std::abs(C[i] - Expected); + if (std::abs(Expected) > tolerance) + error /= std::abs(Expected); + bool pass = error < tolerance; + if (verbose || !pass) { + std::cout << "Function: " << getName(CK) + << " Operands: " << A[i] << " " << B[i] + << " Result: " << C[i] + << " Expected: " << Expected + << " Error: " << error + << " Pass: " << pass + << std::endl; + } + if (!pass) + return false; + } + return true; + }; + +#define OP(x) assert(test_fun(CK_##x)); + ALL_FUN +#undef OP + + hipFree(Ad); + hipFree(Bd); + hipFree(Cd); + delete[] A; + delete[] B; + delete[] C; + delete[] D; +} + +int main() { + test(); + test(); + passed(); + return 0; +}