Support malloc/free for hip-clang

Bu işleme şunda yer alıyor:
Yaxun Sam Liu
2018-07-24 18:12:32 -04:00
ebeveyn 17f612def1
işleme d17dc21c9c
8 değiştirilmiş dosya ile 317 ekleme ve 82 silme
+15 -1
Dosyayı Görüntüle
@@ -91,8 +91,22 @@ Setting HCC_UNPINNED_COPY_MODE = 3, forces all unpinned transfer to use direct m
Following environment variables can be used to control the transfer thresholds:
- HCC_H2D_STAGING_THRESHOLD - Threshold in KB for H2D copy. For sizes smaller than threshold direct copy logic would be used else staging buffers logic. By default it is set to 64.
- HCC_H2D_STAGING_THRESHOLD - Threshold in KB for H2D copy. For sizes smaller than threshold direct copy logic would be used else staging buffers logic. By default it is set to 64.
- HCC_H2D_PININPLACE_THRESHOLD - Threshold in KB for H2D copy. For sizes smaller than threshold staging buffers logic would be used else PinInPlace logic. By default it is set to 4096.
- HCC_D2H_PININPLACE_THRESHOLD - Threshold in KB for D2H copy. For sizes smaller than threshold staging buffer logic would be used else PinInPlace logic. By default it is set to 1024.
## Device-Side Malloc
hip-hcc and hip-clang supports device-side malloc and free. Users can allocate
memory dynamically in a kernel. The allocated memory are in global address
space, however, different threads get different memory allocations for the same
call of malloc. The allocated memory can be accessed or freed by other threads
or other kernels. It persists in the life time of the HIP program until it is
freed.
The memory are allocated in pages. Users can define macro
`__HIP_SIZE_OF_PAGE` for controlling the page size in bytes and macro
`__HIP_NUM_PAGES` for controlling the total number of pages that can be
allocated.
+1
Dosyayı Görüntüle
@@ -1029,4 +1029,5 @@ static inline __device__ void* memset(void* ptr, int val, size_t size) {
unsigned char val8 = static_cast<unsigned char>(val);
return __hip_hc_memset(ptr, val8, size);
}
#endif
+102
Dosyayı Görüntüle
@@ -0,0 +1,102 @@
/*
Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#ifndef HIP_INCLUDE_HIP_HCC_DETAIL_HIP_MEMORY_H
#define HIP_INCLUDE_HIP_HCC_DETAIL_HIP_MEMORY_H
// Implementation of malloc and free device functions.
// HIP heap is implemented as a global array with fixed size. Users may define
// __HIP_SIZE_OF_PAGE and __HIP_NUM_PAGES to have a larger heap.
// Size of page in bytes.
#ifndef __HIP_SIZE_OF_PAGE
#define __HIP_SIZE_OF_PAGE 64
#endif
// Total number of pages
#ifndef __HIP_NUM_PAGES
#define __HIP_NUM_PAGES (16 * 64 * 64)
#endif
#define __HIP_SIZE_OF_HEAP (__HIP_NUM_PAGES * __HIP_SIZE_OF_PAGE)
__attribute__((weak)) __device__ char __hip_device_heap[__HIP_SIZE_OF_HEAP];
__attribute__((weak)) __device__
uint32_t __hip_device_page_flag[__HIP_NUM_PAGES];
extern "C" inline __device__ void* __hip_malloc(size_t size) {
char* heap = (char*)__hip_device_heap;
if (size > __HIP_SIZE_OF_HEAP) {
return (void*)nullptr;
}
uint32_t totalThreads =
hipBlockDim_x * hipGridDim_x * hipBlockDim_y
* hipGridDim_y * hipBlockDim_z * hipGridDim_z;
uint32_t currentWorkItem = hipThreadIdx_x + hipBlockDim_x * hipBlockIdx_x;
uint32_t numHeapsPerWorkItem = __HIP_NUM_PAGES / totalThreads;
uint32_t heapSizePerWorkItem = __HIP_SIZE_OF_HEAP / totalThreads;
uint32_t stride = size / __HIP_SIZE_OF_PAGE;
uint32_t start = numHeapsPerWorkItem * currentWorkItem;
uint32_t k = 0;
while (__hip_device_page_flag[k] > 0) {
k++;
}
for (uint32_t i = 0; i < stride - 1; i++) {
__hip_device_page_flag[i + start + k] = 1;
}
__hip_device_page_flag[start + stride - 1 + k] = 2;
void* ptr = (void*)(heap
+ heapSizePerWorkItem * currentWorkItem + k * __HIP_SIZE_OF_PAGE);
return ptr;
}
extern "C" inline __device__ void* __hip_free(void* ptr) {
if (ptr == nullptr) {
return nullptr;
}
uint32_t offsetByte = (uint64_t)ptr - (uint64_t)__hip_device_heap;
uint32_t offsetPage = offsetByte / __HIP_SIZE_OF_PAGE;
while (__hip_device_page_flag[offsetPage] != 0) {
if (__hip_device_page_flag[offsetPage] == 2) {
__hip_device_page_flag[offsetPage] = 0;
offsetPage++;
break;
} else {
__hip_device_page_flag[offsetPage] = 0;
offsetPage++;
}
}
return nullptr;
}
#endif // HIP_INCLUDE_HIP_HCC_DETAIL_HIP_MEMORY_H
+8 -5
Dosyayı Görüntüle
@@ -260,11 +260,11 @@ static constexpr Coordinates<hc_get_workitem_id> threadIdx;
#endif // defined __HCC__
#if __HCC_OR_HIP_CLANG__
extern "C" __device__ void* __hip_hc_malloc(size_t);
extern "C" __device__ void* __hip_hc_free(void* ptr);
extern "C" __device__ void* __hip_malloc(size_t);
extern "C" __device__ void* __hip_free(void* ptr);
static inline __device__ void* malloc(size_t size) { return __hip_hc_malloc(size); }
static inline __device__ void* free(void* ptr) { return __hip_hc_free(ptr); }
static inline __device__ void* malloc(size_t size) { return __hip_malloc(size); }
static inline __device__ void* free(void* ptr) { return __hip_free(ptr); }
#ifdef __HCC_ACCELERATOR__
@@ -438,6 +438,8 @@ extern const __device__ __attribute__((weak)) __hip_builtin_gridDim_t gridDim;
#define hipGridDim_y gridDim.y
#define hipGridDim_z gridDim.z
#include <hip/hcc_detail/math_functions.h>
// Support std::complex.
#pragma push_macro("__CUDA__")
#define __CUDA__
@@ -448,8 +450,9 @@ extern const __device__ __attribute__((weak)) __hip_builtin_gridDim_t gridDim;
#undef __CUDA__
#pragma pop_macro("__CUDA__")
#include <hip/hcc_detail/math_functions.h>
#endif
#include <hip/hcc_detail/hip_memory.h>
#endif // HIP_HCC_DETAIL_RUNTIME_H
-64
Dosyayı Görüntüle
@@ -28,70 +28,6 @@ THE SOFTWARE.
#include "hip/hip_runtime.h"
#include <atomic>
//=================================================================================================
/*
Implementation of malloc and free device functions.
This is the best place to put them because the device
global variables need to be initialized at the start.
*/
__device__ char gpuHeap[SIZE_OF_HEAP];
__device__ uint32_t gpuFlags[NUM_PAGES];
__device__ void* __hip_hc_malloc(size_t size) {
char* heap = (char*)gpuHeap;
if (size > SIZE_OF_HEAP) {
return (void*)nullptr;
}
uint32_t totalThreads =
blockDim.x * gridDim.x * blockDim.y * gridDim.y * blockDim.z * gridDim.z;
uint32_t currentWorkItem = threadIdx.x + blockDim.x * blockIdx.x;
uint32_t numHeapsPerWorkItem = NUM_PAGES / totalThreads;
uint32_t heapSizePerWorkItem = SIZE_OF_HEAP / totalThreads;
uint32_t stride = size / SIZE_OF_PAGE;
uint32_t start = numHeapsPerWorkItem * currentWorkItem;
uint32_t k = 0;
while (gpuFlags[k] > 0) {
k++;
}
for (uint32_t i = 0; i < stride - 1; i++) {
gpuFlags[i + start + k] = 1;
}
gpuFlags[start + stride - 1 + k] = 2;
void* ptr = (void*)(heap + heapSizePerWorkItem * currentWorkItem + k * SIZE_OF_PAGE);
return ptr;
}
__device__ void* __hip_hc_free(void* ptr) {
if (ptr == nullptr) {
return nullptr;
}
uint32_t offsetByte = (uint64_t)ptr - (uint64_t)gpuHeap;
uint32_t offsetPage = offsetByte / SIZE_OF_PAGE;
while (gpuFlags[offsetPage] != 0) {
if (gpuFlags[offsetPage] == 2) {
gpuFlags[offsetPage] = 0;
offsetPage++;
break;
} else {
gpuFlags[offsetPage] = 0;
offsetPage++;
}
}
return nullptr;
}
// abort
__device__ void abort() { return hc::abort(); }
-11
Dosyayı Görüntüle
@@ -29,14 +29,6 @@ THE SOFTWARE.
Heap size computation for malloc and free device functions.
*/
#define NUM_PAGES_PER_THREAD 16
#define SIZE_OF_PAGE 64
#define NUM_THREADS_PER_CU 64
#define NUM_CUS_PER_GPU 64 // Specific for r9 Nano
#define NUM_PAGES NUM_PAGES_PER_THREAD* NUM_THREADS_PER_CU* NUM_CUS_PER_GPU
#define SIZE_MALLOC NUM_PAGES* SIZE_OF_PAGE
#define SIZE_OF_HEAP SIZE_MALLOC
#define HIP_SQRT_2 1.41421356237
#define HIP_SQRT_PI 1.77245385091
@@ -62,9 +54,6 @@ THE SOFTWARE.
#define HIP_PI 3.14159265358979323846
__device__ void* __hip_hc_malloc(size_t size);
__device__ void* __hip_hc_free(void* ptr);
__device__ float __hip_erfinvf(float x);
__device__ double __hip_erfinv(double x);
+1 -1
Dosyayı Görüntüle
@@ -99,7 +99,7 @@ hipError_t hipDeviceGetLimit(size_t* pValue, hipLimit_t limit) {
return ihipLogStatus(hipErrorInvalidValue);
}
if (limit == hipLimitMallocHeapSize) {
*pValue = (size_t)SIZE_OF_HEAP;
*pValue = (size_t)__HIP_SIZE_OF_HEAP;
return ihipLogStatus(hipSuccess);
} else {
return ihipLogStatus(hipErrorUnsupportedLimit);
+190
Dosyayı Görüntüle
@@ -0,0 +1,190 @@
/*
Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR
IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
/* HIT_START
* BUILD: %t %s NVCC_OPTIONS -std=c++11
* RUN: %t EXCLUDE_HIP_PLATFORM nvcc
* HIT_END
*/
#include "test_common.h"
#include <iostream>
#include <complex>
// Tolerance for error
const double tolerance = 1e-6;
const bool verbose = false;
#define LEN 64
#define ALL_FUN \
OP(add) \
OP(sub) \
OP(mul) \
OP(div)
#define OP(x) CK_##x,
enum CalcKind {
ALL_FUN
};
#undef OP
#define OP(x) case CK_##x: return #x;
std::string getName(enum CalcKind CK) {
switch(CK){
ALL_FUN
}
}
#undef OP
// Calculates function.
// If the function has one argument, B is ignored.
// If the function returns real number, converts it to a complex number.
#define ONE_ARG(func) \
case CK_##func: \
return std::complex<FloatT>(std::func(A));
template<typename FloatT>
__device__ __host__ std::complex<FloatT> calc(std::complex<FloatT> A,
std::complex<FloatT> B,
enum CalcKind CK) {
switch(CK) {
case CK_add:
return A + B;
case CK_sub:
return A - B;
case CK_mul:
return A * B;
case CK_div:
return A / B;
}
}
// Allocate memory in kernel and save the address to pA and pB.
// Copy value from A, B to allocated memory.
template<typename FloatT>
__global__ void kernel_alloc(std::complex<FloatT>* A,
std::complex<FloatT>* B,
std::complex<FloatT>** pA,
std::complex<FloatT>** pB) {
typedef std::complex<FloatT> CFloatT;
int tx = threadIdx.x + blockIdx.x * blockDim.x;
if (tx == 0) {
*pA = (CFloatT*)malloc(sizeof(CFloatT)*LEN);
*pB = (CFloatT*)malloc(sizeof(CFloatT)*LEN);
for (int i = 0; i < LEN; i++) {
(*pA)[i] = A[i];
(*pB)[i] = B[i];
}
}
}
// Do calculation using values saved in allocated memmory. pA, pB are buffers
// containing the address of the device-side allocated array.
template<typename FloatT>
__global__ void kernel_free(std::complex<FloatT>** pA,
std::complex<FloatT>** pB, std::complex<FloatT>* C,
enum CalcKind CK) {
typedef std::complex<FloatT> CFloatT;
int tx = threadIdx.x + blockIdx.x * blockDim.x;
C[tx] = calc<FloatT>((*pA)[tx], (*pB)[tx], CK);
if (tx == 0) {
free(*pA);
free(*pB);
}
}
template<typename FloatT>
void test() {
typedef std::complex<FloatT> ComplexT;
ComplexT *A, *Ad, *B, *Bd, *C, *Cd, *D;
A = new ComplexT[LEN];
B = new ComplexT[LEN];
C = new ComplexT[LEN];
D = new ComplexT[LEN];
hipMalloc((void**)&Ad, sizeof(ComplexT)*LEN);
hipMalloc((void**)&Bd, sizeof(ComplexT)*LEN);
hipMalloc((void**)&Cd, sizeof(ComplexT)*LEN);
for (uint32_t i = 0; i < LEN; i++) {
A[i] = ComplexT((i + 1) * 1.0f, (i + 2) * 1.0f);
B[i] = A[i];
C[i] = A[i];
}
hipMemcpy(Ad, A, sizeof(ComplexT)*LEN, hipMemcpyHostToDevice);
hipMemcpy(Bd, B, sizeof(ComplexT)*LEN, hipMemcpyHostToDevice);
// Run kernel for a calculation kind and verify by comparing with host
// calculation result. Returns false if fails.
auto test_fun = [&](enum CalcKind CK) {
// kernel_alloc allocates memory on device side and initialize it.
// kernel_free uses allocated memory from kernel_alloc and does the
// calculation then free the memory.
// pA and pB are buffers to pass the device-side allocated memory address
// from kernel_alloc to kernel_free.
ComplexT **pA, **pB;
hipMalloc((ComplexT***)&pA, sizeof(ComplexT*));
hipMalloc((ComplexT***)&pB, sizeof(ComplexT*));
hipLaunchKernelGGL(kernel_alloc<FloatT>, dim3(1), dim3(LEN), 0, 0,
Ad, Bd, pA, pB);
hipDeviceSynchronize();
hipLaunchKernelGGL(kernel_free<FloatT>, dim3(1), dim3(LEN), 0, 0,
pA, pB, Cd, CK);
hipMemcpy(C, Cd, sizeof(ComplexT)*LEN, hipMemcpyDeviceToHost);
hipFree(pA);
hipFree(pB);
for (int i = 0; i < LEN; i++) {
ComplexT Expected = calc(A[i], B[i], CK);
FloatT error = std::abs(C[i] - Expected);
if (std::abs(Expected) > tolerance)
error /= std::abs(Expected);
bool pass = error < tolerance;
if (verbose || !pass) {
std::cout << "Function: " << getName(CK)
<< " Operands: " << A[i] << " " << B[i]
<< " Result: " << C[i]
<< " Expected: " << Expected
<< " Error: " << error
<< " Pass: " << pass
<< std::endl;
}
if (!pass)
return false;
}
return true;
};
#define OP(x) assert(test_fun(CK_##x));
ALL_FUN
#undef OP
hipFree(Ad);
hipFree(Bd);
hipFree(Cd);
delete[] A;
delete[] B;
delete[] C;
delete[] D;
}
int main() {
test<float>();
test<double>();
passed();
return 0;
}