Support malloc/free for hip-clang
Bu işleme şunda yer alıyor:
@@ -91,8 +91,22 @@ Setting HCC_UNPINNED_COPY_MODE = 3, forces all unpinned transfer to use direct m
|
||||
|
||||
Following environment variables can be used to control the transfer thresholds:
|
||||
|
||||
- HCC_H2D_STAGING_THRESHOLD - Threshold in KB for H2D copy. For sizes smaller than threshold direct copy logic would be used else staging buffers logic. By default it is set to 64.
|
||||
- HCC_H2D_STAGING_THRESHOLD - Threshold in KB for H2D copy. For sizes smaller than threshold direct copy logic would be used else staging buffers logic. By default it is set to 64.
|
||||
|
||||
- HCC_H2D_PININPLACE_THRESHOLD - Threshold in KB for H2D copy. For sizes smaller than threshold staging buffers logic would be used else PinInPlace logic. By default it is set to 4096.
|
||||
|
||||
- HCC_D2H_PININPLACE_THRESHOLD - Threshold in KB for D2H copy. For sizes smaller than threshold staging buffer logic would be used else PinInPlace logic. By default it is set to 1024.
|
||||
|
||||
## Device-Side Malloc
|
||||
|
||||
hip-hcc and hip-clang supports device-side malloc and free. Users can allocate
|
||||
memory dynamically in a kernel. The allocated memory are in global address
|
||||
space, however, different threads get different memory allocations for the same
|
||||
call of malloc. The allocated memory can be accessed or freed by other threads
|
||||
or other kernels. It persists in the life time of the HIP program until it is
|
||||
freed.
|
||||
|
||||
The memory are allocated in pages. Users can define macro
|
||||
`__HIP_SIZE_OF_PAGE` for controlling the page size in bytes and macro
|
||||
`__HIP_NUM_PAGES` for controlling the total number of pages that can be
|
||||
allocated.
|
||||
@@ -1029,4 +1029,5 @@ static inline __device__ void* memset(void* ptr, int val, size_t size) {
|
||||
unsigned char val8 = static_cast<unsigned char>(val);
|
||||
return __hip_hc_memset(ptr, val8, size);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
@@ -0,0 +1,102 @@
|
||||
/*
|
||||
Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#ifndef HIP_INCLUDE_HIP_HCC_DETAIL_HIP_MEMORY_H
|
||||
#define HIP_INCLUDE_HIP_HCC_DETAIL_HIP_MEMORY_H
|
||||
|
||||
// Implementation of malloc and free device functions.
|
||||
// HIP heap is implemented as a global array with fixed size. Users may define
|
||||
// __HIP_SIZE_OF_PAGE and __HIP_NUM_PAGES to have a larger heap.
|
||||
|
||||
// Size of page in bytes.
|
||||
#ifndef __HIP_SIZE_OF_PAGE
|
||||
#define __HIP_SIZE_OF_PAGE 64
|
||||
#endif
|
||||
|
||||
// Total number of pages
|
||||
#ifndef __HIP_NUM_PAGES
|
||||
#define __HIP_NUM_PAGES (16 * 64 * 64)
|
||||
#endif
|
||||
|
||||
#define __HIP_SIZE_OF_HEAP (__HIP_NUM_PAGES * __HIP_SIZE_OF_PAGE)
|
||||
|
||||
__attribute__((weak)) __device__ char __hip_device_heap[__HIP_SIZE_OF_HEAP];
|
||||
__attribute__((weak)) __device__
|
||||
uint32_t __hip_device_page_flag[__HIP_NUM_PAGES];
|
||||
|
||||
extern "C" inline __device__ void* __hip_malloc(size_t size) {
|
||||
char* heap = (char*)__hip_device_heap;
|
||||
if (size > __HIP_SIZE_OF_HEAP) {
|
||||
return (void*)nullptr;
|
||||
}
|
||||
uint32_t totalThreads =
|
||||
hipBlockDim_x * hipGridDim_x * hipBlockDim_y
|
||||
* hipGridDim_y * hipBlockDim_z * hipGridDim_z;
|
||||
uint32_t currentWorkItem = hipThreadIdx_x + hipBlockDim_x * hipBlockIdx_x;
|
||||
|
||||
uint32_t numHeapsPerWorkItem = __HIP_NUM_PAGES / totalThreads;
|
||||
uint32_t heapSizePerWorkItem = __HIP_SIZE_OF_HEAP / totalThreads;
|
||||
|
||||
uint32_t stride = size / __HIP_SIZE_OF_PAGE;
|
||||
uint32_t start = numHeapsPerWorkItem * currentWorkItem;
|
||||
|
||||
uint32_t k = 0;
|
||||
|
||||
while (__hip_device_page_flag[k] > 0) {
|
||||
k++;
|
||||
}
|
||||
|
||||
for (uint32_t i = 0; i < stride - 1; i++) {
|
||||
__hip_device_page_flag[i + start + k] = 1;
|
||||
}
|
||||
|
||||
__hip_device_page_flag[start + stride - 1 + k] = 2;
|
||||
|
||||
void* ptr = (void*)(heap
|
||||
+ heapSizePerWorkItem * currentWorkItem + k * __HIP_SIZE_OF_PAGE);
|
||||
|
||||
return ptr;
|
||||
}
|
||||
|
||||
extern "C" inline __device__ void* __hip_free(void* ptr) {
|
||||
if (ptr == nullptr) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
uint32_t offsetByte = (uint64_t)ptr - (uint64_t)__hip_device_heap;
|
||||
uint32_t offsetPage = offsetByte / __HIP_SIZE_OF_PAGE;
|
||||
|
||||
while (__hip_device_page_flag[offsetPage] != 0) {
|
||||
if (__hip_device_page_flag[offsetPage] == 2) {
|
||||
__hip_device_page_flag[offsetPage] = 0;
|
||||
offsetPage++;
|
||||
break;
|
||||
} else {
|
||||
__hip_device_page_flag[offsetPage] = 0;
|
||||
offsetPage++;
|
||||
}
|
||||
}
|
||||
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
#endif // HIP_INCLUDE_HIP_HCC_DETAIL_HIP_MEMORY_H
|
||||
@@ -260,11 +260,11 @@ static constexpr Coordinates<hc_get_workitem_id> threadIdx;
|
||||
|
||||
#endif // defined __HCC__
|
||||
#if __HCC_OR_HIP_CLANG__
|
||||
extern "C" __device__ void* __hip_hc_malloc(size_t);
|
||||
extern "C" __device__ void* __hip_hc_free(void* ptr);
|
||||
extern "C" __device__ void* __hip_malloc(size_t);
|
||||
extern "C" __device__ void* __hip_free(void* ptr);
|
||||
|
||||
static inline __device__ void* malloc(size_t size) { return __hip_hc_malloc(size); }
|
||||
static inline __device__ void* free(void* ptr) { return __hip_hc_free(ptr); }
|
||||
static inline __device__ void* malloc(size_t size) { return __hip_malloc(size); }
|
||||
static inline __device__ void* free(void* ptr) { return __hip_free(ptr); }
|
||||
|
||||
#ifdef __HCC_ACCELERATOR__
|
||||
|
||||
@@ -438,6 +438,8 @@ extern const __device__ __attribute__((weak)) __hip_builtin_gridDim_t gridDim;
|
||||
#define hipGridDim_y gridDim.y
|
||||
#define hipGridDim_z gridDim.z
|
||||
|
||||
#include <hip/hcc_detail/math_functions.h>
|
||||
|
||||
// Support std::complex.
|
||||
#pragma push_macro("__CUDA__")
|
||||
#define __CUDA__
|
||||
@@ -448,8 +450,9 @@ extern const __device__ __attribute__((weak)) __hip_builtin_gridDim_t gridDim;
|
||||
#undef __CUDA__
|
||||
#pragma pop_macro("__CUDA__")
|
||||
|
||||
#include <hip/hcc_detail/math_functions.h>
|
||||
|
||||
#endif
|
||||
|
||||
#include <hip/hcc_detail/hip_memory.h>
|
||||
|
||||
#endif // HIP_HCC_DETAIL_RUNTIME_H
|
||||
|
||||
@@ -28,70 +28,6 @@ THE SOFTWARE.
|
||||
#include "hip/hip_runtime.h"
|
||||
#include <atomic>
|
||||
|
||||
//=================================================================================================
|
||||
/*
|
||||
Implementation of malloc and free device functions.
|
||||
|
||||
This is the best place to put them because the device
|
||||
global variables need to be initialized at the start.
|
||||
*/
|
||||
__device__ char gpuHeap[SIZE_OF_HEAP];
|
||||
__device__ uint32_t gpuFlags[NUM_PAGES];
|
||||
|
||||
__device__ void* __hip_hc_malloc(size_t size) {
|
||||
char* heap = (char*)gpuHeap;
|
||||
if (size > SIZE_OF_HEAP) {
|
||||
return (void*)nullptr;
|
||||
}
|
||||
uint32_t totalThreads =
|
||||
blockDim.x * gridDim.x * blockDim.y * gridDim.y * blockDim.z * gridDim.z;
|
||||
uint32_t currentWorkItem = threadIdx.x + blockDim.x * blockIdx.x;
|
||||
|
||||
uint32_t numHeapsPerWorkItem = NUM_PAGES / totalThreads;
|
||||
uint32_t heapSizePerWorkItem = SIZE_OF_HEAP / totalThreads;
|
||||
|
||||
uint32_t stride = size / SIZE_OF_PAGE;
|
||||
uint32_t start = numHeapsPerWorkItem * currentWorkItem;
|
||||
|
||||
uint32_t k = 0;
|
||||
|
||||
while (gpuFlags[k] > 0) {
|
||||
k++;
|
||||
}
|
||||
|
||||
for (uint32_t i = 0; i < stride - 1; i++) {
|
||||
gpuFlags[i + start + k] = 1;
|
||||
}
|
||||
|
||||
gpuFlags[start + stride - 1 + k] = 2;
|
||||
|
||||
void* ptr = (void*)(heap + heapSizePerWorkItem * currentWorkItem + k * SIZE_OF_PAGE);
|
||||
|
||||
return ptr;
|
||||
}
|
||||
|
||||
__device__ void* __hip_hc_free(void* ptr) {
|
||||
if (ptr == nullptr) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
uint32_t offsetByte = (uint64_t)ptr - (uint64_t)gpuHeap;
|
||||
uint32_t offsetPage = offsetByte / SIZE_OF_PAGE;
|
||||
|
||||
while (gpuFlags[offsetPage] != 0) {
|
||||
if (gpuFlags[offsetPage] == 2) {
|
||||
gpuFlags[offsetPage] = 0;
|
||||
offsetPage++;
|
||||
break;
|
||||
} else {
|
||||
gpuFlags[offsetPage] = 0;
|
||||
offsetPage++;
|
||||
}
|
||||
}
|
||||
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
// abort
|
||||
__device__ void abort() { return hc::abort(); }
|
||||
|
||||
|
||||
@@ -29,14 +29,6 @@ THE SOFTWARE.
|
||||
Heap size computation for malloc and free device functions.
|
||||
*/
|
||||
|
||||
#define NUM_PAGES_PER_THREAD 16
|
||||
#define SIZE_OF_PAGE 64
|
||||
#define NUM_THREADS_PER_CU 64
|
||||
#define NUM_CUS_PER_GPU 64 // Specific for r9 Nano
|
||||
#define NUM_PAGES NUM_PAGES_PER_THREAD* NUM_THREADS_PER_CU* NUM_CUS_PER_GPU
|
||||
#define SIZE_MALLOC NUM_PAGES* SIZE_OF_PAGE
|
||||
#define SIZE_OF_HEAP SIZE_MALLOC
|
||||
|
||||
#define HIP_SQRT_2 1.41421356237
|
||||
#define HIP_SQRT_PI 1.77245385091
|
||||
|
||||
@@ -62,9 +54,6 @@ THE SOFTWARE.
|
||||
|
||||
#define HIP_PI 3.14159265358979323846
|
||||
|
||||
__device__ void* __hip_hc_malloc(size_t size);
|
||||
__device__ void* __hip_hc_free(void* ptr);
|
||||
|
||||
__device__ float __hip_erfinvf(float x);
|
||||
__device__ double __hip_erfinv(double x);
|
||||
|
||||
|
||||
@@ -99,7 +99,7 @@ hipError_t hipDeviceGetLimit(size_t* pValue, hipLimit_t limit) {
|
||||
return ihipLogStatus(hipErrorInvalidValue);
|
||||
}
|
||||
if (limit == hipLimitMallocHeapSize) {
|
||||
*pValue = (size_t)SIZE_OF_HEAP;
|
||||
*pValue = (size_t)__HIP_SIZE_OF_HEAP;
|
||||
return ihipLogStatus(hipSuccess);
|
||||
} else {
|
||||
return ihipLogStatus(hipErrorUnsupportedLimit);
|
||||
|
||||
@@ -0,0 +1,190 @@
|
||||
/*
|
||||
Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
/* HIT_START
|
||||
* BUILD: %t %s NVCC_OPTIONS -std=c++11
|
||||
* RUN: %t EXCLUDE_HIP_PLATFORM nvcc
|
||||
* HIT_END
|
||||
*/
|
||||
#include "test_common.h"
|
||||
#include <iostream>
|
||||
#include <complex>
|
||||
|
||||
// Tolerance for error
|
||||
const double tolerance = 1e-6;
|
||||
const bool verbose = false;
|
||||
|
||||
#define LEN 64
|
||||
|
||||
#define ALL_FUN \
|
||||
OP(add) \
|
||||
OP(sub) \
|
||||
OP(mul) \
|
||||
OP(div)
|
||||
|
||||
#define OP(x) CK_##x,
|
||||
enum CalcKind {
|
||||
ALL_FUN
|
||||
};
|
||||
#undef OP
|
||||
|
||||
#define OP(x) case CK_##x: return #x;
|
||||
std::string getName(enum CalcKind CK) {
|
||||
switch(CK){
|
||||
ALL_FUN
|
||||
}
|
||||
}
|
||||
#undef OP
|
||||
|
||||
// Calculates function.
|
||||
// If the function has one argument, B is ignored.
|
||||
// If the function returns real number, converts it to a complex number.
|
||||
#define ONE_ARG(func) \
|
||||
case CK_##func: \
|
||||
return std::complex<FloatT>(std::func(A));
|
||||
|
||||
template<typename FloatT>
|
||||
__device__ __host__ std::complex<FloatT> calc(std::complex<FloatT> A,
|
||||
std::complex<FloatT> B,
|
||||
enum CalcKind CK) {
|
||||
switch(CK) {
|
||||
case CK_add:
|
||||
return A + B;
|
||||
case CK_sub:
|
||||
return A - B;
|
||||
case CK_mul:
|
||||
return A * B;
|
||||
case CK_div:
|
||||
return A / B;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
// Allocate memory in kernel and save the address to pA and pB.
|
||||
// Copy value from A, B to allocated memory.
|
||||
template<typename FloatT>
|
||||
__global__ void kernel_alloc(std::complex<FloatT>* A,
|
||||
std::complex<FloatT>* B,
|
||||
std::complex<FloatT>** pA,
|
||||
std::complex<FloatT>** pB) {
|
||||
typedef std::complex<FloatT> CFloatT;
|
||||
int tx = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
if (tx == 0) {
|
||||
*pA = (CFloatT*)malloc(sizeof(CFloatT)*LEN);
|
||||
*pB = (CFloatT*)malloc(sizeof(CFloatT)*LEN);
|
||||
for (int i = 0; i < LEN; i++) {
|
||||
(*pA)[i] = A[i];
|
||||
(*pB)[i] = B[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Do calculation using values saved in allocated memmory. pA, pB are buffers
|
||||
// containing the address of the device-side allocated array.
|
||||
template<typename FloatT>
|
||||
__global__ void kernel_free(std::complex<FloatT>** pA,
|
||||
std::complex<FloatT>** pB, std::complex<FloatT>* C,
|
||||
enum CalcKind CK) {
|
||||
typedef std::complex<FloatT> CFloatT;
|
||||
int tx = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
C[tx] = calc<FloatT>((*pA)[tx], (*pB)[tx], CK);
|
||||
if (tx == 0) {
|
||||
free(*pA);
|
||||
free(*pB);
|
||||
}
|
||||
}
|
||||
|
||||
template<typename FloatT>
|
||||
void test() {
|
||||
typedef std::complex<FloatT> ComplexT;
|
||||
ComplexT *A, *Ad, *B, *Bd, *C, *Cd, *D;
|
||||
A = new ComplexT[LEN];
|
||||
B = new ComplexT[LEN];
|
||||
C = new ComplexT[LEN];
|
||||
D = new ComplexT[LEN];
|
||||
hipMalloc((void**)&Ad, sizeof(ComplexT)*LEN);
|
||||
hipMalloc((void**)&Bd, sizeof(ComplexT)*LEN);
|
||||
hipMalloc((void**)&Cd, sizeof(ComplexT)*LEN);
|
||||
|
||||
for (uint32_t i = 0; i < LEN; i++) {
|
||||
A[i] = ComplexT((i + 1) * 1.0f, (i + 2) * 1.0f);
|
||||
B[i] = A[i];
|
||||
C[i] = A[i];
|
||||
}
|
||||
hipMemcpy(Ad, A, sizeof(ComplexT)*LEN, hipMemcpyHostToDevice);
|
||||
hipMemcpy(Bd, B, sizeof(ComplexT)*LEN, hipMemcpyHostToDevice);
|
||||
|
||||
// Run kernel for a calculation kind and verify by comparing with host
|
||||
// calculation result. Returns false if fails.
|
||||
auto test_fun = [&](enum CalcKind CK) {
|
||||
// kernel_alloc allocates memory on device side and initialize it.
|
||||
// kernel_free uses allocated memory from kernel_alloc and does the
|
||||
// calculation then free the memory.
|
||||
// pA and pB are buffers to pass the device-side allocated memory address
|
||||
// from kernel_alloc to kernel_free.
|
||||
ComplexT **pA, **pB;
|
||||
hipMalloc((ComplexT***)&pA, sizeof(ComplexT*));
|
||||
hipMalloc((ComplexT***)&pB, sizeof(ComplexT*));
|
||||
hipLaunchKernelGGL(kernel_alloc<FloatT>, dim3(1), dim3(LEN), 0, 0,
|
||||
Ad, Bd, pA, pB);
|
||||
hipDeviceSynchronize();
|
||||
hipLaunchKernelGGL(kernel_free<FloatT>, dim3(1), dim3(LEN), 0, 0,
|
||||
pA, pB, Cd, CK);
|
||||
hipMemcpy(C, Cd, sizeof(ComplexT)*LEN, hipMemcpyDeviceToHost);
|
||||
hipFree(pA);
|
||||
hipFree(pB);
|
||||
for (int i = 0; i < LEN; i++) {
|
||||
ComplexT Expected = calc(A[i], B[i], CK);
|
||||
FloatT error = std::abs(C[i] - Expected);
|
||||
if (std::abs(Expected) > tolerance)
|
||||
error /= std::abs(Expected);
|
||||
bool pass = error < tolerance;
|
||||
if (verbose || !pass) {
|
||||
std::cout << "Function: " << getName(CK)
|
||||
<< " Operands: " << A[i] << " " << B[i]
|
||||
<< " Result: " << C[i]
|
||||
<< " Expected: " << Expected
|
||||
<< " Error: " << error
|
||||
<< " Pass: " << pass
|
||||
<< std::endl;
|
||||
}
|
||||
if (!pass)
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
};
|
||||
|
||||
#define OP(x) assert(test_fun(CK_##x));
|
||||
ALL_FUN
|
||||
#undef OP
|
||||
|
||||
hipFree(Ad);
|
||||
hipFree(Bd);
|
||||
hipFree(Cd);
|
||||
delete[] A;
|
||||
delete[] B;
|
||||
delete[] C;
|
||||
delete[] D;
|
||||
}
|
||||
|
||||
int main() {
|
||||
test<float>();
|
||||
test<double>();
|
||||
passed();
|
||||
return 0;
|
||||
}
|
||||
Yeni konuda referans
Bir kullanıcı engelle