added memset and memcpy device functions
- Added memcpy and memset device functions
- Added test for memcpy and memset
Change-Id: Icd21a8dd964953b86d5e92889bf1664bee647219
[ROCm/clr commit: 5e7c396bcd]
Αυτή η υποβολή περιλαμβάνεται σε:
@@ -568,6 +568,55 @@ __device__ void __threadfence_system(void);
|
||||
#define hipGridDim_y (hc_get_num_groups(1))
|
||||
#define hipGridDim_z (hc_get_num_groups(2))
|
||||
|
||||
// loop unrolling
|
||||
__device__ static inline void* memcpy(void* dst, void* src, size_t size)
|
||||
{
|
||||
uint64_t i = 0;
|
||||
uint64_t totalLength = size/sizeof(uint32_t);
|
||||
for(i=hipThreadIdx_x+hipBlockIdx_x*hipBlockDim_x;
|
||||
i<(totalLength/4);
|
||||
i = i + hipBlockDim_x * hipGridDim_x)
|
||||
{
|
||||
((uint32_t*)dst)[4*i] = ((uint32_t*)src)[4*i];
|
||||
((uint32_t*)dst)[4*i+1] = ((uint32_t*)src)[4*i+1];
|
||||
((uint32_t*)dst)[4*i+2] = ((uint32_t*)src)[4*i+2];
|
||||
((uint32_t*)dst)[4*i+3] = ((uint32_t*)src)[4*i+3];
|
||||
}
|
||||
if(4*i < totalLength){
|
||||
((uint32_t*)dst)[4*i] = ((uint32_t*)src)[4*i];
|
||||
((uint32_t*)dst)[4*i+1] = ((uint32_t*)src)[4*i+1];
|
||||
((uint32_t*)dst)[4*i+2] = ((uint32_t*)src)[4*i+2];
|
||||
((uint32_t*)dst)[4*i+3] = ((uint32_t*)src)[4*i+3];
|
||||
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
__device__ static inline void* memset(void* ptr, uint8_t val, size_t size)
|
||||
{
|
||||
uint32_t _val = 0;
|
||||
_val = (val | val << 8 | val << 16 | val << 24);
|
||||
uint64_t totalLength = size/sizeof(uint32_t);
|
||||
uint64_t i = 0;
|
||||
for(i=hipThreadIdx_x+hipBlockIdx_x*hipBlockDim_x;
|
||||
i<(totalLength/4);
|
||||
i = i + hipBlockDim_x * hipGridDim_x)
|
||||
{
|
||||
((uint32_t*)ptr)[4*i] = _val;
|
||||
((uint32_t*)ptr)[4*i+1] = _val;
|
||||
((uint32_t*)ptr)[4*i+2] = _val;
|
||||
((uint32_t*)ptr)[4*i+3] = _val;
|
||||
}
|
||||
if(4*i < totalLength){
|
||||
((uint32_t*)ptr)[4*i] = _val;
|
||||
((uint32_t*)ptr)[4*i+1] = _val;
|
||||
((uint32_t*)ptr)[4*i+2] = _val;
|
||||
((uint32_t*)ptr)[4*i+3] = _val;
|
||||
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
#define __syncthreads() hc_barrier(CLK_LOCAL_MEM_FENCE)
|
||||
|
||||
#define HIP_KERNEL_NAME(...) __VA_ARGS__
|
||||
@@ -653,4 +702,6 @@ do {\
|
||||
*/
|
||||
|
||||
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
@@ -0,0 +1,42 @@
|
||||
#include<iostream>
|
||||
#include<hip/hip_runtime.h>
|
||||
#include<hip/hip_runtime_api.h>
|
||||
|
||||
#define LEN 1030
|
||||
#define SIZE LEN << 2
|
||||
|
||||
__global__ void cpy(hipLaunchParm lp, uint32_t *Out, uint32_t *In, uint32_t *Vald)
|
||||
{
|
||||
memcpy(Out, In, SIZE, Vald);
|
||||
}
|
||||
|
||||
__global__ void set(hipLaunchParm lp, uint32_t *ptr, uint8_t val, size_t size)
|
||||
{
|
||||
memset(ptr, val, size);
|
||||
}
|
||||
|
||||
int main()
|
||||
{
|
||||
uint32_t *A, *Ad, *B, *Bd;
|
||||
uint32_t *Val, *Vald;
|
||||
A = new uint32_t[LEN];
|
||||
B = new uint32_t[LEN];
|
||||
Val = new uint32_t;
|
||||
*Val = 0;
|
||||
for(int i=0;i<LEN;i++){
|
||||
A[i] = i *1.0f;
|
||||
B[i] = 0.0f;
|
||||
}
|
||||
hipMalloc((void**)&Ad, SIZE);
|
||||
hipMalloc((void**)&Bd, SIZE);
|
||||
hipMalloc((void**)&Vald, sizeof(uint32_t));
|
||||
hipMemcpy(Ad, A, SIZE, hipMemcpyHostToDevice);
|
||||
hipLaunchKernel(cpy, dim3(1), dim3(LEN/4), 0, 0, Bd, Ad, Vald);
|
||||
hipLaunchKernel(set, dim3(1), dim3(LEN/4), 0, 0, Bd, 0x1, SIZE);
|
||||
hipMemcpy(B, Bd, SIZE, hipMemcpyDeviceToHost);
|
||||
hipMemcpy(Val, Vald, sizeof(uint32_t), hipMemcpyDeviceToHost);
|
||||
for(int i=LEN-16;i<LEN;i++){
|
||||
std::cout<<A[i]<<" "<<B[i]<<std::endl;
|
||||
}
|
||||
std::cout<<*Val<<std::endl;
|
||||
}
|
||||
Αναφορά σε νέο ζήτημα
Block a user