changed memcpy and memset device functions

Change-Id: Ia7f450536a75fad4fe13c7fcf5e9e7a9b5450f52
Cette révision appartient à :
Aditya Atluri
2016-10-11 17:43:15 -05:00
Parent d71c0d10de
révision 288f024d00
2 fichiers modifiés avec 196 ajouts et 36 suppressions
+9 -36
Voir le fichier
@@ -585,48 +585,21 @@ __device__ void __threadfence_system(void) __attribute__((deprecated("Provided
// loop unrolling
__device__ static inline void* memcpy(void* dst, void* src, size_t size)
{
uint64_t i = 0;
uint64_t totalLength = size/sizeof(uint32_t);
for(i=hipThreadIdx_x+hipBlockIdx_x*hipBlockDim_x;
i<(totalLength/4);
i = i + hipBlockDim_x * hipGridDim_x)
{
((uint32_t*)dst)[4*i] = ((uint32_t*)src)[4*i];
((uint32_t*)dst)[4*i+1] = ((uint32_t*)src)[4*i+1];
((uint32_t*)dst)[4*i+2] = ((uint32_t*)src)[4*i+2];
((uint32_t*)dst)[4*i+3] = ((uint32_t*)src)[4*i+3];
}
if(4*i < totalLength){
((uint32_t*)dst)[4*i] = ((uint32_t*)src)[4*i];
((uint32_t*)dst)[4*i+1] = ((uint32_t*)src)[4*i+1];
((uint32_t*)dst)[4*i+2] = ((uint32_t*)src)[4*i+2];
((uint32_t*)dst)[4*i+3] = ((uint32_t*)src)[4*i+3];
uint8_t *dstPtr, *srcPtr;
dstPtr = (uint8_t*)dst;
srcPtr = (uint8_t*)src;
for(uint32_t i=0;i<size;i++) {
dstPtr[i] = srcPtr[i];
}
return nullptr;
}
__device__ static inline void* memset(void* ptr, uint8_t val, size_t size)
{
uint32_t _val = 0;
_val = (val | val << 8 | val << 16 | val << 24);
uint64_t totalLength = size/sizeof(uint32_t);
uint64_t i = 0;
for(i=hipThreadIdx_x+hipBlockIdx_x*hipBlockDim_x;
i<(totalLength/4);
i = i + hipBlockDim_x * hipGridDim_x)
{
((uint32_t*)ptr)[4*i] = _val;
((uint32_t*)ptr)[4*i+1] = _val;
((uint32_t*)ptr)[4*i+2] = _val;
((uint32_t*)ptr)[4*i+3] = _val;
}
if(4*i < totalLength){
((uint32_t*)ptr)[4*i] = _val;
((uint32_t*)ptr)[4*i+1] = _val;
((uint32_t*)ptr)[4*i+2] = _val;
((uint32_t*)ptr)[4*i+3] = _val;
uint8_t *dstPtr;
dstPtr = (uint8_t*)ptr;
for(uint32_t i=0;i<size;i++) {
dstPtr[i] = val;
}
return nullptr;
}
+187
Voir le fichier
@@ -0,0 +1,187 @@
#include<hip/hip_runtime_api.h>
#include<hip/hip_runtime.h>
#include<iostream>
#define LEN8 8 * 4
#define LEN9 9 * 4
#define LEN10 10 * 4
#define LEN11 11 * 4
#define LEN12 12 * 4
__global__ void MemCpy8(hipLaunchParm lp, uint8_t *In, uint8_t *Out) {
int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
memcpy(Out + tid*8, In + tid*8, 8);
}
__global__ void MemCpy9(hipLaunchParm lp, uint8_t *In, uint8_t *Out) {
int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
memcpy(Out + tid*9, In + tid*9, 9);
}
__global__ void MemCpy10(hipLaunchParm lp, uint8_t *In, uint8_t *Out) {
int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
memcpy(Out + tid*10, In + tid*10, 10);
}
__global__ void MemCpy11(hipLaunchParm lp, uint8_t *In, uint8_t *Out) {
int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
memcpy(Out + tid*11, In + tid*11, 11);
}
__global__ void MemCpy12(hipLaunchParm lp, uint8_t *In, uint8_t *Out) {
int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
memcpy(Out + tid*12, In + tid*12, 12);
}
__global__ void MemSet8(hipLaunchParm lp, uint8_t *In) {
int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
memset(In + tid*8, 1, 8);
}
__global__ void MemSet9(hipLaunchParm lp, uint8_t *In) {
int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
memset(In + tid*9, 1, 9);
}
__global__ void MemSet10(hipLaunchParm lp, uint8_t *In) {
int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
memset(In + tid*10, 1, 10);
}
__global__ void MemSet11(hipLaunchParm lp, uint8_t *In) {
int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
memset(In + tid*11, 1, 11);
}
__global__ void MemSet12(hipLaunchParm lp, uint8_t *In) {
int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
memset(In + tid*12, 1, 12);
}
int main(){
uint8_t *A, *Ad, *B, *Bd, *C, *Cd;
A = new uint8_t[LEN8];
B = new uint8_t[LEN8];
C = new uint8_t[LEN8];
for(uint32_t i=0;i<LEN8;i++) {
A[i] = i;
B[i] = 0;
C[i] = 0;
}
hipMalloc((void**)&Ad, LEN8);
hipMalloc((void**)&Bd, LEN8);
hipMalloc((void**)&Cd, LEN8);
hipMemcpy(Ad, A, LEN8, hipMemcpyHostToDevice);
hipLaunchKernel(MemCpy8, dim3(2,1,1), dim3(2,1,1), 0, 0, Ad, Bd);
hipLaunchKernel(MemSet8, dim3(2,1,1), dim3(2,1,1), 0, 0, Cd);
hipMemcpy(B, Bd, LEN8, hipMemcpyDeviceToHost);
hipMemcpy(C, Cd, LEN8, hipMemcpyDeviceToHost);
for(uint32_t i=0;i<LEN8;i++) {
assert(A[i] == B[i]);
assert(C[i] == 1);
}
delete A;
delete B;
delete C;
A = new uint8_t[LEN9];
B = new uint8_t[LEN9];
C = new uint8_t[LEN9];
for(uint32_t i=0;i<LEN9;i++) {
A[i] = i;
B[i] = 0;
C[i] = 0;
}
hipMalloc((void**)&Ad, LEN9);
hipMalloc((void**)&Bd, LEN9);
hipMalloc((void**)&Cd, LEN9);
hipMemcpy(Ad, A, LEN9, hipMemcpyHostToDevice);
hipLaunchKernel(MemCpy9, dim3(2,1,1), dim3(2,1,1), 0, 0, Ad, Bd);
hipLaunchKernel(MemSet9, dim3(2,1,1), dim3(2,1,1), 0, 0, Cd);
hipMemcpy(B, Bd, LEN9, hipMemcpyDeviceToHost);
hipMemcpy(C, Cd, LEN9, hipMemcpyDeviceToHost);
for(uint32_t i=0;i<LEN9;i++) {
assert(A[i] == B[i]);
assert(C[i] == 1);
}
delete A;
delete B;
delete C;
A = new uint8_t[LEN10];
B = new uint8_t[LEN10];
C = new uint8_t[LEN10];
for(uint32_t i=0;i<LEN10;i++) {
A[i] = i;
B[i] = 0;
C[i] = 0;
}
hipMalloc((void**)&Ad, LEN10);
hipMalloc((void**)&Bd, LEN10);
hipMalloc((void**)&Cd, LEN10);
hipMemcpy(Ad, A, LEN10, hipMemcpyHostToDevice);
hipLaunchKernel(MemCpy10, dim3(2,1,1), dim3(2,1,1), 0, 0, Ad, Bd);
hipLaunchKernel(MemSet10, dim3(2,1,1), dim3(2,1,1), 0, 0, Cd);
hipMemcpy(B, Bd, LEN10, hipMemcpyDeviceToHost);
hipMemcpy(C, Cd, LEN10, hipMemcpyDeviceToHost);
for(uint32_t i=0;i<LEN10;i++) {
assert(A[i] == B[i]);
assert(C[i] == 1);
}
delete A;
delete B;
delete C;
A = new uint8_t[LEN11];
B = new uint8_t[LEN11];
C = new uint8_t[LEN11];
for(uint32_t i=0;i<LEN11;i++) {
A[i] = i;
B[i] = 0;
C[i] = 0;
}
hipMalloc((void**)&Ad, LEN11);
hipMalloc((void**)&Bd, LEN11);
hipMalloc((void**)&Cd, LEN11);
hipMemcpy(Ad, A, LEN11, hipMemcpyHostToDevice);
hipLaunchKernel(MemCpy11, dim3(2,1,1), dim3(2,1,1), 0, 0, Ad, Bd);
hipLaunchKernel(MemSet11, dim3(2,1,1), dim3(2,1,1), 0, 0, Cd);
hipMemcpy(B, Bd, LEN11, hipMemcpyDeviceToHost);
hipMemcpy(C, Cd, LEN11, hipMemcpyDeviceToHost);
for(uint32_t i=0;i<LEN11;i++) {
assert(A[i] == B[i]);
assert(C[i] == 1);
}
delete A;
delete B;
delete C;
A = new uint8_t[LEN12];
B = new uint8_t[LEN12];
C = new uint8_t[LEN12];
for(uint32_t i=0;i<LEN12;i++) {
A[i] = i;
B[i] = 0;
C[i] = 0;
}
hipMalloc((void**)&Ad, LEN12);
hipMalloc((void**)&Bd, LEN12);
hipMalloc((void**)&Cd, LEN12);
hipMemcpy(Ad, A, LEN12, hipMemcpyHostToDevice);
hipLaunchKernel(MemCpy12, dim3(2,1,1), dim3(2,1,1), 0, 0, Ad, Bd);
hipLaunchKernel(MemSet12, dim3(2,1,1), dim3(2,1,1), 0, 0, Cd);
hipMemcpy(B, Bd, LEN12, hipMemcpyDeviceToHost);
hipMemcpy(C, Cd, LEN12, hipMemcpyDeviceToHost);
for(uint32_t i=0;i<LEN12;i++) {
assert(A[i] == B[i]);
assert(C[i] == 1);
}
delete A;
delete B;
delete C;
}