changed memcpy and memset device functions
Change-Id: Ia7f450536a75fad4fe13c7fcf5e9e7a9b5450f52
Cette révision appartient à :
@@ -585,48 +585,21 @@ __device__ void __threadfence_system(void) __attribute__((deprecated("Provided
|
||||
// loop unrolling
|
||||
__device__ static inline void* memcpy(void* dst, void* src, size_t size)
|
||||
{
|
||||
uint64_t i = 0;
|
||||
uint64_t totalLength = size/sizeof(uint32_t);
|
||||
for(i=hipThreadIdx_x+hipBlockIdx_x*hipBlockDim_x;
|
||||
i<(totalLength/4);
|
||||
i = i + hipBlockDim_x * hipGridDim_x)
|
||||
{
|
||||
((uint32_t*)dst)[4*i] = ((uint32_t*)src)[4*i];
|
||||
((uint32_t*)dst)[4*i+1] = ((uint32_t*)src)[4*i+1];
|
||||
((uint32_t*)dst)[4*i+2] = ((uint32_t*)src)[4*i+2];
|
||||
((uint32_t*)dst)[4*i+3] = ((uint32_t*)src)[4*i+3];
|
||||
}
|
||||
if(4*i < totalLength){
|
||||
((uint32_t*)dst)[4*i] = ((uint32_t*)src)[4*i];
|
||||
((uint32_t*)dst)[4*i+1] = ((uint32_t*)src)[4*i+1];
|
||||
((uint32_t*)dst)[4*i+2] = ((uint32_t*)src)[4*i+2];
|
||||
((uint32_t*)dst)[4*i+3] = ((uint32_t*)src)[4*i+3];
|
||||
|
||||
uint8_t *dstPtr, *srcPtr;
|
||||
dstPtr = (uint8_t*)dst;
|
||||
srcPtr = (uint8_t*)src;
|
||||
for(uint32_t i=0;i<size;i++) {
|
||||
dstPtr[i] = srcPtr[i];
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
__device__ static inline void* memset(void* ptr, uint8_t val, size_t size)
|
||||
{
|
||||
uint32_t _val = 0;
|
||||
_val = (val | val << 8 | val << 16 | val << 24);
|
||||
uint64_t totalLength = size/sizeof(uint32_t);
|
||||
uint64_t i = 0;
|
||||
for(i=hipThreadIdx_x+hipBlockIdx_x*hipBlockDim_x;
|
||||
i<(totalLength/4);
|
||||
i = i + hipBlockDim_x * hipGridDim_x)
|
||||
{
|
||||
((uint32_t*)ptr)[4*i] = _val;
|
||||
((uint32_t*)ptr)[4*i+1] = _val;
|
||||
((uint32_t*)ptr)[4*i+2] = _val;
|
||||
((uint32_t*)ptr)[4*i+3] = _val;
|
||||
}
|
||||
if(4*i < totalLength){
|
||||
((uint32_t*)ptr)[4*i] = _val;
|
||||
((uint32_t*)ptr)[4*i+1] = _val;
|
||||
((uint32_t*)ptr)[4*i+2] = _val;
|
||||
((uint32_t*)ptr)[4*i+3] = _val;
|
||||
|
||||
uint8_t *dstPtr;
|
||||
dstPtr = (uint8_t*)ptr;
|
||||
for(uint32_t i=0;i<size;i++) {
|
||||
dstPtr[i] = val;
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
@@ -0,0 +1,187 @@
|
||||
#include<hip/hip_runtime_api.h>
|
||||
#include<hip/hip_runtime.h>
|
||||
#include<iostream>
|
||||
|
||||
#define LEN8 8 * 4
|
||||
#define LEN9 9 * 4
|
||||
#define LEN10 10 * 4
|
||||
#define LEN11 11 * 4
|
||||
#define LEN12 12 * 4
|
||||
|
||||
__global__ void MemCpy8(hipLaunchParm lp, uint8_t *In, uint8_t *Out) {
|
||||
int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
|
||||
memcpy(Out + tid*8, In + tid*8, 8);
|
||||
}
|
||||
|
||||
__global__ void MemCpy9(hipLaunchParm lp, uint8_t *In, uint8_t *Out) {
|
||||
int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
|
||||
memcpy(Out + tid*9, In + tid*9, 9);
|
||||
}
|
||||
|
||||
__global__ void MemCpy10(hipLaunchParm lp, uint8_t *In, uint8_t *Out) {
|
||||
int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
|
||||
memcpy(Out + tid*10, In + tid*10, 10);
|
||||
}
|
||||
|
||||
__global__ void MemCpy11(hipLaunchParm lp, uint8_t *In, uint8_t *Out) {
|
||||
int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
|
||||
memcpy(Out + tid*11, In + tid*11, 11);
|
||||
}
|
||||
|
||||
__global__ void MemCpy12(hipLaunchParm lp, uint8_t *In, uint8_t *Out) {
|
||||
int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
|
||||
memcpy(Out + tid*12, In + tid*12, 12);
|
||||
}
|
||||
|
||||
__global__ void MemSet8(hipLaunchParm lp, uint8_t *In) {
|
||||
int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
|
||||
memset(In + tid*8, 1, 8);
|
||||
}
|
||||
|
||||
__global__ void MemSet9(hipLaunchParm lp, uint8_t *In) {
|
||||
int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
|
||||
memset(In + tid*9, 1, 9);
|
||||
}
|
||||
|
||||
__global__ void MemSet10(hipLaunchParm lp, uint8_t *In) {
|
||||
int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
|
||||
memset(In + tid*10, 1, 10);
|
||||
}
|
||||
|
||||
__global__ void MemSet11(hipLaunchParm lp, uint8_t *In) {
|
||||
int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
|
||||
memset(In + tid*11, 1, 11);
|
||||
}
|
||||
|
||||
__global__ void MemSet12(hipLaunchParm lp, uint8_t *In) {
|
||||
int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
|
||||
memset(In + tid*12, 1, 12);
|
||||
}
|
||||
|
||||
int main(){
|
||||
uint8_t *A, *Ad, *B, *Bd, *C, *Cd;
|
||||
A = new uint8_t[LEN8];
|
||||
B = new uint8_t[LEN8];
|
||||
C = new uint8_t[LEN8];
|
||||
for(uint32_t i=0;i<LEN8;i++) {
|
||||
A[i] = i;
|
||||
B[i] = 0;
|
||||
C[i] = 0;
|
||||
}
|
||||
hipMalloc((void**)&Ad, LEN8);
|
||||
hipMalloc((void**)&Bd, LEN8);
|
||||
hipMalloc((void**)&Cd, LEN8);
|
||||
hipMemcpy(Ad, A, LEN8, hipMemcpyHostToDevice);
|
||||
hipLaunchKernel(MemCpy8, dim3(2,1,1), dim3(2,1,1), 0, 0, Ad, Bd);
|
||||
hipLaunchKernel(MemSet8, dim3(2,1,1), dim3(2,1,1), 0, 0, Cd);
|
||||
hipMemcpy(B, Bd, LEN8, hipMemcpyDeviceToHost);
|
||||
hipMemcpy(C, Cd, LEN8, hipMemcpyDeviceToHost);
|
||||
for(uint32_t i=0;i<LEN8;i++) {
|
||||
assert(A[i] == B[i]);
|
||||
assert(C[i] == 1);
|
||||
}
|
||||
|
||||
delete A;
|
||||
delete B;
|
||||
delete C;
|
||||
|
||||
A = new uint8_t[LEN9];
|
||||
B = new uint8_t[LEN9];
|
||||
C = new uint8_t[LEN9];
|
||||
for(uint32_t i=0;i<LEN9;i++) {
|
||||
A[i] = i;
|
||||
B[i] = 0;
|
||||
C[i] = 0;
|
||||
}
|
||||
hipMalloc((void**)&Ad, LEN9);
|
||||
hipMalloc((void**)&Bd, LEN9);
|
||||
hipMalloc((void**)&Cd, LEN9);
|
||||
hipMemcpy(Ad, A, LEN9, hipMemcpyHostToDevice);
|
||||
hipLaunchKernel(MemCpy9, dim3(2,1,1), dim3(2,1,1), 0, 0, Ad, Bd);
|
||||
hipLaunchKernel(MemSet9, dim3(2,1,1), dim3(2,1,1), 0, 0, Cd);
|
||||
hipMemcpy(B, Bd, LEN9, hipMemcpyDeviceToHost);
|
||||
hipMemcpy(C, Cd, LEN9, hipMemcpyDeviceToHost);
|
||||
for(uint32_t i=0;i<LEN9;i++) {
|
||||
assert(A[i] == B[i]);
|
||||
assert(C[i] == 1);
|
||||
}
|
||||
|
||||
delete A;
|
||||
delete B;
|
||||
delete C;
|
||||
|
||||
A = new uint8_t[LEN10];
|
||||
B = new uint8_t[LEN10];
|
||||
C = new uint8_t[LEN10];
|
||||
for(uint32_t i=0;i<LEN10;i++) {
|
||||
A[i] = i;
|
||||
B[i] = 0;
|
||||
C[i] = 0;
|
||||
}
|
||||
hipMalloc((void**)&Ad, LEN10);
|
||||
hipMalloc((void**)&Bd, LEN10);
|
||||
hipMalloc((void**)&Cd, LEN10);
|
||||
hipMemcpy(Ad, A, LEN10, hipMemcpyHostToDevice);
|
||||
hipLaunchKernel(MemCpy10, dim3(2,1,1), dim3(2,1,1), 0, 0, Ad, Bd);
|
||||
hipLaunchKernel(MemSet10, dim3(2,1,1), dim3(2,1,1), 0, 0, Cd);
|
||||
hipMemcpy(B, Bd, LEN10, hipMemcpyDeviceToHost);
|
||||
hipMemcpy(C, Cd, LEN10, hipMemcpyDeviceToHost);
|
||||
for(uint32_t i=0;i<LEN10;i++) {
|
||||
assert(A[i] == B[i]);
|
||||
assert(C[i] == 1);
|
||||
}
|
||||
|
||||
delete A;
|
||||
delete B;
|
||||
delete C;
|
||||
|
||||
A = new uint8_t[LEN11];
|
||||
B = new uint8_t[LEN11];
|
||||
C = new uint8_t[LEN11];
|
||||
for(uint32_t i=0;i<LEN11;i++) {
|
||||
A[i] = i;
|
||||
B[i] = 0;
|
||||
C[i] = 0;
|
||||
}
|
||||
hipMalloc((void**)&Ad, LEN11);
|
||||
hipMalloc((void**)&Bd, LEN11);
|
||||
hipMalloc((void**)&Cd, LEN11);
|
||||
hipMemcpy(Ad, A, LEN11, hipMemcpyHostToDevice);
|
||||
hipLaunchKernel(MemCpy11, dim3(2,1,1), dim3(2,1,1), 0, 0, Ad, Bd);
|
||||
hipLaunchKernel(MemSet11, dim3(2,1,1), dim3(2,1,1), 0, 0, Cd);
|
||||
hipMemcpy(B, Bd, LEN11, hipMemcpyDeviceToHost);
|
||||
hipMemcpy(C, Cd, LEN11, hipMemcpyDeviceToHost);
|
||||
for(uint32_t i=0;i<LEN11;i++) {
|
||||
assert(A[i] == B[i]);
|
||||
assert(C[i] == 1);
|
||||
}
|
||||
|
||||
delete A;
|
||||
delete B;
|
||||
delete C;
|
||||
|
||||
A = new uint8_t[LEN12];
|
||||
B = new uint8_t[LEN12];
|
||||
C = new uint8_t[LEN12];
|
||||
for(uint32_t i=0;i<LEN12;i++) {
|
||||
A[i] = i;
|
||||
B[i] = 0;
|
||||
C[i] = 0;
|
||||
}
|
||||
hipMalloc((void**)&Ad, LEN12);
|
||||
hipMalloc((void**)&Bd, LEN12);
|
||||
hipMalloc((void**)&Cd, LEN12);
|
||||
hipMemcpy(Ad, A, LEN12, hipMemcpyHostToDevice);
|
||||
hipLaunchKernel(MemCpy12, dim3(2,1,1), dim3(2,1,1), 0, 0, Ad, Bd);
|
||||
hipLaunchKernel(MemSet12, dim3(2,1,1), dim3(2,1,1), 0, 0, Cd);
|
||||
hipMemcpy(B, Bd, LEN12, hipMemcpyDeviceToHost);
|
||||
hipMemcpy(C, Cd, LEN12, hipMemcpyDeviceToHost);
|
||||
for(uint32_t i=0;i<LEN12;i++) {
|
||||
assert(A[i] == B[i]);
|
||||
assert(C[i] == 1);
|
||||
}
|
||||
|
||||
delete A;
|
||||
delete B;
|
||||
delete C;
|
||||
}
|
||||
Référencer dans un nouveau ticket
Bloquer un utilisateur