diff --git a/projects/clr/hipamd/include/hip/hcc_detail/hip_runtime.h b/projects/clr/hipamd/include/hip/hcc_detail/hip_runtime.h index f0f1364997..e4ddd9ccac 100755 --- a/projects/clr/hipamd/include/hip/hcc_detail/hip_runtime.h +++ b/projects/clr/hipamd/include/hip/hcc_detail/hip_runtime.h @@ -585,48 +585,21 @@ __device__ void __threadfence_system(void) __attribute__((deprecated("Provided // loop unrolling __device__ static inline void* memcpy(void* dst, void* src, size_t size) { - uint64_t i = 0; - uint64_t totalLength = size/sizeof(uint32_t); - for(i=hipThreadIdx_x+hipBlockIdx_x*hipBlockDim_x; - i<(totalLength/4); - i = i + hipBlockDim_x * hipGridDim_x) - { - ((uint32_t*)dst)[4*i] = ((uint32_t*)src)[4*i]; - ((uint32_t*)dst)[4*i+1] = ((uint32_t*)src)[4*i+1]; - ((uint32_t*)dst)[4*i+2] = ((uint32_t*)src)[4*i+2]; - ((uint32_t*)dst)[4*i+3] = ((uint32_t*)src)[4*i+3]; - } - if(4*i < totalLength){ - ((uint32_t*)dst)[4*i] = ((uint32_t*)src)[4*i]; - ((uint32_t*)dst)[4*i+1] = ((uint32_t*)src)[4*i+1]; - ((uint32_t*)dst)[4*i+2] = ((uint32_t*)src)[4*i+2]; - ((uint32_t*)dst)[4*i+3] = ((uint32_t*)src)[4*i+3]; - + uint8_t *dstPtr, *srcPtr; + dstPtr = (uint8_t*)dst; + srcPtr = (uint8_t*)src; + for(uint32_t i=0;i +#include +#include + +#define LEN8 8 * 4 +#define LEN9 9 * 4 +#define LEN10 10 * 4 +#define LEN11 11 * 4 +#define LEN12 12 * 4 + +__global__ void MemCpy8(hipLaunchParm lp, uint8_t *In, uint8_t *Out) { + int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; + memcpy(Out + tid*8, In + tid*8, 8); +} + +__global__ void MemCpy9(hipLaunchParm lp, uint8_t *In, uint8_t *Out) { + int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; + memcpy(Out + tid*9, In + tid*9, 9); +} + +__global__ void MemCpy10(hipLaunchParm lp, uint8_t *In, uint8_t *Out) { + int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; + memcpy(Out + tid*10, In + tid*10, 10); +} + +__global__ void MemCpy11(hipLaunchParm lp, uint8_t *In, uint8_t *Out) { + int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; + memcpy(Out + tid*11, In + tid*11, 11); +} + +__global__ void MemCpy12(hipLaunchParm lp, uint8_t *In, uint8_t *Out) { + int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; + memcpy(Out + tid*12, In + tid*12, 12); +} + +__global__ void MemSet8(hipLaunchParm lp, uint8_t *In) { + int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; + memset(In + tid*8, 1, 8); +} + +__global__ void MemSet9(hipLaunchParm lp, uint8_t *In) { + int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; + memset(In + tid*9, 1, 9); +} + +__global__ void MemSet10(hipLaunchParm lp, uint8_t *In) { + int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; + memset(In + tid*10, 1, 10); +} + +__global__ void MemSet11(hipLaunchParm lp, uint8_t *In) { + int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; + memset(In + tid*11, 1, 11); +} + +__global__ void MemSet12(hipLaunchParm lp, uint8_t *In) { + int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; + memset(In + tid*12, 1, 12); +} + +int main(){ + uint8_t *A, *Ad, *B, *Bd, *C, *Cd; + A = new uint8_t[LEN8]; + B = new uint8_t[LEN8]; + C = new uint8_t[LEN8]; + for(uint32_t i=0;i