Revert "Revert adoption of CUDA indexing in general - this can only work with later versions of the compiler, just like module based dispatch, and thus must be guarded against usage in earlier (e.g. 1.6) versions."
This reverts commit d2fd1f5
This commit is contained in:
@@ -40,8 +40,8 @@ __global__ void testExternSharedKernel(hipLaunchParm lp, const T* A_d, const T*
|
||||
T *sdata = reinterpret_cast<T *>(my_sdata);
|
||||
#endif
|
||||
|
||||
size_t gid = (blockIdx.x * blockDim.x + threadIdx.x);
|
||||
size_t tid = threadIdx.x;
|
||||
size_t gid = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
|
||||
size_t tid = hipThreadIdx_x;
|
||||
|
||||
// initialize dynamic shared memory
|
||||
if (tid < groupElements) {
|
||||
|
||||
@@ -34,7 +34,7 @@ THE SOFTWARE.
|
||||
|
||||
__global__ void vectorAdd(hipLaunchParm lp, float *Ad, float *Bd) {
|
||||
HIP_DYNAMIC_SHARED(float, sBd);
|
||||
int tx = threadIdx.x;
|
||||
int tx = hipThreadIdx_x;
|
||||
for(int i=0;i<LEN/64;i++) {
|
||||
sBd[tx + i * 64] = Ad[tx + i * 64] + 1.0f;
|
||||
Bd[tx + i * 64] = sBd[tx + i * 64];
|
||||
|
||||
@@ -48,8 +48,8 @@ vectorADD2( hipLaunchParm lp,
|
||||
T *C_d,
|
||||
size_t N)
|
||||
{
|
||||
size_t offset = (blockIdx.x * blockDim.x + threadIdx.x);
|
||||
size_t stride = blockDim.x * gridDim.x ;
|
||||
size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
|
||||
size_t stride = hipBlockDim_x * hipGridDim_x ;
|
||||
|
||||
for (size_t i=offset; i<N; i+=stride) {
|
||||
double foo = __hiloint2double(A_d[i], B_d[i]);
|
||||
|
||||
@@ -62,7 +62,7 @@ __global__ void MyKernel (const hipLaunchParm lp, const float *a, const float *b
|
||||
{
|
||||
//KERNELBEGIN;
|
||||
|
||||
unsigned gid = threadIdx.x;
|
||||
unsigned gid = hipThreadIdx_x;
|
||||
if (gid < N) {
|
||||
c[gid] = a[gid] + PlusOne(b[gid]);
|
||||
}
|
||||
@@ -96,7 +96,7 @@ vectorADD(const hipLaunchParm lp,
|
||||
int zuzu = deviceVar + 1;
|
||||
|
||||
|
||||
int b = threadIdx.x;
|
||||
int b = hipThreadIdx_x;
|
||||
int c;
|
||||
#ifdef NOT_YET
|
||||
int a = __shfl_up(x, 1);
|
||||
@@ -119,8 +119,8 @@ vectorADD(const hipLaunchParm lp,
|
||||
__syncthreads();
|
||||
|
||||
|
||||
size_t offset = (blockIdx.x * blockDim.x + threadIdx.x);
|
||||
size_t stride = blockDim.x * gridDim.x ;
|
||||
size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
|
||||
size_t stride = hipBlockDim_x * hipGridDim_x ;
|
||||
|
||||
for (size_t i=offset; i<N; i+=stride) {
|
||||
C_d[i] = A_d[i] + B_d[i];
|
||||
|
||||
@@ -38,7 +38,7 @@ __constant__ int Value[LEN];
|
||||
|
||||
__global__ void Get(hipLaunchParm lp, int *Ad)
|
||||
{
|
||||
int tid = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
|
||||
Ad[tid] = Value[tid];
|
||||
}
|
||||
|
||||
|
||||
@@ -33,12 +33,12 @@ THE SOFTWARE.
|
||||
#define SIZE NUM * 8
|
||||
|
||||
__global__ void Alloc(hipLaunchParm lp, uint64_t *Ptr) {
|
||||
int tid = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
|
||||
Ptr[tid] = (uint64_t)malloc(128);
|
||||
}
|
||||
|
||||
__global__ void Free(hipLaunchParm lp, uint64_t *Ptr) {
|
||||
int tid = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
|
||||
free((void*)Ptr[tid]);
|
||||
}
|
||||
|
||||
|
||||
@@ -35,52 +35,52 @@ THE SOFTWARE.
|
||||
#define LEN12 12 * 4
|
||||
|
||||
__global__ void MemCpy8(hipLaunchParm lp, uint8_t *In, uint8_t *Out) {
|
||||
int tid = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
|
||||
memcpy(Out + tid*8, In + tid*8, 8);
|
||||
}
|
||||
|
||||
__global__ void MemCpy9(hipLaunchParm lp, uint8_t *In, uint8_t *Out) {
|
||||
int tid = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
|
||||
memcpy(Out + tid*9, In + tid*9, 9);
|
||||
}
|
||||
|
||||
__global__ void MemCpy10(hipLaunchParm lp, uint8_t *In, uint8_t *Out) {
|
||||
int tid = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
|
||||
memcpy(Out + tid*10, In + tid*10, 10);
|
||||
}
|
||||
|
||||
__global__ void MemCpy11(hipLaunchParm lp, uint8_t *In, uint8_t *Out) {
|
||||
int tid = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
|
||||
memcpy(Out + tid*11, In + tid*11, 11);
|
||||
}
|
||||
|
||||
__global__ void MemCpy12(hipLaunchParm lp, uint8_t *In, uint8_t *Out) {
|
||||
int tid = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
|
||||
memcpy(Out + tid*12, In + tid*12, 12);
|
||||
}
|
||||
|
||||
__global__ void MemSet8(hipLaunchParm lp, uint8_t *In) {
|
||||
int tid = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
|
||||
memset(In + tid*8, 1, 8);
|
||||
}
|
||||
|
||||
__global__ void MemSet9(hipLaunchParm lp, uint8_t *In) {
|
||||
int tid = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
|
||||
memset(In + tid*9, 1, 9);
|
||||
}
|
||||
|
||||
__global__ void MemSet10(hipLaunchParm lp, uint8_t *In) {
|
||||
int tid = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
|
||||
memset(In + tid*10, 1, 10);
|
||||
}
|
||||
|
||||
__global__ void MemSet11(hipLaunchParm lp, uint8_t *In) {
|
||||
int tid = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
|
||||
memset(In + tid*11, 1, 11);
|
||||
}
|
||||
|
||||
__global__ void MemSet12(hipLaunchParm lp, uint8_t *In) {
|
||||
int tid = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
|
||||
memset(In + tid*12, 1, 12);
|
||||
}
|
||||
|
||||
|
||||
@@ -37,7 +37,7 @@ __global__ void vadd_asm(hipLaunchParm lp,
|
||||
float *out,
|
||||
float *in)
|
||||
{
|
||||
int i = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
|
||||
|
||||
#ifdef __HIP_PLATFORM_NVCC__
|
||||
asm volatile("add.f32 %0,%1,%2;":"=f"(out[i]):"f"(in[i]),"f"(out[i]));
|
||||
|
||||
@@ -35,7 +35,7 @@ __global__ void vmac_asm(hipLaunchParm lp,
|
||||
float *out,
|
||||
float *in)
|
||||
{
|
||||
int i = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
|
||||
|
||||
asm volatile ("v_mac_f32_e32 %0, %2, %3" : "=v" (out[i]) : "0"(out[i]), "v" (a), "v" (in[i]));
|
||||
}
|
||||
|
||||
@@ -33,7 +33,7 @@ void
|
||||
__launch_bounds__(256, 2)
|
||||
myKern(hipLaunchParm lp, int *C, const int *A, int N, int xfactor)
|
||||
{
|
||||
int tid = (blockIdx.x * blockDim.x + threadIdx.x);
|
||||
int tid = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
|
||||
|
||||
if (tid < N) {
|
||||
C[tid] = A[tid];
|
||||
|
||||
Reference in New Issue
Block a user