Revert adoption of CUDA indexing in general - this can only work with later versions of the compiler, just like module based dispatch, and thus must be guarded against usage in earlier (e.g. 1.6) versions.

This commit is contained in:
Alex Voicu
2017-11-29 21:01:28 +00:00
parent b881cf713c
commit d2fd1f5544
63 changed files with 173 additions and 171 deletions
+21 -19
View File
@@ -381,27 +381,29 @@ __device__ void __threadfence_system(void) ;
* @}
*/
template<typename std::common_type<
decltype(hc_get_group_id),
decltype(hc_get_group_size),
decltype(hc_get_num_groups),
decltype(hc_get_workitem_id)>::type f>
class Coordinates {
using R = decltype(f(0));
#if __hcc_workweek__ >= 17481
template<typename std::common_type<
decltype(hc_get_group_id),
decltype(hc_get_group_size),
decltype(hc_get_num_groups),
decltype(hc_get_workitem_id)>::type f>
class Coordinates {
using R = decltype(f(0));
struct X { __device__ operator R() const { return f(0); } };
struct Y { __device__ operator R() const { return f(1); } };
struct Z { __device__ operator R() const { return f(2); } };
public:
static constexpr X x{};
static constexpr Y y{};
static constexpr Z z{};
};
struct X { __device__ operator R() const { return f(0); } };
struct Y { __device__ operator R() const { return f(1); } };
struct Z { __device__ operator R() const { return f(2); } };
public:
static constexpr X x{};
static constexpr Y y{};
static constexpr Z z{};
};
static constexpr Coordinates<hc_get_group_size> blockDim;
static constexpr Coordinates<hc_get_group_id> blockIdx;
static constexpr Coordinates<hc_get_num_groups> gridDim;
static constexpr Coordinates<hc_get_workitem_id> threadIdx;
static constexpr Coordinates<hc_get_group_size> blockDim;
static constexpr Coordinates<hc_get_group_id> blockIdx;
static constexpr Coordinates<hc_get_num_groups> gridDim;
static constexpr Coordinates<hc_get_workitem_id> threadIdx;
#endif
#define hipThreadIdx_x (hc_get_workitem_id(0))
#define hipThreadIdx_y (hc_get_workitem_id(1))
+2 -2
View File
@@ -40,8 +40,8 @@ template <typename T>
__global__ void
vector_square(T *C_d, const T *A_d, size_t N)
{
size_t offset = (blockIdx.x * blockDim.x + threadIdx.x);
size_t stride = blockDim.x * gridDim.x ;
size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
size_t stride = hipBlockDim_x * hipGridDim_x ;
for (size_t i=offset; i<N; i+=stride) {
C_d[i] = A_d[i] * A_d[i];
+2 -2
View File
@@ -45,8 +45,8 @@ __device__ void *__hip_hc_malloc(size_t size)
{
return (void*)nullptr;
}
uint32_t totalThreads = blockDim.x * gridDim.x * blockDim.y * gridDim.y * blockDim.z * gridDim.z;
uint32_t currentWorkItem = threadIdx.x + blockDim.x * blockIdx.x;
uint32_t totalThreads = hipBlockDim_x * hipGridDim_x * hipBlockDim_y * hipGridDim_y * hipBlockDim_z * hipGridDim_z;
uint32_t currentWorkItem = hipThreadIdx_x + hipBlockDim_x * hipBlockIdx_x;
uint32_t numHeapsPerWorkItem = NUM_PAGES / totalThreads;
uint32_t heapSizePerWorkItem = SIZE_OF_HEAP / totalThreads;
+2 -2
View File
@@ -1307,9 +1307,9 @@ namespace
__global__
void hip_fill_n(RandomAccessIterator f, N n, T value)
{
const uint32_t grid_dim = gridDim.x * blockDim.x;
const uint32_t grid_dim = hipGridDim_x * hipBlockDim_x;
size_t idx = blockIdx.x * block_dim + threadIdx.x;
size_t idx = hipBlockIdx_x * block_dim + hipThreadIdx_x;
while (idx < n) {
__builtin_memcpy(
reinterpret_cast<void*>(&f[idx]),
@@ -34,7 +34,7 @@ THE SOFTWARE.
#define NUM_STREAMS 2
__global__ void Iter(hipLaunchParm lp, int *Ad, int num){
int tx = threadIdx.x + blockIdx.x * blockDim.x;
int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
// Kernel loop designed to execute very slowly... ... ... so we can test timing-related behavior below
if(tx == 0){
for(int i = 0; i<num;i++){
+1 -1
View File
@@ -27,7 +27,7 @@ THE SOFTWARE.
#define SIZE 64<<2
__global__ void getSqAbs(hipLaunchParm lp, float *A, float *B, float *C){
int tx = threadIdx.x + blockIdx.x * blockDim.x;
int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
C[tx] = hipCsqabsf(make_hipFloatComplex(A[tx], B[tx]));
}
+2 -2
View File
@@ -16,13 +16,13 @@
__global__ void cpy(hipLaunchParm lp, uint32_t *Out, uint32_t *In)
{
int tx = threadIdx.x;
int tx = hipThreadIdx_x;
memcpy(Out + tx, In + tx, sizeof(uint32_t));
}
__global__ void set(hipLaunchParm lp, uint32_t *ptr, uint8_t val, size_t size)
{
int tx = threadIdx.x;
int tx = hipThreadIdx_x;
memset(ptr + tx, val, sizeof(uint32_t));
}
+1 -1
View File
@@ -35,7 +35,7 @@ THE SOFTWARE.
__global__ void floatMath(hipLaunchParm lp, float *In, float *Out) {
int tid = threadIdx.x + blockIdx.x * blockDim.x;
int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
Out[tid] = __cosf(In[tid]);
Out[tid] = __exp10f(Out[tid]);
Out[tid] = __expf(Out[tid]);
+1 -1
View File
@@ -217,7 +217,7 @@ int computeGold(int *gpuData, const int len)
__global__ void testKernel(hipLaunchParm lp,int *g_odata)
{
// access thread id
const unsigned int tid = blockDim.x * blockIdx.x + threadIdx.x;
const unsigned int tid = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
// Test various atomic instructions
+16 -16
View File
@@ -32,82 +32,82 @@ THE SOFTWARE.
#define SIZE N*sizeof(float)
__global__ void test_sincosf(hipLaunchParm lp, float* a, float* b, float *c){
int tid = threadIdx.x;
int tid = hipThreadIdx_x;
sincosf(a[tid], b+tid, c+tid);
}
__global__ void test_sincospif(hipLaunchParm lp, float* a, float* b, float *c){
int tid = threadIdx.x;
int tid = hipThreadIdx_x;
sincospif(a[tid], b+tid, c+tid);
}
__global__ void test_fdividef(hipLaunchParm lp, float *a, float* b, float *c){
int tid = threadIdx.x;
int tid = hipThreadIdx_x;
c[tid] = fdividef(a[tid], b[tid]);
}
__global__ void test_llrintf(hipLaunchParm lp, float *a, long long int *b){
int tid = threadIdx.x;
int tid = hipThreadIdx_x;
b[tid] = llrintf(a[tid]);
}
__global__ void test_lrintf(hipLaunchParm lp, float *a, long int *b){
int tid = threadIdx.x;
int tid = hipThreadIdx_x;
b[tid] = lrintf(a[tid]);
}
__global__ void test_rintf(hipLaunchParm lp, float *a, float *b){
int tid = threadIdx.x;
int tid = hipThreadIdx_x;
b[tid] = rintf(a[tid]);
}
__global__ void test_llroundf(hipLaunchParm lp, float *a, long long int *b){
int tid = threadIdx.x;
int tid = hipThreadIdx_x;
b[tid] = llroundf(a[tid]);
}
__global__ void test_lroundf(hipLaunchParm lp, float *a, long int *b){
int tid = threadIdx.x;
int tid = hipThreadIdx_x;
b[tid] = lroundf(a[tid]);
}
__global__ void test_rhypotf(hipLaunchParm lp, float *a, float* b, float *c){
int tid = threadIdx.x;
int tid = hipThreadIdx_x;
c[tid] = rhypotf(a[tid], b[tid]);
}
__global__ void test_norm3df(hipLaunchParm lp, float *a, float* b, float *c, float *d){
int tid = threadIdx.x;
int tid = hipThreadIdx_x;
d[tid] = norm3df(a[tid], b[tid], c[tid]);
}
__global__ void test_norm4df(hipLaunchParm lp, float *a, float* b, float *c, float *d, float *e){
int tid = threadIdx.x;
int tid = hipThreadIdx_x;
e[tid] = norm4df(a[tid], b[tid], c[tid], d[tid]);
}
__global__ void test_normf(hipLaunchParm lp, float *a, float *b){
int tid = threadIdx.x;
int tid = hipThreadIdx_x;
b[tid] = normf(N, a);
}
__global__ void test_rnorm3df(hipLaunchParm lp, float *a, float* b, float *c, float *d){
int tid = threadIdx.x;
int tid = hipThreadIdx_x;
d[tid] = rnorm3df(a[tid], b[tid], c[tid]);
}
__global__ void test_rnorm4df(hipLaunchParm lp, float *a, float* b, float *c, float *d, float *e){
int tid = threadIdx.x;
int tid = hipThreadIdx_x;
e[tid] = rnorm4df(a[tid], b[tid], c[tid], d[tid]);
}
__global__ void test_rnormf(hipLaunchParm lp, float *a, float *b){
int tid = threadIdx.x;
int tid = hipThreadIdx_x;
b[tid] = rnormf(N, a);
}
__global__ void test_erfinvf(hipLaunchParm lp, float *a, float *b){
int tid = threadIdx.x;
int tid = hipThreadIdx_x;
b[tid] = erff(erfinvf(a[tid]));
}
+14 -14
View File
@@ -32,72 +32,72 @@ THE SOFTWARE.
#define SIZE N*sizeof(double)
__global__ void test_sincos(hipLaunchParm lp, double* a, double* b, double *c){
int tid = threadIdx.x;
int tid = hipThreadIdx_x;
sincos(a[tid], b+tid, c+tid);
}
__global__ void test_sincospi(hipLaunchParm lp, double* a, double* b, double *c){
int tid = threadIdx.x;
int tid = hipThreadIdx_x;
sincospi(a[tid], b+tid, c+tid);
}
__global__ void test_llrint(hipLaunchParm lp, double *a, long long int *b){
int tid = threadIdx.x;
int tid = hipThreadIdx_x;
b[tid] = llrint(a[tid]);
}
__global__ void test_lrint(hipLaunchParm lp, double *a, long int *b){
int tid = threadIdx.x;
int tid = hipThreadIdx_x;
b[tid] = lrint(a[tid]);
}
__global__ void test_rint(hipLaunchParm lp, double *a, double *b){
int tid = threadIdx.x;
int tid = hipThreadIdx_x;
b[tid] = rint(a[tid]);
}
__global__ void test_llround(hipLaunchParm lp, double *a, long long int *b){
int tid = threadIdx.x;
int tid = hipThreadIdx_x;
b[tid] = llround(a[tid]);
}
__global__ void test_lround(hipLaunchParm lp, double *a, long int *b){
int tid = threadIdx.x;
int tid = hipThreadIdx_x;
b[tid] = lround(a[tid]);
}
__global__ void test_rhypot(hipLaunchParm lp, double *a, double* b, double *c){
int tid = threadIdx.x;
int tid = hipThreadIdx_x;
c[tid] = rhypot(a[tid], b[tid]);
}
__global__ void test_norm3d(hipLaunchParm lp, double *a, double* b, double *c, double *d){
int tid = threadIdx.x;
int tid = hipThreadIdx_x;
d[tid] = norm3d(a[tid], b[tid], c[tid]);
}
__global__ void test_norm4d(hipLaunchParm lp, double *a, double* b, double *c, double *d, double *e){
int tid = threadIdx.x;
int tid = hipThreadIdx_x;
e[tid] = norm4d(a[tid], b[tid], c[tid], d[tid]);
}
__global__ void test_rnorm3d(hipLaunchParm lp, double *a, double* b, double *c, double *d){
int tid = threadIdx.x;
int tid = hipThreadIdx_x;
d[tid] = rnorm3d(a[tid], b[tid], c[tid]);
}
__global__ void test_rnorm4d(hipLaunchParm lp, double *a, double* b, double *c, double *d, double *e){
int tid = threadIdx.x;
int tid = hipThreadIdx_x;
e[tid] = rnorm4d(a[tid], b[tid], c[tid], d[tid]);
}
__global__ void test_rnorm(hipLaunchParm lp, double *a, double *b){
int tid = threadIdx.x;
int tid = hipThreadIdx_x;
b[tid] = rnorm(N, a);
}
__global__ void test_erfinv(hipLaunchParm lp, double *a, double *b){
int tid = threadIdx.x;
int tid = hipThreadIdx_x;
b[tid] = erf(erfinv(a[tid]));
}
+1 -1
View File
@@ -36,7 +36,7 @@ __device__ int globalOut[NUM];
__global__ void Assign(hipLaunchParm lp, int* Out)
{
int tid = threadIdx.x + blockIdx.x * blockDim.x;
int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
Out[tid] = globalIn[tid];
globalOut[tid] = globalIn[tid];
}
+2 -2
View File
@@ -29,7 +29,7 @@ THE SOFTWARE.
#if __HIP_ARCH_GFX803__ || __HIP_ARCH_GFX900__
__global__ void __halfMath(hipLaunchParm lp, __half *A, __half *B, __half *C) {
int tx = threadIdx.x;
int tx = hipThreadIdx_x;
__half a = A[tx];
__half b = B[tx];
__half c = C[tx];
@@ -45,7 +45,7 @@ __global__ void __halfMath(hipLaunchParm lp, __half *A, __half *B, __half *C) {
}
__global__ void __half2Math(hipLaunchParm lp, __half2 *A, __half2 *B, __half2 *C) {
int tx = threadIdx.x;
int tx = hipThreadIdx_x;
__half2 a = A[tx];
__half2 b = B[tx];
__half2 c = C[tx];
+1 -1
View File
@@ -33,7 +33,7 @@ THE SOFTWARE.
__global__ void vAdd(hipLaunchParm lp, float *In1, float *In2, float *In3, float *In4, float *Out)
{
int tid = threadIdx.x + blockIdx.x * blockDim.x;
int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
In4[tid] = In1[tid] + In2[tid];
__threadfence();
In3[tid] = In3[tid] + In4[tid];
+3 -3
View File
@@ -37,9 +37,9 @@ __global__ void
warpvote(hipLaunchParm lp, int* device_any, int* device_all , int Num_Warps_per_Block, int pshift)
{
int tid = threadIdx.x + blockIdx.x * blockDim.x;
device_any[threadIdx.x>>pshift] = __any(tid -77);
device_all[threadIdx.x>>pshift] = __all(tid -77);
int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
device_any[hipThreadIdx_x>>pshift] = __any(tid -77);
device_all[hipThreadIdx_x>>pshift] = __all(tid -77);
}
int main(int argc, char *argv[])
+4 -4
View File
@@ -34,12 +34,12 @@ __global__ void
gpu_ballot(hipLaunchParm lp, unsigned int* device_ballot, int Num_Warps_per_Block,int pshift)
{
int tid = threadIdx.x + blockIdx.x * blockDim.x;
const unsigned int warp_num = threadIdx.x >> pshift;
int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
const unsigned int warp_num = hipThreadIdx_x >> pshift;
#ifdef __HIP_PLATFORM_HCC__
atomicAdd(&device_ballot[warp_num+blockIdx.x*Num_Warps_per_Block],__popcll(__ballot(tid - 245)));
atomicAdd(&device_ballot[warp_num+hipBlockIdx_x*Num_Warps_per_Block],__popcll(__ballot(tid - 245)));
#else
atomicAdd(&device_ballot[warp_num+blockIdx.x*Num_Warps_per_Block],__popc(__ballot(tid - 245)));
atomicAdd(&device_ballot[warp_num+hipBlockIdx_x*Num_Warps_per_Block],__popc(__ballot(tid - 245)));
#endif
}
+2 -2
View File
@@ -72,8 +72,8 @@ HIP_kernel(hipLaunchParm lp,
unsigned int* a, unsigned int* b,unsigned long long int* c, unsigned long long int* d, int width, int height)
{
int x = blockDim.x * blockIdx.x + threadIdx.x;
int y = blockDim.y * blockIdx.y + threadIdx.y;
int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;
int i = y * width + x;
if ( i < (width * height)) {
+2 -2
View File
@@ -83,8 +83,8 @@ HIP_kernel(hipLaunchParm lp,
unsigned int* a, unsigned int* b,unsigned int* c, unsigned long long int* d, int width, int height)
{
int x = blockDim.x * blockIdx.x + threadIdx.x;
int y = blockDim.y * blockIdx.y + threadIdx.y;
int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;
int i = y * width + x;
if ( i < (width * height)) {
+2 -2
View File
@@ -73,8 +73,8 @@ HIP_kernel(hipLaunchParm lp,
int width, int height)
{
int x = blockDim.x * blockIdx.x + threadIdx.x;
int y = blockDim.y * blockIdx.y + threadIdx.y;
int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;
int i = y * width + x;
if ( i < (width * height)) {
+2 -2
View File
@@ -64,8 +64,8 @@ HIP_kernel(hipLaunchParm lp,
unsigned int* a, unsigned int* b,unsigned int* c, unsigned long long int* d, int width, int height)
{
int x = blockDim.x * blockIdx.x + threadIdx.x;
int y = blockDim.y * blockIdx.y + threadIdx.y;
int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;
int i = y * width + x;
if ( i < (width * height)) {
+2 -2
View File
@@ -57,8 +57,8 @@ vectoradd_float(hipLaunchParm lp,
T* a, const T* bm, int width, int height)
{
int x = blockDim.x * blockIdx.x + threadIdx.x;
int y = blockDim.y * blockIdx.y + threadIdx.y;
int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;
int i = y * width + x;
if ( i < (width * height)) {
+10 -10
View File
@@ -45,8 +45,8 @@ vectoradd_char1(hipLaunchParm lp,
char1* a, const char1* bm, const char1* cm, int width, int height)
{
int x = blockDim.x * blockIdx.x + threadIdx.x;
int y = blockDim.y * blockIdx.y + threadIdx.y;
int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;
int i = y * width + x;
if ( i < (width * height)) {
@@ -59,8 +59,8 @@ vectoradd_char2(hipLaunchParm lp,
char2* a, const char2* bm, const char2* cm, int width, int height)
{
int x = blockDim.x * blockIdx.x + threadIdx.x;
int y = blockDim.y * blockIdx.y + threadIdx.y;
int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;
int i = y * width + x;
if ( i < (width * height)) {
@@ -73,8 +73,8 @@ vectoradd_char3(hipLaunchParm lp,
char3* a, const char3* bm, const char3* cm, int width, int height)
{
int x = blockDim.x * blockIdx.x + threadIdx.x;
int y = blockDim.y * blockIdx.y + threadIdx.y;
int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;
int i = y * width + x;
if ( i < (width * height)) {
@@ -86,8 +86,8 @@ vectoradd_char4(hipLaunchParm lp,
char4* a, const char4* bm, const char4* cm, int width, int height)
{
int x = blockDim.x * blockIdx.x + threadIdx.x;
int y = blockDim.y * blockIdx.y + threadIdx.y;
int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;
int i = y * width + x;
if ( i < (width * height)) {
@@ -100,8 +100,8 @@ vectoradd_char4(hipLaunchParm lp,
__kernel__ void vectoradd_float(float* a, const float* b, const float* c, int width, int height) {
int x = blockDimX * blockIdx.x + threadIdx.x;
int y = blockDimY * blockIdy.y + threadIdx.y;
int x = blockDimX * hipBlockIdx_x + hipThreadIdx_x;
int y = blockDimY * blockIdy.y + hipThreadIdx_y;
int i = y * width + x;
if ( i < (width * height)) {
+1 -1
View File
@@ -36,7 +36,7 @@ THE SOFTWARE.
#define SIZE LEN<<2
__global__ void kernel_trig(hipLaunchParm lp, float *In, float *sin_d, float *cos_d, float *tan_d, float *sin_pd, float *cos_pd){
int tid = threadIdx.x + blockIdx.x * blockDim.x;
int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
sin_d[tid] = __sinf(In[tid]);
cos_d[tid] = __cosf(In[tid]);
tan_d[tid] = __tanf(In[tid]);
+1 -1
View File
@@ -29,7 +29,7 @@ THE SOFTWARE.
__global__ void Add(hipLaunchParm lp, float *Ad, float *Bd, float *Cd, size_t len)
{
int tx = threadIdx.x + blockIdx.x * blockDim.x;
int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
if(tx < len)
{
Cd[tx] = Ad[tx] + Bd[tx];
+1 -1
View File
@@ -30,7 +30,7 @@ THE SOFTWARE.
__global__ void Kern(hipLaunchParm lp, float *A)
{
int tx = threadIdx.x + blockIdx.x * blockDim.x;
int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
A[tx] += 1.0f;
}
+1 -1
View File
@@ -33,7 +33,7 @@ class memManager;
template<typename T>
__global__ void Add(hipLaunchParm lp, T* Ad, T* Bd, T* Cd, size_t Len)
{
int tx = threadIdx.x + blockIdx.x * blockDim.x;
int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
if(tx < Len)
{
Cd[tx] = Ad[tx] + Bd[tx];
+1 -1
View File
@@ -34,7 +34,7 @@ THE SOFTWARE.
#define SIZE 1024*1024*sizeof(int)
__global__ void Iter(hipLaunchParm lp, int *Ad){
int tx = threadIdx.x + blockIdx.x * blockDim.x;
int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
if(tx == 0){
for(int i=0;i<ITER;i++){
Ad[tx] += 1;
+1 -1
View File
@@ -29,7 +29,7 @@ THE SOFTWARE.
#define SIZE 1024*1024*sizeof(int)
__global__ void Iter(hipLaunchParm lp, int *Ad){
int tx = threadIdx.x + blockIdx.x * blockDim.x;
int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
if(tx == 0){
for(int i=0;i<ITER;i++){
Ad[tx] += 1;
+1 -1
View File
@@ -2,7 +2,7 @@
#include "hip/hip_runtime_api.h"
__global__ void Kernel(hipLaunchParm lp, float *Ad){
int tx = threadIdx.x + blockIdx.x * blockDim.x;
int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
Ad[tx] += Ad[tx-1];
}
+2 -2
View File
@@ -40,8 +40,8 @@ __global__ void testExternSharedKernel(hipLaunchParm lp, const T* A_d, const T*
T *sdata = reinterpret_cast<T *>(my_sdata);
#endif
size_t gid = (blockIdx.x * blockDim.x + threadIdx.x);
size_t tid = threadIdx.x;
size_t gid = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
size_t tid = hipThreadIdx_x;
// initialize dynamic shared memory
if (tid < groupElements) {
+1 -1
View File
@@ -34,7 +34,7 @@ THE SOFTWARE.
__global__ void vectorAdd(hipLaunchParm lp, float *Ad, float *Bd) {
HIP_DYNAMIC_SHARED(float, sBd);
int tx = threadIdx.x;
int tx = hipThreadIdx_x;
for(int i=0;i<LEN/64;i++) {
sBd[tx + i * 64] = Ad[tx + i * 64] + 1.0f;
Bd[tx + i * 64] = sBd[tx + i * 64];
+2 -2
View File
@@ -48,8 +48,8 @@ vectorADD2( hipLaunchParm lp,
T *C_d,
size_t N)
{
size_t offset = (blockIdx.x * blockDim.x + threadIdx.x);
size_t stride = blockDim.x * gridDim.x ;
size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
size_t stride = hipBlockDim_x * hipGridDim_x ;
for (size_t i=offset; i<N; i+=stride) {
double foo = __hiloint2double(A_d[i], B_d[i]);
+4 -4
View File
@@ -62,7 +62,7 @@ __global__ void MyKernel (const hipLaunchParm lp, const float *a, const float *b
{
//KERNELBEGIN;
unsigned gid = threadIdx.x;
unsigned gid = hipThreadIdx_x;
if (gid < N) {
c[gid] = a[gid] + PlusOne(b[gid]);
}
@@ -96,7 +96,7 @@ vectorADD(const hipLaunchParm lp,
int zuzu = deviceVar + 1;
int b = threadIdx.x;
int b = hipThreadIdx_x;
int c;
#ifdef NOT_YET
int a = __shfl_up(x, 1);
@@ -119,8 +119,8 @@ vectorADD(const hipLaunchParm lp,
__syncthreads();
size_t offset = (blockIdx.x * blockDim.x + threadIdx.x);
size_t stride = blockDim.x * gridDim.x ;
size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
size_t stride = hipBlockDim_x * hipGridDim_x ;
for (size_t i=offset; i<N; i+=stride) {
C_d[i] = A_d[i] + B_d[i];
+1 -1
View File
@@ -38,7 +38,7 @@ __constant__ int Value[LEN];
__global__ void Get(hipLaunchParm lp, int *Ad)
{
int tid = threadIdx.x + blockIdx.x * blockDim.x;
int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
Ad[tid] = Value[tid];
}
+2 -2
View File
@@ -33,12 +33,12 @@ THE SOFTWARE.
#define SIZE NUM * 8
__global__ void Alloc(hipLaunchParm lp, uint64_t *Ptr) {
int tid = threadIdx.x + blockIdx.x * blockDim.x;
int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
Ptr[tid] = (uint64_t)malloc(128);
}
__global__ void Free(hipLaunchParm lp, uint64_t *Ptr) {
int tid = threadIdx.x + blockIdx.x * blockDim.x;
int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
free((void*)Ptr[tid]);
}
+10 -10
View File
@@ -35,52 +35,52 @@ THE SOFTWARE.
#define LEN12 12 * 4
__global__ void MemCpy8(hipLaunchParm lp, uint8_t *In, uint8_t *Out) {
int tid = threadIdx.x + blockIdx.x * blockDim.x;
int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
memcpy(Out + tid*8, In + tid*8, 8);
}
__global__ void MemCpy9(hipLaunchParm lp, uint8_t *In, uint8_t *Out) {
int tid = threadIdx.x + blockIdx.x * blockDim.x;
int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
memcpy(Out + tid*9, In + tid*9, 9);
}
__global__ void MemCpy10(hipLaunchParm lp, uint8_t *In, uint8_t *Out) {
int tid = threadIdx.x + blockIdx.x * blockDim.x;
int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
memcpy(Out + tid*10, In + tid*10, 10);
}
__global__ void MemCpy11(hipLaunchParm lp, uint8_t *In, uint8_t *Out) {
int tid = threadIdx.x + blockIdx.x * blockDim.x;
int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
memcpy(Out + tid*11, In + tid*11, 11);
}
__global__ void MemCpy12(hipLaunchParm lp, uint8_t *In, uint8_t *Out) {
int tid = threadIdx.x + blockIdx.x * blockDim.x;
int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
memcpy(Out + tid*12, In + tid*12, 12);
}
__global__ void MemSet8(hipLaunchParm lp, uint8_t *In) {
int tid = threadIdx.x + blockIdx.x * blockDim.x;
int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
memset(In + tid*8, 1, 8);
}
__global__ void MemSet9(hipLaunchParm lp, uint8_t *In) {
int tid = threadIdx.x + blockIdx.x * blockDim.x;
int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
memset(In + tid*9, 1, 9);
}
__global__ void MemSet10(hipLaunchParm lp, uint8_t *In) {
int tid = threadIdx.x + blockIdx.x * blockDim.x;
int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
memset(In + tid*10, 1, 10);
}
__global__ void MemSet11(hipLaunchParm lp, uint8_t *In) {
int tid = threadIdx.x + blockIdx.x * blockDim.x;
int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
memset(In + tid*11, 1, 11);
}
__global__ void MemSet12(hipLaunchParm lp, uint8_t *In) {
int tid = threadIdx.x + blockIdx.x * blockDim.x;
int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
memset(In + tid*12, 1, 12);
}
+1 -1
View File
@@ -37,7 +37,7 @@ __global__ void vadd_asm(hipLaunchParm lp,
float *out,
float *in)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
#ifdef __HIP_PLATFORM_NVCC__
asm volatile("add.f32 %0,%1,%2;":"=f"(out[i]):"f"(in[i]),"f"(out[i]));
+1 -1
View File
@@ -35,7 +35,7 @@ __global__ void vmac_asm(hipLaunchParm lp,
float *out,
float *in)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
asm volatile ("v_mac_f32_e32 %0, %2, %3" : "=v" (out[i]) : "0"(out[i]), "v" (a), "v" (in[i]));
}
+1 -1
View File
@@ -33,7 +33,7 @@ void
__launch_bounds__(256, 2)
myKern(hipLaunchParm lp, int *C, const int *A, int N, int xfactor)
{
int tid = (blockIdx.x * blockDim.x + threadIdx.x);
int tid = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
if (tid < N) {
C[tid] = A[tid];
@@ -34,7 +34,7 @@ THE SOFTWARE.
#define NUM_STREAMS 2
__global__ void Iter(hipLaunchParm lp, int *Ad, int num){
int tx = threadIdx.x + blockIdx.x * blockDim.x;
int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
// Kernel loop designed to execute very slowly... ... ... so we can test timing-related behavior below
if(tx == 0){
for(int i = 0; i<num;i++){
@@ -33,7 +33,7 @@ THE SOFTWARE.
#define SIZE LEN*sizeof(float)
__global__ void Add(hipLaunchParm lp, float *Ad, float *Bd, float *Cd){
int tx = threadIdx.x + blockIdx.x * blockDim.x;
int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
Cd[tx] = Ad[tx] + Bd[tx];
}
@@ -33,13 +33,13 @@
#define SIZE LEN*sizeof(float)
__global__ void Add(float *Ad, float *Bd, float *Cd){
int tx = threadIdx.x + blockIdx.x * blockDim.x;
int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
Cd[tx] = Ad[tx] + Bd[tx];
}
__global__ void Set(int *Ad, int val){
int tx = threadIdx.x + blockIdx.x * blockDim.x;
int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
Ad[tx] = val;
}
@@ -29,7 +29,7 @@ THE SOFTWARE.
#include<malloc.h>
__global__ void Inc(hipLaunchParm lp, float *Ad){
int tx = threadIdx.x + blockIdx.x * blockDim.x;
int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
Ad[tx] = Ad[tx] + float(1);
}
@@ -70,8 +70,8 @@ template<typename T>
__global__ void
addK (hipLaunchParm lp, T *A, T K, size_t numElements)
{
size_t offset = (blockIdx.x * blockDim.x + threadIdx.x);
size_t stride = blockDim.x * gridDim.x ;
size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
size_t stride = hipBlockDim_x * hipGridDim_x ;
for (size_t i=offset; i<numElements; i+=stride) {
A[i] = A[i] + K;
@@ -31,7 +31,7 @@ THE SOFTWARE.
__global__ void Kernel(hipLaunchParm lp,volatile float* hostRes)
{
int tid = threadIdx.x + blockIdx.x * blockDim.x;
int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
hostRes[tid] = tid + 1;
__threadfence_system();
// expecting that the data is getting flushed to host here!
@@ -63,8 +63,8 @@ int enablePeers(int dev0, int dev1)
__global__ void
memsetIntKernel(int * ptr, const int val, size_t numElements)
{
int gid = (blockIdx.x * blockDim.x + threadIdx.x);
int stride = blockDim.x * gridDim.x ;
int gid = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
int stride = hipBlockDim_x * hipGridDim_x ;
for (size_t i= gid; i< numElements; i+=stride){
ptr[i] = val;
}
@@ -73,8 +73,8 @@ memsetIntKernel(int * ptr, const int val, size_t numElements)
__global__ void
memcpyIntKernel(const int * src, int* dst, size_t numElements)
{
int gid = (blockIdx.x * blockDim.x + threadIdx.x);
int stride = blockDim.x * gridDim.x ;
int gid = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
int stride = hipBlockDim_x * hipGridDim_x ;
for (size_t i= gid; i< numElements; i+=stride){
dst[i] = src[i];
}
+1 -1
View File
@@ -34,7 +34,7 @@ THE SOFTWARE.
#define kernel_name "hello_world"
__global__ void Cpy(hipLaunchParm lp, float *Ad, float* Bd){
int tx = threadIdx.x;
int tx = hipThreadIdx_x;
Bd[tx] = Ad[tx];
}
+1 -1
View File
@@ -24,7 +24,7 @@ THE SOFTWARE.
extern "C" __global__ void hello_world(hipLaunchParm lp, float *a, float *b)
{
int tx = threadIdx.x;
int tx = hipThreadIdx_x;
b[tx] = a[tx];
}
@@ -35,7 +35,7 @@ THE SOFTWARE.
template<typename T>
__global__ void Inc(hipLaunchParm lp, T *Array){
int tx = threadIdx.x + blockIdx.x * blockDim.x;
int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
Array[tx] = Array[tx] + T(1);
}
@@ -29,7 +29,7 @@ THE SOFTWARE.
const int NN = 1 << 21;
__global__ void kernel(hipLaunchParm lp, float *x, float *y, int n){
int tid = threadIdx.x;
int tid = hipThreadIdx_x;
if(tid < 1){
for(int i=0;i<n;i++){
x[i] = sqrt(powf(3.14159,i));
@@ -39,7 +39,7 @@ __global__ void kernel(hipLaunchParm lp, float *x, float *y, int n){
}
__global__ void nKernel(hipLaunchParm lp, float *y){
int tid = threadIdx.x;
int tid = hipThreadIdx_x;
y[tid] = y[tid] + 1.0f;
}
@@ -31,7 +31,7 @@ THE SOFTWARE.
const int NN = 1 << 21;
__global__ void kernel(hipLaunchParm lp, float *x, float *y, int n){
int tid = threadIdx.x;
int tid = hipThreadIdx_x;
if(tid < 1){
for(int i=0;i<n;i++){
x[i] = sqrt(powf(3.14159,i));
@@ -41,7 +41,7 @@ __global__ void kernel(hipLaunchParm lp, float *x, float *y, int n){
}
__global__ void nKernel(hipLaunchParm lp, float *y){
int tid = threadIdx.x;
int tid = hipThreadIdx_x;
y[tid] = y[tid] + 1.0f;
}
@@ -41,8 +41,8 @@ vectorADDRepeat(hipLaunchParm lp,
size_t NELEM,
int repeat)
{
size_t offset = (blockIdx.x * blockDim.x + threadIdx.x);
size_t stride = blockDim.x * gridDim.x ;
size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
size_t stride = hipBlockDim_x * hipGridDim_x ;
for (int j=1; j<=repeat;j++) {
for (size_t i=offset; i<NELEM; i+=stride) {
+1 -1
View File
@@ -73,7 +73,7 @@ void D2H(T *Dst, T *Src, size_t size){
template<typename T>
__global__ void Inc(hipLaunchParm lp, T *In){
int tx = threadIdx.x + blockIdx.x * blockDim.x;
int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
In[tx] = In[tx] + 1;
}
@@ -102,8 +102,8 @@ MemcpyFunction g_moduleMemcpy("memcpyInt.hsaco", "memcpyIntKernel");
__global__ void
memsetIntKernel(int * ptr, const int val, size_t numElements)
{
int gid = (blockIdx.x * blockDim.x + threadIdx.x);
int stride = blockDim.x * gridDim.x ;
int gid = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
int stride = hipBlockDim_x * hipGridDim_x ;
for (size_t i= gid; i< numElements; i+=stride){
ptr[i] = val;
}
@@ -112,8 +112,8 @@ memsetIntKernel(int * ptr, const int val, size_t numElements)
__global__ void
memcpyIntKernel(int *dst, const int * src, size_t numElements)
{
int gid = (blockIdx.x * blockDim.x + threadIdx.x);
int stride = blockDim.x * gridDim.x ;
int gid = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
int stride = hipBlockDim_x * hipGridDim_x ;
for (size_t i= gid; i< numElements; i+=stride){
dst[i] = src[i];
}
@@ -5,8 +5,8 @@
extern "C" __global__ void
memcpyIntKernel(hipLaunchParm lp, int *dst, const int * src, size_t numElements)
{
int gid = (blockIdx.x * blockDim.x + threadIdx.x);
int stride = blockDim.x * gridDim.x ;
int gid = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
int stride = hipBlockDim_x * hipGridDim_x ;
for (size_t i= gid; i< numElements; i+=stride){
dst[i] = src[i];
}
+1 -1
View File
@@ -23,7 +23,7 @@ THE SOFTWARE.
void __global__
test_kernel(float *A)
{
int tid = blockIdx.x * blockDim.x + threadIdx.x;
int tid = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
float a = __ballot(tid < 16);
float b = __shfl(tid < 16);
+1 -1
View File
@@ -30,7 +30,7 @@ THE SOFTWARE.
#define ITER 1<<10
__global__ void Iter(hipLaunchParm lp, int *Ad, int num){
int tx = threadIdx.x + blockIdx.x * blockDim.x;
int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
if(tx == 0){
for(int i = 0; i<num;i++){
Ad[tx] += 1;
+1 -1
View File
@@ -26,7 +26,7 @@ THE SOFTWARE.
static size_t size[NUM_SIZE];
__global__ void Add(hipLaunchParm lp, int *Ad){
int tx = threadIdx.x;
int tx = hipThreadIdx_x;
Ad[tx] = Ad[tx] + tx;
}
+1 -1
View File
@@ -26,7 +26,7 @@ THE SOFTWARE.
static size_t size[NUM_SIZE];
__global__ void Add(hipLaunchParm lp, int *Ad){
int tx = threadIdx.x;
int tx = hipThreadIdx_x;
Ad[tx] = Ad[tx] + tx;
}
+1 -1
View File
@@ -30,7 +30,7 @@ THE SOFTWARE.
#define ITER 1<<10
__global__ void Iter(hipLaunchParm lp, int *Ad, int num){
int tx = threadIdx.x + blockIdx.x * blockDim.x;
int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
if(tx == 0){
for(int i = 0; i<num;i++){
Ad[tx] += 1;
+10 -10
View File
@@ -136,8 +136,8 @@ vectorADD(hipLaunchParm lp,
T *C_d,
size_t NELEM)
{
size_t offset = (blockIdx.x * blockDim.x + threadIdx.x);
size_t stride = blockDim.x * gridDim.x ;
size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
size_t stride = hipBlockDim_x * hipGridDim_x ;
for (size_t i=offset; i<NELEM; i+=stride) {
C_d[i] = A_d[i] + B_d[i];
@@ -153,8 +153,8 @@ vectorADDReverse(hipLaunchParm lp,
T *C_d,
size_t NELEM)
{
size_t offset = (blockIdx.x * blockDim.x + threadIdx.x);
size_t stride = blockDim.x * gridDim.x ;
size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
size_t stride = hipBlockDim_x * hipGridDim_x ;
for (int64_t i=NELEM-stride+offset; i>=0; i-=stride) {
C_d[i] = A_d[i] + B_d[i];
@@ -169,8 +169,8 @@ addCount( const T *A_d,
size_t NELEM,
int count)
{
size_t offset = (blockIdx.x * blockDim.x + threadIdx.x);
size_t stride = blockDim.x * gridDim.x ;
size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
size_t stride = hipBlockDim_x * hipGridDim_x ;
// Deliberately do this in an inefficient way to increase kernel runtime
for (int i=0; i<count; i++) {
@@ -188,8 +188,8 @@ addCountReverse( const T *A_d,
int64_t NELEM,
int count)
{
size_t offset = (blockIdx.x * blockDim.x + threadIdx.x);
size_t stride = blockDim.x * gridDim.x ;
size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
size_t stride = hipBlockDim_x * hipGridDim_x ;
// Deliberately do this in an inefficient way to increase kernel runtime
for (int i=0; i<count; i++) {
@@ -205,8 +205,8 @@ __global__ void
memsetReverse( T *C_d, T val,
int64_t NELEM)
{
size_t offset = (blockIdx.x * blockDim.x + threadIdx.x);
size_t stride = blockDim.x * gridDim.x ;
size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
size_t stride = hipBlockDim_x * hipGridDim_x ;
for (int64_t i=NELEM-stride+offset; i>=0; i-=stride) {
C_d[i] = val;
+2 -2
View File
@@ -17,8 +17,8 @@ __global__ void tex2DKernel(float* outputData,
int width,
int height)
{
int x = blockIdx.x*blockDim.x + threadIdx.x;
int y = blockIdx.y*blockDim.y + threadIdx.y;
int x = hipBlockIdx_x*hipBlockDim_x + hipThreadIdx_x;
int y = hipBlockIdx_y*hipBlockDim_y + hipThreadIdx_y;
outputData[y*width + x] = tex2D<float>(textureObject, x, y);
}
+2 -2
View File
@@ -20,8 +20,8 @@ __global__ void tex2DKernel(float* outputData,
int width,
int height)
{
int x = blockIdx.x*blockDim.x + threadIdx.x;
int y = blockIdx.y*blockDim.y + threadIdx.y;
int x = hipBlockIdx_x*hipBlockDim_x + hipThreadIdx_x;
int y = hipBlockIdx_y*hipBlockDim_y + hipThreadIdx_y;
#ifdef __HIP_PLATFORM_HCC__
outputData[y*width + x] = tex2D(tex, textureObject, x, y);
#else