Revert adoption of CUDA indexing in general - this can only work with later versions of the compiler, just like module based dispatch, and thus must be guarded against usage in earlier (e.g. 1.6) versions.

Αυτή η υποβολή περιλαμβάνεται σε:
Alex Voicu
2017-11-29 21:01:28 +00:00
γονέας b881cf713c
υποβολή d2fd1f5544
63 αρχεία άλλαξαν με 173 προσθήκες και 171 διαγραφές
@@ -381,27 +381,29 @@ __device__ void __threadfence_system(void) ;
* @}
*/
template<typename std::common_type<
decltype(hc_get_group_id),
decltype(hc_get_group_size),
decltype(hc_get_num_groups),
decltype(hc_get_workitem_id)>::type f>
class Coordinates {
using R = decltype(f(0));
#if __hcc_workweek__ >= 17481
template<typename std::common_type<
decltype(hc_get_group_id),
decltype(hc_get_group_size),
decltype(hc_get_num_groups),
decltype(hc_get_workitem_id)>::type f>
class Coordinates {
using R = decltype(f(0));
struct X { __device__ operator R() const { return f(0); } };
struct Y { __device__ operator R() const { return f(1); } };
struct Z { __device__ operator R() const { return f(2); } };
public:
static constexpr X x{};
static constexpr Y y{};
static constexpr Z z{};
};
struct X { __device__ operator R() const { return f(0); } };
struct Y { __device__ operator R() const { return f(1); } };
struct Z { __device__ operator R() const { return f(2); } };
public:
static constexpr X x{};
static constexpr Y y{};
static constexpr Z z{};
};
static constexpr Coordinates<hc_get_group_size> blockDim;
static constexpr Coordinates<hc_get_group_id> blockIdx;
static constexpr Coordinates<hc_get_num_groups> gridDim;
static constexpr Coordinates<hc_get_workitem_id> threadIdx;
static constexpr Coordinates<hc_get_group_size> blockDim;
static constexpr Coordinates<hc_get_group_id> blockIdx;
static constexpr Coordinates<hc_get_num_groups> gridDim;
static constexpr Coordinates<hc_get_workitem_id> threadIdx;
#endif
#define hipThreadIdx_x (hc_get_workitem_id(0))
#define hipThreadIdx_y (hc_get_workitem_id(1))
@@ -40,8 +40,8 @@ template <typename T>
__global__ void
vector_square(T *C_d, const T *A_d, size_t N)
{
size_t offset = (blockIdx.x * blockDim.x + threadIdx.x);
size_t stride = blockDim.x * gridDim.x ;
size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
size_t stride = hipBlockDim_x * hipGridDim_x ;
for (size_t i=offset; i<N; i+=stride) {
C_d[i] = A_d[i] * A_d[i];
+2 -2
Προβολή Αρχείου
@@ -45,8 +45,8 @@ __device__ void *__hip_hc_malloc(size_t size)
{
return (void*)nullptr;
}
uint32_t totalThreads = blockDim.x * gridDim.x * blockDim.y * gridDim.y * blockDim.z * gridDim.z;
uint32_t currentWorkItem = threadIdx.x + blockDim.x * blockIdx.x;
uint32_t totalThreads = hipBlockDim_x * hipGridDim_x * hipBlockDim_y * hipGridDim_y * hipBlockDim_z * hipGridDim_z;
uint32_t currentWorkItem = hipThreadIdx_x + hipBlockDim_x * hipBlockIdx_x;
uint32_t numHeapsPerWorkItem = NUM_PAGES / totalThreads;
uint32_t heapSizePerWorkItem = SIZE_OF_HEAP / totalThreads;
+2 -2
Προβολή Αρχείου
@@ -1307,9 +1307,9 @@ namespace
__global__
void hip_fill_n(RandomAccessIterator f, N n, T value)
{
const uint32_t grid_dim = gridDim.x * blockDim.x;
const uint32_t grid_dim = hipGridDim_x * hipBlockDim_x;
size_t idx = blockIdx.x * block_dim + threadIdx.x;
size_t idx = hipBlockIdx_x * block_dim + hipThreadIdx_x;
while (idx < n) {
__builtin_memcpy(
reinterpret_cast<void*>(&f[idx]),
@@ -34,7 +34,7 @@ THE SOFTWARE.
#define NUM_STREAMS 2
__global__ void Iter(hipLaunchParm lp, int *Ad, int num){
int tx = threadIdx.x + blockIdx.x * blockDim.x;
int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
// Kernel loop designed to execute very slowly... ... ... so we can test timing-related behavior below
if(tx == 0){
for(int i = 0; i<num;i++){
@@ -27,7 +27,7 @@ THE SOFTWARE.
#define SIZE 64<<2
__global__ void getSqAbs(hipLaunchParm lp, float *A, float *B, float *C){
int tx = threadIdx.x + blockIdx.x * blockDim.x;
int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
C[tx] = hipCsqabsf(make_hipFloatComplex(A[tx], B[tx]));
}
@@ -16,13 +16,13 @@
__global__ void cpy(hipLaunchParm lp, uint32_t *Out, uint32_t *In)
{
int tx = threadIdx.x;
int tx = hipThreadIdx_x;
memcpy(Out + tx, In + tx, sizeof(uint32_t));
}
__global__ void set(hipLaunchParm lp, uint32_t *ptr, uint8_t val, size_t size)
{
int tx = threadIdx.x;
int tx = hipThreadIdx_x;
memset(ptr + tx, val, sizeof(uint32_t));
}
@@ -35,7 +35,7 @@ THE SOFTWARE.
__global__ void floatMath(hipLaunchParm lp, float *In, float *Out) {
int tid = threadIdx.x + blockIdx.x * blockDim.x;
int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
Out[tid] = __cosf(In[tid]);
Out[tid] = __exp10f(Out[tid]);
Out[tid] = __expf(Out[tid]);
@@ -217,7 +217,7 @@ int computeGold(int *gpuData, const int len)
__global__ void testKernel(hipLaunchParm lp,int *g_odata)
{
// access thread id
const unsigned int tid = blockDim.x * blockIdx.x + threadIdx.x;
const unsigned int tid = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
// Test various atomic instructions
@@ -32,82 +32,82 @@ THE SOFTWARE.
#define SIZE N*sizeof(float)
__global__ void test_sincosf(hipLaunchParm lp, float* a, float* b, float *c){
int tid = threadIdx.x;
int tid = hipThreadIdx_x;
sincosf(a[tid], b+tid, c+tid);
}
__global__ void test_sincospif(hipLaunchParm lp, float* a, float* b, float *c){
int tid = threadIdx.x;
int tid = hipThreadIdx_x;
sincospif(a[tid], b+tid, c+tid);
}
__global__ void test_fdividef(hipLaunchParm lp, float *a, float* b, float *c){
int tid = threadIdx.x;
int tid = hipThreadIdx_x;
c[tid] = fdividef(a[tid], b[tid]);
}
__global__ void test_llrintf(hipLaunchParm lp, float *a, long long int *b){
int tid = threadIdx.x;
int tid = hipThreadIdx_x;
b[tid] = llrintf(a[tid]);
}
__global__ void test_lrintf(hipLaunchParm lp, float *a, long int *b){
int tid = threadIdx.x;
int tid = hipThreadIdx_x;
b[tid] = lrintf(a[tid]);
}
__global__ void test_rintf(hipLaunchParm lp, float *a, float *b){
int tid = threadIdx.x;
int tid = hipThreadIdx_x;
b[tid] = rintf(a[tid]);
}
__global__ void test_llroundf(hipLaunchParm lp, float *a, long long int *b){
int tid = threadIdx.x;
int tid = hipThreadIdx_x;
b[tid] = llroundf(a[tid]);
}
__global__ void test_lroundf(hipLaunchParm lp, float *a, long int *b){
int tid = threadIdx.x;
int tid = hipThreadIdx_x;
b[tid] = lroundf(a[tid]);
}
__global__ void test_rhypotf(hipLaunchParm lp, float *a, float* b, float *c){
int tid = threadIdx.x;
int tid = hipThreadIdx_x;
c[tid] = rhypotf(a[tid], b[tid]);
}
__global__ void test_norm3df(hipLaunchParm lp, float *a, float* b, float *c, float *d){
int tid = threadIdx.x;
int tid = hipThreadIdx_x;
d[tid] = norm3df(a[tid], b[tid], c[tid]);
}
__global__ void test_norm4df(hipLaunchParm lp, float *a, float* b, float *c, float *d, float *e){
int tid = threadIdx.x;
int tid = hipThreadIdx_x;
e[tid] = norm4df(a[tid], b[tid], c[tid], d[tid]);
}
__global__ void test_normf(hipLaunchParm lp, float *a, float *b){
int tid = threadIdx.x;
int tid = hipThreadIdx_x;
b[tid] = normf(N, a);
}
__global__ void test_rnorm3df(hipLaunchParm lp, float *a, float* b, float *c, float *d){
int tid = threadIdx.x;
int tid = hipThreadIdx_x;
d[tid] = rnorm3df(a[tid], b[tid], c[tid]);
}
__global__ void test_rnorm4df(hipLaunchParm lp, float *a, float* b, float *c, float *d, float *e){
int tid = threadIdx.x;
int tid = hipThreadIdx_x;
e[tid] = rnorm4df(a[tid], b[tid], c[tid], d[tid]);
}
__global__ void test_rnormf(hipLaunchParm lp, float *a, float *b){
int tid = threadIdx.x;
int tid = hipThreadIdx_x;
b[tid] = rnormf(N, a);
}
__global__ void test_erfinvf(hipLaunchParm lp, float *a, float *b){
int tid = threadIdx.x;
int tid = hipThreadIdx_x;
b[tid] = erff(erfinvf(a[tid]));
}
@@ -32,72 +32,72 @@ THE SOFTWARE.
#define SIZE N*sizeof(double)
__global__ void test_sincos(hipLaunchParm lp, double* a, double* b, double *c){
int tid = threadIdx.x;
int tid = hipThreadIdx_x;
sincos(a[tid], b+tid, c+tid);
}
__global__ void test_sincospi(hipLaunchParm lp, double* a, double* b, double *c){
int tid = threadIdx.x;
int tid = hipThreadIdx_x;
sincospi(a[tid], b+tid, c+tid);
}
__global__ void test_llrint(hipLaunchParm lp, double *a, long long int *b){
int tid = threadIdx.x;
int tid = hipThreadIdx_x;
b[tid] = llrint(a[tid]);
}
__global__ void test_lrint(hipLaunchParm lp, double *a, long int *b){
int tid = threadIdx.x;
int tid = hipThreadIdx_x;
b[tid] = lrint(a[tid]);
}
__global__ void test_rint(hipLaunchParm lp, double *a, double *b){
int tid = threadIdx.x;
int tid = hipThreadIdx_x;
b[tid] = rint(a[tid]);
}
__global__ void test_llround(hipLaunchParm lp, double *a, long long int *b){
int tid = threadIdx.x;
int tid = hipThreadIdx_x;
b[tid] = llround(a[tid]);
}
__global__ void test_lround(hipLaunchParm lp, double *a, long int *b){
int tid = threadIdx.x;
int tid = hipThreadIdx_x;
b[tid] = lround(a[tid]);
}
__global__ void test_rhypot(hipLaunchParm lp, double *a, double* b, double *c){
int tid = threadIdx.x;
int tid = hipThreadIdx_x;
c[tid] = rhypot(a[tid], b[tid]);
}
__global__ void test_norm3d(hipLaunchParm lp, double *a, double* b, double *c, double *d){
int tid = threadIdx.x;
int tid = hipThreadIdx_x;
d[tid] = norm3d(a[tid], b[tid], c[tid]);
}
__global__ void test_norm4d(hipLaunchParm lp, double *a, double* b, double *c, double *d, double *e){
int tid = threadIdx.x;
int tid = hipThreadIdx_x;
e[tid] = norm4d(a[tid], b[tid], c[tid], d[tid]);
}
__global__ void test_rnorm3d(hipLaunchParm lp, double *a, double* b, double *c, double *d){
int tid = threadIdx.x;
int tid = hipThreadIdx_x;
d[tid] = rnorm3d(a[tid], b[tid], c[tid]);
}
__global__ void test_rnorm4d(hipLaunchParm lp, double *a, double* b, double *c, double *d, double *e){
int tid = threadIdx.x;
int tid = hipThreadIdx_x;
e[tid] = rnorm4d(a[tid], b[tid], c[tid], d[tid]);
}
__global__ void test_rnorm(hipLaunchParm lp, double *a, double *b){
int tid = threadIdx.x;
int tid = hipThreadIdx_x;
b[tid] = rnorm(N, a);
}
__global__ void test_erfinv(hipLaunchParm lp, double *a, double *b){
int tid = threadIdx.x;
int tid = hipThreadIdx_x;
b[tid] = erf(erfinv(a[tid]));
}
@@ -36,7 +36,7 @@ __device__ int globalOut[NUM];
__global__ void Assign(hipLaunchParm lp, int* Out)
{
int tid = threadIdx.x + blockIdx.x * blockDim.x;
int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
Out[tid] = globalIn[tid];
globalOut[tid] = globalIn[tid];
}
@@ -29,7 +29,7 @@ THE SOFTWARE.
#if __HIP_ARCH_GFX803__ || __HIP_ARCH_GFX900__
__global__ void __halfMath(hipLaunchParm lp, __half *A, __half *B, __half *C) {
int tx = threadIdx.x;
int tx = hipThreadIdx_x;
__half a = A[tx];
__half b = B[tx];
__half c = C[tx];
@@ -45,7 +45,7 @@ __global__ void __halfMath(hipLaunchParm lp, __half *A, __half *B, __half *C) {
}
__global__ void __half2Math(hipLaunchParm lp, __half2 *A, __half2 *B, __half2 *C) {
int tx = threadIdx.x;
int tx = hipThreadIdx_x;
__half2 a = A[tx];
__half2 b = B[tx];
__half2 c = C[tx];
@@ -33,7 +33,7 @@ THE SOFTWARE.
__global__ void vAdd(hipLaunchParm lp, float *In1, float *In2, float *In3, float *In4, float *Out)
{
int tid = threadIdx.x + blockIdx.x * blockDim.x;
int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
In4[tid] = In1[tid] + In2[tid];
__threadfence();
In3[tid] = In3[tid] + In4[tid];
@@ -37,9 +37,9 @@ __global__ void
warpvote(hipLaunchParm lp, int* device_any, int* device_all , int Num_Warps_per_Block, int pshift)
{
int tid = threadIdx.x + blockIdx.x * blockDim.x;
device_any[threadIdx.x>>pshift] = __any(tid -77);
device_all[threadIdx.x>>pshift] = __all(tid -77);
int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
device_any[hipThreadIdx_x>>pshift] = __any(tid -77);
device_all[hipThreadIdx_x>>pshift] = __all(tid -77);
}
int main(int argc, char *argv[])
@@ -34,12 +34,12 @@ __global__ void
gpu_ballot(hipLaunchParm lp, unsigned int* device_ballot, int Num_Warps_per_Block,int pshift)
{
int tid = threadIdx.x + blockIdx.x * blockDim.x;
const unsigned int warp_num = threadIdx.x >> pshift;
int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
const unsigned int warp_num = hipThreadIdx_x >> pshift;
#ifdef __HIP_PLATFORM_HCC__
atomicAdd(&device_ballot[warp_num+blockIdx.x*Num_Warps_per_Block],__popcll(__ballot(tid - 245)));
atomicAdd(&device_ballot[warp_num+hipBlockIdx_x*Num_Warps_per_Block],__popcll(__ballot(tid - 245)));
#else
atomicAdd(&device_ballot[warp_num+blockIdx.x*Num_Warps_per_Block],__popc(__ballot(tid - 245)));
atomicAdd(&device_ballot[warp_num+hipBlockIdx_x*Num_Warps_per_Block],__popc(__ballot(tid - 245)));
#endif
}
@@ -72,8 +72,8 @@ HIP_kernel(hipLaunchParm lp,
unsigned int* a, unsigned int* b,unsigned long long int* c, unsigned long long int* d, int width, int height)
{
int x = blockDim.x * blockIdx.x + threadIdx.x;
int y = blockDim.y * blockIdx.y + threadIdx.y;
int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;
int i = y * width + x;
if ( i < (width * height)) {
@@ -83,8 +83,8 @@ HIP_kernel(hipLaunchParm lp,
unsigned int* a, unsigned int* b,unsigned int* c, unsigned long long int* d, int width, int height)
{
int x = blockDim.x * blockIdx.x + threadIdx.x;
int y = blockDim.y * blockIdx.y + threadIdx.y;
int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;
int i = y * width + x;
if ( i < (width * height)) {
@@ -73,8 +73,8 @@ HIP_kernel(hipLaunchParm lp,
int width, int height)
{
int x = blockDim.x * blockIdx.x + threadIdx.x;
int y = blockDim.y * blockIdx.y + threadIdx.y;
int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;
int i = y * width + x;
if ( i < (width * height)) {
@@ -64,8 +64,8 @@ HIP_kernel(hipLaunchParm lp,
unsigned int* a, unsigned int* b,unsigned int* c, unsigned long long int* d, int width, int height)
{
int x = blockDim.x * blockIdx.x + threadIdx.x;
int y = blockDim.y * blockIdx.y + threadIdx.y;
int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;
int i = y * width + x;
if ( i < (width * height)) {
@@ -57,8 +57,8 @@ vectoradd_float(hipLaunchParm lp,
T* a, const T* bm, int width, int height)
{
int x = blockDim.x * blockIdx.x + threadIdx.x;
int y = blockDim.y * blockIdx.y + threadIdx.y;
int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;
int i = y * width + x;
if ( i < (width * height)) {
@@ -45,8 +45,8 @@ vectoradd_char1(hipLaunchParm lp,
char1* a, const char1* bm, const char1* cm, int width, int height)
{
int x = blockDim.x * blockIdx.x + threadIdx.x;
int y = blockDim.y * blockIdx.y + threadIdx.y;
int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;
int i = y * width + x;
if ( i < (width * height)) {
@@ -59,8 +59,8 @@ vectoradd_char2(hipLaunchParm lp,
char2* a, const char2* bm, const char2* cm, int width, int height)
{
int x = blockDim.x * blockIdx.x + threadIdx.x;
int y = blockDim.y * blockIdx.y + threadIdx.y;
int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;
int i = y * width + x;
if ( i < (width * height)) {
@@ -73,8 +73,8 @@ vectoradd_char3(hipLaunchParm lp,
char3* a, const char3* bm, const char3* cm, int width, int height)
{
int x = blockDim.x * blockIdx.x + threadIdx.x;
int y = blockDim.y * blockIdx.y + threadIdx.y;
int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;
int i = y * width + x;
if ( i < (width * height)) {
@@ -86,8 +86,8 @@ vectoradd_char4(hipLaunchParm lp,
char4* a, const char4* bm, const char4* cm, int width, int height)
{
int x = blockDim.x * blockIdx.x + threadIdx.x;
int y = blockDim.y * blockIdx.y + threadIdx.y;
int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;
int i = y * width + x;
if ( i < (width * height)) {
@@ -100,8 +100,8 @@ vectoradd_char4(hipLaunchParm lp,
__kernel__ void vectoradd_float(float* a, const float* b, const float* c, int width, int height) {
int x = blockDimX * blockIdx.x + threadIdx.x;
int y = blockDimY * blockIdy.y + threadIdx.y;
int x = blockDimX * hipBlockIdx_x + hipThreadIdx_x;
int y = blockDimY * blockIdy.y + hipThreadIdx_y;
int i = y * width + x;
if ( i < (width * height)) {
@@ -36,7 +36,7 @@ THE SOFTWARE.
#define SIZE LEN<<2
__global__ void kernel_trig(hipLaunchParm lp, float *In, float *sin_d, float *cos_d, float *tan_d, float *sin_pd, float *cos_pd){
int tid = threadIdx.x + blockIdx.x * blockDim.x;
int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
sin_d[tid] = __sinf(In[tid]);
cos_d[tid] = __cosf(In[tid]);
tan_d[tid] = __tanf(In[tid]);
@@ -29,7 +29,7 @@ THE SOFTWARE.
__global__ void Add(hipLaunchParm lp, float *Ad, float *Bd, float *Cd, size_t len)
{
int tx = threadIdx.x + blockIdx.x * blockDim.x;
int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
if(tx < len)
{
Cd[tx] = Ad[tx] + Bd[tx];
@@ -30,7 +30,7 @@ THE SOFTWARE.
__global__ void Kern(hipLaunchParm lp, float *A)
{
int tx = threadIdx.x + blockIdx.x * blockDim.x;
int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
A[tx] += 1.0f;
}
@@ -33,7 +33,7 @@ class memManager;
template<typename T>
__global__ void Add(hipLaunchParm lp, T* Ad, T* Bd, T* Cd, size_t Len)
{
int tx = threadIdx.x + blockIdx.x * blockDim.x;
int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
if(tx < Len)
{
Cd[tx] = Ad[tx] + Bd[tx];
+1 -1
Προβολή Αρχείου
@@ -34,7 +34,7 @@ THE SOFTWARE.
#define SIZE 1024*1024*sizeof(int)
__global__ void Iter(hipLaunchParm lp, int *Ad){
int tx = threadIdx.x + blockIdx.x * blockDim.x;
int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
if(tx == 0){
for(int i=0;i<ITER;i++){
Ad[tx] += 1;
+1 -1
Προβολή Αρχείου
@@ -29,7 +29,7 @@ THE SOFTWARE.
#define SIZE 1024*1024*sizeof(int)
__global__ void Iter(hipLaunchParm lp, int *Ad){
int tx = threadIdx.x + blockIdx.x * blockDim.x;
int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
if(tx == 0){
for(int i=0;i<ITER;i++){
Ad[tx] += 1;
+1 -1
Προβολή Αρχείου
@@ -2,7 +2,7 @@
#include "hip/hip_runtime_api.h"
__global__ void Kernel(hipLaunchParm lp, float *Ad){
int tx = threadIdx.x + blockIdx.x * blockDim.x;
int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
Ad[tx] += Ad[tx-1];
}
@@ -40,8 +40,8 @@ __global__ void testExternSharedKernel(hipLaunchParm lp, const T* A_d, const T*
T *sdata = reinterpret_cast<T *>(my_sdata);
#endif
size_t gid = (blockIdx.x * blockDim.x + threadIdx.x);
size_t tid = threadIdx.x;
size_t gid = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
size_t tid = hipThreadIdx_x;
// initialize dynamic shared memory
if (tid < groupElements) {
@@ -34,7 +34,7 @@ THE SOFTWARE.
__global__ void vectorAdd(hipLaunchParm lp, float *Ad, float *Bd) {
HIP_DYNAMIC_SHARED(float, sBd);
int tx = threadIdx.x;
int tx = hipThreadIdx_x;
for(int i=0;i<LEN/64;i++) {
sBd[tx + i * 64] = Ad[tx + i * 64] + 1.0f;
Bd[tx + i * 64] = sBd[tx + i * 64];
@@ -48,8 +48,8 @@ vectorADD2( hipLaunchParm lp,
T *C_d,
size_t N)
{
size_t offset = (blockIdx.x * blockDim.x + threadIdx.x);
size_t stride = blockDim.x * gridDim.x ;
size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
size_t stride = hipBlockDim_x * hipGridDim_x ;
for (size_t i=offset; i<N; i+=stride) {
double foo = __hiloint2double(A_d[i], B_d[i]);
@@ -62,7 +62,7 @@ __global__ void MyKernel (const hipLaunchParm lp, const float *a, const float *b
{
//KERNELBEGIN;
unsigned gid = threadIdx.x;
unsigned gid = hipThreadIdx_x;
if (gid < N) {
c[gid] = a[gid] + PlusOne(b[gid]);
}
@@ -96,7 +96,7 @@ vectorADD(const hipLaunchParm lp,
int zuzu = deviceVar + 1;
int b = threadIdx.x;
int b = hipThreadIdx_x;
int c;
#ifdef NOT_YET
int a = __shfl_up(x, 1);
@@ -119,8 +119,8 @@ vectorADD(const hipLaunchParm lp,
__syncthreads();
size_t offset = (blockIdx.x * blockDim.x + threadIdx.x);
size_t stride = blockDim.x * gridDim.x ;
size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
size_t stride = hipBlockDim_x * hipGridDim_x ;
for (size_t i=offset; i<N; i+=stride) {
C_d[i] = A_d[i] + B_d[i];
@@ -38,7 +38,7 @@ __constant__ int Value[LEN];
__global__ void Get(hipLaunchParm lp, int *Ad)
{
int tid = threadIdx.x + blockIdx.x * blockDim.x;
int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
Ad[tid] = Value[tid];
}
@@ -33,12 +33,12 @@ THE SOFTWARE.
#define SIZE NUM * 8
__global__ void Alloc(hipLaunchParm lp, uint64_t *Ptr) {
int tid = threadIdx.x + blockIdx.x * blockDim.x;
int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
Ptr[tid] = (uint64_t)malloc(128);
}
__global__ void Free(hipLaunchParm lp, uint64_t *Ptr) {
int tid = threadIdx.x + blockIdx.x * blockDim.x;
int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
free((void*)Ptr[tid]);
}
@@ -35,52 +35,52 @@ THE SOFTWARE.
#define LEN12 12 * 4
__global__ void MemCpy8(hipLaunchParm lp, uint8_t *In, uint8_t *Out) {
int tid = threadIdx.x + blockIdx.x * blockDim.x;
int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
memcpy(Out + tid*8, In + tid*8, 8);
}
__global__ void MemCpy9(hipLaunchParm lp, uint8_t *In, uint8_t *Out) {
int tid = threadIdx.x + blockIdx.x * blockDim.x;
int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
memcpy(Out + tid*9, In + tid*9, 9);
}
__global__ void MemCpy10(hipLaunchParm lp, uint8_t *In, uint8_t *Out) {
int tid = threadIdx.x + blockIdx.x * blockDim.x;
int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
memcpy(Out + tid*10, In + tid*10, 10);
}
__global__ void MemCpy11(hipLaunchParm lp, uint8_t *In, uint8_t *Out) {
int tid = threadIdx.x + blockIdx.x * blockDim.x;
int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
memcpy(Out + tid*11, In + tid*11, 11);
}
__global__ void MemCpy12(hipLaunchParm lp, uint8_t *In, uint8_t *Out) {
int tid = threadIdx.x + blockIdx.x * blockDim.x;
int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
memcpy(Out + tid*12, In + tid*12, 12);
}
__global__ void MemSet8(hipLaunchParm lp, uint8_t *In) {
int tid = threadIdx.x + blockIdx.x * blockDim.x;
int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
memset(In + tid*8, 1, 8);
}
__global__ void MemSet9(hipLaunchParm lp, uint8_t *In) {
int tid = threadIdx.x + blockIdx.x * blockDim.x;
int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
memset(In + tid*9, 1, 9);
}
__global__ void MemSet10(hipLaunchParm lp, uint8_t *In) {
int tid = threadIdx.x + blockIdx.x * blockDim.x;
int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
memset(In + tid*10, 1, 10);
}
__global__ void MemSet11(hipLaunchParm lp, uint8_t *In) {
int tid = threadIdx.x + blockIdx.x * blockDim.x;
int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
memset(In + tid*11, 1, 11);
}
__global__ void MemSet12(hipLaunchParm lp, uint8_t *In) {
int tid = threadIdx.x + blockIdx.x * blockDim.x;
int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
memset(In + tid*12, 1, 12);
}
@@ -37,7 +37,7 @@ __global__ void vadd_asm(hipLaunchParm lp,
float *out,
float *in)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
#ifdef __HIP_PLATFORM_NVCC__
asm volatile("add.f32 %0,%1,%2;":"=f"(out[i]):"f"(in[i]),"f"(out[i]));
@@ -35,7 +35,7 @@ __global__ void vmac_asm(hipLaunchParm lp,
float *out,
float *in)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
asm volatile ("v_mac_f32_e32 %0, %2, %3" : "=v" (out[i]) : "0"(out[i]), "v" (a), "v" (in[i]));
}
@@ -33,7 +33,7 @@ void
__launch_bounds__(256, 2)
myKern(hipLaunchParm lp, int *C, const int *A, int N, int xfactor)
{
int tid = (blockIdx.x * blockDim.x + threadIdx.x);
int tid = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
if (tid < N) {
C[tid] = A[tid];
@@ -34,7 +34,7 @@ THE SOFTWARE.
#define NUM_STREAMS 2
__global__ void Iter(hipLaunchParm lp, int *Ad, int num){
int tx = threadIdx.x + blockIdx.x * blockDim.x;
int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
// Kernel loop designed to execute very slowly... ... ... so we can test timing-related behavior below
if(tx == 0){
for(int i = 0; i<num;i++){
@@ -33,7 +33,7 @@ THE SOFTWARE.
#define SIZE LEN*sizeof(float)
__global__ void Add(hipLaunchParm lp, float *Ad, float *Bd, float *Cd){
int tx = threadIdx.x + blockIdx.x * blockDim.x;
int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
Cd[tx] = Ad[tx] + Bd[tx];
}
@@ -33,13 +33,13 @@
#define SIZE LEN*sizeof(float)
__global__ void Add(float *Ad, float *Bd, float *Cd){
int tx = threadIdx.x + blockIdx.x * blockDim.x;
int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
Cd[tx] = Ad[tx] + Bd[tx];
}
__global__ void Set(int *Ad, int val){
int tx = threadIdx.x + blockIdx.x * blockDim.x;
int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
Ad[tx] = val;
}
@@ -29,7 +29,7 @@ THE SOFTWARE.
#include<malloc.h>
__global__ void Inc(hipLaunchParm lp, float *Ad){
int tx = threadIdx.x + blockIdx.x * blockDim.x;
int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
Ad[tx] = Ad[tx] + float(1);
}
@@ -70,8 +70,8 @@ template<typename T>
__global__ void
addK (hipLaunchParm lp, T *A, T K, size_t numElements)
{
size_t offset = (blockIdx.x * blockDim.x + threadIdx.x);
size_t stride = blockDim.x * gridDim.x ;
size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
size_t stride = hipBlockDim_x * hipGridDim_x ;
for (size_t i=offset; i<numElements; i+=stride) {
A[i] = A[i] + K;
@@ -31,7 +31,7 @@ THE SOFTWARE.
__global__ void Kernel(hipLaunchParm lp,volatile float* hostRes)
{
int tid = threadIdx.x + blockIdx.x * blockDim.x;
int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
hostRes[tid] = tid + 1;
__threadfence_system();
// expecting that the data is getting flushed to host here!
@@ -63,8 +63,8 @@ int enablePeers(int dev0, int dev1)
__global__ void
memsetIntKernel(int * ptr, const int val, size_t numElements)
{
int gid = (blockIdx.x * blockDim.x + threadIdx.x);
int stride = blockDim.x * gridDim.x ;
int gid = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
int stride = hipBlockDim_x * hipGridDim_x ;
for (size_t i= gid; i< numElements; i+=stride){
ptr[i] = val;
}
@@ -73,8 +73,8 @@ memsetIntKernel(int * ptr, const int val, size_t numElements)
__global__ void
memcpyIntKernel(const int * src, int* dst, size_t numElements)
{
int gid = (blockIdx.x * blockDim.x + threadIdx.x);
int stride = blockDim.x * gridDim.x ;
int gid = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
int stride = hipBlockDim_x * hipGridDim_x ;
for (size_t i= gid; i< numElements; i+=stride){
dst[i] = src[i];
}
@@ -34,7 +34,7 @@ THE SOFTWARE.
#define kernel_name "hello_world"
__global__ void Cpy(hipLaunchParm lp, float *Ad, float* Bd){
int tx = threadIdx.x;
int tx = hipThreadIdx_x;
Bd[tx] = Ad[tx];
}
@@ -24,7 +24,7 @@ THE SOFTWARE.
extern "C" __global__ void hello_world(hipLaunchParm lp, float *a, float *b)
{
int tx = threadIdx.x;
int tx = hipThreadIdx_x;
b[tx] = a[tx];
}
@@ -35,7 +35,7 @@ THE SOFTWARE.
template<typename T>
__global__ void Inc(hipLaunchParm lp, T *Array){
int tx = threadIdx.x + blockIdx.x * blockDim.x;
int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
Array[tx] = Array[tx] + T(1);
}
@@ -29,7 +29,7 @@ THE SOFTWARE.
const int NN = 1 << 21;
__global__ void kernel(hipLaunchParm lp, float *x, float *y, int n){
int tid = threadIdx.x;
int tid = hipThreadIdx_x;
if(tid < 1){
for(int i=0;i<n;i++){
x[i] = sqrt(powf(3.14159,i));
@@ -39,7 +39,7 @@ __global__ void kernel(hipLaunchParm lp, float *x, float *y, int n){
}
__global__ void nKernel(hipLaunchParm lp, float *y){
int tid = threadIdx.x;
int tid = hipThreadIdx_x;
y[tid] = y[tid] + 1.0f;
}
@@ -31,7 +31,7 @@ THE SOFTWARE.
const int NN = 1 << 21;
__global__ void kernel(hipLaunchParm lp, float *x, float *y, int n){
int tid = threadIdx.x;
int tid = hipThreadIdx_x;
if(tid < 1){
for(int i=0;i<n;i++){
x[i] = sqrt(powf(3.14159,i));
@@ -41,7 +41,7 @@ __global__ void kernel(hipLaunchParm lp, float *x, float *y, int n){
}
__global__ void nKernel(hipLaunchParm lp, float *y){
int tid = threadIdx.x;
int tid = hipThreadIdx_x;
y[tid] = y[tid] + 1.0f;
}
@@ -41,8 +41,8 @@ vectorADDRepeat(hipLaunchParm lp,
size_t NELEM,
int repeat)
{
size_t offset = (blockIdx.x * blockDim.x + threadIdx.x);
size_t stride = blockDim.x * gridDim.x ;
size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
size_t stride = hipBlockDim_x * hipGridDim_x ;
for (int j=1; j<=repeat;j++) {
for (size_t i=offset; i<NELEM; i+=stride) {
@@ -73,7 +73,7 @@ void D2H(T *Dst, T *Src, size_t size){
template<typename T>
__global__ void Inc(hipLaunchParm lp, T *In){
int tx = threadIdx.x + blockIdx.x * blockDim.x;
int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
In[tx] = In[tx] + 1;
}
@@ -102,8 +102,8 @@ MemcpyFunction g_moduleMemcpy("memcpyInt.hsaco", "memcpyIntKernel");
__global__ void
memsetIntKernel(int * ptr, const int val, size_t numElements)
{
int gid = (blockIdx.x * blockDim.x + threadIdx.x);
int stride = blockDim.x * gridDim.x ;
int gid = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
int stride = hipBlockDim_x * hipGridDim_x ;
for (size_t i= gid; i< numElements; i+=stride){
ptr[i] = val;
}
@@ -112,8 +112,8 @@ memsetIntKernel(int * ptr, const int val, size_t numElements)
__global__ void
memcpyIntKernel(int *dst, const int * src, size_t numElements)
{
int gid = (blockIdx.x * blockDim.x + threadIdx.x);
int stride = blockDim.x * gridDim.x ;
int gid = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
int stride = hipBlockDim_x * hipGridDim_x ;
for (size_t i= gid; i< numElements; i+=stride){
dst[i] = src[i];
}
@@ -5,8 +5,8 @@
extern "C" __global__ void
memcpyIntKernel(hipLaunchParm lp, int *dst, const int * src, size_t numElements)
{
int gid = (blockIdx.x * blockDim.x + threadIdx.x);
int stride = blockDim.x * gridDim.x ;
int gid = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
int stride = hipBlockDim_x * hipGridDim_x ;
for (size_t i= gid; i< numElements; i+=stride){
dst[i] = src[i];
}
+1 -1
Προβολή Αρχείου
@@ -23,7 +23,7 @@ THE SOFTWARE.
void __global__
test_kernel(float *A)
{
int tid = blockIdx.x * blockDim.x + threadIdx.x;
int tid = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
float a = __ballot(tid < 16);
float b = __shfl(tid < 16);
@@ -30,7 +30,7 @@ THE SOFTWARE.
#define ITER 1<<10
__global__ void Iter(hipLaunchParm lp, int *Ad, int num){
int tx = threadIdx.x + blockIdx.x * blockDim.x;
int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
if(tx == 0){
for(int i = 0; i<num;i++){
Ad[tx] += 1;
@@ -26,7 +26,7 @@ THE SOFTWARE.
static size_t size[NUM_SIZE];
__global__ void Add(hipLaunchParm lp, int *Ad){
int tx = threadIdx.x;
int tx = hipThreadIdx_x;
Ad[tx] = Ad[tx] + tx;
}
@@ -26,7 +26,7 @@ THE SOFTWARE.
static size_t size[NUM_SIZE];
__global__ void Add(hipLaunchParm lp, int *Ad){
int tx = threadIdx.x;
int tx = hipThreadIdx_x;
Ad[tx] = Ad[tx] + tx;
}
@@ -30,7 +30,7 @@ THE SOFTWARE.
#define ITER 1<<10
__global__ void Iter(hipLaunchParm lp, int *Ad, int num){
int tx = threadIdx.x + blockIdx.x * blockDim.x;
int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
if(tx == 0){
for(int i = 0; i<num;i++){
Ad[tx] += 1;
+10 -10
Προβολή Αρχείου
@@ -136,8 +136,8 @@ vectorADD(hipLaunchParm lp,
T *C_d,
size_t NELEM)
{
size_t offset = (blockIdx.x * blockDim.x + threadIdx.x);
size_t stride = blockDim.x * gridDim.x ;
size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
size_t stride = hipBlockDim_x * hipGridDim_x ;
for (size_t i=offset; i<NELEM; i+=stride) {
C_d[i] = A_d[i] + B_d[i];
@@ -153,8 +153,8 @@ vectorADDReverse(hipLaunchParm lp,
T *C_d,
size_t NELEM)
{
size_t offset = (blockIdx.x * blockDim.x + threadIdx.x);
size_t stride = blockDim.x * gridDim.x ;
size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
size_t stride = hipBlockDim_x * hipGridDim_x ;
for (int64_t i=NELEM-stride+offset; i>=0; i-=stride) {
C_d[i] = A_d[i] + B_d[i];
@@ -169,8 +169,8 @@ addCount( const T *A_d,
size_t NELEM,
int count)
{
size_t offset = (blockIdx.x * blockDim.x + threadIdx.x);
size_t stride = blockDim.x * gridDim.x ;
size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
size_t stride = hipBlockDim_x * hipGridDim_x ;
// Deliberately do this in an inefficient way to increase kernel runtime
for (int i=0; i<count; i++) {
@@ -188,8 +188,8 @@ addCountReverse( const T *A_d,
int64_t NELEM,
int count)
{
size_t offset = (blockIdx.x * blockDim.x + threadIdx.x);
size_t stride = blockDim.x * gridDim.x ;
size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
size_t stride = hipBlockDim_x * hipGridDim_x ;
// Deliberately do this in an inefficient way to increase kernel runtime
for (int i=0; i<count; i++) {
@@ -205,8 +205,8 @@ __global__ void
memsetReverse( T *C_d, T val,
int64_t NELEM)
{
size_t offset = (blockIdx.x * blockDim.x + threadIdx.x);
size_t stride = blockDim.x * gridDim.x ;
size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
size_t stride = hipBlockDim_x * hipGridDim_x ;
for (int64_t i=NELEM-stride+offset; i>=0; i-=stride) {
C_d[i] = val;
@@ -17,8 +17,8 @@ __global__ void tex2DKernel(float* outputData,
int width,
int height)
{
int x = blockIdx.x*blockDim.x + threadIdx.x;
int y = blockIdx.y*blockDim.y + threadIdx.y;
int x = hipBlockIdx_x*hipBlockDim_x + hipThreadIdx_x;
int y = hipBlockIdx_y*hipBlockDim_y + hipThreadIdx_y;
outputData[y*width + x] = tex2D<float>(textureObject, x, y);
}
@@ -20,8 +20,8 @@ __global__ void tex2DKernel(float* outputData,
int width,
int height)
{
int x = blockIdx.x*blockDim.x + threadIdx.x;
int y = blockIdx.y*blockDim.y + threadIdx.y;
int x = hipBlockIdx_x*hipBlockDim_x + hipThreadIdx_x;
int y = hipBlockIdx_y*hipBlockDim_y + hipThreadIdx_y;
#ifdef __HIP_PLATFORM_HCC__
outputData[y*width + x] = tex2D(tex, textureObject, x, y);
#else