This implements the trivial change needed to move back from the hip{Something}_{x, y, z} macros to the natural CUDA syntax of Something.{x, y, z}. This is contained in lines 384-404 in hip_runtime.h. All of the other changes have to do with changing unit tests to use this syntax. The macros are retained for backwards compatibility.

Этот коммит содержится в:
Alex Voicu
2017-11-19 01:54:12 +00:00
родитель 85975e719d
Коммит cffd0e14eb
60 изменённых файлов: 270 добавлений и 248 удалений
+21
Просмотреть файл
@@ -381,6 +381,27 @@ __device__ void __threadfence_system(void) ;
* @}
*/
template<typename std::common_type<
decltype(hc_get_group_id),
decltype(hc_get_group_size),
decltype(hc_get_num_groups),
decltype(hc_get_workitem_id)>::type f>
class Coordinates {
using R = decltype(f(0));
struct X { __device__ operator R() const { return f(0); } };
struct Y { __device__ operator R() const { return f(1); } };
struct Z { __device__ operator R() const { return f(2); } };
public:
static constexpr X x{};
static constexpr Y y{};
static constexpr Z z{};
};
static constexpr Coordinates<hc_get_group_size> blockDim;
static constexpr Coordinates<hc_get_group_id> blockIdx;
static constexpr Coordinates<hc_get_num_groups> gridDim;
static constexpr Coordinates<hc_get_workitem_id> threadIdx;
#define hipThreadIdx_x (hc_get_workitem_id(0))
#define hipThreadIdx_y (hc_get_workitem_id(1))
+6 -6
Просмотреть файл
@@ -45,8 +45,8 @@ __device__ void *__hip_hc_malloc(size_t size)
{
return (void*)nullptr;
}
uint32_t totalThreads = hipBlockDim_x * hipGridDim_x * hipBlockDim_y * hipGridDim_y * hipBlockDim_z * hipGridDim_z;
uint32_t currentWorkItem = hipThreadIdx_x + hipBlockDim_x * hipBlockIdx_x;
uint32_t totalThreads = blockDim.x * gridDim.x * blockDim.y * gridDim.y * blockDim.z * gridDim.z;
uint32_t currentWorkItem = threadIdx.x + blockDim.x * blockIdx.x;
uint32_t numHeapsPerWorkItem = NUM_PAGES / totalThreads;
uint32_t heapSizePerWorkItem = SIZE_OF_HEAP / totalThreads;
@@ -932,7 +932,7 @@ __device__ unsigned long long int atomicMax(unsigned long long int* address,
template<typename T>
__device__ T atomicCAS_impl(T* address, T compare, T val)
{
// the implementation assumes the atomic is lock-free and
// the implementation assumes the atomic is lock-free and
// has the same size as the non-atmoic equivalent type
static_assert(sizeof(T) == sizeof(std::atomic<T>)
, "size mismatch between atomic and non-atomic types");
@@ -945,7 +945,7 @@ __device__ T atomicCAS_impl(T* address, T compare, T val)
T expected = compare;
// hcc should generate a system scope atomic CAS
// hcc should generate a system scope atomic CAS
std::atomic_compare_exchange_weak_explicit(u.atomic_address
, &expected, val
, std::memory_order_acq_rel
@@ -1110,8 +1110,8 @@ __device__ void* __get_dynamicgroupbaseptr() {
return hc::get_dynamic_group_segment_base_pointer();
}
__host__ void* __get_dynamicgroupbaseptr() {
return nullptr;
__host__ void* __get_dynamicgroupbaseptr() {
return nullptr;
}
// Precise Math Functions
+1 -1
Просмотреть файл
@@ -34,7 +34,7 @@ THE SOFTWARE.
#define NUM_STREAMS 2
__global__ void Iter(hipLaunchParm lp, int *Ad, int num){
int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
int tx = threadIdx.x + blockIdx.x * blockDim.x;
// Kernel loop designed to execute very slowly... ... ... so we can test timing-related behavior below
if(tx == 0){
for(int i = 0; i<num;i++){
+1 -1
Просмотреть файл
@@ -27,7 +27,7 @@ THE SOFTWARE.
#define SIZE 64<<2
__global__ void getSqAbs(hipLaunchParm lp, float *A, float *B, float *C){
int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
int tx = threadIdx.x + blockIdx.x * blockDim.x;
C[tx] = hipCsqabsf(make_hipFloatComplex(A[tx], B[tx]));
}
+3 -3
Просмотреть файл
@@ -16,13 +16,13 @@
__global__ void cpy(hipLaunchParm lp, uint32_t *Out, uint32_t *In)
{
int tx = hipThreadIdx_x;
int tx = threadIdx.x;
memcpy(Out + tx, In + tx, sizeof(uint32_t));
}
__global__ void set(hipLaunchParm lp, uint32_t *ptr, uint8_t val, size_t size)
{
int tx = hipThreadIdx_x;
int tx = threadIdx.x;
memset(ptr + tx, val, (sizeof(uint32_t)*(size/LEN)));
}
@@ -58,6 +58,6 @@ int main()
return 0;
}
}
passed();
}
+1 -1
Просмотреть файл
@@ -35,7 +35,7 @@ THE SOFTWARE.
__global__ void floatMath(hipLaunchParm lp, float *In, float *Out) {
int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
int tid = threadIdx.x + blockIdx.x * blockDim.x;
Out[tid] = __cosf(In[tid]);
Out[tid] = __exp10f(Out[tid]);
Out[tid] = __expf(Out[tid]);
+2 -2
Просмотреть файл
@@ -217,7 +217,7 @@ int computeGold(int *gpuData, const int len)
__global__ void testKernel(hipLaunchParm lp,int *g_odata)
{
// access thread id
const unsigned int tid = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
const unsigned int tid = blockDim.x * blockIdx.x + threadIdx.x;
// Test various atomic instructions
@@ -240,7 +240,7 @@ __global__ void testKernel(hipLaunchParm lp,int *g_odata)
// Atomic increment (modulo 17+1)
atomicInc((unsigned int *)&g_odata[5], 17);
// Atomic decrement
atomicDec((unsigned int *)&g_odata[6], 137);
+16 -16
Просмотреть файл
@@ -32,82 +32,82 @@ THE SOFTWARE.
#define SIZE N*sizeof(float)
__global__ void test_sincosf(hipLaunchParm lp, float* a, float* b, float *c){
int tid = hipThreadIdx_x;
int tid = threadIdx.x;
sincosf(a[tid], b+tid, c+tid);
}
__global__ void test_sincospif(hipLaunchParm lp, float* a, float* b, float *c){
int tid = hipThreadIdx_x;
int tid = threadIdx.x;
sincospif(a[tid], b+tid, c+tid);
}
__global__ void test_fdividef(hipLaunchParm lp, float *a, float* b, float *c){
int tid = hipThreadIdx_x;
int tid = threadIdx.x;
c[tid] = fdividef(a[tid], b[tid]);
}
__global__ void test_llrintf(hipLaunchParm lp, float *a, long long int *b){
int tid = hipThreadIdx_x;
int tid = threadIdx.x;
b[tid] = llrintf(a[tid]);
}
__global__ void test_lrintf(hipLaunchParm lp, float *a, long int *b){
int tid = hipThreadIdx_x;
int tid = threadIdx.x;
b[tid] = lrintf(a[tid]);
}
__global__ void test_rintf(hipLaunchParm lp, float *a, float *b){
int tid = hipThreadIdx_x;
int tid = threadIdx.x;
b[tid] = rintf(a[tid]);
}
__global__ void test_llroundf(hipLaunchParm lp, float *a, long long int *b){
int tid = hipThreadIdx_x;
int tid = threadIdx.x;
b[tid] = llroundf(a[tid]);
}
__global__ void test_lroundf(hipLaunchParm lp, float *a, long int *b){
int tid = hipThreadIdx_x;
int tid = threadIdx.x;
b[tid] = lroundf(a[tid]);
}
__global__ void test_rhypotf(hipLaunchParm lp, float *a, float* b, float *c){
int tid = hipThreadIdx_x;
int tid = threadIdx.x;
c[tid] = rhypotf(a[tid], b[tid]);
}
__global__ void test_norm3df(hipLaunchParm lp, float *a, float* b, float *c, float *d){
int tid = hipThreadIdx_x;
int tid = threadIdx.x;
d[tid] = norm3df(a[tid], b[tid], c[tid]);
}
__global__ void test_norm4df(hipLaunchParm lp, float *a, float* b, float *c, float *d, float *e){
int tid = hipThreadIdx_x;
int tid = threadIdx.x;
e[tid] = norm4df(a[tid], b[tid], c[tid], d[tid]);
}
__global__ void test_normf(hipLaunchParm lp, float *a, float *b){
int tid = hipThreadIdx_x;
int tid = threadIdx.x;
b[tid] = normf(N, a);
}
__global__ void test_rnorm3df(hipLaunchParm lp, float *a, float* b, float *c, float *d){
int tid = hipThreadIdx_x;
int tid = threadIdx.x;
d[tid] = rnorm3df(a[tid], b[tid], c[tid]);
}
__global__ void test_rnorm4df(hipLaunchParm lp, float *a, float* b, float *c, float *d, float *e){
int tid = hipThreadIdx_x;
int tid = threadIdx.x;
e[tid] = rnorm4df(a[tid], b[tid], c[tid], d[tid]);
}
__global__ void test_rnormf(hipLaunchParm lp, float *a, float *b){
int tid = hipThreadIdx_x;
int tid = threadIdx.x;
b[tid] = rnormf(N, a);
}
__global__ void test_erfinvf(hipLaunchParm lp, float *a, float *b){
int tid = hipThreadIdx_x;
int tid = threadIdx.x;
b[tid] = erff(erfinvf(a[tid]));
}
+14 -14
Просмотреть файл
@@ -32,72 +32,72 @@ THE SOFTWARE.
#define SIZE N*sizeof(double)
__global__ void test_sincos(hipLaunchParm lp, double* a, double* b, double *c){
int tid = hipThreadIdx_x;
int tid = threadIdx.x;
sincos(a[tid], b+tid, c+tid);
}
__global__ void test_sincospi(hipLaunchParm lp, double* a, double* b, double *c){
int tid = hipThreadIdx_x;
int tid = threadIdx.x;
sincospi(a[tid], b+tid, c+tid);
}
__global__ void test_llrint(hipLaunchParm lp, double *a, long long int *b){
int tid = hipThreadIdx_x;
int tid = threadIdx.x;
b[tid] = llrint(a[tid]);
}
__global__ void test_lrint(hipLaunchParm lp, double *a, long int *b){
int tid = hipThreadIdx_x;
int tid = threadIdx.x;
b[tid] = lrint(a[tid]);
}
__global__ void test_rint(hipLaunchParm lp, double *a, double *b){
int tid = hipThreadIdx_x;
int tid = threadIdx.x;
b[tid] = rint(a[tid]);
}
__global__ void test_llround(hipLaunchParm lp, double *a, long long int *b){
int tid = hipThreadIdx_x;
int tid = threadIdx.x;
b[tid] = llround(a[tid]);
}
__global__ void test_lround(hipLaunchParm lp, double *a, long int *b){
int tid = hipThreadIdx_x;
int tid = threadIdx.x;
b[tid] = lround(a[tid]);
}
__global__ void test_rhypot(hipLaunchParm lp, double *a, double* b, double *c){
int tid = hipThreadIdx_x;
int tid = threadIdx.x;
c[tid] = rhypot(a[tid], b[tid]);
}
__global__ void test_norm3d(hipLaunchParm lp, double *a, double* b, double *c, double *d){
int tid = hipThreadIdx_x;
int tid = threadIdx.x;
d[tid] = norm3d(a[tid], b[tid], c[tid]);
}
__global__ void test_norm4d(hipLaunchParm lp, double *a, double* b, double *c, double *d, double *e){
int tid = hipThreadIdx_x;
int tid = threadIdx.x;
e[tid] = norm4d(a[tid], b[tid], c[tid], d[tid]);
}
__global__ void test_rnorm3d(hipLaunchParm lp, double *a, double* b, double *c, double *d){
int tid = hipThreadIdx_x;
int tid = threadIdx.x;
d[tid] = rnorm3d(a[tid], b[tid], c[tid]);
}
__global__ void test_rnorm4d(hipLaunchParm lp, double *a, double* b, double *c, double *d, double *e){
int tid = hipThreadIdx_x;
int tid = threadIdx.x;
e[tid] = rnorm4d(a[tid], b[tid], c[tid], d[tid]);
}
__global__ void test_rnorm(hipLaunchParm lp, double *a, double *b){
int tid = hipThreadIdx_x;
int tid = threadIdx.x;
b[tid] = rnorm(N, a);
}
__global__ void test_erfinv(hipLaunchParm lp, double *a, double *b){
int tid = hipThreadIdx_x;
int tid = threadIdx.x;
b[tid] = erf(erfinv(a[tid]));
}
+1 -1
Просмотреть файл
@@ -36,7 +36,7 @@ __device__ int globalOut[NUM];
__global__ void Assign(hipLaunchParm lp, int* Out)
{
int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
int tid = threadIdx.x + blockIdx.x * blockDim.x;
Out[tid] = globalIn[tid];
globalOut[tid] = globalIn[tid];
}
+2 -2
Просмотреть файл
@@ -28,7 +28,7 @@ THE SOFTWARE.
#if __HIP_ARCH_GFX803__ > 0
__global__ void __halfMath(hipLaunchParm lp, __half *A, __half *B, __half *C) {
int tx = hipThreadIdx_x;
int tx = threadIdx.x;
__half a = A[tx];
__half b = B[tx];
__half c = C[tx];
@@ -44,7 +44,7 @@ __global__ void __halfMath(hipLaunchParm lp, __half *A, __half *B, __half *C) {
}
__global__ void __half2Math(hipLaunchParm lp, __half2 *A, __half2 *B, __half2 *C) {
int tx = hipThreadIdx_x;
int tx = threadIdx.x;
__half2 a = A[tx];
__half2 b = B[tx];
__half2 c = C[tx];
+1 -1
Просмотреть файл
@@ -33,7 +33,7 @@ THE SOFTWARE.
__global__ void vAdd(hipLaunchParm lp, float *In1, float *In2, float *In3, float *In4, float *Out)
{
int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
int tid = threadIdx.x + blockIdx.x * blockDim.x;
In4[tid] = In1[tid] + In2[tid];
__threadfence();
In3[tid] = In3[tid] + In4[tid];
+4 -4
Просмотреть файл
@@ -37,9 +37,9 @@ __global__ void
warpvote(hipLaunchParm lp, int* device_any, int* device_all , int Num_Warps_per_Block, int pshift)
{
int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
device_any[hipThreadIdx_x>>pshift] = __any(tid -77);
device_all[hipThreadIdx_x>>pshift] = __all(tid -77);
int tid = threadIdx.x + blockIdx.x * blockDim.x;
device_any[threadIdx.x>>pshift] = __any(tid -77);
device_all[threadIdx.x>>pshift] = __all(tid -77);
}
int main(int argc, char *argv[])
@@ -49,7 +49,7 @@ int main(int argc, char *argv[])
warpSize = devProp.warpSize;
int w = warpSize;
pshift = 0;
pshift = 0;
while (w >>= 1) ++pshift;
printf ("warpSize=%d pshift=%d\n", warpSize, pshift);
+4 -4
Просмотреть файл
@@ -34,12 +34,12 @@ __global__ void
gpu_ballot(hipLaunchParm lp, unsigned int* device_ballot, int Num_Warps_per_Block,int pshift)
{
int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
const unsigned int warp_num = hipThreadIdx_x >> pshift;
int tid = threadIdx.x + blockIdx.x * blockDim.x;
const unsigned int warp_num = threadIdx.x >> pshift;
#ifdef __HIP_PLATFORM_HCC__
atomicAdd(&device_ballot[warp_num+hipBlockIdx_x*Num_Warps_per_Block],__popcll(__ballot(tid - 245)));
atomicAdd(&device_ballot[warp_num+blockIdx.x*Num_Warps_per_Block],__popcll(__ballot(tid - 245)));
#else
atomicAdd(&device_ballot[warp_num+hipBlockIdx_x*Num_Warps_per_Block],__popc(__ballot(tid - 245)));
atomicAdd(&device_ballot[warp_num+blockIdx.x*Num_Warps_per_Block],__popc(__ballot(tid - 245)));
#endif
}
+2 -2
Просмотреть файл
@@ -72,8 +72,8 @@ HIP_kernel(hipLaunchParm lp,
unsigned int* a, unsigned int* b,unsigned long long int* c, unsigned long long int* d, int width, int height)
{
int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;
int x = blockDim.x * blockIdx.x + threadIdx.x;
int y = blockDim.y * blockIdx.y + threadIdx.y;
int i = y * width + x;
if ( i < (width * height)) {
+2 -2
Просмотреть файл
@@ -83,8 +83,8 @@ HIP_kernel(hipLaunchParm lp,
unsigned int* a, unsigned int* b,unsigned int* c, unsigned long long int* d, int width, int height)
{
int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;
int x = blockDim.x * blockIdx.x + threadIdx.x;
int y = blockDim.y * blockIdx.y + threadIdx.y;
int i = y * width + x;
if ( i < (width * height)) {
+2 -2
Просмотреть файл
@@ -73,8 +73,8 @@ HIP_kernel(hipLaunchParm lp,
int width, int height)
{
int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;
int x = blockDim.x * blockIdx.x + threadIdx.x;
int y = blockDim.y * blockIdx.y + threadIdx.y;
int i = y * width + x;
if ( i < (width * height)) {
+2 -2
Просмотреть файл
@@ -64,8 +64,8 @@ HIP_kernel(hipLaunchParm lp,
unsigned int* a, unsigned int* b,unsigned int* c, unsigned long long int* d, int width, int height)
{
int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;
int x = blockDim.x * blockIdx.x + threadIdx.x;
int y = blockDim.y * blockIdx.y + threadIdx.y;
int i = y * width + x;
if ( i < (width * height)) {
+2 -2
Просмотреть файл
@@ -57,8 +57,8 @@ vectoradd_float(hipLaunchParm lp,
T* a, const T* bm, int width, int height)
{
int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;
int x = blockDim.x * blockIdx.x + threadIdx.x;
int y = blockDim.y * blockIdx.y + threadIdx.y;
int i = y * width + x;
if ( i < (width * height)) {
+35 -35
Просмотреть файл
@@ -40,13 +40,13 @@ THE SOFTWARE.
#define THREADS_PER_BLOCK_Z 1
__global__ void
__global__ void
vectoradd_char1(hipLaunchParm lp,
char1* a, const char1* bm, const char1* cm, int width, int height)
char1* a, const char1* bm, const char1* cm, int width, int height)
{
int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;
int x = blockDim.x * blockIdx.x + threadIdx.x;
int y = blockDim.y * blockIdx.y + threadIdx.y;
int i = y * width + x;
if ( i < (width * height)) {
@@ -54,40 +54,40 @@ vectoradd_char1(hipLaunchParm lp,
}
}
__global__ void
__global__ void
vectoradd_char2(hipLaunchParm lp,
char2* a, const char2* bm, const char2* cm, int width, int height)
char2* a, const char2* bm, const char2* cm, int width, int height)
{
int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;
int x = blockDim.x * blockIdx.x + threadIdx.x;
int y = blockDim.y * blockIdx.y + threadIdx.y;
int i = y * width + x;
if ( i < (width * height)) {
a[i] = make_char2(bm[i].x, bm[i].y) + make_char2(cm[i].x, cm[i].y);
}
}
}
__global__ void
__global__ void
vectoradd_char3(hipLaunchParm lp,
char3* a, const char3* bm, const char3* cm, int width, int height)
char3* a, const char3* bm, const char3* cm, int width, int height)
{
int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;
int x = blockDim.x * blockIdx.x + threadIdx.x;
int y = blockDim.y * blockIdx.y + threadIdx.y;
int i = y * width + x;
if ( i < (width * height)) {
a[i] = make_char3(bm[i].x, bm[i].y, bm[i].z) + make_char3(cm[i].x, cm[i].y, cm[i].z);
}
}
__global__ void
__global__ void
vectoradd_char4(hipLaunchParm lp,
char4* a, const char4* bm, const char4* cm, int width, int height)
char4* a, const char4* bm, const char4* cm, int width, int height)
{
int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;
int x = blockDim.x * blockIdx.x + threadIdx.x;
int y = blockDim.y * blockIdx.y + threadIdx.y;
int i = y * width + x;
if ( i < (width * height)) {
@@ -99,7 +99,7 @@ vectoradd_char4(hipLaunchParm lp,
#if 0
__kernel__ void vectoradd_float(float* a, const float* b, const float* c, int width, int height) {
int x = blockDimX * blockIdx.x + threadIdx.x;
int y = blockDimY * blockIdy.y + threadIdx.y;
@@ -128,21 +128,21 @@ bool dataTypesRun(){
hostA = (T*)malloc(NUM * sizeof(T));
hostB = (T*)malloc(NUM * sizeof(T));
hostC = (T*)malloc(NUM * sizeof(T));
// initialize the input data
for (i = 0; i < NUM; i++) {
hostB[i] = (T)i;
hostC[i] = (T)i;
}
HIP_ASSERT(hipMalloc((void**)&deviceA, NUM * sizeof(T)));
HIP_ASSERT(hipMalloc((void**)&deviceB, NUM * sizeof(T)));
HIP_ASSERT(hipMalloc((void**)&deviceC, NUM * sizeof(T)));
HIP_ASSERT(hipMemcpy(deviceB, hostB, NUM*sizeof(T), hipMemcpyHostToDevice));
HIP_ASSERT(hipMemcpy(deviceC, hostC, NUM*sizeof(T), hipMemcpyHostToDevice));
hipLaunchKernel(HIP_KERNEL_NAME(vectoradd_char1),
hipLaunchKernel(HIP_KERNEL_NAME(vectoradd_char1),
dim3(WIDTH/THREADS_PER_BLOCK_X, HEIGHT/THREADS_PER_BLOCK_Y),
dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),
0, 0,
@@ -192,21 +192,21 @@ bool dataTypesRun(){
hostA = (T*)malloc(NUM * sizeof(T));
hostB = (T*)malloc(NUM * sizeof(T));
hostC = (T*)malloc(NUM * sizeof(T));
// initialize the input data
for (i = 0; i < NUM; i++) {
hostB[i] = (T)i;
hostC[i] = (T)i;
}
HIP_ASSERT(hipMalloc((void**)&deviceA, NUM * sizeof(T)));
HIP_ASSERT(hipMalloc((void**)&deviceB, NUM * sizeof(T)));
HIP_ASSERT(hipMalloc((void**)&deviceC, NUM * sizeof(T)));
HIP_ASSERT(hipMemcpy(deviceB, hostB, NUM*sizeof(T), hipMemcpyHostToDevice));
HIP_ASSERT(hipMemcpy(deviceC, hostC, NUM*sizeof(T), hipMemcpyHostToDevice));
hipLaunchKernel(HIP_KERNEL_NAME(vectoradd_char1),
hipLaunchKernel(HIP_KERNEL_NAME(vectoradd_char1),
dim3(WIDTH/THREADS_PER_BLOCK_X, HEIGHT/THREADS_PER_BLOCK_Y),
dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),
0, 0,
@@ -256,21 +256,21 @@ bool dataTypesRun(){
hostA = (T*)malloc(NUM * sizeof(T));
hostB = (T*)malloc(NUM * sizeof(T));
hostC = (T*)malloc(NUM * sizeof(T));
// initialize the input data
for (i = 0; i < NUM; i++) {
hostB[i] = (T)i;
hostC[i] = (T)i;
}
HIP_ASSERT(hipMalloc((void**)&deviceA, NUM * sizeof(T)));
HIP_ASSERT(hipMalloc((void**)&deviceB, NUM * sizeof(T)));
HIP_ASSERT(hipMalloc((void**)&deviceC, NUM * sizeof(T)));
HIP_ASSERT(hipMemcpy(deviceB, hostB, NUM*sizeof(T), hipMemcpyHostToDevice));
HIP_ASSERT(hipMemcpy(deviceC, hostC, NUM*sizeof(T), hipMemcpyHostToDevice));
hipLaunchKernel(HIP_KERNEL_NAME(vectoradd_char1),
hipLaunchKernel(HIP_KERNEL_NAME(vectoradd_char1),
dim3(WIDTH/THREADS_PER_BLOCK_X, HEIGHT/THREADS_PER_BLOCK_Y),
dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),
0, 0,
@@ -319,21 +319,21 @@ bool dataTypesRunChar4(){
hostA = (T*)malloc(NUM * sizeof(T));
hostB = (T*)malloc(NUM * sizeof(T));
hostC = (T*)malloc(NUM * sizeof(T));
// initialize the input data
for (i = 0; i < NUM; i++) {
hostB[i] = (T)i;
hostC[i] = (T)i;
}
HIP_ASSERT(hipMalloc((void**)&deviceA, NUM * sizeof(T)));
HIP_ASSERT(hipMalloc((void**)&deviceB, NUM * sizeof(T)));
HIP_ASSERT(hipMalloc((void**)&deviceC, NUM * sizeof(T)));
HIP_ASSERT(hipMemcpy(deviceB, hostB, NUM*sizeof(T), hipMemcpyHostToDevice));
HIP_ASSERT(hipMemcpy(deviceC, hostC, NUM*sizeof(T), hipMemcpyHostToDevice));
hipLaunchKernel(HIP_KERNEL_NAME(vectoradd_char1),
hipLaunchKernel(HIP_KERNEL_NAME(vectoradd_char1),
dim3(WIDTH/THREADS_PER_BLOCK_X, HEIGHT/THREADS_PER_BLOCK_Y),
dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),
0, 0,
@@ -368,7 +368,7 @@ bool dataTypesRunChar4(){
}
int main() {
hipDeviceProp_t devProp;
hipGetDeviceProperties(&devProp, 0);
cout << " System minor " << devProp.minor << endl;
+1 -1
Просмотреть файл
@@ -36,7 +36,7 @@ THE SOFTWARE.
#define SIZE LEN<<2
__global__ void kernel_trig(hipLaunchParm lp, float *In, float *sin_d, float *cos_d, float *tan_d, float *sin_pd, float *cos_pd){
int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
int tid = threadIdx.x + blockIdx.x * blockDim.x;
sin_d[tid] = __sinf(In[tid]);
cos_d[tid] = __cosf(In[tid]);
tan_d[tid] = __tanf(In[tid]);
+1 -1
Просмотреть файл
@@ -29,7 +29,7 @@ THE SOFTWARE.
__global__ void Add(hipLaunchParm lp, float *Ad, float *Bd, float *Cd, size_t len)
{
int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
int tx = threadIdx.x + blockIdx.x * blockDim.x;
if(tx < len)
{
Cd[tx] = Ad[tx] + Bd[tx];
+1 -1
Просмотреть файл
@@ -30,7 +30,7 @@ THE SOFTWARE.
__global__ void Kern(hipLaunchParm lp, float *A)
{
int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
int tx = threadIdx.x + blockIdx.x * blockDim.x;
A[tx] += 1.0f;
}
+1 -1
Просмотреть файл
@@ -33,7 +33,7 @@ class memManager;
template<typename T>
__global__ void Add(hipLaunchParm lp, T* Ad, T* Bd, T* Cd, size_t Len)
{
int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
int tx = threadIdx.x + blockIdx.x * blockDim.x;
if(tx < Len)
{
Cd[tx] = Ad[tx] + Bd[tx];
+1 -1
Просмотреть файл
@@ -34,7 +34,7 @@ THE SOFTWARE.
#define SIZE 1024*1024*sizeof(int)
__global__ void Iter(hipLaunchParm lp, int *Ad){
int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
int tx = threadIdx.x + blockIdx.x * blockDim.x;
if(tx == 0){
for(int i=0;i<ITER;i++){
Ad[tx] += 1;
+1 -1
Просмотреть файл
@@ -29,7 +29,7 @@ THE SOFTWARE.
#define SIZE 1024*1024*sizeof(int)
__global__ void Iter(hipLaunchParm lp, int *Ad){
int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
int tx = threadIdx.x + blockIdx.x * blockDim.x;
if(tx == 0){
for(int i=0;i<ITER;i++){
Ad[tx] += 1;
+1 -1
Просмотреть файл
@@ -2,7 +2,7 @@
#include "hip/hip_runtime_api.h"
__global__ void Kernel(hipLaunchParm lp, float *Ad){
int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
int tx = threadIdx.x + blockIdx.x * blockDim.x;
Ad[tx] += Ad[tx-1];
}
+2 -2
Просмотреть файл
@@ -40,8 +40,8 @@ __global__ void testExternSharedKernel(hipLaunchParm lp, const T* A_d, const T*
T *sdata = reinterpret_cast<T *>(my_sdata);
#endif
size_t gid = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
size_t tid = hipThreadIdx_x;
size_t gid = (blockIdx.x * blockDim.x + threadIdx.x);
size_t tid = threadIdx.x;
// initialize dynamic shared memory
if (tid < groupElements) {
+1 -1
Просмотреть файл
@@ -34,7 +34,7 @@ THE SOFTWARE.
__global__ void vectorAdd(hipLaunchParm lp, float *Ad, float *Bd) {
HIP_DYNAMIC_SHARED(float, sBd);
int tx = hipThreadIdx_x;
int tx = threadIdx.x;
for(int i=0;i<LEN/64;i++) {
sBd[tx + i * 64] = Ad[tx + i * 64] + 1.0f;
Bd[tx + i * 64] = sBd[tx + i * 64];
+4 -3
Просмотреть файл
@@ -48,11 +48,12 @@ vectorADD2( hipLaunchParm lp,
T *C_d,
size_t N)
{
size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
size_t stride = hipBlockDim_x * hipGridDim_x ;
size_t offset = (blockIdx.x * blockDim.x + threadIdx.x);
size_t stride = blockDim.x * gridDim.x ;
for (size_t i=offset; i<N; i+=stride) {
C_d[i] = A_d[i] + B_d[i] ;
auto foo = __hiloint2double(A_d[i], B_d[i]);
C_d[i] = __double2loint(foo) + __double2hiint(foo);//A_d[i] + B_d[i] ;
}
}
+4 -4
Просмотреть файл
@@ -62,7 +62,7 @@ __global__ void MyKernel (const hipLaunchParm lp, const float *a, const float *b
{
//KERNELBEGIN;
unsigned gid = hipThreadIdx_x;
unsigned gid = threadIdx.x;
if (gid < N) {
c[gid] = a[gid] + PlusOne(b[gid]);
}
@@ -96,7 +96,7 @@ vectorADD(const hipLaunchParm lp,
int zuzu = deviceVar + 1;
int b = hipThreadIdx_x;
int b = threadIdx.x;
int c;
#ifdef NOT_YET
int a = __shfl_up(x, 1);
@@ -119,8 +119,8 @@ vectorADD(const hipLaunchParm lp,
__syncthreads();
size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
size_t stride = hipBlockDim_x * hipGridDim_x ;
size_t offset = (blockIdx.x * blockDim.x + threadIdx.x);
size_t stride = blockDim.x * gridDim.x ;
for (size_t i=offset; i<N; i+=stride) {
C_d[i] = A_d[i] + B_d[i];
+2 -2
Просмотреть файл
@@ -18,7 +18,7 @@ THE SOFTWARE.
*/
/* HIT_START
* BUILD: %t %s ../test_common.cpp
* BUILD: %t %s ../test_common.cpp
* RUN: %t
* HIT_END
*/
@@ -38,7 +38,7 @@ __constant__ int Value[LEN];
__global__ void Get(hipLaunchParm lp, int *Ad)
{
int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
int tid = threadIdx.x + blockIdx.x * blockDim.x;
Ad[tid] = Value[tid];
}
+2 -2
Просмотреть файл
@@ -33,12 +33,12 @@ THE SOFTWARE.
#define SIZE NUM * 8
__global__ void Alloc(hipLaunchParm lp, uint64_t *Ptr) {
int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
int tid = threadIdx.x + blockIdx.x * blockDim.x;
Ptr[tid] = (uint64_t)malloc(128);
}
__global__ void Free(hipLaunchParm lp, uint64_t *Ptr) {
int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
int tid = threadIdx.x + blockIdx.x * blockDim.x;
free((void*)Ptr[tid]);
}
+10 -10
Просмотреть файл
@@ -35,52 +35,52 @@ THE SOFTWARE.
#define LEN12 12 * 4
__global__ void MemCpy8(hipLaunchParm lp, uint8_t *In, uint8_t *Out) {
int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
int tid = threadIdx.x + blockIdx.x * blockDim.x;
memcpy(Out + tid*8, In + tid*8, 8);
}
__global__ void MemCpy9(hipLaunchParm lp, uint8_t *In, uint8_t *Out) {
int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
int tid = threadIdx.x + blockIdx.x * blockDim.x;
memcpy(Out + tid*9, In + tid*9, 9);
}
__global__ void MemCpy10(hipLaunchParm lp, uint8_t *In, uint8_t *Out) {
int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
int tid = threadIdx.x + blockIdx.x * blockDim.x;
memcpy(Out + tid*10, In + tid*10, 10);
}
__global__ void MemCpy11(hipLaunchParm lp, uint8_t *In, uint8_t *Out) {
int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
int tid = threadIdx.x + blockIdx.x * blockDim.x;
memcpy(Out + tid*11, In + tid*11, 11);
}
__global__ void MemCpy12(hipLaunchParm lp, uint8_t *In, uint8_t *Out) {
int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
int tid = threadIdx.x + blockIdx.x * blockDim.x;
memcpy(Out + tid*12, In + tid*12, 12);
}
__global__ void MemSet8(hipLaunchParm lp, uint8_t *In) {
int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
int tid = threadIdx.x + blockIdx.x * blockDim.x;
memset(In + tid*8, 1, 8);
}
__global__ void MemSet9(hipLaunchParm lp, uint8_t *In) {
int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
int tid = threadIdx.x + blockIdx.x * blockDim.x;
memset(In + tid*9, 1, 9);
}
__global__ void MemSet10(hipLaunchParm lp, uint8_t *In) {
int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
int tid = threadIdx.x + blockIdx.x * blockDim.x;
memset(In + tid*10, 1, 10);
}
__global__ void MemSet11(hipLaunchParm lp, uint8_t *In) {
int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
int tid = threadIdx.x + blockIdx.x * blockDim.x;
memset(In + tid*11, 1, 11);
}
__global__ void MemSet12(hipLaunchParm lp, uint8_t *In) {
int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
int tid = threadIdx.x + blockIdx.x * blockDim.x;
memset(In + tid*12, 1, 12);
}
+2 -2
Просмотреть файл
@@ -37,7 +37,7 @@ __global__ void vadd_asm(hipLaunchParm lp,
float *out,
float *in)
{
int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
int i = blockDim.x * blockIdx.x + threadIdx.x;
asm volatile ("v_add_f32_e32 %0, %1, %2" : "=v" (out[i]) : "v"(in[i]),"v" (out[i]));
}
@@ -49,7 +49,7 @@ void addCPUReference(
{
for(unsigned int j=0; j < NUM; j++)
{
output[j]= input[j] + output[j];
}
}
+2 -2
Просмотреть файл
@@ -35,7 +35,7 @@ __global__ void vmac_asm(hipLaunchParm lp,
float *out,
float *in)
{
int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
int i = blockDim.x * blockIdx.x + threadIdx.x;
asm volatile ("v_mac_f32_e32 %0, %2, %3" : "=v" (out[i]) : "0"(out[i]), "v" (a), "v" (in[i]));
}
@@ -47,7 +47,7 @@ void CPUReference(
{
for(unsigned int j=0; j < NUM; j++)
{
output[j]= a*input[j] + output[j];
}
}
+1 -1
Просмотреть файл
@@ -33,7 +33,7 @@ void
__launch_bounds__(256, 2)
myKern(hipLaunchParm lp, int *C, const int *A, int N, int xfactor)
{
int tid = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
int tid = (blockIdx.x * blockDim.x + threadIdx.x);
if (tid < N) {
C[tid] = A[tid];
+1 -1
Просмотреть файл
@@ -34,7 +34,7 @@ THE SOFTWARE.
#define NUM_STREAMS 2
__global__ void Iter(hipLaunchParm lp, int *Ad, int num){
int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
int tx = threadIdx.x + blockIdx.x * blockDim.x;
// Kernel loop designed to execute very slowly... ... ... so we can test timing-related behavior below
if(tx == 0){
for(int i = 0; i<num;i++){
+1 -1
Просмотреть файл
@@ -33,7 +33,7 @@ THE SOFTWARE.
#define SIZE LEN*sizeof(float)
__global__ void Add(hipLaunchParm lp, float *Ad, float *Bd, float *Cd){
int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
int tx = threadIdx.x + blockIdx.x * blockDim.x;
Cd[tx] = Ad[tx] + Bd[tx];
}
+13 -13
Просмотреть файл
@@ -33,13 +33,13 @@
#define SIZE LEN*sizeof(float)
__global__ void Add(float *Ad, float *Bd, float *Cd){
int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
int tx = threadIdx.x + blockIdx.x * blockDim.x;
Cd[tx] = Ad[tx] + Bd[tx];
}
__global__ void Set(int *Ad, int val){
int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
int tx = threadIdx.x + blockIdx.x * blockDim.x;
Ad[tx] = val;
}
@@ -52,13 +52,13 @@ std::vector<std::string> syncMsg = {"event", "stream", "device"};
void CheckHostPointer(int numElements, int *ptr, unsigned eventFlags, int syncMethod, std::string msg)
{
std::cerr << "test: CheckHostPointer " << msg
std::cerr << "test: CheckHostPointer " << msg
//<< " HIP_COHERENT_HOST_ALLOC=" << HIP_COHERENT_HOST_ALLOC
//<< " HIP_EVENT_SYS_RELEASE=" << HIP_EVENT_SYS_RELEASE
<< " eventFlags = " << std::hex << eventFlags
<< ((eventFlags & hipEventReleaseToDevice) ? " hipEventReleaseToDevice" : "")
<< ((eventFlags & hipEventReleaseToSystem) ? " hipEventReleaseToSystem" : "")
<< " ptr=" << ptr
<< " eventFlags = " << std::hex << eventFlags
<< ((eventFlags & hipEventReleaseToDevice) ? " hipEventReleaseToDevice" : "")
<< ((eventFlags & hipEventReleaseToSystem) ? " hipEventReleaseToSystem" : "")
<< " ptr=" << ptr
<< " syncMethod=" << syncMsg[syncMethod] << "\n";
hipStream_t s;
@@ -93,7 +93,7 @@ void CheckHostPointer(int numElements, int *ptr, unsigned eventFlags, int syncMe
default:
assert(0);
};
for (int i=0; i<numElements; i++) {
if (ptr[i] != expected) {
printf ("mismatch at %d: %d != %d\n", i, ptr[i], expected);
@@ -153,7 +153,7 @@ int main(){
size_t sizeBytes = numElements * sizeof (int);
#ifdef __HIP_PLATFORM_HCC__
{
{
// Stimulate error condition:
int *A = &numElements;
HIPCHECK_API(hipHostMalloc((void**)&A, sizeBytes, hipHostMallocCoherent|hipHostMallocNonCoherent), hipErrorInvalidValue);
@@ -174,7 +174,7 @@ int main(){
// agent-scope releases don't provide host visibility, don't use them here:
}
if (1) {
if (1) {
int *A = nullptr;
HIPCHECK(hipHostMalloc((void**)&A, sizeBytes, hipHostMallocCoherent));
const char *ptrType = "coherent";
@@ -189,14 +189,14 @@ int main(){
// Check defaults:
if (1) {
if (1) {
int *A = nullptr;
HIPCHECK(hipHostMalloc((void**)&A, sizeBytes));
const char *ptrType = "default";
CheckHostPointer(numElements, A, 0, SYNC_DEVICE, ptrType);
CheckHostPointer(numElements, A, 0, SYNC_STREAM, ptrType);
CheckHostPointer(numElements, A, 0, SYNC_EVENT, ptrType);
CheckHostPointer(numElements, A, 0, SYNC_DEVICE, ptrType);
CheckHostPointer(numElements, A, 0, SYNC_STREAM, ptrType);
CheckHostPointer(numElements, A, 0, SYNC_EVENT, ptrType);
@@ -206,7 +206,7 @@ int main(){
}
passed();
}
+6 -6
Просмотреть файл
@@ -29,13 +29,13 @@ THE SOFTWARE.
#include<malloc.h>
__global__ void Inc(hipLaunchParm lp, float *Ad){
int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
int tx = threadIdx.x + blockIdx.x * blockDim.x;
Ad[tx] = Ad[tx] + float(1);
}
template<typename T>
void doMemCopy(size_t numElements, int offset, T *A, T *Bh, T *Bd, bool internalRegister)
void doMemCopy(size_t numElements, int offset, T *A, T *Bh, T *Bd, bool internalRegister)
{
A = A + offset;
numElements -= offset;
@@ -56,7 +56,7 @@ void doMemCopy(size_t numElements, int offset, T *A, T *Bh, T *Bd, bool internal
HIPCHECK(hipMemset(Bd, 13.0f, sizeBytes));
//
//
HIPCHECK(hipMemcpy(Bd, A, sizeBytes, hipMemcpyHostToDevice));
HIPCHECK(hipMemcpy(Bh, Bd, sizeBytes, hipMemcpyDeviceToHost));
@@ -81,7 +81,7 @@ int main(int argc, char *argv[])
const size_t size = N * sizeof(float);
if (p_tests & 0x1) {
if (p_tests & 0x1) {
float *A, **Ad;
int num_devices;
HIPCHECK(hipGetDeviceCount(&num_devices));
@@ -115,7 +115,7 @@ int main(int argc, char *argv[])
}
if (p_tests & 0x6) {
if (p_tests & 0x6) {
// Sensitize HIP bug if device does not match where the memory was registered.
HIPCHECK(hipSetDevice(0));
@@ -129,7 +129,7 @@ int main(int argc, char *argv[])
HIPCHECK(hipMalloc(&Bd, size));
// TODO - set to 128
#define OFFSETS_TO_TRY 128
#define OFFSETS_TO_TRY 128
assert (N>OFFSETS_TO_TRY);
if (p_tests & 0x2) {
+12 -12
Просмотреть файл
@@ -59,7 +59,7 @@ struct HostTraits<Pinned>
static const char *Name() { return "Pinned"; } ;
static void *Alloc(size_t sizeBytes) {
void *p;
void *p;
HIPCHECK(hipHostMalloc((void**)&p, sizeBytes, hipHostMallocDefault));
return p;
};
@@ -67,11 +67,11 @@ struct HostTraits<Pinned>
template<typename T>
__global__ void
__global__ void
addK (hipLaunchParm lp, T *A, T K, size_t numElements)
{
size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
size_t stride = hipBlockDim_x * hipGridDim_x ;
size_t offset = (blockIdx.x * blockDim.x + threadIdx.x);
size_t stride = blockDim.x * gridDim.x ;
for (size_t i=offset; i<numElements; i+=stride) {
A[i] = A[i] + K;
@@ -85,7 +85,7 @@ addK (hipLaunchParm lp, T *A, T K, size_t numElements)
//IN: numInflight : number of copies inflight at any time:
//IN: numPongs = number of iterations to run (iteration)
template<typename T, class AllocType>
void test_pingpong(hipStream_t stream, size_t numElements, int numInflight, int numPongs, bool doHostSide)
void test_pingpong(hipStream_t stream, size_t numElements, int numInflight, int numPongs, bool doHostSide)
{
HIPASSERT(numElements % numInflight == 0); // Must be evenly divisible.
size_t Nbytes = numElements*sizeof(T);
@@ -95,7 +95,7 @@ void test_pingpong(hipStream_t stream, size_t numElements, int numInflight, int
unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, numElements);
printf ("-----------------------------------------------------------------------------------------------\n");
printf ("testing: %s<%s> Nbytes=%zu (%6.1f MB) numPongs=%d numInflight=%d eachCopyElements=%zu eachCopyBytes=%zu\n",
printf ("testing: %s<%s> Nbytes=%zu (%6.1f MB) numPongs=%d numInflight=%d eachCopyElements=%zu eachCopyBytes=%zu\n",
__func__, HostTraits<AllocType>::Name(), Nbytes, (double)(Nbytes)/1024.0/1024.0, numPongs, numInflight, eachCopyElements, eachCopyBytes);
T *A_h = NULL;
@@ -176,7 +176,7 @@ void test_manyInflightCopies(hipStream_t stream, int numElements, int numCopies,
size_t eachCopyBytes = eachCopyElements * sizeof(T);
printf ("-----------------------------------------------------------------------------------------------\n");
printf ("testing: %s Nbytes=%zu (%6.1f MB) numCopies=%d eachCopyElements=%zu eachCopyBytes=%zu\n",
printf ("testing: %s Nbytes=%zu (%6.1f MB) numCopies=%d eachCopyElements=%zu eachCopyBytes=%zu\n",
__func__, Nbytes, (double)(Nbytes)/1024.0/1024.0, numCopies, eachCopyElements, eachCopyBytes);
T *A_d;
@@ -194,7 +194,7 @@ void test_manyInflightCopies(hipStream_t stream, int numElements, int numCopies,
//stream=0; // fixme TODO
for (int i=0; i<numCopies; i++)
for (int i=0; i<numCopies; i++)
{
HIPASSERT(A_d + i*eachCopyElements < A_d + Nbytes);
HIPCHECK(hipMemcpyAsync(&A_d[i*eachCopyElements], &A_h1[i*eachCopyElements], eachCopyBytes, hipMemcpyHostToDevice, stream));
@@ -204,7 +204,7 @@ void test_manyInflightCopies(hipStream_t stream, int numElements, int numCopies,
HIPCHECK(hipDeviceSynchronize());
}
for (int i=0; i<numCopies; i++)
for (int i=0; i<numCopies; i++)
{
HIPASSERT(A_d + i*eachCopyElements < A_d + Nbytes);
HIPCHECK(hipMemcpyAsync(&A_h2[i*eachCopyElements], &A_d[i*eachCopyElements], eachCopyBytes, hipMemcpyDeviceToHost, stream));
@@ -252,7 +252,7 @@ void test_chunkedAsyncExample(int nStreams, bool useNullStream, bool useSyncMemc
hipStream_t *stream = (hipStream_t*)malloc(sizeof(hipStream_t) * nStreams);
if (useNullStream) {
if (useNullStream) {
nStreams = 1;
stream[0] = NULL;
} else {
@@ -262,7 +262,7 @@ void test_chunkedAsyncExample(int nStreams, bool useNullStream, bool useSyncMemc
}
size_t workLeft = N;
size_t workLeft = N;
size_t workPerStream = N / nStreams;
for (int i = 0; i < nStreams; ++i) {
size_t work = (workLeft < workPerStream) ? workLeft : workPerStream;
@@ -287,7 +287,7 @@ void test_chunkedAsyncExample(int nStreams, bool useNullStream, bool useSyncMemc
} else {
HIPCHECK ( hipMemcpyAsync(&C_h[offset], &C_d[offset], workBytes, hipMemcpyDeviceToHost, stream[i]));
}
}
}
HIPCHECK (hipDeviceSynchronize());
+1 -1
Просмотреть файл
@@ -31,7 +31,7 @@ THE SOFTWARE.
__global__ void Kernel(hipLaunchParm lp,volatile float* hostRes)
{
int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
int tid = threadIdx.x + blockIdx.x * blockDim.x;
hostRes[tid] = tid + 1;
__threadfence_system();
// expecting that the data is getting flushed to host here!
+6 -6
Просмотреть файл
@@ -24,7 +24,7 @@ THE SOFTWARE.
/* HIT_START
* BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS --std=c++11
* RUN: %t EXCLUDE_HIP_PLATFORM all
* RUN: %t EXCLUDE_HIP_PLATFORM all
* HIT_END
*/
@@ -63,8 +63,8 @@ int enablePeers(int dev0, int dev1)
__global__ void
memsetIntKernel(int * ptr, const int val, size_t numElements)
{
int gid = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
int stride = hipBlockDim_x * hipGridDim_x ;
int gid = (blockIdx.x * blockDim.x + threadIdx.x);
int stride = blockDim.x * gridDim.x ;
for (size_t i= gid; i< numElements; i+=stride){
ptr[i] = val;
}
@@ -73,15 +73,15 @@ memsetIntKernel(int * ptr, const int val, size_t numElements)
__global__ void
memcpyIntKernel(const int * src, int* dst, size_t numElements)
{
int gid = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
int stride = hipBlockDim_x * hipGridDim_x ;
int gid = (blockIdx.x * blockDim.x + threadIdx.x);
int stride = blockDim.x * gridDim.x ;
for (size_t i= gid; i< numElements; i+=stride){
dst[i] = src[i];
}
};
// CHeck arrays in reverse order, to more easily detect cases where
// CHeck arrays in reverse order, to more easily detect cases where
// the copy is "partially" done.
void checkReverse(const int *ptr, int numElements, int expected) {
for (int i=numElements-1; i>=0; i--) {
+1 -1
Просмотреть файл
@@ -34,7 +34,7 @@ THE SOFTWARE.
#define kernel_name "hello_world"
__global__ void Cpy(hipLaunchParm lp, float *Ad, float* Bd){
int tx = hipThreadIdx_x;
int tx = threadIdx.x;
Bd[tx] = Ad[tx];
}
+1 -1
Просмотреть файл
@@ -24,7 +24,7 @@ THE SOFTWARE.
extern "C" __global__ void hello_world(hipLaunchParm lp, float *a, float *b)
{
int tx = hipThreadIdx_x;
int tx = threadIdx.x;
b[tx] = a[tx];
}
+3 -3
Просмотреть файл
@@ -35,7 +35,7 @@ THE SOFTWARE.
template<typename T>
__global__ void Inc(hipLaunchParm lp, T *Array){
int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
int tx = threadIdx.x + blockIdx.x * blockDim.x;
Array[tx] = Array[tx] + T(1);
}
@@ -116,7 +116,7 @@ int main(int argc, char **argv)
}
const size_t size = N * sizeof(float);
for (int i=0; i< iterations; i++) {
std::thread t1(run1, size, stream[0]);
@@ -126,7 +126,7 @@ int main(int argc, char **argv)
// std::cout<<"T1"<<std::endl;
t2.join();
// std::cout<<"T2"<<std::endl;
t3.join();
t3.join();
}
passed();
}
+2 -2
Просмотреть файл
@@ -29,7 +29,7 @@ THE SOFTWARE.
const int NN = 1 << 21;
__global__ void kernel(hipLaunchParm lp, float *x, float *y, int n){
int tid = hipThreadIdx_x;
int tid = threadIdx.x;
if(tid < 1){
for(int i=0;i<n;i++){
x[i] = sqrt(powf(3.14159,i));
@@ -39,7 +39,7 @@ __global__ void kernel(hipLaunchParm lp, float *x, float *y, int n){
}
__global__ void nKernel(hipLaunchParm lp, float *y){
int tid = hipThreadIdx_x;
int tid = threadIdx.x;
y[tid] = y[tid] + 1.0f;
}
+2 -2
Просмотреть файл
@@ -31,7 +31,7 @@ THE SOFTWARE.
const int NN = 1 << 21;
__global__ void kernel(hipLaunchParm lp, float *x, float *y, int n){
int tid = hipThreadIdx_x;
int tid = threadIdx.x;
if(tid < 1){
for(int i=0;i<n;i++){
x[i] = sqrt(powf(3.14159,i));
@@ -41,7 +41,7 @@ __global__ void kernel(hipLaunchParm lp, float *x, float *y, int n){
}
__global__ void nKernel(hipLaunchParm lp, float *y){
int tid = hipThreadIdx_x;
int tid = threadIdx.x;
y[tid] = y[tid] + 1.0f;
}
+2 -2
Просмотреть файл
@@ -41,8 +41,8 @@ vectorADDRepeat(hipLaunchParm lp,
size_t NELEM,
int repeat)
{
size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
size_t stride = hipBlockDim_x * hipGridDim_x ;
size_t offset = (blockIdx.x * blockDim.x + threadIdx.x);
size_t stride = blockDim.x * gridDim.x ;
for (int j=1; j<=repeat;j++) {
for (size_t i=offset; i<NELEM; i+=stride) {
+4 -4
Просмотреть файл
@@ -73,7 +73,7 @@ void D2H(T *Dst, T *Src, size_t size){
template<typename T>
__global__ void Inc(hipLaunchParm lp, T *In){
int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
int tx = threadIdx.x + blockIdx.x * blockDim.x;
In[tx] = In[tx] + 1;
}
@@ -94,12 +94,12 @@ void initArrays(T **Ad, T **Ah,
}
template<typename T>
void initArrays(T **Ad, size_t N,
bool deviceMemory = false,
void initArrays(T **Ad, size_t N,
bool deviceMemory = false,
bool usePinnedHost = false){
size_t NBytes = N * sizeof(T);
if(deviceMemory){
HIPCHECK( hipMalloc(Ad, NBytes));
HIPCHECK( hipMalloc(Ad, NBytes));
}else{
if(usePinnedHost){
HIPCHECK(hipHostMalloc((void**)Ad, NBytes, hipHostMallocDefault));
+20 -20
Просмотреть файл
@@ -27,14 +27,14 @@ THE SOFTWARE.
// TODO - add code object support here.
/* HIT_START
* BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS --std=c++11
* RUN: %t
* RUN: %t
* HIT_END
*/
// Test cache management (fences) and synchronization between kernel and copy commands.
// Exhaustively tests 3 command types (copy, kernel, module kernel),
// many sync types (see SyncType), followed by another command, across a sweep
// Exhaustively tests 3 command types (copy, kernel, module kernel),
// many sync types (see SyncType), followed by another command, across a sweep
// of data sizes designed to stress various levels of the memory hierarchy.
#include "hip/hip_runtime.h"
@@ -102,8 +102,8 @@ MemcpyFunction g_moduleMemcpy("memcpyInt.hsaco", "memcpyIntKernel");
__global__ void
memsetIntKernel(int * ptr, const int val, size_t numElements)
{
int gid = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
int stride = hipBlockDim_x * hipGridDim_x ;
int gid = (blockIdx.x * blockDim.x + threadIdx.x);
int stride = blockDim.x * gridDim.x ;
for (size_t i= gid; i< numElements; i+=stride){
ptr[i] = val;
}
@@ -112,15 +112,15 @@ memsetIntKernel(int * ptr, const int val, size_t numElements)
__global__ void
memcpyIntKernel(int *dst, const int * src, size_t numElements)
{
int gid = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
int stride = hipBlockDim_x * hipGridDim_x ;
int gid = (blockIdx.x * blockDim.x + threadIdx.x);
int stride = blockDim.x * gridDim.x ;
for (size_t i= gid; i< numElements; i+=stride){
dst[i] = src[i];
}
};
// CHeck arrays in reverse order, to more easily detect cases where
// CHeck arrays in reverse order, to more easily detect cases where
// the copy is "partially" done.
void checkReverse(const int *ptr, int numElements, int expected) {
int mismatchCnt = 0;
@@ -157,7 +157,7 @@ const char * CmdTypeStr(CmdType c)
ENUM_CASE_STR(MODULE_KERNEL);
default: return "UNKNOWN";
};
}
}
enum SyncType {
@@ -223,16 +223,16 @@ void resetInputs( int * Ad, int * Bd, int *Cd, int *Ch, size_t numElements, int
// Intended to test proper synchronization and cache flushing between CMDA and CMDB.
// CMD are of type CmdType. All command copy memory, using either hipMemcpyAsync or kernel implementations.
// CmdA copies from Ad to Bd,
// CmdA copies from Ad to Bd,
// Some form of synchronization is applied.
// Then cmdB copies from Bd to Cd.
//
// Cd is then copied to host Ch using a memory copy.
//
// Correct result at the end is that Ch contains the contents originally in Ad (integer 0x42)
void runTestImpl(CmdType cmdAType, SyncType syncType, CmdType cmdBType,
void runTestImpl(CmdType cmdAType, SyncType syncType, CmdType cmdBType,
hipStream_t stream1, hipStream_t stream2, int numElements,
int * Ad, int * Bd, int *Cd, int *Ch,
int * Ad, int * Bd, int *Cd, int *Ch,
int expected)
{
hipEvent_t e;
@@ -241,14 +241,14 @@ void runTestImpl(CmdType cmdAType, SyncType syncType, CmdType cmdBType,
resetInputs(Ad, Bd, Cd, Ch, numElements, expected);
const size_t sizeElements = numElements * sizeof(int);
fprintf (stderr, "test: runTest with %zu bytes (%6.2f MB) cmdA=%s; sync=%s; cmdB=%s\n",
fprintf (stderr, "test: runTest with %zu bytes (%6.2f MB) cmdA=%s; sync=%s; cmdB=%s\n",
sizeElements, (double) (sizeElements/1024.0), CmdTypeStr(cmdAType), SyncTypeStr(syncType), CmdTypeStr(cmdBType));
if (SKIP_MODULE_KERNEL && ((cmdAType == MODULE_KERNEL) || (cmdBType == MODULE_KERNEL))) {
fprintf (stderr, "warn: skipping since test infra does not yet support modules\n");
return;
}
// Step A:
runCmd(cmdAType, Bd, Ad, stream1, numElements);
@@ -334,7 +334,7 @@ void testWrapper(size_t numElements)
fprintf (stderr, "test: init complete, start running tests\n");
runTestImpl(COPY, EVENT_SYNC, KERNEL, stream1, stream2, numElements, Ad, Bd, Cd, Ch, expected);
runTestImpl(COPY, EVENT_SYNC, KERNEL, stream1, stream2, numElements, Ad, Bd, Cd, Ch, expected);
for (int cmdA=0; cmdA<MAX_CmdType; cmdA++) {
for (int cmdB=0; cmdB<MAX_CmdType; cmdB++) {
@@ -347,7 +347,7 @@ void testWrapper(size_t numElements)
//case STREAM_QUERY:
case STREAM_SYNC:
case DEVICE_SYNC:
runTestImpl(CmdType(cmdA), SyncType(syncMode), CmdType(cmdB), stream1, stream2, numElements, Ad, Bd, Cd, Ch, expected);
runTestImpl(CmdType(cmdA), SyncType(syncMode), CmdType(cmdB), stream1, stream2, numElements, Ad, Bd, Cd, Ch, expected);
break;
default:
break;
@@ -358,11 +358,11 @@ void testWrapper(size_t numElements)
}
#if 0
runTestImpl(COPY, STREAM_SYNC, MODULE_KERNEL, stream1, stream2, numElements, Ad, Bd, Cd, Ch, expected);
runTestImpl(COPY, STREAM_SYNC, KERNEL, stream1, stream2, numElements, Ad, Bd, Cd, Ch, expected);
runTestImpl(COPY, STREAM_WAIT_EVENT, MODULE_KERNEL, stream1, stream2, numElements, Ad, Bd, Cd, Ch, expected);
runTestImpl(COPY, STREAM_SYNC, MODULE_KERNEL, stream1, stream2, numElements, Ad, Bd, Cd, Ch, expected);
runTestImpl(COPY, STREAM_SYNC, KERNEL, stream1, stream2, numElements, Ad, Bd, Cd, Ch, expected);
runTestImpl(COPY, STREAM_WAIT_EVENT, MODULE_KERNEL, stream1, stream2, numElements, Ad, Bd, Cd, Ch, expected);
runTestImpl(COPY, STREAM_WAIT_EVENT, KERNEL, stream1, stream2, numElements, Ad, Bd, Cd, Ch, expected);
runTestImpl(COPY, STREAM_WAIT_EVENT, KERNEL, stream1, stream2, numElements, Ad, Bd, Cd, Ch, expected);
#endif
HIPCHECK(hipFree(Ad));
+3 -3
Просмотреть файл
@@ -1,12 +1,12 @@
#include <hip/hip_runtime.h>
extern "C" __global__ void
memcpyIntKernel(hipLaunchParm lp, int *dst, const int * src, size_t numElements)
{
int gid = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
int stride = hipBlockDim_x * hipGridDim_x ;
int gid = (blockIdx.x * blockDim.x + threadIdx.x);
int stride = blockDim.x * gridDim.x ;
for (size_t i= gid; i< numElements; i+=stride){
dst[i] = src[i];
}
+2 -2
Просмотреть файл
@@ -30,7 +30,7 @@ THE SOFTWARE.
#define ITER 1<<10
__global__ void Iter(hipLaunchParm lp, int *Ad, int num){
int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
int tx = threadIdx.x + blockIdx.x * blockDim.x;
if(tx == 0){
for(int i = 0; i<num;i++){
Ad[tx] += 1;
@@ -58,7 +58,7 @@ int main(){
}
std::cout<<"Waitin..."<<std::endl;
hipDeviceSynchronize();
}
+1 -1
Просмотреть файл
@@ -26,7 +26,7 @@ THE SOFTWARE.
static size_t size[NUM_SIZE];
__global__ void Add(hipLaunchParm lp, int *Ad){
int tx = hipThreadIdx_x;
int tx = threadIdx.x;
Ad[tx] = Ad[tx] + tx;
}
+1 -1
Просмотреть файл
@@ -26,7 +26,7 @@ THE SOFTWARE.
static size_t size[NUM_SIZE];
__global__ void Add(hipLaunchParm lp, int *Ad){
int tx = hipThreadIdx_x;
int tx = threadIdx.x;
Ad[tx] = Ad[tx] + tx;
}
+2 -2
Просмотреть файл
@@ -30,7 +30,7 @@ THE SOFTWARE.
#define ITER 1<<10
__global__ void Iter(hipLaunchParm lp, int *Ad, int num){
int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
int tx = threadIdx.x + blockIdx.x * blockDim.x;
if(tx == 0){
for(int i = 0; i<num;i++){
Ad[tx] += 1;
@@ -57,7 +57,7 @@ int main(){
}
std::cout<<"Waitin..."<<std::endl;
hipDeviceSynchronize();
}
+23 -23
Просмотреть файл
@@ -136,8 +136,8 @@ vectorADD(hipLaunchParm lp,
T *C_d,
size_t NELEM)
{
size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
size_t stride = hipBlockDim_x * hipGridDim_x ;
size_t offset = (blockIdx.x * blockDim.x + threadIdx.x);
size_t stride = blockDim.x * gridDim.x ;
for (size_t i=offset; i<NELEM; i+=stride) {
C_d[i] = A_d[i] + B_d[i];
@@ -153,8 +153,8 @@ vectorADDReverse(hipLaunchParm lp,
T *C_d,
size_t NELEM)
{
size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
size_t stride = hipBlockDim_x * hipGridDim_x ;
size_t offset = (blockIdx.x * blockDim.x + threadIdx.x);
size_t stride = blockDim.x * gridDim.x ;
for (int64_t i=NELEM-stride+offset; i>=0; i-=stride) {
C_d[i] = A_d[i] + B_d[i];
@@ -169,8 +169,8 @@ addCount( const T *A_d,
size_t NELEM,
int count)
{
size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
size_t stride = hipBlockDim_x * hipGridDim_x ;
size_t offset = (blockIdx.x * blockDim.x + threadIdx.x);
size_t stride = blockDim.x * gridDim.x ;
// Deliberately do this in an inefficient way to increase kernel runtime
for (int i=0; i<count; i++) {
@@ -188,8 +188,8 @@ addCountReverse( const T *A_d,
int64_t NELEM,
int count)
{
size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
size_t stride = hipBlockDim_x * hipGridDim_x ;
size_t offset = (blockIdx.x * blockDim.x + threadIdx.x);
size_t stride = blockDim.x * gridDim.x ;
// Deliberately do this in an inefficient way to increase kernel runtime
for (int i=0; i<count; i++) {
@@ -205,8 +205,8 @@ __global__ void
memsetReverse( T *C_d, T val,
int64_t NELEM)
{
size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
size_t stride = hipBlockDim_x * hipGridDim_x ;
size_t offset = (blockIdx.x * blockDim.x + threadIdx.x);
size_t stride = blockDim.x * gridDim.x ;
for (int64_t i=NELEM-stride+offset; i>=0; i-=stride) {
C_d[i] = val;
@@ -219,12 +219,12 @@ void setDefaultData(size_t numElements, T *A_h, T* B_h, T *C_h)
{
// Initialize the host data:
for (size_t i=0; i<numElements; i++) {
if (A_h)
if (A_h)
(A_h)[i] = 3.146f + i; // Pi
if (B_h)
if (B_h)
(B_h)[i] = 1.618f + i; // Phi
if (C_h)
(C_h)[i] = 0.0f + i;
if (C_h)
(C_h)[i] = 0.0f + i;
}
}
@@ -268,8 +268,8 @@ void initArraysForHost(T **A_h, T **B_h, T **C_h,
template <typename T>
void initArrays(T **A_d, T **B_d, T **C_d,
T **A_h, T **B_h, T **C_h,
size_t N, bool usePinnedHost=false)
T **A_h, T **B_h, T **C_h,
size_t N, bool usePinnedHost=false)
{
size_t Nbytes = N*sizeof(T);
@@ -317,7 +317,7 @@ void freeArraysForHost(T *A_h, T *B_h, T *C_h, bool usePinnedHost)
template <typename T>
void freeArrays(T *A_d, T *B_d, T *C_d,
T *A_h, T *B_h, T *C_h, bool usePinnedHost)
T *A_h, T *B_h, T *C_h, bool usePinnedHost)
{
if (A_d) {
HIPCHECK ( hipFree(A_d) );
@@ -453,9 +453,9 @@ struct Pinned {
static const bool isPinned = true;
static const char *str() { return "Pinned"; };
static void *Alloc(size_t sizeBytes)
static void *Alloc(size_t sizeBytes)
{
void *p;
void *p;
HIPCHECK(hipHostMalloc((void**)&p, sizeBytes));
return p;
};
@@ -463,12 +463,12 @@ struct Pinned {
//---
struct Unpinned
struct Unpinned
{
static const bool isPinned = false;
static const char *str() { return "Unpinned"; };
static void *Alloc(size_t sizeBytes)
static void *Alloc(size_t sizeBytes)
{
void *p = malloc (sizeBytes);
HIPASSERT(p);
@@ -496,7 +496,7 @@ template<>
struct MemTraits<Memcpy>
{
static void Copy(void *dest, const void *src, size_t sizeBytes, hipMemcpyKind kind, hipStream_t stream)
static void Copy(void *dest, const void *src, size_t sizeBytes, hipMemcpyKind kind, hipStream_t stream)
{
HIPCHECK(hipMemcpy(dest, src, sizeBytes, kind));
}
@@ -507,7 +507,7 @@ template<>
struct MemTraits<MemcpyAsync>
{
static void Copy(void *dest, const void *src, size_t sizeBytes, hipMemcpyKind kind, hipStream_t stream)
static void Copy(void *dest, const void *src, size_t sizeBytes, hipMemcpyKind kind, hipStream_t stream)
{
HIPCHECK(hipMemcpyAsync(dest, src, sizeBytes, kind, stream));
}
+2 -2
Просмотреть файл
@@ -17,8 +17,8 @@ __global__ void tex2DKernel(float* outputData,
int width,
int height)
{
int x = hipBlockIdx_x*hipBlockDim_x + hipThreadIdx_x;
int y = hipBlockIdx_y*hipBlockDim_y + hipThreadIdx_y;
int x = blockIdx.x*blockDim.x + threadIdx.x;
int y = blockIdx.y*blockDim.y + threadIdx.y;
outputData[y*width + x] = tex2D<float>(textureObject, x, y);
}
+2 -2
Просмотреть файл
@@ -18,8 +18,8 @@ __global__ void tex2DKernel(float* outputData,
int width,
int height)
{
int x = hipBlockIdx_x*hipBlockDim_x + hipThreadIdx_x;
int y = hipBlockIdx_y*hipBlockDim_y + hipThreadIdx_y;
int x = blockIdx.x*blockDim.x + threadIdx.x;
int y = blockIdx.y*blockDim.y + threadIdx.y;
#ifdef __HIP_PLATFORM_HCC__
outputData[y*width + x] = tex2D(tex, textureObject, x, y);
#else