This implements the trivial change needed to move back from the hip{Something}_{x, y, z} macros to the natural CUDA syntax of Something.{x, y, z}. This is contained in lines 384-404 in hip_runtime.h. All of the other changes have to do with changing unit tests to use this syntax. The macros are retained for backwards compatibility.
Этот коммит содержится в:
@@ -381,6 +381,27 @@ __device__ void __threadfence_system(void) ;
|
||||
* @}
|
||||
*/
|
||||
|
||||
template<typename std::common_type<
|
||||
decltype(hc_get_group_id),
|
||||
decltype(hc_get_group_size),
|
||||
decltype(hc_get_num_groups),
|
||||
decltype(hc_get_workitem_id)>::type f>
|
||||
class Coordinates {
|
||||
using R = decltype(f(0));
|
||||
|
||||
struct X { __device__ operator R() const { return f(0); } };
|
||||
struct Y { __device__ operator R() const { return f(1); } };
|
||||
struct Z { __device__ operator R() const { return f(2); } };
|
||||
public:
|
||||
static constexpr X x{};
|
||||
static constexpr Y y{};
|
||||
static constexpr Z z{};
|
||||
};
|
||||
|
||||
static constexpr Coordinates<hc_get_group_size> blockDim;
|
||||
static constexpr Coordinates<hc_get_group_id> blockIdx;
|
||||
static constexpr Coordinates<hc_get_num_groups> gridDim;
|
||||
static constexpr Coordinates<hc_get_workitem_id> threadIdx;
|
||||
|
||||
#define hipThreadIdx_x (hc_get_workitem_id(0))
|
||||
#define hipThreadIdx_y (hc_get_workitem_id(1))
|
||||
|
||||
@@ -45,8 +45,8 @@ __device__ void *__hip_hc_malloc(size_t size)
|
||||
{
|
||||
return (void*)nullptr;
|
||||
}
|
||||
uint32_t totalThreads = hipBlockDim_x * hipGridDim_x * hipBlockDim_y * hipGridDim_y * hipBlockDim_z * hipGridDim_z;
|
||||
uint32_t currentWorkItem = hipThreadIdx_x + hipBlockDim_x * hipBlockIdx_x;
|
||||
uint32_t totalThreads = blockDim.x * gridDim.x * blockDim.y * gridDim.y * blockDim.z * gridDim.z;
|
||||
uint32_t currentWorkItem = threadIdx.x + blockDim.x * blockIdx.x;
|
||||
|
||||
uint32_t numHeapsPerWorkItem = NUM_PAGES / totalThreads;
|
||||
uint32_t heapSizePerWorkItem = SIZE_OF_HEAP / totalThreads;
|
||||
@@ -932,7 +932,7 @@ __device__ unsigned long long int atomicMax(unsigned long long int* address,
|
||||
template<typename T>
|
||||
__device__ T atomicCAS_impl(T* address, T compare, T val)
|
||||
{
|
||||
// the implementation assumes the atomic is lock-free and
|
||||
// the implementation assumes the atomic is lock-free and
|
||||
// has the same size as the non-atmoic equivalent type
|
||||
static_assert(sizeof(T) == sizeof(std::atomic<T>)
|
||||
, "size mismatch between atomic and non-atomic types");
|
||||
@@ -945,7 +945,7 @@ __device__ T atomicCAS_impl(T* address, T compare, T val)
|
||||
|
||||
T expected = compare;
|
||||
|
||||
// hcc should generate a system scope atomic CAS
|
||||
// hcc should generate a system scope atomic CAS
|
||||
std::atomic_compare_exchange_weak_explicit(u.atomic_address
|
||||
, &expected, val
|
||||
, std::memory_order_acq_rel
|
||||
@@ -1110,8 +1110,8 @@ __device__ void* __get_dynamicgroupbaseptr() {
|
||||
return hc::get_dynamic_group_segment_base_pointer();
|
||||
}
|
||||
|
||||
__host__ void* __get_dynamicgroupbaseptr() {
|
||||
return nullptr;
|
||||
__host__ void* __get_dynamicgroupbaseptr() {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
// Precise Math Functions
|
||||
|
||||
@@ -34,7 +34,7 @@ THE SOFTWARE.
|
||||
#define NUM_STREAMS 2
|
||||
|
||||
__global__ void Iter(hipLaunchParm lp, int *Ad, int num){
|
||||
int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
|
||||
int tx = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
// Kernel loop designed to execute very slowly... ... ... so we can test timing-related behavior below
|
||||
if(tx == 0){
|
||||
for(int i = 0; i<num;i++){
|
||||
|
||||
@@ -27,7 +27,7 @@ THE SOFTWARE.
|
||||
#define SIZE 64<<2
|
||||
|
||||
__global__ void getSqAbs(hipLaunchParm lp, float *A, float *B, float *C){
|
||||
int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
|
||||
int tx = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
C[tx] = hipCsqabsf(make_hipFloatComplex(A[tx], B[tx]));
|
||||
}
|
||||
|
||||
|
||||
@@ -16,13 +16,13 @@
|
||||
|
||||
__global__ void cpy(hipLaunchParm lp, uint32_t *Out, uint32_t *In)
|
||||
{
|
||||
int tx = hipThreadIdx_x;
|
||||
int tx = threadIdx.x;
|
||||
memcpy(Out + tx, In + tx, sizeof(uint32_t));
|
||||
}
|
||||
|
||||
__global__ void set(hipLaunchParm lp, uint32_t *ptr, uint8_t val, size_t size)
|
||||
{
|
||||
int tx = hipThreadIdx_x;
|
||||
int tx = threadIdx.x;
|
||||
memset(ptr + tx, val, (sizeof(uint32_t)*(size/LEN)));
|
||||
}
|
||||
|
||||
@@ -58,6 +58,6 @@ int main()
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
passed();
|
||||
}
|
||||
|
||||
@@ -35,7 +35,7 @@ THE SOFTWARE.
|
||||
|
||||
|
||||
__global__ void floatMath(hipLaunchParm lp, float *In, float *Out) {
|
||||
int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
|
||||
int tid = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
Out[tid] = __cosf(In[tid]);
|
||||
Out[tid] = __exp10f(Out[tid]);
|
||||
Out[tid] = __expf(Out[tid]);
|
||||
|
||||
@@ -217,7 +217,7 @@ int computeGold(int *gpuData, const int len)
|
||||
__global__ void testKernel(hipLaunchParm lp,int *g_odata)
|
||||
{
|
||||
// access thread id
|
||||
const unsigned int tid = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
|
||||
const unsigned int tid = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
|
||||
// Test various atomic instructions
|
||||
|
||||
@@ -240,7 +240,7 @@ __global__ void testKernel(hipLaunchParm lp,int *g_odata)
|
||||
|
||||
// Atomic increment (modulo 17+1)
|
||||
atomicInc((unsigned int *)&g_odata[5], 17);
|
||||
|
||||
|
||||
// Atomic decrement
|
||||
atomicDec((unsigned int *)&g_odata[6], 137);
|
||||
|
||||
|
||||
@@ -32,82 +32,82 @@ THE SOFTWARE.
|
||||
#define SIZE N*sizeof(float)
|
||||
|
||||
__global__ void test_sincosf(hipLaunchParm lp, float* a, float* b, float *c){
|
||||
int tid = hipThreadIdx_x;
|
||||
int tid = threadIdx.x;
|
||||
sincosf(a[tid], b+tid, c+tid);
|
||||
}
|
||||
|
||||
__global__ void test_sincospif(hipLaunchParm lp, float* a, float* b, float *c){
|
||||
int tid = hipThreadIdx_x;
|
||||
int tid = threadIdx.x;
|
||||
sincospif(a[tid], b+tid, c+tid);
|
||||
}
|
||||
|
||||
__global__ void test_fdividef(hipLaunchParm lp, float *a, float* b, float *c){
|
||||
int tid = hipThreadIdx_x;
|
||||
int tid = threadIdx.x;
|
||||
c[tid] = fdividef(a[tid], b[tid]);
|
||||
}
|
||||
|
||||
__global__ void test_llrintf(hipLaunchParm lp, float *a, long long int *b){
|
||||
int tid = hipThreadIdx_x;
|
||||
int tid = threadIdx.x;
|
||||
b[tid] = llrintf(a[tid]);
|
||||
}
|
||||
|
||||
__global__ void test_lrintf(hipLaunchParm lp, float *a, long int *b){
|
||||
int tid = hipThreadIdx_x;
|
||||
int tid = threadIdx.x;
|
||||
b[tid] = lrintf(a[tid]);
|
||||
}
|
||||
|
||||
__global__ void test_rintf(hipLaunchParm lp, float *a, float *b){
|
||||
int tid = hipThreadIdx_x;
|
||||
int tid = threadIdx.x;
|
||||
b[tid] = rintf(a[tid]);
|
||||
}
|
||||
|
||||
__global__ void test_llroundf(hipLaunchParm lp, float *a, long long int *b){
|
||||
int tid = hipThreadIdx_x;
|
||||
int tid = threadIdx.x;
|
||||
b[tid] = llroundf(a[tid]);
|
||||
}
|
||||
|
||||
__global__ void test_lroundf(hipLaunchParm lp, float *a, long int *b){
|
||||
int tid = hipThreadIdx_x;
|
||||
int tid = threadIdx.x;
|
||||
b[tid] = lroundf(a[tid]);
|
||||
}
|
||||
|
||||
__global__ void test_rhypotf(hipLaunchParm lp, float *a, float* b, float *c){
|
||||
int tid = hipThreadIdx_x;
|
||||
int tid = threadIdx.x;
|
||||
c[tid] = rhypotf(a[tid], b[tid]);
|
||||
}
|
||||
|
||||
__global__ void test_norm3df(hipLaunchParm lp, float *a, float* b, float *c, float *d){
|
||||
int tid = hipThreadIdx_x;
|
||||
int tid = threadIdx.x;
|
||||
d[tid] = norm3df(a[tid], b[tid], c[tid]);
|
||||
}
|
||||
|
||||
__global__ void test_norm4df(hipLaunchParm lp, float *a, float* b, float *c, float *d, float *e){
|
||||
int tid = hipThreadIdx_x;
|
||||
int tid = threadIdx.x;
|
||||
e[tid] = norm4df(a[tid], b[tid], c[tid], d[tid]);
|
||||
}
|
||||
|
||||
__global__ void test_normf(hipLaunchParm lp, float *a, float *b){
|
||||
int tid = hipThreadIdx_x;
|
||||
int tid = threadIdx.x;
|
||||
b[tid] = normf(N, a);
|
||||
}
|
||||
|
||||
__global__ void test_rnorm3df(hipLaunchParm lp, float *a, float* b, float *c, float *d){
|
||||
int tid = hipThreadIdx_x;
|
||||
int tid = threadIdx.x;
|
||||
d[tid] = rnorm3df(a[tid], b[tid], c[tid]);
|
||||
}
|
||||
|
||||
__global__ void test_rnorm4df(hipLaunchParm lp, float *a, float* b, float *c, float *d, float *e){
|
||||
int tid = hipThreadIdx_x;
|
||||
int tid = threadIdx.x;
|
||||
e[tid] = rnorm4df(a[tid], b[tid], c[tid], d[tid]);
|
||||
}
|
||||
|
||||
__global__ void test_rnormf(hipLaunchParm lp, float *a, float *b){
|
||||
int tid = hipThreadIdx_x;
|
||||
int tid = threadIdx.x;
|
||||
b[tid] = rnormf(N, a);
|
||||
}
|
||||
|
||||
__global__ void test_erfinvf(hipLaunchParm lp, float *a, float *b){
|
||||
int tid = hipThreadIdx_x;
|
||||
int tid = threadIdx.x;
|
||||
b[tid] = erff(erfinvf(a[tid]));
|
||||
}
|
||||
|
||||
|
||||
@@ -32,72 +32,72 @@ THE SOFTWARE.
|
||||
#define SIZE N*sizeof(double)
|
||||
|
||||
__global__ void test_sincos(hipLaunchParm lp, double* a, double* b, double *c){
|
||||
int tid = hipThreadIdx_x;
|
||||
int tid = threadIdx.x;
|
||||
sincos(a[tid], b+tid, c+tid);
|
||||
}
|
||||
|
||||
__global__ void test_sincospi(hipLaunchParm lp, double* a, double* b, double *c){
|
||||
int tid = hipThreadIdx_x;
|
||||
int tid = threadIdx.x;
|
||||
sincospi(a[tid], b+tid, c+tid);
|
||||
}
|
||||
|
||||
__global__ void test_llrint(hipLaunchParm lp, double *a, long long int *b){
|
||||
int tid = hipThreadIdx_x;
|
||||
int tid = threadIdx.x;
|
||||
b[tid] = llrint(a[tid]);
|
||||
}
|
||||
|
||||
__global__ void test_lrint(hipLaunchParm lp, double *a, long int *b){
|
||||
int tid = hipThreadIdx_x;
|
||||
int tid = threadIdx.x;
|
||||
b[tid] = lrint(a[tid]);
|
||||
}
|
||||
|
||||
__global__ void test_rint(hipLaunchParm lp, double *a, double *b){
|
||||
int tid = hipThreadIdx_x;
|
||||
int tid = threadIdx.x;
|
||||
b[tid] = rint(a[tid]);
|
||||
}
|
||||
|
||||
__global__ void test_llround(hipLaunchParm lp, double *a, long long int *b){
|
||||
int tid = hipThreadIdx_x;
|
||||
int tid = threadIdx.x;
|
||||
b[tid] = llround(a[tid]);
|
||||
}
|
||||
|
||||
__global__ void test_lround(hipLaunchParm lp, double *a, long int *b){
|
||||
int tid = hipThreadIdx_x;
|
||||
int tid = threadIdx.x;
|
||||
b[tid] = lround(a[tid]);
|
||||
}
|
||||
|
||||
__global__ void test_rhypot(hipLaunchParm lp, double *a, double* b, double *c){
|
||||
int tid = hipThreadIdx_x;
|
||||
int tid = threadIdx.x;
|
||||
c[tid] = rhypot(a[tid], b[tid]);
|
||||
}
|
||||
|
||||
__global__ void test_norm3d(hipLaunchParm lp, double *a, double* b, double *c, double *d){
|
||||
int tid = hipThreadIdx_x;
|
||||
int tid = threadIdx.x;
|
||||
d[tid] = norm3d(a[tid], b[tid], c[tid]);
|
||||
}
|
||||
|
||||
__global__ void test_norm4d(hipLaunchParm lp, double *a, double* b, double *c, double *d, double *e){
|
||||
int tid = hipThreadIdx_x;
|
||||
int tid = threadIdx.x;
|
||||
e[tid] = norm4d(a[tid], b[tid], c[tid], d[tid]);
|
||||
}
|
||||
|
||||
__global__ void test_rnorm3d(hipLaunchParm lp, double *a, double* b, double *c, double *d){
|
||||
int tid = hipThreadIdx_x;
|
||||
int tid = threadIdx.x;
|
||||
d[tid] = rnorm3d(a[tid], b[tid], c[tid]);
|
||||
}
|
||||
|
||||
__global__ void test_rnorm4d(hipLaunchParm lp, double *a, double* b, double *c, double *d, double *e){
|
||||
int tid = hipThreadIdx_x;
|
||||
int tid = threadIdx.x;
|
||||
e[tid] = rnorm4d(a[tid], b[tid], c[tid], d[tid]);
|
||||
}
|
||||
|
||||
__global__ void test_rnorm(hipLaunchParm lp, double *a, double *b){
|
||||
int tid = hipThreadIdx_x;
|
||||
int tid = threadIdx.x;
|
||||
b[tid] = rnorm(N, a);
|
||||
}
|
||||
|
||||
__global__ void test_erfinv(hipLaunchParm lp, double *a, double *b){
|
||||
int tid = hipThreadIdx_x;
|
||||
int tid = threadIdx.x;
|
||||
b[tid] = erf(erfinv(a[tid]));
|
||||
}
|
||||
|
||||
|
||||
@@ -36,7 +36,7 @@ __device__ int globalOut[NUM];
|
||||
|
||||
__global__ void Assign(hipLaunchParm lp, int* Out)
|
||||
{
|
||||
int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
|
||||
int tid = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
Out[tid] = globalIn[tid];
|
||||
globalOut[tid] = globalIn[tid];
|
||||
}
|
||||
|
||||
@@ -28,7 +28,7 @@ THE SOFTWARE.
|
||||
#if __HIP_ARCH_GFX803__ > 0
|
||||
|
||||
__global__ void __halfMath(hipLaunchParm lp, __half *A, __half *B, __half *C) {
|
||||
int tx = hipThreadIdx_x;
|
||||
int tx = threadIdx.x;
|
||||
__half a = A[tx];
|
||||
__half b = B[tx];
|
||||
__half c = C[tx];
|
||||
@@ -44,7 +44,7 @@ __global__ void __halfMath(hipLaunchParm lp, __half *A, __half *B, __half *C) {
|
||||
}
|
||||
|
||||
__global__ void __half2Math(hipLaunchParm lp, __half2 *A, __half2 *B, __half2 *C) {
|
||||
int tx = hipThreadIdx_x;
|
||||
int tx = threadIdx.x;
|
||||
__half2 a = A[tx];
|
||||
__half2 b = B[tx];
|
||||
__half2 c = C[tx];
|
||||
|
||||
@@ -33,7 +33,7 @@ THE SOFTWARE.
|
||||
|
||||
__global__ void vAdd(hipLaunchParm lp, float *In1, float *In2, float *In3, float *In4, float *Out)
|
||||
{
|
||||
int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
|
||||
int tid = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
In4[tid] = In1[tid] + In2[tid];
|
||||
__threadfence();
|
||||
In3[tid] = In3[tid] + In4[tid];
|
||||
|
||||
@@ -37,9 +37,9 @@ __global__ void
|
||||
warpvote(hipLaunchParm lp, int* device_any, int* device_all , int Num_Warps_per_Block, int pshift)
|
||||
{
|
||||
|
||||
int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
|
||||
device_any[hipThreadIdx_x>>pshift] = __any(tid -77);
|
||||
device_all[hipThreadIdx_x>>pshift] = __all(tid -77);
|
||||
int tid = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
device_any[threadIdx.x>>pshift] = __any(tid -77);
|
||||
device_all[threadIdx.x>>pshift] = __all(tid -77);
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[])
|
||||
@@ -49,7 +49,7 @@ int main(int argc, char *argv[])
|
||||
warpSize = devProp.warpSize;
|
||||
|
||||
int w = warpSize;
|
||||
pshift = 0;
|
||||
pshift = 0;
|
||||
while (w >>= 1) ++pshift;
|
||||
|
||||
printf ("warpSize=%d pshift=%d\n", warpSize, pshift);
|
||||
|
||||
@@ -34,12 +34,12 @@ __global__ void
|
||||
gpu_ballot(hipLaunchParm lp, unsigned int* device_ballot, int Num_Warps_per_Block,int pshift)
|
||||
{
|
||||
|
||||
int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
|
||||
const unsigned int warp_num = hipThreadIdx_x >> pshift;
|
||||
int tid = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
const unsigned int warp_num = threadIdx.x >> pshift;
|
||||
#ifdef __HIP_PLATFORM_HCC__
|
||||
atomicAdd(&device_ballot[warp_num+hipBlockIdx_x*Num_Warps_per_Block],__popcll(__ballot(tid - 245)));
|
||||
atomicAdd(&device_ballot[warp_num+blockIdx.x*Num_Warps_per_Block],__popcll(__ballot(tid - 245)));
|
||||
#else
|
||||
atomicAdd(&device_ballot[warp_num+hipBlockIdx_x*Num_Warps_per_Block],__popc(__ballot(tid - 245)));
|
||||
atomicAdd(&device_ballot[warp_num+blockIdx.x*Num_Warps_per_Block],__popc(__ballot(tid - 245)));
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
@@ -72,8 +72,8 @@ HIP_kernel(hipLaunchParm lp,
|
||||
unsigned int* a, unsigned int* b,unsigned long long int* c, unsigned long long int* d, int width, int height)
|
||||
{
|
||||
|
||||
int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
|
||||
int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;
|
||||
int x = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
int y = blockDim.y * blockIdx.y + threadIdx.y;
|
||||
|
||||
int i = y * width + x;
|
||||
if ( i < (width * height)) {
|
||||
|
||||
@@ -83,8 +83,8 @@ HIP_kernel(hipLaunchParm lp,
|
||||
unsigned int* a, unsigned int* b,unsigned int* c, unsigned long long int* d, int width, int height)
|
||||
{
|
||||
|
||||
int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
|
||||
int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;
|
||||
int x = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
int y = blockDim.y * blockIdx.y + threadIdx.y;
|
||||
|
||||
int i = y * width + x;
|
||||
if ( i < (width * height)) {
|
||||
|
||||
@@ -73,8 +73,8 @@ HIP_kernel(hipLaunchParm lp,
|
||||
int width, int height)
|
||||
{
|
||||
|
||||
int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
|
||||
int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;
|
||||
int x = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
int y = blockDim.y * blockIdx.y + threadIdx.y;
|
||||
|
||||
int i = y * width + x;
|
||||
if ( i < (width * height)) {
|
||||
|
||||
@@ -64,8 +64,8 @@ HIP_kernel(hipLaunchParm lp,
|
||||
unsigned int* a, unsigned int* b,unsigned int* c, unsigned long long int* d, int width, int height)
|
||||
{
|
||||
|
||||
int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
|
||||
int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;
|
||||
int x = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
int y = blockDim.y * blockIdx.y + threadIdx.y;
|
||||
|
||||
int i = y * width + x;
|
||||
if ( i < (width * height)) {
|
||||
|
||||
@@ -57,8 +57,8 @@ vectoradd_float(hipLaunchParm lp,
|
||||
T* a, const T* bm, int width, int height)
|
||||
|
||||
{
|
||||
int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
|
||||
int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;
|
||||
int x = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
int y = blockDim.y * blockIdx.y + threadIdx.y;
|
||||
|
||||
int i = y * width + x;
|
||||
if ( i < (width * height)) {
|
||||
|
||||
@@ -40,13 +40,13 @@ THE SOFTWARE.
|
||||
#define THREADS_PER_BLOCK_Z 1
|
||||
|
||||
|
||||
__global__ void
|
||||
__global__ void
|
||||
vectoradd_char1(hipLaunchParm lp,
|
||||
char1* a, const char1* bm, const char1* cm, int width, int height)
|
||||
char1* a, const char1* bm, const char1* cm, int width, int height)
|
||||
|
||||
{
|
||||
int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
|
||||
int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;
|
||||
int x = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
int y = blockDim.y * blockIdx.y + threadIdx.y;
|
||||
|
||||
int i = y * width + x;
|
||||
if ( i < (width * height)) {
|
||||
@@ -54,40 +54,40 @@ vectoradd_char1(hipLaunchParm lp,
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void
|
||||
__global__ void
|
||||
vectoradd_char2(hipLaunchParm lp,
|
||||
char2* a, const char2* bm, const char2* cm, int width, int height)
|
||||
char2* a, const char2* bm, const char2* cm, int width, int height)
|
||||
|
||||
{
|
||||
int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
|
||||
int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;
|
||||
int x = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
int y = blockDim.y * blockIdx.y + threadIdx.y;
|
||||
|
||||
int i = y * width + x;
|
||||
if ( i < (width * height)) {
|
||||
a[i] = make_char2(bm[i].x, bm[i].y) + make_char2(cm[i].x, cm[i].y);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void
|
||||
__global__ void
|
||||
vectoradd_char3(hipLaunchParm lp,
|
||||
char3* a, const char3* bm, const char3* cm, int width, int height)
|
||||
char3* a, const char3* bm, const char3* cm, int width, int height)
|
||||
|
||||
{
|
||||
int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
|
||||
int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;
|
||||
int x = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
int y = blockDim.y * blockIdx.y + threadIdx.y;
|
||||
|
||||
int i = y * width + x;
|
||||
if ( i < (width * height)) {
|
||||
a[i] = make_char3(bm[i].x, bm[i].y, bm[i].z) + make_char3(cm[i].x, cm[i].y, cm[i].z);
|
||||
}
|
||||
}
|
||||
__global__ void
|
||||
__global__ void
|
||||
vectoradd_char4(hipLaunchParm lp,
|
||||
char4* a, const char4* bm, const char4* cm, int width, int height)
|
||||
char4* a, const char4* bm, const char4* cm, int width, int height)
|
||||
|
||||
{
|
||||
int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
|
||||
int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;
|
||||
int x = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
int y = blockDim.y * blockIdx.y + threadIdx.y;
|
||||
|
||||
int i = y * width + x;
|
||||
if ( i < (width * height)) {
|
||||
@@ -99,7 +99,7 @@ vectoradd_char4(hipLaunchParm lp,
|
||||
#if 0
|
||||
__kernel__ void vectoradd_float(float* a, const float* b, const float* c, int width, int height) {
|
||||
|
||||
|
||||
|
||||
int x = blockDimX * blockIdx.x + threadIdx.x;
|
||||
int y = blockDimY * blockIdy.y + threadIdx.y;
|
||||
|
||||
@@ -128,21 +128,21 @@ bool dataTypesRun(){
|
||||
hostA = (T*)malloc(NUM * sizeof(T));
|
||||
hostB = (T*)malloc(NUM * sizeof(T));
|
||||
hostC = (T*)malloc(NUM * sizeof(T));
|
||||
|
||||
|
||||
// initialize the input data
|
||||
for (i = 0; i < NUM; i++) {
|
||||
hostB[i] = (T)i;
|
||||
hostC[i] = (T)i;
|
||||
}
|
||||
|
||||
|
||||
HIP_ASSERT(hipMalloc((void**)&deviceA, NUM * sizeof(T)));
|
||||
HIP_ASSERT(hipMalloc((void**)&deviceB, NUM * sizeof(T)));
|
||||
HIP_ASSERT(hipMalloc((void**)&deviceC, NUM * sizeof(T)));
|
||||
|
||||
|
||||
HIP_ASSERT(hipMemcpy(deviceB, hostB, NUM*sizeof(T), hipMemcpyHostToDevice));
|
||||
HIP_ASSERT(hipMemcpy(deviceC, hostC, NUM*sizeof(T), hipMemcpyHostToDevice));
|
||||
|
||||
hipLaunchKernel(HIP_KERNEL_NAME(vectoradd_char1),
|
||||
hipLaunchKernel(HIP_KERNEL_NAME(vectoradd_char1),
|
||||
dim3(WIDTH/THREADS_PER_BLOCK_X, HEIGHT/THREADS_PER_BLOCK_Y),
|
||||
dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),
|
||||
0, 0,
|
||||
@@ -192,21 +192,21 @@ bool dataTypesRun(){
|
||||
hostA = (T*)malloc(NUM * sizeof(T));
|
||||
hostB = (T*)malloc(NUM * sizeof(T));
|
||||
hostC = (T*)malloc(NUM * sizeof(T));
|
||||
|
||||
|
||||
// initialize the input data
|
||||
for (i = 0; i < NUM; i++) {
|
||||
hostB[i] = (T)i;
|
||||
hostC[i] = (T)i;
|
||||
}
|
||||
|
||||
|
||||
HIP_ASSERT(hipMalloc((void**)&deviceA, NUM * sizeof(T)));
|
||||
HIP_ASSERT(hipMalloc((void**)&deviceB, NUM * sizeof(T)));
|
||||
HIP_ASSERT(hipMalloc((void**)&deviceC, NUM * sizeof(T)));
|
||||
|
||||
|
||||
HIP_ASSERT(hipMemcpy(deviceB, hostB, NUM*sizeof(T), hipMemcpyHostToDevice));
|
||||
HIP_ASSERT(hipMemcpy(deviceC, hostC, NUM*sizeof(T), hipMemcpyHostToDevice));
|
||||
|
||||
hipLaunchKernel(HIP_KERNEL_NAME(vectoradd_char1),
|
||||
hipLaunchKernel(HIP_KERNEL_NAME(vectoradd_char1),
|
||||
dim3(WIDTH/THREADS_PER_BLOCK_X, HEIGHT/THREADS_PER_BLOCK_Y),
|
||||
dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),
|
||||
0, 0,
|
||||
@@ -256,21 +256,21 @@ bool dataTypesRun(){
|
||||
hostA = (T*)malloc(NUM * sizeof(T));
|
||||
hostB = (T*)malloc(NUM * sizeof(T));
|
||||
hostC = (T*)malloc(NUM * sizeof(T));
|
||||
|
||||
|
||||
// initialize the input data
|
||||
for (i = 0; i < NUM; i++) {
|
||||
hostB[i] = (T)i;
|
||||
hostC[i] = (T)i;
|
||||
}
|
||||
|
||||
|
||||
HIP_ASSERT(hipMalloc((void**)&deviceA, NUM * sizeof(T)));
|
||||
HIP_ASSERT(hipMalloc((void**)&deviceB, NUM * sizeof(T)));
|
||||
HIP_ASSERT(hipMalloc((void**)&deviceC, NUM * sizeof(T)));
|
||||
|
||||
|
||||
HIP_ASSERT(hipMemcpy(deviceB, hostB, NUM*sizeof(T), hipMemcpyHostToDevice));
|
||||
HIP_ASSERT(hipMemcpy(deviceC, hostC, NUM*sizeof(T), hipMemcpyHostToDevice));
|
||||
|
||||
hipLaunchKernel(HIP_KERNEL_NAME(vectoradd_char1),
|
||||
hipLaunchKernel(HIP_KERNEL_NAME(vectoradd_char1),
|
||||
dim3(WIDTH/THREADS_PER_BLOCK_X, HEIGHT/THREADS_PER_BLOCK_Y),
|
||||
dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),
|
||||
0, 0,
|
||||
@@ -319,21 +319,21 @@ bool dataTypesRunChar4(){
|
||||
hostA = (T*)malloc(NUM * sizeof(T));
|
||||
hostB = (T*)malloc(NUM * sizeof(T));
|
||||
hostC = (T*)malloc(NUM * sizeof(T));
|
||||
|
||||
|
||||
// initialize the input data
|
||||
for (i = 0; i < NUM; i++) {
|
||||
hostB[i] = (T)i;
|
||||
hostC[i] = (T)i;
|
||||
}
|
||||
|
||||
|
||||
HIP_ASSERT(hipMalloc((void**)&deviceA, NUM * sizeof(T)));
|
||||
HIP_ASSERT(hipMalloc((void**)&deviceB, NUM * sizeof(T)));
|
||||
HIP_ASSERT(hipMalloc((void**)&deviceC, NUM * sizeof(T)));
|
||||
|
||||
|
||||
HIP_ASSERT(hipMemcpy(deviceB, hostB, NUM*sizeof(T), hipMemcpyHostToDevice));
|
||||
HIP_ASSERT(hipMemcpy(deviceC, hostC, NUM*sizeof(T), hipMemcpyHostToDevice));
|
||||
|
||||
hipLaunchKernel(HIP_KERNEL_NAME(vectoradd_char1),
|
||||
hipLaunchKernel(HIP_KERNEL_NAME(vectoradd_char1),
|
||||
dim3(WIDTH/THREADS_PER_BLOCK_X, HEIGHT/THREADS_PER_BLOCK_Y),
|
||||
dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),
|
||||
0, 0,
|
||||
@@ -368,7 +368,7 @@ bool dataTypesRunChar4(){
|
||||
}
|
||||
|
||||
int main() {
|
||||
|
||||
|
||||
hipDeviceProp_t devProp;
|
||||
hipGetDeviceProperties(&devProp, 0);
|
||||
cout << " System minor " << devProp.minor << endl;
|
||||
|
||||
@@ -36,7 +36,7 @@ THE SOFTWARE.
|
||||
#define SIZE LEN<<2
|
||||
|
||||
__global__ void kernel_trig(hipLaunchParm lp, float *In, float *sin_d, float *cos_d, float *tan_d, float *sin_pd, float *cos_pd){
|
||||
int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
|
||||
int tid = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
sin_d[tid] = __sinf(In[tid]);
|
||||
cos_d[tid] = __cosf(In[tid]);
|
||||
tan_d[tid] = __tanf(In[tid]);
|
||||
|
||||
@@ -29,7 +29,7 @@ THE SOFTWARE.
|
||||
|
||||
__global__ void Add(hipLaunchParm lp, float *Ad, float *Bd, float *Cd, size_t len)
|
||||
{
|
||||
int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
|
||||
int tx = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
if(tx < len)
|
||||
{
|
||||
Cd[tx] = Ad[tx] + Bd[tx];
|
||||
|
||||
@@ -30,7 +30,7 @@ THE SOFTWARE.
|
||||
|
||||
__global__ void Kern(hipLaunchParm lp, float *A)
|
||||
{
|
||||
int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
|
||||
int tx = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
A[tx] += 1.0f;
|
||||
}
|
||||
|
||||
|
||||
@@ -33,7 +33,7 @@ class memManager;
|
||||
template<typename T>
|
||||
__global__ void Add(hipLaunchParm lp, T* Ad, T* Bd, T* Cd, size_t Len)
|
||||
{
|
||||
int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
|
||||
int tx = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
if(tx < Len)
|
||||
{
|
||||
Cd[tx] = Ad[tx] + Bd[tx];
|
||||
|
||||
@@ -34,7 +34,7 @@ THE SOFTWARE.
|
||||
#define SIZE 1024*1024*sizeof(int)
|
||||
|
||||
__global__ void Iter(hipLaunchParm lp, int *Ad){
|
||||
int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
|
||||
int tx = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
if(tx == 0){
|
||||
for(int i=0;i<ITER;i++){
|
||||
Ad[tx] += 1;
|
||||
|
||||
@@ -29,7 +29,7 @@ THE SOFTWARE.
|
||||
#define SIZE 1024*1024*sizeof(int)
|
||||
|
||||
__global__ void Iter(hipLaunchParm lp, int *Ad){
|
||||
int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
|
||||
int tx = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
if(tx == 0){
|
||||
for(int i=0;i<ITER;i++){
|
||||
Ad[tx] += 1;
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
#include "hip/hip_runtime_api.h"
|
||||
|
||||
__global__ void Kernel(hipLaunchParm lp, float *Ad){
|
||||
int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
|
||||
int tx = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
Ad[tx] += Ad[tx-1];
|
||||
}
|
||||
|
||||
|
||||
@@ -40,8 +40,8 @@ __global__ void testExternSharedKernel(hipLaunchParm lp, const T* A_d, const T*
|
||||
T *sdata = reinterpret_cast<T *>(my_sdata);
|
||||
#endif
|
||||
|
||||
size_t gid = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
|
||||
size_t tid = hipThreadIdx_x;
|
||||
size_t gid = (blockIdx.x * blockDim.x + threadIdx.x);
|
||||
size_t tid = threadIdx.x;
|
||||
|
||||
// initialize dynamic shared memory
|
||||
if (tid < groupElements) {
|
||||
|
||||
@@ -34,7 +34,7 @@ THE SOFTWARE.
|
||||
|
||||
__global__ void vectorAdd(hipLaunchParm lp, float *Ad, float *Bd) {
|
||||
HIP_DYNAMIC_SHARED(float, sBd);
|
||||
int tx = hipThreadIdx_x;
|
||||
int tx = threadIdx.x;
|
||||
for(int i=0;i<LEN/64;i++) {
|
||||
sBd[tx + i * 64] = Ad[tx + i * 64] + 1.0f;
|
||||
Bd[tx + i * 64] = sBd[tx + i * 64];
|
||||
|
||||
@@ -48,11 +48,12 @@ vectorADD2( hipLaunchParm lp,
|
||||
T *C_d,
|
||||
size_t N)
|
||||
{
|
||||
size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
|
||||
size_t stride = hipBlockDim_x * hipGridDim_x ;
|
||||
size_t offset = (blockIdx.x * blockDim.x + threadIdx.x);
|
||||
size_t stride = blockDim.x * gridDim.x ;
|
||||
|
||||
for (size_t i=offset; i<N; i+=stride) {
|
||||
C_d[i] = A_d[i] + B_d[i] ;
|
||||
auto foo = __hiloint2double(A_d[i], B_d[i]);
|
||||
C_d[i] = __double2loint(foo) + __double2hiint(foo);//A_d[i] + B_d[i] ;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -62,7 +62,7 @@ __global__ void MyKernel (const hipLaunchParm lp, const float *a, const float *b
|
||||
{
|
||||
//KERNELBEGIN;
|
||||
|
||||
unsigned gid = hipThreadIdx_x;
|
||||
unsigned gid = threadIdx.x;
|
||||
if (gid < N) {
|
||||
c[gid] = a[gid] + PlusOne(b[gid]);
|
||||
}
|
||||
@@ -96,7 +96,7 @@ vectorADD(const hipLaunchParm lp,
|
||||
int zuzu = deviceVar + 1;
|
||||
|
||||
|
||||
int b = hipThreadIdx_x;
|
||||
int b = threadIdx.x;
|
||||
int c;
|
||||
#ifdef NOT_YET
|
||||
int a = __shfl_up(x, 1);
|
||||
@@ -119,8 +119,8 @@ vectorADD(const hipLaunchParm lp,
|
||||
__syncthreads();
|
||||
|
||||
|
||||
size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
|
||||
size_t stride = hipBlockDim_x * hipGridDim_x ;
|
||||
size_t offset = (blockIdx.x * blockDim.x + threadIdx.x);
|
||||
size_t stride = blockDim.x * gridDim.x ;
|
||||
|
||||
for (size_t i=offset; i<N; i+=stride) {
|
||||
C_d[i] = A_d[i] + B_d[i];
|
||||
|
||||
@@ -18,7 +18,7 @@ THE SOFTWARE.
|
||||
*/
|
||||
|
||||
/* HIT_START
|
||||
* BUILD: %t %s ../test_common.cpp
|
||||
* BUILD: %t %s ../test_common.cpp
|
||||
* RUN: %t
|
||||
* HIT_END
|
||||
*/
|
||||
@@ -38,7 +38,7 @@ __constant__ int Value[LEN];
|
||||
|
||||
__global__ void Get(hipLaunchParm lp, int *Ad)
|
||||
{
|
||||
int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
|
||||
int tid = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
Ad[tid] = Value[tid];
|
||||
}
|
||||
|
||||
|
||||
@@ -33,12 +33,12 @@ THE SOFTWARE.
|
||||
#define SIZE NUM * 8
|
||||
|
||||
__global__ void Alloc(hipLaunchParm lp, uint64_t *Ptr) {
|
||||
int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
|
||||
int tid = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
Ptr[tid] = (uint64_t)malloc(128);
|
||||
}
|
||||
|
||||
__global__ void Free(hipLaunchParm lp, uint64_t *Ptr) {
|
||||
int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
|
||||
int tid = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
free((void*)Ptr[tid]);
|
||||
}
|
||||
|
||||
|
||||
@@ -35,52 +35,52 @@ THE SOFTWARE.
|
||||
#define LEN12 12 * 4
|
||||
|
||||
__global__ void MemCpy8(hipLaunchParm lp, uint8_t *In, uint8_t *Out) {
|
||||
int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
|
||||
int tid = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
memcpy(Out + tid*8, In + tid*8, 8);
|
||||
}
|
||||
|
||||
__global__ void MemCpy9(hipLaunchParm lp, uint8_t *In, uint8_t *Out) {
|
||||
int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
|
||||
int tid = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
memcpy(Out + tid*9, In + tid*9, 9);
|
||||
}
|
||||
|
||||
__global__ void MemCpy10(hipLaunchParm lp, uint8_t *In, uint8_t *Out) {
|
||||
int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
|
||||
int tid = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
memcpy(Out + tid*10, In + tid*10, 10);
|
||||
}
|
||||
|
||||
__global__ void MemCpy11(hipLaunchParm lp, uint8_t *In, uint8_t *Out) {
|
||||
int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
|
||||
int tid = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
memcpy(Out + tid*11, In + tid*11, 11);
|
||||
}
|
||||
|
||||
__global__ void MemCpy12(hipLaunchParm lp, uint8_t *In, uint8_t *Out) {
|
||||
int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
|
||||
int tid = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
memcpy(Out + tid*12, In + tid*12, 12);
|
||||
}
|
||||
|
||||
__global__ void MemSet8(hipLaunchParm lp, uint8_t *In) {
|
||||
int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
|
||||
int tid = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
memset(In + tid*8, 1, 8);
|
||||
}
|
||||
|
||||
__global__ void MemSet9(hipLaunchParm lp, uint8_t *In) {
|
||||
int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
|
||||
int tid = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
memset(In + tid*9, 1, 9);
|
||||
}
|
||||
|
||||
__global__ void MemSet10(hipLaunchParm lp, uint8_t *In) {
|
||||
int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
|
||||
int tid = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
memset(In + tid*10, 1, 10);
|
||||
}
|
||||
|
||||
__global__ void MemSet11(hipLaunchParm lp, uint8_t *In) {
|
||||
int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
|
||||
int tid = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
memset(In + tid*11, 1, 11);
|
||||
}
|
||||
|
||||
__global__ void MemSet12(hipLaunchParm lp, uint8_t *In) {
|
||||
int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
|
||||
int tid = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
memset(In + tid*12, 1, 12);
|
||||
}
|
||||
|
||||
|
||||
@@ -37,7 +37,7 @@ __global__ void vadd_asm(hipLaunchParm lp,
|
||||
float *out,
|
||||
float *in)
|
||||
{
|
||||
int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
|
||||
int i = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
|
||||
asm volatile ("v_add_f32_e32 %0, %1, %2" : "=v" (out[i]) : "v"(in[i]),"v" (out[i]));
|
||||
}
|
||||
@@ -49,7 +49,7 @@ void addCPUReference(
|
||||
{
|
||||
for(unsigned int j=0; j < NUM; j++)
|
||||
{
|
||||
|
||||
|
||||
output[j]= input[j] + output[j];
|
||||
}
|
||||
}
|
||||
|
||||
@@ -35,7 +35,7 @@ __global__ void vmac_asm(hipLaunchParm lp,
|
||||
float *out,
|
||||
float *in)
|
||||
{
|
||||
int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
|
||||
int i = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
|
||||
asm volatile ("v_mac_f32_e32 %0, %2, %3" : "=v" (out[i]) : "0"(out[i]), "v" (a), "v" (in[i]));
|
||||
}
|
||||
@@ -47,7 +47,7 @@ void CPUReference(
|
||||
{
|
||||
for(unsigned int j=0; j < NUM; j++)
|
||||
{
|
||||
|
||||
|
||||
output[j]= a*input[j] + output[j];
|
||||
}
|
||||
}
|
||||
|
||||
@@ -33,7 +33,7 @@ void
|
||||
__launch_bounds__(256, 2)
|
||||
myKern(hipLaunchParm lp, int *C, const int *A, int N, int xfactor)
|
||||
{
|
||||
int tid = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
|
||||
int tid = (blockIdx.x * blockDim.x + threadIdx.x);
|
||||
|
||||
if (tid < N) {
|
||||
C[tid] = A[tid];
|
||||
|
||||
@@ -34,7 +34,7 @@ THE SOFTWARE.
|
||||
#define NUM_STREAMS 2
|
||||
|
||||
__global__ void Iter(hipLaunchParm lp, int *Ad, int num){
|
||||
int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
|
||||
int tx = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
// Kernel loop designed to execute very slowly... ... ... so we can test timing-related behavior below
|
||||
if(tx == 0){
|
||||
for(int i = 0; i<num;i++){
|
||||
|
||||
@@ -33,7 +33,7 @@ THE SOFTWARE.
|
||||
#define SIZE LEN*sizeof(float)
|
||||
|
||||
__global__ void Add(hipLaunchParm lp, float *Ad, float *Bd, float *Cd){
|
||||
int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
|
||||
int tx = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
Cd[tx] = Ad[tx] + Bd[tx];
|
||||
}
|
||||
|
||||
|
||||
@@ -33,13 +33,13 @@
|
||||
#define SIZE LEN*sizeof(float)
|
||||
|
||||
__global__ void Add(float *Ad, float *Bd, float *Cd){
|
||||
int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
|
||||
int tx = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
Cd[tx] = Ad[tx] + Bd[tx];
|
||||
}
|
||||
|
||||
|
||||
__global__ void Set(int *Ad, int val){
|
||||
int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
|
||||
int tx = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
Ad[tx] = val;
|
||||
}
|
||||
|
||||
@@ -52,13 +52,13 @@ std::vector<std::string> syncMsg = {"event", "stream", "device"};
|
||||
|
||||
void CheckHostPointer(int numElements, int *ptr, unsigned eventFlags, int syncMethod, std::string msg)
|
||||
{
|
||||
std::cerr << "test: CheckHostPointer " << msg
|
||||
std::cerr << "test: CheckHostPointer " << msg
|
||||
//<< " HIP_COHERENT_HOST_ALLOC=" << HIP_COHERENT_HOST_ALLOC
|
||||
//<< " HIP_EVENT_SYS_RELEASE=" << HIP_EVENT_SYS_RELEASE
|
||||
<< " eventFlags = " << std::hex << eventFlags
|
||||
<< ((eventFlags & hipEventReleaseToDevice) ? " hipEventReleaseToDevice" : "")
|
||||
<< ((eventFlags & hipEventReleaseToSystem) ? " hipEventReleaseToSystem" : "")
|
||||
<< " ptr=" << ptr
|
||||
<< " eventFlags = " << std::hex << eventFlags
|
||||
<< ((eventFlags & hipEventReleaseToDevice) ? " hipEventReleaseToDevice" : "")
|
||||
<< ((eventFlags & hipEventReleaseToSystem) ? " hipEventReleaseToSystem" : "")
|
||||
<< " ptr=" << ptr
|
||||
<< " syncMethod=" << syncMsg[syncMethod] << "\n";
|
||||
|
||||
hipStream_t s;
|
||||
@@ -93,7 +93,7 @@ void CheckHostPointer(int numElements, int *ptr, unsigned eventFlags, int syncMe
|
||||
default:
|
||||
assert(0);
|
||||
};
|
||||
|
||||
|
||||
for (int i=0; i<numElements; i++) {
|
||||
if (ptr[i] != expected) {
|
||||
printf ("mismatch at %d: %d != %d\n", i, ptr[i], expected);
|
||||
@@ -153,7 +153,7 @@ int main(){
|
||||
size_t sizeBytes = numElements * sizeof (int);
|
||||
|
||||
#ifdef __HIP_PLATFORM_HCC__
|
||||
{
|
||||
{
|
||||
// Stimulate error condition:
|
||||
int *A = &numElements;
|
||||
HIPCHECK_API(hipHostMalloc((void**)&A, sizeBytes, hipHostMallocCoherent|hipHostMallocNonCoherent), hipErrorInvalidValue);
|
||||
@@ -174,7 +174,7 @@ int main(){
|
||||
// agent-scope releases don't provide host visibility, don't use them here:
|
||||
}
|
||||
|
||||
if (1) {
|
||||
if (1) {
|
||||
int *A = nullptr;
|
||||
HIPCHECK(hipHostMalloc((void**)&A, sizeBytes, hipHostMallocCoherent));
|
||||
const char *ptrType = "coherent";
|
||||
@@ -189,14 +189,14 @@ int main(){
|
||||
|
||||
|
||||
// Check defaults:
|
||||
if (1) {
|
||||
if (1) {
|
||||
int *A = nullptr;
|
||||
HIPCHECK(hipHostMalloc((void**)&A, sizeBytes));
|
||||
const char *ptrType = "default";
|
||||
CheckHostPointer(numElements, A, 0, SYNC_DEVICE, ptrType);
|
||||
CheckHostPointer(numElements, A, 0, SYNC_STREAM, ptrType);
|
||||
CheckHostPointer(numElements, A, 0, SYNC_EVENT, ptrType);
|
||||
|
||||
|
||||
CheckHostPointer(numElements, A, 0, SYNC_DEVICE, ptrType);
|
||||
CheckHostPointer(numElements, A, 0, SYNC_STREAM, ptrType);
|
||||
CheckHostPointer(numElements, A, 0, SYNC_EVENT, ptrType);
|
||||
@@ -206,7 +206,7 @@ int main(){
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
passed();
|
||||
|
||||
}
|
||||
|
||||
@@ -29,13 +29,13 @@ THE SOFTWARE.
|
||||
#include<malloc.h>
|
||||
|
||||
__global__ void Inc(hipLaunchParm lp, float *Ad){
|
||||
int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
|
||||
int tx = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
Ad[tx] = Ad[tx] + float(1);
|
||||
}
|
||||
|
||||
|
||||
template<typename T>
|
||||
void doMemCopy(size_t numElements, int offset, T *A, T *Bh, T *Bd, bool internalRegister)
|
||||
void doMemCopy(size_t numElements, int offset, T *A, T *Bh, T *Bd, bool internalRegister)
|
||||
{
|
||||
A = A + offset;
|
||||
numElements -= offset;
|
||||
@@ -56,7 +56,7 @@ void doMemCopy(size_t numElements, int offset, T *A, T *Bh, T *Bd, bool internal
|
||||
HIPCHECK(hipMemset(Bd, 13.0f, sizeBytes));
|
||||
|
||||
|
||||
//
|
||||
//
|
||||
HIPCHECK(hipMemcpy(Bd, A, sizeBytes, hipMemcpyHostToDevice));
|
||||
HIPCHECK(hipMemcpy(Bh, Bd, sizeBytes, hipMemcpyDeviceToHost));
|
||||
|
||||
@@ -81,7 +81,7 @@ int main(int argc, char *argv[])
|
||||
|
||||
const size_t size = N * sizeof(float);
|
||||
|
||||
if (p_tests & 0x1) {
|
||||
if (p_tests & 0x1) {
|
||||
float *A, **Ad;
|
||||
int num_devices;
|
||||
HIPCHECK(hipGetDeviceCount(&num_devices));
|
||||
@@ -115,7 +115,7 @@ int main(int argc, char *argv[])
|
||||
}
|
||||
|
||||
|
||||
if (p_tests & 0x6) {
|
||||
if (p_tests & 0x6) {
|
||||
// Sensitize HIP bug if device does not match where the memory was registered.
|
||||
HIPCHECK(hipSetDevice(0));
|
||||
|
||||
@@ -129,7 +129,7 @@ int main(int argc, char *argv[])
|
||||
HIPCHECK(hipMalloc(&Bd, size));
|
||||
|
||||
// TODO - set to 128
|
||||
#define OFFSETS_TO_TRY 128
|
||||
#define OFFSETS_TO_TRY 128
|
||||
assert (N>OFFSETS_TO_TRY);
|
||||
|
||||
if (p_tests & 0x2) {
|
||||
|
||||
@@ -59,7 +59,7 @@ struct HostTraits<Pinned>
|
||||
static const char *Name() { return "Pinned"; } ;
|
||||
|
||||
static void *Alloc(size_t sizeBytes) {
|
||||
void *p;
|
||||
void *p;
|
||||
HIPCHECK(hipHostMalloc((void**)&p, sizeBytes, hipHostMallocDefault));
|
||||
return p;
|
||||
};
|
||||
@@ -67,11 +67,11 @@ struct HostTraits<Pinned>
|
||||
|
||||
|
||||
template<typename T>
|
||||
__global__ void
|
||||
__global__ void
|
||||
addK (hipLaunchParm lp, T *A, T K, size_t numElements)
|
||||
{
|
||||
size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
|
||||
size_t stride = hipBlockDim_x * hipGridDim_x ;
|
||||
size_t offset = (blockIdx.x * blockDim.x + threadIdx.x);
|
||||
size_t stride = blockDim.x * gridDim.x ;
|
||||
|
||||
for (size_t i=offset; i<numElements; i+=stride) {
|
||||
A[i] = A[i] + K;
|
||||
@@ -85,7 +85,7 @@ addK (hipLaunchParm lp, T *A, T K, size_t numElements)
|
||||
//IN: numInflight : number of copies inflight at any time:
|
||||
//IN: numPongs = number of iterations to run (iteration)
|
||||
template<typename T, class AllocType>
|
||||
void test_pingpong(hipStream_t stream, size_t numElements, int numInflight, int numPongs, bool doHostSide)
|
||||
void test_pingpong(hipStream_t stream, size_t numElements, int numInflight, int numPongs, bool doHostSide)
|
||||
{
|
||||
HIPASSERT(numElements % numInflight == 0); // Must be evenly divisible.
|
||||
size_t Nbytes = numElements*sizeof(T);
|
||||
@@ -95,7 +95,7 @@ void test_pingpong(hipStream_t stream, size_t numElements, int numInflight, int
|
||||
unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, numElements);
|
||||
|
||||
printf ("-----------------------------------------------------------------------------------------------\n");
|
||||
printf ("testing: %s<%s> Nbytes=%zu (%6.1f MB) numPongs=%d numInflight=%d eachCopyElements=%zu eachCopyBytes=%zu\n",
|
||||
printf ("testing: %s<%s> Nbytes=%zu (%6.1f MB) numPongs=%d numInflight=%d eachCopyElements=%zu eachCopyBytes=%zu\n",
|
||||
__func__, HostTraits<AllocType>::Name(), Nbytes, (double)(Nbytes)/1024.0/1024.0, numPongs, numInflight, eachCopyElements, eachCopyBytes);
|
||||
|
||||
T *A_h = NULL;
|
||||
@@ -176,7 +176,7 @@ void test_manyInflightCopies(hipStream_t stream, int numElements, int numCopies,
|
||||
size_t eachCopyBytes = eachCopyElements * sizeof(T);
|
||||
|
||||
printf ("-----------------------------------------------------------------------------------------------\n");
|
||||
printf ("testing: %s Nbytes=%zu (%6.1f MB) numCopies=%d eachCopyElements=%zu eachCopyBytes=%zu\n",
|
||||
printf ("testing: %s Nbytes=%zu (%6.1f MB) numCopies=%d eachCopyElements=%zu eachCopyBytes=%zu\n",
|
||||
__func__, Nbytes, (double)(Nbytes)/1024.0/1024.0, numCopies, eachCopyElements, eachCopyBytes);
|
||||
|
||||
T *A_d;
|
||||
@@ -194,7 +194,7 @@ void test_manyInflightCopies(hipStream_t stream, int numElements, int numCopies,
|
||||
//stream=0; // fixme TODO
|
||||
|
||||
|
||||
for (int i=0; i<numCopies; i++)
|
||||
for (int i=0; i<numCopies; i++)
|
||||
{
|
||||
HIPASSERT(A_d + i*eachCopyElements < A_d + Nbytes);
|
||||
HIPCHECK(hipMemcpyAsync(&A_d[i*eachCopyElements], &A_h1[i*eachCopyElements], eachCopyBytes, hipMemcpyHostToDevice, stream));
|
||||
@@ -204,7 +204,7 @@ void test_manyInflightCopies(hipStream_t stream, int numElements, int numCopies,
|
||||
HIPCHECK(hipDeviceSynchronize());
|
||||
}
|
||||
|
||||
for (int i=0; i<numCopies; i++)
|
||||
for (int i=0; i<numCopies; i++)
|
||||
{
|
||||
HIPASSERT(A_d + i*eachCopyElements < A_d + Nbytes);
|
||||
HIPCHECK(hipMemcpyAsync(&A_h2[i*eachCopyElements], &A_d[i*eachCopyElements], eachCopyBytes, hipMemcpyDeviceToHost, stream));
|
||||
@@ -252,7 +252,7 @@ void test_chunkedAsyncExample(int nStreams, bool useNullStream, bool useSyncMemc
|
||||
|
||||
|
||||
hipStream_t *stream = (hipStream_t*)malloc(sizeof(hipStream_t) * nStreams);
|
||||
if (useNullStream) {
|
||||
if (useNullStream) {
|
||||
nStreams = 1;
|
||||
stream[0] = NULL;
|
||||
} else {
|
||||
@@ -262,7 +262,7 @@ void test_chunkedAsyncExample(int nStreams, bool useNullStream, bool useSyncMemc
|
||||
}
|
||||
|
||||
|
||||
size_t workLeft = N;
|
||||
size_t workLeft = N;
|
||||
size_t workPerStream = N / nStreams;
|
||||
for (int i = 0; i < nStreams; ++i) {
|
||||
size_t work = (workLeft < workPerStream) ? workLeft : workPerStream;
|
||||
@@ -287,7 +287,7 @@ void test_chunkedAsyncExample(int nStreams, bool useNullStream, bool useSyncMemc
|
||||
} else {
|
||||
HIPCHECK ( hipMemcpyAsync(&C_h[offset], &C_d[offset], workBytes, hipMemcpyDeviceToHost, stream[i]));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
HIPCHECK (hipDeviceSynchronize());
|
||||
|
||||
@@ -31,7 +31,7 @@ THE SOFTWARE.
|
||||
|
||||
__global__ void Kernel(hipLaunchParm lp,volatile float* hostRes)
|
||||
{
|
||||
int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
|
||||
int tid = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
hostRes[tid] = tid + 1;
|
||||
__threadfence_system();
|
||||
// expecting that the data is getting flushed to host here!
|
||||
|
||||
@@ -24,7 +24,7 @@ THE SOFTWARE.
|
||||
|
||||
/* HIT_START
|
||||
* BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS --std=c++11
|
||||
* RUN: %t EXCLUDE_HIP_PLATFORM all
|
||||
* RUN: %t EXCLUDE_HIP_PLATFORM all
|
||||
* HIT_END
|
||||
*/
|
||||
|
||||
@@ -63,8 +63,8 @@ int enablePeers(int dev0, int dev1)
|
||||
__global__ void
|
||||
memsetIntKernel(int * ptr, const int val, size_t numElements)
|
||||
{
|
||||
int gid = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
|
||||
int stride = hipBlockDim_x * hipGridDim_x ;
|
||||
int gid = (blockIdx.x * blockDim.x + threadIdx.x);
|
||||
int stride = blockDim.x * gridDim.x ;
|
||||
for (size_t i= gid; i< numElements; i+=stride){
|
||||
ptr[i] = val;
|
||||
}
|
||||
@@ -73,15 +73,15 @@ memsetIntKernel(int * ptr, const int val, size_t numElements)
|
||||
__global__ void
|
||||
memcpyIntKernel(const int * src, int* dst, size_t numElements)
|
||||
{
|
||||
int gid = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
|
||||
int stride = hipBlockDim_x * hipGridDim_x ;
|
||||
int gid = (blockIdx.x * blockDim.x + threadIdx.x);
|
||||
int stride = blockDim.x * gridDim.x ;
|
||||
for (size_t i= gid; i< numElements; i+=stride){
|
||||
dst[i] = src[i];
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
// CHeck arrays in reverse order, to more easily detect cases where
|
||||
// CHeck arrays in reverse order, to more easily detect cases where
|
||||
// the copy is "partially" done.
|
||||
void checkReverse(const int *ptr, int numElements, int expected) {
|
||||
for (int i=numElements-1; i>=0; i--) {
|
||||
|
||||
@@ -34,7 +34,7 @@ THE SOFTWARE.
|
||||
#define kernel_name "hello_world"
|
||||
|
||||
__global__ void Cpy(hipLaunchParm lp, float *Ad, float* Bd){
|
||||
int tx = hipThreadIdx_x;
|
||||
int tx = threadIdx.x;
|
||||
Bd[tx] = Ad[tx];
|
||||
}
|
||||
|
||||
|
||||
@@ -24,7 +24,7 @@ THE SOFTWARE.
|
||||
|
||||
extern "C" __global__ void hello_world(hipLaunchParm lp, float *a, float *b)
|
||||
{
|
||||
int tx = hipThreadIdx_x;
|
||||
int tx = threadIdx.x;
|
||||
b[tx] = a[tx];
|
||||
}
|
||||
|
||||
|
||||
@@ -35,7 +35,7 @@ THE SOFTWARE.
|
||||
|
||||
template<typename T>
|
||||
__global__ void Inc(hipLaunchParm lp, T *Array){
|
||||
int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
|
||||
int tx = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
Array[tx] = Array[tx] + T(1);
|
||||
}
|
||||
|
||||
@@ -116,7 +116,7 @@ int main(int argc, char **argv)
|
||||
}
|
||||
|
||||
const size_t size = N * sizeof(float);
|
||||
|
||||
|
||||
for (int i=0; i< iterations; i++) {
|
||||
|
||||
std::thread t1(run1, size, stream[0]);
|
||||
@@ -126,7 +126,7 @@ int main(int argc, char **argv)
|
||||
// std::cout<<"T1"<<std::endl;
|
||||
t2.join();
|
||||
// std::cout<<"T2"<<std::endl;
|
||||
t3.join();
|
||||
t3.join();
|
||||
}
|
||||
passed();
|
||||
}
|
||||
|
||||
@@ -29,7 +29,7 @@ THE SOFTWARE.
|
||||
const int NN = 1 << 21;
|
||||
|
||||
__global__ void kernel(hipLaunchParm lp, float *x, float *y, int n){
|
||||
int tid = hipThreadIdx_x;
|
||||
int tid = threadIdx.x;
|
||||
if(tid < 1){
|
||||
for(int i=0;i<n;i++){
|
||||
x[i] = sqrt(powf(3.14159,i));
|
||||
@@ -39,7 +39,7 @@ __global__ void kernel(hipLaunchParm lp, float *x, float *y, int n){
|
||||
}
|
||||
|
||||
__global__ void nKernel(hipLaunchParm lp, float *y){
|
||||
int tid = hipThreadIdx_x;
|
||||
int tid = threadIdx.x;
|
||||
y[tid] = y[tid] + 1.0f;
|
||||
}
|
||||
|
||||
|
||||
@@ -31,7 +31,7 @@ THE SOFTWARE.
|
||||
const int NN = 1 << 21;
|
||||
|
||||
__global__ void kernel(hipLaunchParm lp, float *x, float *y, int n){
|
||||
int tid = hipThreadIdx_x;
|
||||
int tid = threadIdx.x;
|
||||
if(tid < 1){
|
||||
for(int i=0;i<n;i++){
|
||||
x[i] = sqrt(powf(3.14159,i));
|
||||
@@ -41,7 +41,7 @@ __global__ void kernel(hipLaunchParm lp, float *x, float *y, int n){
|
||||
}
|
||||
|
||||
__global__ void nKernel(hipLaunchParm lp, float *y){
|
||||
int tid = hipThreadIdx_x;
|
||||
int tid = threadIdx.x;
|
||||
y[tid] = y[tid] + 1.0f;
|
||||
}
|
||||
|
||||
|
||||
@@ -41,8 +41,8 @@ vectorADDRepeat(hipLaunchParm lp,
|
||||
size_t NELEM,
|
||||
int repeat)
|
||||
{
|
||||
size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
|
||||
size_t stride = hipBlockDim_x * hipGridDim_x ;
|
||||
size_t offset = (blockIdx.x * blockDim.x + threadIdx.x);
|
||||
size_t stride = blockDim.x * gridDim.x ;
|
||||
|
||||
for (int j=1; j<=repeat;j++) {
|
||||
for (size_t i=offset; i<NELEM; i+=stride) {
|
||||
|
||||
@@ -73,7 +73,7 @@ void D2H(T *Dst, T *Src, size_t size){
|
||||
|
||||
template<typename T>
|
||||
__global__ void Inc(hipLaunchParm lp, T *In){
|
||||
int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
|
||||
int tx = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
In[tx] = In[tx] + 1;
|
||||
}
|
||||
|
||||
@@ -94,12 +94,12 @@ void initArrays(T **Ad, T **Ah,
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
void initArrays(T **Ad, size_t N,
|
||||
bool deviceMemory = false,
|
||||
void initArrays(T **Ad, size_t N,
|
||||
bool deviceMemory = false,
|
||||
bool usePinnedHost = false){
|
||||
size_t NBytes = N * sizeof(T);
|
||||
if(deviceMemory){
|
||||
HIPCHECK( hipMalloc(Ad, NBytes));
|
||||
HIPCHECK( hipMalloc(Ad, NBytes));
|
||||
}else{
|
||||
if(usePinnedHost){
|
||||
HIPCHECK(hipHostMalloc((void**)Ad, NBytes, hipHostMallocDefault));
|
||||
|
||||
@@ -27,14 +27,14 @@ THE SOFTWARE.
|
||||
// TODO - add code object support here.
|
||||
/* HIT_START
|
||||
* BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS --std=c++11
|
||||
* RUN: %t
|
||||
* RUN: %t
|
||||
* HIT_END
|
||||
*/
|
||||
|
||||
|
||||
// Test cache management (fences) and synchronization between kernel and copy commands.
|
||||
// Exhaustively tests 3 command types (copy, kernel, module kernel),
|
||||
// many sync types (see SyncType), followed by another command, across a sweep
|
||||
// Exhaustively tests 3 command types (copy, kernel, module kernel),
|
||||
// many sync types (see SyncType), followed by another command, across a sweep
|
||||
// of data sizes designed to stress various levels of the memory hierarchy.
|
||||
|
||||
#include "hip/hip_runtime.h"
|
||||
@@ -102,8 +102,8 @@ MemcpyFunction g_moduleMemcpy("memcpyInt.hsaco", "memcpyIntKernel");
|
||||
__global__ void
|
||||
memsetIntKernel(int * ptr, const int val, size_t numElements)
|
||||
{
|
||||
int gid = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
|
||||
int stride = hipBlockDim_x * hipGridDim_x ;
|
||||
int gid = (blockIdx.x * blockDim.x + threadIdx.x);
|
||||
int stride = blockDim.x * gridDim.x ;
|
||||
for (size_t i= gid; i< numElements; i+=stride){
|
||||
ptr[i] = val;
|
||||
}
|
||||
@@ -112,15 +112,15 @@ memsetIntKernel(int * ptr, const int val, size_t numElements)
|
||||
__global__ void
|
||||
memcpyIntKernel(int *dst, const int * src, size_t numElements)
|
||||
{
|
||||
int gid = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
|
||||
int stride = hipBlockDim_x * hipGridDim_x ;
|
||||
int gid = (blockIdx.x * blockDim.x + threadIdx.x);
|
||||
int stride = blockDim.x * gridDim.x ;
|
||||
for (size_t i= gid; i< numElements; i+=stride){
|
||||
dst[i] = src[i];
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
// CHeck arrays in reverse order, to more easily detect cases where
|
||||
// CHeck arrays in reverse order, to more easily detect cases where
|
||||
// the copy is "partially" done.
|
||||
void checkReverse(const int *ptr, int numElements, int expected) {
|
||||
int mismatchCnt = 0;
|
||||
@@ -157,7 +157,7 @@ const char * CmdTypeStr(CmdType c)
|
||||
ENUM_CASE_STR(MODULE_KERNEL);
|
||||
default: return "UNKNOWN";
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
enum SyncType {
|
||||
@@ -223,16 +223,16 @@ void resetInputs( int * Ad, int * Bd, int *Cd, int *Ch, size_t numElements, int
|
||||
|
||||
// Intended to test proper synchronization and cache flushing between CMDA and CMDB.
|
||||
// CMD are of type CmdType. All command copy memory, using either hipMemcpyAsync or kernel implementations.
|
||||
// CmdA copies from Ad to Bd,
|
||||
// CmdA copies from Ad to Bd,
|
||||
// Some form of synchronization is applied.
|
||||
// Then cmdB copies from Bd to Cd.
|
||||
//
|
||||
// Cd is then copied to host Ch using a memory copy.
|
||||
//
|
||||
// Correct result at the end is that Ch contains the contents originally in Ad (integer 0x42)
|
||||
void runTestImpl(CmdType cmdAType, SyncType syncType, CmdType cmdBType,
|
||||
void runTestImpl(CmdType cmdAType, SyncType syncType, CmdType cmdBType,
|
||||
hipStream_t stream1, hipStream_t stream2, int numElements,
|
||||
int * Ad, int * Bd, int *Cd, int *Ch,
|
||||
int * Ad, int * Bd, int *Cd, int *Ch,
|
||||
int expected)
|
||||
{
|
||||
hipEvent_t e;
|
||||
@@ -241,14 +241,14 @@ void runTestImpl(CmdType cmdAType, SyncType syncType, CmdType cmdBType,
|
||||
resetInputs(Ad, Bd, Cd, Ch, numElements, expected);
|
||||
|
||||
const size_t sizeElements = numElements * sizeof(int);
|
||||
fprintf (stderr, "test: runTest with %zu bytes (%6.2f MB) cmdA=%s; sync=%s; cmdB=%s\n",
|
||||
fprintf (stderr, "test: runTest with %zu bytes (%6.2f MB) cmdA=%s; sync=%s; cmdB=%s\n",
|
||||
sizeElements, (double) (sizeElements/1024.0), CmdTypeStr(cmdAType), SyncTypeStr(syncType), CmdTypeStr(cmdBType));
|
||||
|
||||
if (SKIP_MODULE_KERNEL && ((cmdAType == MODULE_KERNEL) || (cmdBType == MODULE_KERNEL))) {
|
||||
fprintf (stderr, "warn: skipping since test infra does not yet support modules\n");
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
|
||||
// Step A:
|
||||
runCmd(cmdAType, Bd, Ad, stream1, numElements);
|
||||
@@ -334,7 +334,7 @@ void testWrapper(size_t numElements)
|
||||
fprintf (stderr, "test: init complete, start running tests\n");
|
||||
|
||||
|
||||
runTestImpl(COPY, EVENT_SYNC, KERNEL, stream1, stream2, numElements, Ad, Bd, Cd, Ch, expected);
|
||||
runTestImpl(COPY, EVENT_SYNC, KERNEL, stream1, stream2, numElements, Ad, Bd, Cd, Ch, expected);
|
||||
|
||||
for (int cmdA=0; cmdA<MAX_CmdType; cmdA++) {
|
||||
for (int cmdB=0; cmdB<MAX_CmdType; cmdB++) {
|
||||
@@ -347,7 +347,7 @@ void testWrapper(size_t numElements)
|
||||
//case STREAM_QUERY:
|
||||
case STREAM_SYNC:
|
||||
case DEVICE_SYNC:
|
||||
runTestImpl(CmdType(cmdA), SyncType(syncMode), CmdType(cmdB), stream1, stream2, numElements, Ad, Bd, Cd, Ch, expected);
|
||||
runTestImpl(CmdType(cmdA), SyncType(syncMode), CmdType(cmdB), stream1, stream2, numElements, Ad, Bd, Cd, Ch, expected);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
@@ -358,11 +358,11 @@ void testWrapper(size_t numElements)
|
||||
}
|
||||
|
||||
#if 0
|
||||
runTestImpl(COPY, STREAM_SYNC, MODULE_KERNEL, stream1, stream2, numElements, Ad, Bd, Cd, Ch, expected);
|
||||
runTestImpl(COPY, STREAM_SYNC, KERNEL, stream1, stream2, numElements, Ad, Bd, Cd, Ch, expected);
|
||||
runTestImpl(COPY, STREAM_WAIT_EVENT, MODULE_KERNEL, stream1, stream2, numElements, Ad, Bd, Cd, Ch, expected);
|
||||
runTestImpl(COPY, STREAM_SYNC, MODULE_KERNEL, stream1, stream2, numElements, Ad, Bd, Cd, Ch, expected);
|
||||
runTestImpl(COPY, STREAM_SYNC, KERNEL, stream1, stream2, numElements, Ad, Bd, Cd, Ch, expected);
|
||||
runTestImpl(COPY, STREAM_WAIT_EVENT, MODULE_KERNEL, stream1, stream2, numElements, Ad, Bd, Cd, Ch, expected);
|
||||
|
||||
runTestImpl(COPY, STREAM_WAIT_EVENT, KERNEL, stream1, stream2, numElements, Ad, Bd, Cd, Ch, expected);
|
||||
runTestImpl(COPY, STREAM_WAIT_EVENT, KERNEL, stream1, stream2, numElements, Ad, Bd, Cd, Ch, expected);
|
||||
#endif
|
||||
|
||||
HIPCHECK(hipFree(Ad));
|
||||
|
||||
@@ -1,12 +1,12 @@
|
||||
#include <hip/hip_runtime.h>
|
||||
|
||||
|
||||
|
||||
|
||||
extern "C" __global__ void
|
||||
memcpyIntKernel(hipLaunchParm lp, int *dst, const int * src, size_t numElements)
|
||||
{
|
||||
int gid = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
|
||||
int stride = hipBlockDim_x * hipGridDim_x ;
|
||||
int gid = (blockIdx.x * blockDim.x + threadIdx.x);
|
||||
int stride = blockDim.x * gridDim.x ;
|
||||
for (size_t i= gid; i< numElements; i+=stride){
|
||||
dst[i] = src[i];
|
||||
}
|
||||
|
||||
@@ -30,7 +30,7 @@ THE SOFTWARE.
|
||||
#define ITER 1<<10
|
||||
|
||||
__global__ void Iter(hipLaunchParm lp, int *Ad, int num){
|
||||
int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
|
||||
int tx = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
if(tx == 0){
|
||||
for(int i = 0; i<num;i++){
|
||||
Ad[tx] += 1;
|
||||
@@ -58,7 +58,7 @@ int main(){
|
||||
}
|
||||
|
||||
std::cout<<"Waitin..."<<std::endl;
|
||||
|
||||
|
||||
hipDeviceSynchronize();
|
||||
}
|
||||
|
||||
|
||||
@@ -26,7 +26,7 @@ THE SOFTWARE.
|
||||
static size_t size[NUM_SIZE];
|
||||
|
||||
__global__ void Add(hipLaunchParm lp, int *Ad){
|
||||
int tx = hipThreadIdx_x;
|
||||
int tx = threadIdx.x;
|
||||
Ad[tx] = Ad[tx] + tx;
|
||||
}
|
||||
|
||||
|
||||
@@ -26,7 +26,7 @@ THE SOFTWARE.
|
||||
static size_t size[NUM_SIZE];
|
||||
|
||||
__global__ void Add(hipLaunchParm lp, int *Ad){
|
||||
int tx = hipThreadIdx_x;
|
||||
int tx = threadIdx.x;
|
||||
Ad[tx] = Ad[tx] + tx;
|
||||
}
|
||||
|
||||
|
||||
@@ -30,7 +30,7 @@ THE SOFTWARE.
|
||||
#define ITER 1<<10
|
||||
|
||||
__global__ void Iter(hipLaunchParm lp, int *Ad, int num){
|
||||
int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
|
||||
int tx = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
if(tx == 0){
|
||||
for(int i = 0; i<num;i++){
|
||||
Ad[tx] += 1;
|
||||
@@ -57,7 +57,7 @@ int main(){
|
||||
}
|
||||
|
||||
std::cout<<"Waitin..."<<std::endl;
|
||||
|
||||
|
||||
hipDeviceSynchronize();
|
||||
}
|
||||
|
||||
|
||||
@@ -136,8 +136,8 @@ vectorADD(hipLaunchParm lp,
|
||||
T *C_d,
|
||||
size_t NELEM)
|
||||
{
|
||||
size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
|
||||
size_t stride = hipBlockDim_x * hipGridDim_x ;
|
||||
size_t offset = (blockIdx.x * blockDim.x + threadIdx.x);
|
||||
size_t stride = blockDim.x * gridDim.x ;
|
||||
|
||||
for (size_t i=offset; i<NELEM; i+=stride) {
|
||||
C_d[i] = A_d[i] + B_d[i];
|
||||
@@ -153,8 +153,8 @@ vectorADDReverse(hipLaunchParm lp,
|
||||
T *C_d,
|
||||
size_t NELEM)
|
||||
{
|
||||
size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
|
||||
size_t stride = hipBlockDim_x * hipGridDim_x ;
|
||||
size_t offset = (blockIdx.x * blockDim.x + threadIdx.x);
|
||||
size_t stride = blockDim.x * gridDim.x ;
|
||||
|
||||
for (int64_t i=NELEM-stride+offset; i>=0; i-=stride) {
|
||||
C_d[i] = A_d[i] + B_d[i];
|
||||
@@ -169,8 +169,8 @@ addCount( const T *A_d,
|
||||
size_t NELEM,
|
||||
int count)
|
||||
{
|
||||
size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
|
||||
size_t stride = hipBlockDim_x * hipGridDim_x ;
|
||||
size_t offset = (blockIdx.x * blockDim.x + threadIdx.x);
|
||||
size_t stride = blockDim.x * gridDim.x ;
|
||||
|
||||
// Deliberately do this in an inefficient way to increase kernel runtime
|
||||
for (int i=0; i<count; i++) {
|
||||
@@ -188,8 +188,8 @@ addCountReverse( const T *A_d,
|
||||
int64_t NELEM,
|
||||
int count)
|
||||
{
|
||||
size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
|
||||
size_t stride = hipBlockDim_x * hipGridDim_x ;
|
||||
size_t offset = (blockIdx.x * blockDim.x + threadIdx.x);
|
||||
size_t stride = blockDim.x * gridDim.x ;
|
||||
|
||||
// Deliberately do this in an inefficient way to increase kernel runtime
|
||||
for (int i=0; i<count; i++) {
|
||||
@@ -205,8 +205,8 @@ __global__ void
|
||||
memsetReverse( T *C_d, T val,
|
||||
int64_t NELEM)
|
||||
{
|
||||
size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
|
||||
size_t stride = hipBlockDim_x * hipGridDim_x ;
|
||||
size_t offset = (blockIdx.x * blockDim.x + threadIdx.x);
|
||||
size_t stride = blockDim.x * gridDim.x ;
|
||||
|
||||
for (int64_t i=NELEM-stride+offset; i>=0; i-=stride) {
|
||||
C_d[i] = val;
|
||||
@@ -219,12 +219,12 @@ void setDefaultData(size_t numElements, T *A_h, T* B_h, T *C_h)
|
||||
{
|
||||
// Initialize the host data:
|
||||
for (size_t i=0; i<numElements; i++) {
|
||||
if (A_h)
|
||||
if (A_h)
|
||||
(A_h)[i] = 3.146f + i; // Pi
|
||||
if (B_h)
|
||||
if (B_h)
|
||||
(B_h)[i] = 1.618f + i; // Phi
|
||||
if (C_h)
|
||||
(C_h)[i] = 0.0f + i;
|
||||
if (C_h)
|
||||
(C_h)[i] = 0.0f + i;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -268,8 +268,8 @@ void initArraysForHost(T **A_h, T **B_h, T **C_h,
|
||||
|
||||
template <typename T>
|
||||
void initArrays(T **A_d, T **B_d, T **C_d,
|
||||
T **A_h, T **B_h, T **C_h,
|
||||
size_t N, bool usePinnedHost=false)
|
||||
T **A_h, T **B_h, T **C_h,
|
||||
size_t N, bool usePinnedHost=false)
|
||||
{
|
||||
size_t Nbytes = N*sizeof(T);
|
||||
|
||||
@@ -317,7 +317,7 @@ void freeArraysForHost(T *A_h, T *B_h, T *C_h, bool usePinnedHost)
|
||||
|
||||
template <typename T>
|
||||
void freeArrays(T *A_d, T *B_d, T *C_d,
|
||||
T *A_h, T *B_h, T *C_h, bool usePinnedHost)
|
||||
T *A_h, T *B_h, T *C_h, bool usePinnedHost)
|
||||
{
|
||||
if (A_d) {
|
||||
HIPCHECK ( hipFree(A_d) );
|
||||
@@ -453,9 +453,9 @@ struct Pinned {
|
||||
static const bool isPinned = true;
|
||||
static const char *str() { return "Pinned"; };
|
||||
|
||||
static void *Alloc(size_t sizeBytes)
|
||||
static void *Alloc(size_t sizeBytes)
|
||||
{
|
||||
void *p;
|
||||
void *p;
|
||||
HIPCHECK(hipHostMalloc((void**)&p, sizeBytes));
|
||||
return p;
|
||||
};
|
||||
@@ -463,12 +463,12 @@ struct Pinned {
|
||||
|
||||
|
||||
//---
|
||||
struct Unpinned
|
||||
struct Unpinned
|
||||
{
|
||||
static const bool isPinned = false;
|
||||
static const char *str() { return "Unpinned"; };
|
||||
|
||||
static void *Alloc(size_t sizeBytes)
|
||||
static void *Alloc(size_t sizeBytes)
|
||||
{
|
||||
void *p = malloc (sizeBytes);
|
||||
HIPASSERT(p);
|
||||
@@ -496,7 +496,7 @@ template<>
|
||||
struct MemTraits<Memcpy>
|
||||
{
|
||||
|
||||
static void Copy(void *dest, const void *src, size_t sizeBytes, hipMemcpyKind kind, hipStream_t stream)
|
||||
static void Copy(void *dest, const void *src, size_t sizeBytes, hipMemcpyKind kind, hipStream_t stream)
|
||||
{
|
||||
HIPCHECK(hipMemcpy(dest, src, sizeBytes, kind));
|
||||
}
|
||||
@@ -507,7 +507,7 @@ template<>
|
||||
struct MemTraits<MemcpyAsync>
|
||||
{
|
||||
|
||||
static void Copy(void *dest, const void *src, size_t sizeBytes, hipMemcpyKind kind, hipStream_t stream)
|
||||
static void Copy(void *dest, const void *src, size_t sizeBytes, hipMemcpyKind kind, hipStream_t stream)
|
||||
{
|
||||
HIPCHECK(hipMemcpyAsync(dest, src, sizeBytes, kind, stream));
|
||||
}
|
||||
|
||||
@@ -17,8 +17,8 @@ __global__ void tex2DKernel(float* outputData,
|
||||
int width,
|
||||
int height)
|
||||
{
|
||||
int x = hipBlockIdx_x*hipBlockDim_x + hipThreadIdx_x;
|
||||
int y = hipBlockIdx_y*hipBlockDim_y + hipThreadIdx_y;
|
||||
int x = blockIdx.x*blockDim.x + threadIdx.x;
|
||||
int y = blockIdx.y*blockDim.y + threadIdx.y;
|
||||
outputData[y*width + x] = tex2D<float>(textureObject, x, y);
|
||||
}
|
||||
|
||||
|
||||
@@ -18,8 +18,8 @@ __global__ void tex2DKernel(float* outputData,
|
||||
int width,
|
||||
int height)
|
||||
{
|
||||
int x = hipBlockIdx_x*hipBlockDim_x + hipThreadIdx_x;
|
||||
int y = hipBlockIdx_y*hipBlockDim_y + hipThreadIdx_y;
|
||||
int x = blockIdx.x*blockDim.x + threadIdx.x;
|
||||
int y = blockIdx.y*blockDim.y + threadIdx.y;
|
||||
#ifdef __HIP_PLATFORM_HCC__
|
||||
outputData[y*width + x] = tex2D(tex, textureObject, x, y);
|
||||
#else
|
||||
|
||||
Ссылка в новой задаче
Block a user