Merge pull request #224 from ChrisKitching/tests

[HIPIFY] Make the automated tests more useful
2017-10-24 20:12:36 +03:00
@@ -12,11 +12,13 @@ endif()

 option(HIPIFY_CLANG_TESTS "Build the tests for hipify-clang, if lit is installed" ON)

-# Disable the tests if `lit` is not installed.
+# Disable the tests if `lit` or `FileCheck` is not installed.
 find_program(LIT_COMMAND lit)
-if (NOT LIT_COMMAND)
+find_program(FILECHECK_COMMAND FileCheck)
+find_program(SOCAT_COMMAND socat)
+if (NOT LIT_COMMAND OR NOT FILECHECK_COMMAND OR NOT SOCAT_COMMAND)
    set(HIPIFY_CLANG_TESTS OFF CACHE INTERNAL "")
-    message(STATUS "hipify-clang's tests are not being built because `lit` is not installed.")
+    message(STATUS "hipify-clang's tests are not being built because `lit`,`FileCheck` or `socat` could not be found.")
 endif()

 list(APPEND CMAKE_MODULE_PATH ${LLVM_CMAKE_DIR})
@@ -75,9 +77,12 @@ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DHIPIFY_CLANG_RES=\\\"${LLVM_LIBRARY_DI
 install(TARGETS hipify-clang DESTINATION bin)

 if (HIPIFY_CLANG_TESTS)
-    # tests
    find_package(PythonInterp 2.7 REQUIRED EXACT)

+    # Populates CUDA_TOOLKIT_ROOT_DIR, which is then applied to the lit config to give the
+    # value of --cuda-path for the test runs.
+    find_package(CUDA REQUIRED)
+
    configure_file(
        ${CMAKE_CURRENT_LIST_DIR}/../tests/hipify-clang/lit.site.cfg.in
        ${CMAKE_CURRENT_BINARY_DIR}/tests/hipify-clang/lit.site.cfg
@@ -87,7 +92,8 @@ if (HIPIFY_CLANG_TESTS)
    add_lit_testsuite(test-hipify "Running HIPify regression tests"
        ${CMAKE_CURRENT_LIST_DIR}/../tests/hipify-clang
        PARAMS site_config=${CMAKE_CURRENT_BINARY_DIR}/tests/hipify-clang/lit.site.cfg
-        DEPENDS hipify-clang lit
+        ARGS -v
+        DEPENDS hipify-clang
    )

    add_custom_target(test-hipify-clang)
@@ -1,9 +1,9 @@
-// RUN: hipify "%s" -o=%t --
+// RUN: %run_test hipify "%s" "%t" %cuda_args

 #include <iostream>

 __global__ void axpy(float a, float* x, float* y) {
-  // RUN: sh -c "test `grep -c -F 'y[hipThreadIdx_x] = a * x[hipThreadIdx_x];' %t` -eq 2"
+  // CHECK: y[hipThreadIdx_x] = a * x[hipThreadIdx_x];
  y[threadIdx.x] = a * x[threadIdx.x];
 }

@@ -17,21 +17,25 @@ int main(int argc, char* argv[]) {
  // Copy input data to device.
  float* device_x;
  float* device_y;
-  // RUN: sh -c "test `grep -c -F 'hipMalloc(&device_x, kDataLen * sizeof(float));' %t` -eq 2"
+
+  // CHECK: hipMalloc(&device_x, kDataLen * sizeof(float));
  cudaMalloc(&device_x, kDataLen * sizeof(float));
-  // RUN: sh -c "test `grep -c -F 'hipMalloc(&device_y, kDataLen * sizeof(float));' %t` -eq 2"
+
+  // CHECK: hipMalloc(&device_y, kDataLen * sizeof(float));
  cudaMalloc(&device_y, kDataLen * sizeof(float));
-  // RUN: sh -c "test `grep -c -F 'hipMemcpy(device_x, host_x, kDataLen * sizeof(float), hipMemcpyHostToDevice);' %t` -eq 2"
+
+  // CHECK: hipMemcpy(device_x, host_x, kDataLen * sizeof(float), hipMemcpyHostToDevice);
  cudaMemcpy(device_x, host_x, kDataLen * sizeof(float), cudaMemcpyHostToDevice);

  // Launch the kernel.
-  // RUN: sh -c "test `grep -c -F 'hipLaunchKernel(HIP_KERNEL_NAME(axpy), dim3(1), dim3(kDataLen), 0, 0, a, device_x, device_y);' %t` -eq 2"
+  // CHECK: hipLaunchKernelGGL(axpy, dim3(1), dim3(kDataLen), 0, 0, a, device_x, device_y);
  axpy<<<1, kDataLen>>>(a, device_x, device_y);

  // Copy output data to host.
-  // RUN: sh -c "test `grep -c -F 'hipDeviceSynchronize();' %t` -eq 2"
+  // CHECK: hipDeviceSynchronize();
  cudaDeviceSynchronize();
-  // RUN: sh -c "test `grep -c -F 'hipMemcpy(host_y, device_y, kDataLen * sizeof(float), hipMemcpyDeviceToHost);' %t` -eq 2"
+
+  // CHECK: hipMemcpy(host_y, device_y, kDataLen * sizeof(float), hipMemcpyDeviceToHost);
  cudaMemcpy(host_y, device_y, kDataLen * sizeof(float), cudaMemcpyDeviceToHost);

  // Print the results.
@@ -39,7 +43,7 @@ int main(int argc, char* argv[]) {
    std::cout << "y[" << i << "] = " << host_y[i] << "\n";
  }

-  // RUN: sh -c "test `grep -c -F 'hipDeviceReset();' %t` -eq 2"
+  // CHECK: hipDeviceReset();
  cudaDeviceReset();
  return 0;
 }
@@ -0,0 +1,240 @@
+// RUN: %run_test hipify "%s" "%t" %cuda_args
+/*
+ * Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
+ *
+ * Please refer to the NVIDIA end user license agreement (EULA) associated
+ * with this source code for terms and conditions that govern your use of
+ * this software. Any use, reproduction, disclosure, or distribution of
+ * this software and related documentation outside the terms of the EULA
+ * is strictly prohibited.
+ *
+ */
+
+//
+// This sample demonstrates the use of streams for concurrent execution. It also illustrates how to
+// introduce dependencies between CUDA streams with the new cudaStreamWaitEvent function introduced
+// in CUDA 3.2.
+//
+// Devices of compute capability 1.x will run the kernels one after another
+// Devices of compute capability 2.0 or higher can overlap the kernels
+//
+#include <stdio.h>
+#include <helper_functions.h>
+#include <helper_cuda.h>
+
+// This is a kernel that does no real work but runs at least for a specified number of clocks
+__global__ void clock_block(clock_t *d_o, clock_t clock_count)
+{
+    unsigned int start_clock = (unsigned int) clock();
+
+    clock_t clock_offset = 0;
+
+    while (clock_offset < clock_count)
+    {
+        unsigned int end_clock = (unsigned int) clock();
+
+        // The code below should work like
+        // this (thanks to modular arithmetics):
+        //
+        // clock_offset = (clock_t) (end_clock > start_clock ?
+        //                           end_clock - start_clock :
+        //                           end_clock + (0xffffffffu - start_clock));
+        //
+        // Indeed, let m = 2^32 then
+        // end - start = end + m - start (mod m).
+
+        clock_offset = (clock_t)(end_clock - start_clock);
+    }
+
+    d_o[0] = clock_offset;
+}
+
+
+// Single warp reduction kernel
+__global__ void sum(clock_t *d_clocks, int N)
+{
+    __shared__ clock_t s_clocks[32];
+
+    clock_t my_sum = 0;
+
+    for (int i = threadIdx.x; i < N; i+= blockDim.x)
+    {
+        my_sum += d_clocks[i];
+    }
+
+    s_clocks[threadIdx.x] = my_sum;
+    syncthreads();
+
+    for (int i=16; i>0; i/=2)
+    {
+        if (threadIdx.x < i)
+        {
+            s_clocks[threadIdx.x] += s_clocks[threadIdx.x + i];
+        }
+
+        syncthreads();
+    }
+
+    d_clocks[0] = s_clocks[0];
+}
+
+int main(int argc, char **argv)
+{
+    int nkernels = 8;               // number of concurrent kernels
+    int nstreams = nkernels + 1;    // use one more stream than concurrent kernel
+    int nbytes = nkernels * sizeof(clock_t);   // number of data bytes
+    float kernel_time = 10; // time the kernel should run in ms
+    float elapsed_time;   // timing variables
+    int cuda_device = 0;
+
+    printf("[%s] - Starting...\n", argv[0]);
+
+    // get number of kernels if overridden on the command line
+    if (checkCmdLineFlag(argc, (const char **)argv, "nkernels"))
+    {
+        nkernels = getCmdLineArgumentInt(argc, (const char **)argv, "nkernels");
+        nstreams = nkernels + 1;
+    }
+
+    // use command-line specified CUDA device, otherwise use device with highest Gflops/s
+    cuda_device = findCudaDevice(argc, (const char **)argv);
+
+    cudaDeviceProp deviceProp;
+    // CHECK: checkCudaErrors(hipGetDevice(&cuda_device));
+    checkCudaErrors(cudaGetDevice(&cuda_device));
+
+    // CHECK: checkCudaErrors(hipGetDeviceProperties(&deviceProp, cuda_device));
+    checkCudaErrors(cudaGetDeviceProperties(&deviceProp, cuda_device));
+
+    if ((deviceProp.concurrentKernels == 0))
+    {
+        printf("> GPU does not support concurrent kernel execution\n");
+        printf("  CUDA kernel runs will be serialized\n");
+    }
+
+    printf("> Detected Compute SM %d.%d hardware with %d multi-processors\n",
+           deviceProp.major, deviceProp.minor, deviceProp.multiProcessorCount);
+
+    // allocate host memory
+    clock_t *a = 0;                     // pointer to the array data in host memory
+    // CHECK: checkCudaErrors(hipHostMalloc((void **)&a, nbytes));
+    checkCudaErrors(cudaMallocHost((void **)&a, nbytes));
+
+    // allocate device memory
+    clock_t *d_a = 0;             // pointers to data and init value in the device memory
+    // CHECK: checkCudaErrors(hipMalloc((void **)&d_a, nbytes));
+    checkCudaErrors(cudaMalloc((void **)&d_a, nbytes));
+
+    // CHECK: hipStream_t *streams = (hipStream_t *) malloc(nstreams * sizeof(hipStream_t));
+    // allocate and initialize an array of stream handles
+    cudaStream_t *streams = (cudaStream_t *) malloc(nstreams * sizeof(cudaStream_t));
+
+    for (int i = 0; i < nstreams; i++)
+    {
+        // CHECK: checkCudaErrors(hipStreamCreate(&(streams[i])));
+        checkCudaErrors(cudaStreamCreate(&(streams[i])));
+    }
+
+    // create CUDA event handles
+    cudaEvent_t start_event, stop_event;
+
+    // CHECK: checkCudaErrors(hipEventCreate(&start_event));
+    // CHECK: checkCudaErrors(hipEventCreate(&stop_event));
+    checkCudaErrors(cudaEventCreate(&start_event));
+    checkCudaErrors(cudaEventCreate(&stop_event));
+
+    // the events are used for synchronization only and hence do not need to record timings
+    // this also makes events not introduce global sync points when recorded which is critical to get overlap
+
+    // CHECK: hipEvent_t *kernelEvent;
+    // CHECK: kernelEvent = (hipEvent_t *) malloc(nkernels * sizeof(hipEvent_t));
+    cudaEvent_t *kernelEvent;
+    kernelEvent = (cudaEvent_t *) malloc(nkernels * sizeof(cudaEvent_t));
+
+    for (int i = 0; i < nkernels; i++)
+    {
+        // CHECK: checkCudaErrors(hipEventCreateWithFlags(&(kernelEvent[i]), hipEventDisableTiming));
+        checkCudaErrors(cudaEventCreateWithFlags(&(kernelEvent[i]), cudaEventDisableTiming));
+    }
+
+    //////////////////////////////////////////////////////////////////////
+    // time execution with nkernels streams
+    clock_t total_clocks = 0;
+#if defined(__arm__) || defined(__aarch64__)
+    // the kernel takes more time than the channel reset time on arm archs, so to prevent hangs reduce time_clocks.
+    clock_t time_clocks = (clock_t)(kernel_time * (deviceProp.clockRate / 1000));
+#else
+    clock_t time_clocks = (clock_t)(kernel_time * deviceProp.clockRate);
+#endif
+
+    // CHECK: hipEventRecord(start_event, 0);
+    cudaEventRecord(start_event, 0);
+
+    // queue nkernels in separate streams and record when they are done
+    for (int i=0; i<nkernels; ++i)
+    {
+        // CHECK: hipLaunchKernelGGL(clock_block, dim3(1), dim3(1), 0, streams[i], &d_a[i], time_clocks);
+        clock_block<<<1,1,0,streams[i]>>>(&d_a[i], time_clocks);
+        total_clocks += time_clocks;
+
+        // CHECK: checkCudaErrors(hipEventRecord(kernelEvent[i], streams[i]));
+        checkCudaErrors(cudaEventRecord(kernelEvent[i], streams[i]));
+
+        // make the last stream wait for the kernel event to be recorded
+        // CHECK: checkCudaErrors(hipStreamWaitEvent(streams[nstreams-1], kernelEvent[i],0));
+        checkCudaErrors(cudaStreamWaitEvent(streams[nstreams-1], kernelEvent[i],0));
+    }
+
+    // queue a sum kernel and a copy back to host in the last stream.
+    // the commands in this stream get dispatched as soon as all the kernel events have been recorded
+    // CHECK: hipLaunchKernelGGL(sum, dim3(1), dim3(32), 0, streams[nstreams-1], d_a, nkernels);
+    // CHECK: checkCudaErrors(hipMemcpyAsync(a, d_a, sizeof(clock_t), hipMemcpyDeviceToHost, streams[nstreams-1]));
+    sum<<<1,32,0,streams[nstreams-1]>>>(d_a, nkernels);
+    checkCudaErrors(cudaMemcpyAsync(a, d_a, sizeof(clock_t), cudaMemcpyDeviceToHost, streams[nstreams-1]));
+
+    // at this point the CPU has dispatched all work for the GPU and can continue processing other tasks in parallel
+
+    // in this sample we just wait until the GPU is done
+    // CHECK: checkCudaErrors(hipEventRecord(stop_event, 0));
+    // CHECK: checkCudaErrors(hipEventSynchronize(stop_event));
+    // CHECK: checkCudaErrors(hipEventElapsedTime(&elapsed_time, start_event, stop_event));
+    checkCudaErrors(cudaEventRecord(stop_event, 0));
+    checkCudaErrors(cudaEventSynchronize(stop_event));
+    checkCudaErrors(cudaEventElapsedTime(&elapsed_time, start_event, stop_event));
+
+    printf("Expected time for serial execution of %d kernels = %.3fs\n", nkernels, nkernels * kernel_time/1000.0f);
+    printf("Expected time for concurrent execution of %d kernels = %.3fs\n", nkernels, kernel_time/1000.0f);
+    printf("Measured time for sample = %.3fs\n", elapsed_time/1000.0f);
+
+    bool bTestResult  = (a[0] > total_clocks);
+
+    // release resources
+    for (int i = 0; i < nkernels; i++)
+    {
+        // CHECK: hipStreamDestroy(streams[i]);
+        // CHECK: hipEventDestroy(kernelEvent[i]);
+        cudaStreamDestroy(streams[i]);
+        cudaEventDestroy(kernelEvent[i]);
+    }
+
+    free(streams);
+    free(kernelEvent);
+
+    // CHECK: hipEventDestroy(start_event);
+    // CHECK: hipEventDestroy(stop_event);
+    // CHECK: hipHostFree(a);
+    // CHECK: hipFree(d_a);
+    cudaEventDestroy(start_event);
+    cudaEventDestroy(stop_event);
+    cudaFreeHost(a);
+    cudaFree(d_a);
+
+    if (!bTestResult)
+    {
+        printf("Test failed!\n");
+        exit(EXIT_FAILURE);
+    }
+
+    printf("Test passed\n");
+    exit(EXIT_SUCCESS);
+}
@@ -0,0 +1,111 @@
+// RUN: %run_test hipify "%s" "%t" %cuda_args
+
+/*
+Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include<cuda.h>
+#include<cuda_runtime.h>
+#include<iostream>
+#include<unistd.h>
+#include<stdio.h>
+#include<malloc.h>
+
+#define LEN 1024
+#define SIZE LEN * sizeof(float)
+#define ITER 1024*1024
+
+// CHECK: if(status != hipSuccess) {
+#define check(msg, status){ \
+if(status != cudaSuccess) { \
+	printf("%s failed. \n", #msg); \
+} \
+}
+
+__global__ void Inc1(float *Ad, float *Bd){
+  // CHECK: int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
+	int tx = threadIdx.x + blockIdx.x * blockDim.x;
+	if(tx < 1 ){
+		for(int i=0;i<ITER;i++){
+			Ad[tx] = Ad[tx] + 1.0f;
+			for(int j=0;j<256;j++){
+				Bd[tx] = Ad[tx];
+			}
+		}
+	}
+}
+
+__global__ void Inc2(float *Ad, float *Bd){
+  // CHECK: int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
+	int tx = threadIdx.x + blockIdx.x * blockDim.x;
+	if(tx < 1024){
+		for(int i=0;i<ITER;i++){
+			Ad[tx] = Ad[tx] + 1.0f;
+			for(int j=0;j<256;j++){
+				Bd[tx] = Ad[tx];
+			}
+		}
+	}
+}
+
+int main(){
+	float *A, *Ad, *Bd;
+	A = new float[LEN];
+	for(int i=0;i<LEN;i++){
+		A[i] = 0.0f;
+	}
+
+	// CHECK: hipError_t status;
+	cudaError_t status;
+
+  // CHECK: status = hipHostRegister(A, SIZE, hipHostRegisterMapped);
+	status = cudaHostRegister(A, SIZE, cudaHostRegisterMapped);
+	check("Registering A",status);
+
+  // CHECK: hipHostGetDevicePointer(&Ad, A, 0);
+	cudaHostGetDevicePointer(&Ad, A, 0);
+
+  // CHECK: hipMalloc((void**) &Bd, SIZE);
+	cudaMalloc((void**) &Bd, SIZE);
+	dim3 dimGrid(LEN/512,1,1);
+	dim3 dimBlock(512,1,1);
+
+  // CHECK: hipLaunchKernelGGL(Inc1, dim3(dimGrid), dim3(dimBlock), 0, 0, Ad, Bd);
+	Inc1<<<dimGrid, dimBlock>>>(Ad, Bd);
+	sleep(3);
+	A[0] = -(ITER*1.0f);
+	std::cout<<"Same cache line before completion: \t"<< A[0]<<std::endl;
+
+  // CHECK: hipDeviceSynchronize();
+	cudaDeviceSynchronize();
+	std::cout<<"Same cache line after completion: \t"<< A[0]<<std::endl;
+
+	for(int i=0;i<LEN;i++){
+		A[i] = 0.0f;
+	}
+
+  // CHECK: hipLaunchKernelGGL(Inc2, dim3(dimGrid), dim3(dimBlock), 0, 0, Ad, Bd);
+	Inc2<<<dimGrid, dimBlock>>>(Ad, Bd);
+	sleep(3);
+	A[0] = -(ITER*1.0f);
+	std::cout<<"Diff cache line before completion: \t"<<A[0]<<std::endl;
+
+  // CHECK: hipDeviceSynchronize();
+	cudaDeviceSynchronize();
+	std::cout<<"Diff cache line after completion: \t"<<A[0]<<std::endl;
+}
@@ -46,3 +46,6 @@ if obj_root is not None:

 config.substitutions.append(("hipify", obj_root+"/hipify-clang"))

+# Clang args for CUDA...
+config.substitutions.append(("%cuda_args", "-x cuda --cuda-path=%s --cuda-gpu-arch=sm_30 -isystem%s/samples/common/inc" % (config.cuda_root, config.cuda_root)))
+config.substitutions.append(("%run_test", config.test_source_root + "/run_test.sh"))
@@ -2,6 +2,7 @@ import sys

 config.llvm_tools_dir = "@LLVM_TOOLS_BINARY_DIR@"
 config.obj_root = "@BINARY_DIR@"
+config.cuda_root = "@CUDA_TOOLKIT_ROOT_DIR@"

 # Support substitution of the tools and libs dirs with user parameters. This is
 # used when we can't determine the tool dir at configuration time.
@@ -0,0 +1,28 @@
+#!/usr/bin/env bash
+
+set -o errexit
+
+# Run a single LIT test file in a magical way that preserves colour output, to work around
+# a known flaw in lit.
+
+# Capture lit substitutions
+HIPIFY=$1
+IN_FILE=$2
+TMP_FILE=$3
+shift 3
+
+# Remaining args are the ones to forward to clang proper.
+
+# Time for the classic insane little trick for making colour output work.
+# A self-deleting shell-script that does the thing we want to do...
+TMP_SCRIPT=$(mktemp)
+cat << EOF > $TMP_SCRIPT
+set -o errexit
+set -o xtrace
+rm $TMP_SCRIPT
+$HIPIFY -o=$TMP_FILE $IN_FILE -- $@ && cat $TMP_FILE | sed -Ee 's|//.+|// |g' | FileCheck $IN_FILE
+EOF
+chmod a+x $TMP_SCRIPT
+
+# Run the script via socat, spawning a virtual terminal and propagating exit code, and hence failure.
+socat -du EXEC:$TMP_SCRIPT,pty,stderr STDOUT
@@ -0,0 +1,114 @@
+// RUN: %run_test hipify "%s" "%t" %cuda_args
+
+/*
+Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+#include <stdio.h>
+#include <cuda_runtime.h>
+
+#define CHECK(cmd) \
+{\
+    cudaError_t error  = cmd;\
+    if (error != cudaSuccess) { \
+        fprintf(stderr, "error: '%s'(%d) at %s:%d\n", cudaGetErrorString(error), error,__FILE__, __LINE__); \
+        exit(EXIT_FAILURE);\
+	  }\
+}
+
+
+/*
+ * Square each element in the array A and write to array C.
+ */
+template <typename T>
+__global__ void
+vector_square(T *C_d, const T *A_d, size_t N)
+{
+    // CHECK: size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
+    // CHECK: size_t stride = hipBlockDim_x * hipGridDim_x;
+    size_t offset = (blockIdx.x * blockDim.x + threadIdx.x);
+    size_t stride = blockDim.x * gridDim.x;
+
+    for (size_t i=offset; i<N; i+=stride) {
+        C_d[i] = A_d[i] * A_d[i];
+    }
+}
+
+
+int main(int argc, char *argv[])
+{
+    float *A_d, *C_d;
+    float *A_h, *C_h;
+    size_t N = 1000000;
+    size_t Nbytes = N * sizeof(float);
+
+    // CHECK: hipDeviceProp_t props;
+    cudaDeviceProp props;
+
+    // CHECK: CHECK(hipGetDeviceProperties(&props, 0/*deviceID*/));
+    CHECK(cudaGetDeviceProperties(&props, 0/*deviceID*/));
+    printf ("info: running on device %s\n", props.name);
+
+    printf ("info: allocate host mem (%6.2f MB)\n", 2*Nbytes/1024.0/1024.0);
+    A_h = (float*)malloc(Nbytes);
+
+    // CHECK: CHECK(A_h == 0 ? hipErrorMemoryAllocation : hipSuccess );
+    CHECK(A_h == 0 ? cudaErrorMemoryAllocation : cudaSuccess );
+    C_h = (float*)malloc(Nbytes);
+
+    // CHECK: CHECK(C_h == 0 ? hipErrorMemoryAllocation : hipSuccess );
+    CHECK(C_h == 0 ? cudaErrorMemoryAllocation : cudaSuccess );
+    // Fill with Phi + i
+    for (size_t i=0; i<N; i++)
+    {
+        A_h[i] = 1.618f + i;
+    }
+
+    printf ("info: allocate device mem (%6.2f MB)\n", 2*Nbytes/1024.0/1024.0);
+    // CHECK: CHECK(hipMalloc(&A_d, Nbytes));
+    // CHECK: CHECK(hipMalloc(&C_d, Nbytes));
+    CHECK(cudaMalloc(&A_d, Nbytes));
+    CHECK(cudaMalloc(&C_d, Nbytes));
+
+
+    printf ("info: copy Host2Device\n");
+    // CHECK: CHECK ( hipMemcpy(A_d, A_h, Nbytes, hipMemcpyHostToDevice));
+    CHECK ( cudaMemcpy(A_d, A_h, Nbytes, cudaMemcpyHostToDevice));
+
+    const unsigned blocks = 512;
+    const unsigned threadsPerBlock = 256;
+
+    printf ("info: launch 'vector_square' kernel\n");
+    // CHECK: hipLaunchKernelGGL(vector_square, dim3(blocks), dim3(threadsPerBlock), 0, 0, C_d, A_d, N);
+    vector_square <<<blocks, threadsPerBlock>>> (C_d, A_d, N);
+
+    printf ("info: copy Device2Host\n");
+    // CHECK: CHECK ( hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost));
+    CHECK ( cudaMemcpy(C_h, C_d, Nbytes, cudaMemcpyDeviceToHost));
+
+    printf ("info: check result\n");
+    for (size_t i=0; i<N; i++)  {
+        if (C_h[i] != A_h[i] * A_h[i]) {
+            // CHECK: CHECK(hipErrorUnknown);
+            CHECK(cudaErrorUnknown);
+        }
+    }
+    printf ("PASSED!\n");
+}