From e9d259699c96d723e8cd13a07cedb69a2972692f Mon Sep 17 00:00:00 2001
From: Chris Kitching <chriskitching@linux.com>
Date: Wed, 18 Oct 2017 08:00:58 +0100
Subject: [PATCH 01/10] Remove dependency on nonexistent `lit` target

This just... isn't a target. Maybe you were trying to do a file
dependency? But that's handled implicitly by the find_program
call anyway, so all this is doing is generating a cmake warning
---
 hipamd/hipify-clang/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/hipamd/hipify-clang/CMakeLists.txt b/hipamd/hipify-clang/CMakeLists.txt
index 355b0cea7a..d309aa261c 100644
--- a/hipamd/hipify-clang/CMakeLists.txt
+++ b/hipamd/hipify-clang/CMakeLists.txt
@@ -84,7 +84,7 @@ if (HIPIFY_CLANG_TESTS)
     add_lit_testsuite(test-hipify "Running HIPify regression tests"
         ${CMAKE_SOURCE_DIR}/tests/hipify-clang
         PARAMS site_config=${CMAKE_CURRENT_BINARY_DIR}/tests/hipify-clang/lit.site.cfg
-        DEPENDS hipify-clang lit
+        DEPENDS hipify-clang
     )
 
     add_custom_target(test-hipify-clang)

From 9747578d093164d7e5325aa1ef79958879e17b35 Mon Sep 17 00:00:00 2001
From: Chris Kitching <chriskitching@linux.com>
Date: Wed, 18 Oct 2017 08:16:26 +0100
Subject: [PATCH 02/10] Propagate the CUDA toolkit directory into the lit tests

Allows the tests to actually run... :D
---
 hipamd/hipify-clang/CMakeLists.txt        | 5 ++++-
 hipamd/tests/hipify-clang/axpy.cu         | 2 +-
 hipamd/tests/hipify-clang/lit.cfg         | 2 ++
 hipamd/tests/hipify-clang/lit.site.cfg.in | 1 +
 4 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/hipamd/hipify-clang/CMakeLists.txt b/hipamd/hipify-clang/CMakeLists.txt
index d309aa261c..18939c0210 100644
--- a/hipamd/hipify-clang/CMakeLists.txt
+++ b/hipamd/hipify-clang/CMakeLists.txt
@@ -70,9 +70,12 @@ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DHIPIFY_CLANG_RES=\\\"${LLVM_LIBRARY_DI
 install(TARGETS hipify-clang DESTINATION bin)
 
 if (HIPIFY_CLANG_TESTS)
-    # tests
     find_package(PythonInterp 2.7 REQUIRED EXACT)
 
+    # Populates CUDA_TOOLKIT_ROOT_DIR, which is then applied to the lit config to give the
+    # value of --cuda-path for the test runs.
+    find_package(CUDA REQUIRED)
+
     set(BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR})
 
     configure_file(
diff --git a/hipamd/tests/hipify-clang/axpy.cu b/hipamd/tests/hipify-clang/axpy.cu
index 8472e60209..56a854fd0a 100644
--- a/hipamd/tests/hipify-clang/axpy.cu
+++ b/hipamd/tests/hipify-clang/axpy.cu
@@ -1,4 +1,4 @@
-// RUN: hipify "%s" -o=%t --
+// RUN: hipify "%s" -o=%t -- %cuda_args
 
 #include <iostream>
 
diff --git a/hipamd/tests/hipify-clang/lit.cfg b/hipamd/tests/hipify-clang/lit.cfg
index c57b8ec524..57d9d876b5 100644
--- a/hipamd/tests/hipify-clang/lit.cfg
+++ b/hipamd/tests/hipify-clang/lit.cfg
@@ -46,3 +46,5 @@ if obj_root is not None:
 
 config.substitutions.append(("hipify", obj_root+"/hipify-clang"))
 
+# Clang args for CUDA...
+config.substitutions.append(("%cuda_args", "-x cuda --cuda-path=%s --cuda-gpu-arch=sm_30" % config.cuda_root))
diff --git a/hipamd/tests/hipify-clang/lit.site.cfg.in b/hipamd/tests/hipify-clang/lit.site.cfg.in
index 4511316ac7..4e05710065 100644
--- a/hipamd/tests/hipify-clang/lit.site.cfg.in
+++ b/hipamd/tests/hipify-clang/lit.site.cfg.in
@@ -2,6 +2,7 @@ import sys
 
 config.llvm_tools_dir = "@LLVM_TOOLS_BINARY_DIR@"
 config.obj_root = "@BINARY_DIR@"
+config.cuda_root = "@CUDA_TOOLKIT_ROOT_DIR@"
 
 # Support substitution of the tools and libs dirs with user parameters. This is
 # used when we can't determine the tool dir at configuration time.

From 3868036ea7c80f46e9f305775d1145b883b5d0bb Mon Sep 17 00:00:00 2001
From: Chris Kitching <chriskitching@linux.com>
Date: Wed, 18 Oct 2017 08:27:22 +0100
Subject: [PATCH 03/10] Look for FileCheck for running lit tests, too

Use of grep in `lit` RUN lines is deprecated:
https://llvm.org/docs/TestingGuide.html#writing-new-regression-tests

Using grep leads to really unhelpful failure output (it literally
just says "the test failed"). FileCheck is much more helpful, and
distributed with LLVM on most distros anyway, so this extra
dependency shouldn't prove problematic.
---
 hipamd/hipify-clang/CMakeLists.txt | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/hipamd/hipify-clang/CMakeLists.txt b/hipamd/hipify-clang/CMakeLists.txt
index 18939c0210..e7c46557f8 100644
--- a/hipamd/hipify-clang/CMakeLists.txt
+++ b/hipamd/hipify-clang/CMakeLists.txt
@@ -3,11 +3,12 @@ project(hipify-clang)
 
 option(HIPIFY_CLANG_TESTS "Build the tests for hipify-clang, if lit is installed" ON)
 
-# Disable the tests if `lit` is not installed.
+# Disable the tests if `lit` or `FileCheck` is not installed.
 find_program(LIT_COMMAND lit)
-if (NOT LIT_COMMAND)
+find_program(FILECHECK_COMMAND FileCheck)
+if (NOT LIT_COMMAND OR NOT FILECHECK_COMMAND)
     set(HIPIFY_CLANG_TESTS OFF CACHE INTERNAL "")
-    message(STATUS "hipify-clang's tests are not being built because `lit` is not installed.")
+    message(STATUS "hipify-clang's tests are not being built because `lit` or `FileCheck` could not be found.")
 endif()
 
 set(BUILD_HIPIFY_CLANG 0 CACHE INTERNAL "")

From 74fd64d5c132081b56301e07b1220a608d8d8ac0 Mon Sep 17 00:00:00 2001
From: Chris Kitching <chriskitching@linux.com>
Date: Wed, 18 Oct 2017 08:45:16 +0100
Subject: [PATCH 04/10] Migrate lit test to using FileCheck, so failures are
 readable

It seems the test is already broken, but look how awesome the
error message is now:

/home/chris/HIP/tests/hipify-clang/axpy.cu:31:12: error: expected string not found in input
 // CHECK: hipLaunchKernel(HIP_KERNEL_NAME(axpy), dim3(1), dim3(kDataLen), 0, 0, a, device_x, device_y);
           ^
<stdin>:31:2: note: scanning from here
 //
 ^
<stdin>:33:2: note: possible intended match here
 hipLaunchKernelGGL(axpy, dim3(1), dim3(kDataLen), 0, 0, a, device_x, device_y);
 ^
---
 hipamd/tests/hipify-clang/axpy.cu | 22 +++++++++++++---------
 1 file changed, 13 insertions(+), 9 deletions(-)

diff --git a/hipamd/tests/hipify-clang/axpy.cu b/hipamd/tests/hipify-clang/axpy.cu
index 56a854fd0a..e0afa02a26 100644
--- a/hipamd/tests/hipify-clang/axpy.cu
+++ b/hipamd/tests/hipify-clang/axpy.cu
@@ -1,9 +1,9 @@
-// RUN: hipify "%s" -o=%t -- %cuda_args
+// RUN: hipify "%s" -o=%t -- %cuda_args && cat %t | sed -Ee 's|//.+|// |g' | FileCheck %s
 
 #include <iostream>
 
 __global__ void axpy(float a, float* x, float* y) {
-  // RUN: sh -c "test `grep -c -F 'y[hipThreadIdx_x] = a * x[hipThreadIdx_x];' %t` -eq 2"
+  // CHECK: y[hipThreadIdx_x] = a * x[hipThreadIdx_x];
   y[threadIdx.x] = a * x[threadIdx.x];
 }
 
@@ -17,21 +17,25 @@ int main(int argc, char* argv[]) {
   // Copy input data to device.
   float* device_x;
   float* device_y;
-  // RUN: sh -c "test `grep -c -F 'hipMalloc(&device_x, kDataLen * sizeof(float));' %t` -eq 2"
+
+  // CHECK: hipMalloc(&device_x, kDataLen * sizeof(float));
   cudaMalloc(&device_x, kDataLen * sizeof(float));
-  // RUN: sh -c "test `grep -c -F 'hipMalloc(&device_y, kDataLen * sizeof(float));' %t` -eq 2"
+
+  // CHECK: hipMalloc(&device_y, kDataLen * sizeof(float));
   cudaMalloc(&device_y, kDataLen * sizeof(float));
-  // RUN: sh -c "test `grep -c -F 'hipMemcpy(device_x, host_x, kDataLen * sizeof(float), hipMemcpyHostToDevice);' %t` -eq 2"
+
+  // CHECK: hipMemcpy(device_x, host_x, kDataLen * sizeof(float), hipMemcpyHostToDevice);
   cudaMemcpy(device_x, host_x, kDataLen * sizeof(float), cudaMemcpyHostToDevice);
 
   // Launch the kernel.
-  // RUN: sh -c "test `grep -c -F 'hipLaunchKernel(HIP_KERNEL_NAME(axpy), dim3(1), dim3(kDataLen), 0, 0, a, device_x, device_y);' %t` -eq 2"
+  // CHECK: hipLaunchKernel(HIP_KERNEL_NAME(axpy), dim3(1), dim3(kDataLen), 0, 0, a, device_x, device_y);
   axpy<<<1, kDataLen>>>(a, device_x, device_y);
 
   // Copy output data to host.
-  // RUN: sh -c "test `grep -c -F 'hipDeviceSynchronize();' %t` -eq 2"
+  // CHECK: hipDeviceSynchronize();
   cudaDeviceSynchronize();
-  // RUN: sh -c "test `grep -c -F 'hipMemcpy(host_y, device_y, kDataLen * sizeof(float), hipMemcpyDeviceToHost);' %t` -eq 2"
+
+  // CHECK: hipMemcpy(host_y, device_y, kDataLen * sizeof(float), hipMemcpyDeviceToHost);
   cudaMemcpy(host_y, device_y, kDataLen * sizeof(float), cudaMemcpyDeviceToHost);
 
   // Print the results.
@@ -39,7 +43,7 @@ int main(int argc, char* argv[]) {
     std::cout << "y[" << i << "] = " << host_y[i] << "\n";
   }
 
-  // RUN: sh -c "test `grep -c -F 'hipDeviceReset();' %t` -eq 2"
+  // CHECK: hipDeviceReset();
   cudaDeviceReset();
   return 0;
 }

From 5912f465bdc9bed19e3e6dbf7872e73de7c60383 Mon Sep 17 00:00:00 2001
From: Chris Kitching <chriskitching@linux.com>
Date: Wed, 18 Oct 2017 08:46:36 +0100
Subject: [PATCH 05/10] Adapt `lit` test for the hipLaunchKernelGGL changes
 from before...

---
 hipamd/tests/hipify-clang/axpy.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hipamd/tests/hipify-clang/axpy.cu b/hipamd/tests/hipify-clang/axpy.cu
index e0afa02a26..689c6e96ff 100644
--- a/hipamd/tests/hipify-clang/axpy.cu
+++ b/hipamd/tests/hipify-clang/axpy.cu
@@ -28,7 +28,7 @@ int main(int argc, char* argv[]) {
   cudaMemcpy(device_x, host_x, kDataLen * sizeof(float), cudaMemcpyHostToDevice);
 
   // Launch the kernel.
-  // CHECK: hipLaunchKernel(HIP_KERNEL_NAME(axpy), dim3(1), dim3(kDataLen), 0, 0, a, device_x, device_y);
+  // CHECK: hipLaunchKernelGGL(axpy, dim3(1), dim3(kDataLen), 0, 0, a, device_x, device_y);
   axpy<<<1, kDataLen>>>(a, device_x, device_y);
 
   // Copy output data to host.

From c99dcbba8d5e31c64098bb707f892d193e4da322 Mon Sep 17 00:00:00 2001
From: Chris Kitching <chriskitching@linux.com>
Date: Wed, 18 Oct 2017 13:50:00 +0100
Subject: [PATCH 06/10] Introduce a test runner script to simplify invocation

... And to use a standard, highly amusing trick for making
coloured output work.
---
 hipamd/hipify-clang/CMakeLists.txt    |  6 ++++--
 hipamd/tests/hipify-clang/axpy.cu     |  2 +-
 hipamd/tests/hipify-clang/lit.cfg     |  1 +
 hipamd/tests/hipify-clang/run_test.sh | 28 +++++++++++++++++++++++++++
 4 files changed, 34 insertions(+), 3 deletions(-)
 create mode 100755 hipamd/tests/hipify-clang/run_test.sh

diff --git a/hipamd/hipify-clang/CMakeLists.txt b/hipamd/hipify-clang/CMakeLists.txt
index e7c46557f8..2ec96e51a8 100644
--- a/hipamd/hipify-clang/CMakeLists.txt
+++ b/hipamd/hipify-clang/CMakeLists.txt
@@ -6,9 +6,10 @@ option(HIPIFY_CLANG_TESTS "Build the tests for hipify-clang, if lit is installed
 # Disable the tests if `lit` or `FileCheck` is not installed.
 find_program(LIT_COMMAND lit)
 find_program(FILECHECK_COMMAND FileCheck)
-if (NOT LIT_COMMAND OR NOT FILECHECK_COMMAND)
+find_program(SOCAT_COMMAND socat)
+if (NOT LIT_COMMAND OR NOT FILECHECK_COMMAND OR NOT SOCAT_COMMAND)
     set(HIPIFY_CLANG_TESTS OFF CACHE INTERNAL "")
-    message(STATUS "hipify-clang's tests are not being built because `lit` or `FileCheck` could not be found.")
+    message(STATUS "hipify-clang's tests are not being built because `lit`,`FileCheck` or `socat` could not be found.")
 endif()
 
 set(BUILD_HIPIFY_CLANG 0 CACHE INTERNAL "")
@@ -88,6 +89,7 @@ if (HIPIFY_CLANG_TESTS)
     add_lit_testsuite(test-hipify "Running HIPify regression tests"
         ${CMAKE_SOURCE_DIR}/tests/hipify-clang
         PARAMS site_config=${CMAKE_CURRENT_BINARY_DIR}/tests/hipify-clang/lit.site.cfg
+        ARGS -v
         DEPENDS hipify-clang
     )
 
diff --git a/hipamd/tests/hipify-clang/axpy.cu b/hipamd/tests/hipify-clang/axpy.cu
index 689c6e96ff..8c6b0e0d8d 100644
--- a/hipamd/tests/hipify-clang/axpy.cu
+++ b/hipamd/tests/hipify-clang/axpy.cu
@@ -1,4 +1,4 @@
-// RUN: hipify "%s" -o=%t -- %cuda_args && cat %t | sed -Ee 's|//.+|// |g' | FileCheck %s
+// RUN: %run_test hipify "%s" "%t" %cuda_args
 
 #include <iostream>
 
diff --git a/hipamd/tests/hipify-clang/lit.cfg b/hipamd/tests/hipify-clang/lit.cfg
index 57d9d876b5..bb6ac2b407 100644
--- a/hipamd/tests/hipify-clang/lit.cfg
+++ b/hipamd/tests/hipify-clang/lit.cfg
@@ -48,3 +48,4 @@ config.substitutions.append(("hipify", obj_root+"/hipify-clang"))
 
 # Clang args for CUDA...
 config.substitutions.append(("%cuda_args", "-x cuda --cuda-path=%s --cuda-gpu-arch=sm_30" % config.cuda_root))
+config.substitutions.append(("%run_test", config.test_source_root + "/run_test.sh"))
diff --git a/hipamd/tests/hipify-clang/run_test.sh b/hipamd/tests/hipify-clang/run_test.sh
new file mode 100755
index 0000000000..46b2fc066b
--- /dev/null
+++ b/hipamd/tests/hipify-clang/run_test.sh
@@ -0,0 +1,28 @@
+#!/usr/bin/env bash
+
+set -o errexit
+
+# Run a single LIT test file in a magical way that preserves colour output, to work around
+# a known flaw in lit.
+
+# Capture lit substitutions
+HIPIFY=$1
+IN_FILE=$2
+TMP_FILE=$3
+shift 3
+
+# Remaining args are the ones to forward to clang proper.
+
+# Time for the classic insane little trick for making colour output work.
+# A self-deleting shell-script that does the thing we want to do...
+TMP_SCRIPT=$(mktemp)
+cat << EOF > $TMP_SCRIPT
+set -o errexit
+set -o xtrace
+rm $TMP_SCRIPT
+$HIPIFY -o=$TMP_FILE $IN_FILE -- $@ && cat $TMP_FILE | sed -Ee 's|//.+|// |g' | FileCheck $IN_FILE
+EOF
+chmod a+x $TMP_SCRIPT
+
+# Run the script via socat, spawning a virtual terminal and propagating exit code, and hence failure.
+socat -du EXEC:$TMP_SCRIPT,pty,stderr STDOUT

From ead79e5bf4e54f0f7b1fda3482c2d1b0a36d8022 Mon Sep 17 00:00:00 2001
From: Chris Kitching <chriskitching@linux.com>
Date: Wed, 18 Oct 2017 13:58:34 +0100
Subject: [PATCH 07/10] Add square.cu to lit testsuite

---
 hipamd/tests/hipify-clang/square.cu | 114 ++++++++++++++++++++++++++++
 1 file changed, 114 insertions(+)
 create mode 100644 hipamd/tests/hipify-clang/square.cu

diff --git a/hipamd/tests/hipify-clang/square.cu b/hipamd/tests/hipify-clang/square.cu
new file mode 100644
index 0000000000..e0c72094a8
--- /dev/null
+++ b/hipamd/tests/hipify-clang/square.cu
@@ -0,0 +1,114 @@
+// RUN: %run_test hipify "%s" "%t" %cuda_args
+
+/*
+Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+#include <stdio.h>
+#include <cuda_runtime.h>
+
+#define CHECK(cmd) \
+{\
+    cudaError_t error  = cmd;\
+    if (error != cudaSuccess) { \
+        fprintf(stderr, "error: '%s'(%d) at %s:%d\n", cudaGetErrorString(error), error,__FILE__, __LINE__); \
+        exit(EXIT_FAILURE);\
+	  }\
+}
+
+
+/*
+ * Square each element in the array A and write to array C.
+ */
+template <typename T>
+__global__ void
+vector_square(T *C_d, const T *A_d, size_t N)
+{
+    // CHECK: size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
+    // CHECK: size_t stride = hipBlockDim_x * hipGridDim_x;
+    size_t offset = (blockIdx.x * blockDim.x + threadIdx.x);
+    size_t stride = blockDim.x * gridDim.x;
+
+    for (size_t i=offset; i<N; i+=stride) {
+        C_d[i] = A_d[i] * A_d[i];
+    }
+}
+
+
+int main(int argc, char *argv[])
+{
+    float *A_d, *C_d;
+    float *A_h, *C_h;
+    size_t N = 1000000;
+    size_t Nbytes = N * sizeof(float);
+
+    // CHECK: hipDeviceProp_t props;
+    cudaDeviceProp props;
+
+    // CHECK: CHECK(hipGetDeviceProperties(&props, 0/*deviceID*/));
+    CHECK(cudaGetDeviceProperties(&props, 0/*deviceID*/));
+    printf ("info: running on device %s\n", props.name);
+
+    printf ("info: allocate host mem (%6.2f MB)\n", 2*Nbytes/1024.0/1024.0);
+    A_h = (float*)malloc(Nbytes);
+
+    // CHECK: CHECK(A_h == 0 ? hipErrorMemoryAllocation : hipSuccess );
+    CHECK(A_h == 0 ? cudaErrorMemoryAllocation : cudaSuccess );
+    C_h = (float*)malloc(Nbytes);
+
+    // CHECK: CHECK(C_h == 0 ? hipErrorMemoryAllocation : hipSuccess );
+    CHECK(C_h == 0 ? cudaErrorMemoryAllocation : cudaSuccess );
+    // Fill with Phi + i
+    for (size_t i=0; i<N; i++)
+    {
+        A_h[i] = 1.618f + i;
+    }
+
+    printf ("info: allocate device mem (%6.2f MB)\n", 2*Nbytes/1024.0/1024.0);
+    // CHECK: CHECK(hipMalloc(&A_d, Nbytes));
+    // CHECK: CHECK(hipMalloc(&C_d, Nbytes));
+    CHECK(cudaMalloc(&A_d, Nbytes));
+    CHECK(cudaMalloc(&C_d, Nbytes));
+
+
+    printf ("info: copy Host2Device\n");
+    // CHECK: CHECK ( hipMemcpy(A_d, A_h, Nbytes, hipMemcpyHostToDevice));
+    CHECK ( cudaMemcpy(A_d, A_h, Nbytes, cudaMemcpyHostToDevice));
+
+    const unsigned blocks = 512;
+    const unsigned threadsPerBlock = 256;
+
+    printf ("info: launch 'vector_square' kernel\n");
+    // CHECK: hipLaunchKernelGGL(vector_square, dim3(blocks), dim3(threadsPerBlock), 0, 0, C_d, A_d, N);
+    vector_square <<<blocks, threadsPerBlock>>> (C_d, A_d, N);
+
+    printf ("info: copy Device2Host\n");
+    // CHECK: CHECK ( hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost));
+    CHECK ( cudaMemcpy(C_h, C_d, Nbytes, cudaMemcpyDeviceToHost));
+
+    printf ("info: check result\n");
+    for (size_t i=0; i<N; i++)  {
+        if (C_h[i] != A_h[i] * A_h[i]) {
+            // CHECK: CHECK(hipErrorUnknown);
+            CHECK(cudaErrorUnknown);
+        }
+    }
+    printf ("PASSED!\n");
+}

From 2437b319398daa36d32b2e083a39e6c3cd30fe00 Mon Sep 17 00:00:00 2001
From: Chris Kitching <chriskitching@linux.com>
Date: Wed, 18 Oct 2017 14:05:46 +0100
Subject: [PATCH 08/10] Add cudaRegister.cu lit test

---
 hipamd/tests/hipify-clang/cudaRegister.cu | 111 ++++++++++++++++++++++
 1 file changed, 111 insertions(+)
 create mode 100644 hipamd/tests/hipify-clang/cudaRegister.cu

diff --git a/hipamd/tests/hipify-clang/cudaRegister.cu b/hipamd/tests/hipify-clang/cudaRegister.cu
new file mode 100644
index 0000000000..80d17f65b9
--- /dev/null
+++ b/hipamd/tests/hipify-clang/cudaRegister.cu
@@ -0,0 +1,111 @@
+// RUN: %run_test hipify "%s" "%t" %cuda_args
+
+/*
+Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include<cuda.h>
+#include<cuda_runtime.h>
+#include<iostream>
+#include<unistd.h>
+#include<stdio.h>
+#include<malloc.h>
+
+#define LEN 1024
+#define SIZE LEN * sizeof(float)
+#define ITER 1024*1024
+
+// CHECK: if(status != hipSuccess) {
+#define check(msg, status){ \
+if(status != cudaSuccess) { \
+	printf("%s failed. \n", #msg); \
+} \
+}
+
+__global__ void Inc1(float *Ad, float *Bd){
+  // CHECK: int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
+	int tx = threadIdx.x + blockIdx.x * blockDim.x;
+	if(tx < 1 ){
+		for(int i=0;i<ITER;i++){
+			Ad[tx] = Ad[tx] + 1.0f;
+			for(int j=0;j<256;j++){
+				Bd[tx] = Ad[tx];
+			}
+		}
+	}
+}
+
+__global__ void Inc2(float *Ad, float *Bd){
+  // CHECK: int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
+	int tx = threadIdx.x + blockIdx.x * blockDim.x;
+	if(tx < 1024){
+		for(int i=0;i<ITER;i++){
+			Ad[tx] = Ad[tx] + 1.0f;
+			for(int j=0;j<256;j++){
+				Bd[tx] = Ad[tx];
+			}
+		}
+	}
+}
+
+int main(){
+	float *A, *Ad, *Bd;
+	A = new float[LEN];
+	for(int i=0;i<LEN;i++){
+		A[i] = 0.0f;
+	}
+
+	// CHECK: hipError_t status;
+	cudaError_t status;
+
+  // CHECK: status = hipHostRegister(A, SIZE, hipHostRegisterMapped);
+	status = cudaHostRegister(A, SIZE, cudaHostRegisterMapped);
+	check("Registering A",status);
+
+  // CHECK: hipHostGetDevicePointer(&Ad, A, 0);
+	cudaHostGetDevicePointer(&Ad, A, 0);
+
+  // CHECK: hipMalloc((void**) &Bd, SIZE);
+	cudaMalloc((void**) &Bd, SIZE);
+	dim3 dimGrid(LEN/512,1,1);
+	dim3 dimBlock(512,1,1);
+
+  // CHECK: hipLaunchKernelGGL(Inc1, dim3(dimGrid), dim3(dimBlock), 0, 0, Ad, Bd);
+	Inc1<<<dimGrid, dimBlock>>>(Ad, Bd);
+	sleep(3);
+	A[0] = -(ITER*1.0f);
+	std::cout<<"Same cache line before completion: \t"<< A[0]<<std::endl;
+
+  // CHECK: hipDeviceSynchronize();
+	cudaDeviceSynchronize();
+	std::cout<<"Same cache line after completion: \t"<< A[0]<<std::endl;
+
+	for(int i=0;i<LEN;i++){
+		A[i] = 0.0f;
+	}
+
+  // CHECK: hipLaunchKernelGGL(Inc2, dim3(dimGrid), dim3(dimBlock), 0, 0, Ad, Bd);
+	Inc2<<<dimGrid, dimBlock>>>(Ad, Bd);
+	sleep(3);
+	A[0] = -(ITER*1.0f);
+	std::cout<<"Diff cache line before completion: \t"<<A[0]<<std::endl;
+
+  // CHECK: hipDeviceSynchronize();
+	cudaDeviceSynchronize();
+	std::cout<<"Diff cache line after completion: \t"<<A[0]<<std::endl;
+}

From 64d5f07050165e15325a3791540b10efad6cce43 Mon Sep 17 00:00:00 2001
From: Chris Kitching <chriskitching@linux.com>
Date: Wed, 18 Oct 2017 20:41:23 +0100
Subject: [PATCH 09/10] Add the CUDA samples include dir to the path for tests

Means we get to easily steal CUDA examples for tests
---
 hipamd/tests/hipify-clang/lit.cfg | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hipamd/tests/hipify-clang/lit.cfg b/hipamd/tests/hipify-clang/lit.cfg
index bb6ac2b407..76b2ca08dc 100644
--- a/hipamd/tests/hipify-clang/lit.cfg
+++ b/hipamd/tests/hipify-clang/lit.cfg
@@ -47,5 +47,5 @@ if obj_root is not None:
 config.substitutions.append(("hipify", obj_root+"/hipify-clang"))
 
 # Clang args for CUDA...
-config.substitutions.append(("%cuda_args", "-x cuda --cuda-path=%s --cuda-gpu-arch=sm_30" % config.cuda_root))
+config.substitutions.append(("%cuda_args", "-x cuda --cuda-path=%s --cuda-gpu-arch=sm_30 -isystem%s/samples/common/inc" % (config.cuda_root, config.cuda_root)))
 config.substitutions.append(("%run_test", config.test_source_root + "/run_test.sh"))

From 2c65d0da37540e27d67cdd2b357f5d5e5a0aa22e Mon Sep 17 00:00:00 2001
From: Chris Kitching <chriskitching@linux.com>
Date: Wed, 18 Oct 2017 21:12:35 +0100
Subject: [PATCH 10/10] Add concurentKernels.cu to the testsuite

---
 hipamd/tests/hipify-clang/concurentKernels.cu | 240 ++++++++++++++++++
 1 file changed, 240 insertions(+)
 create mode 100644 hipamd/tests/hipify-clang/concurentKernels.cu

diff --git a/hipamd/tests/hipify-clang/concurentKernels.cu b/hipamd/tests/hipify-clang/concurentKernels.cu
new file mode 100644
index 0000000000..e369baaf3e
--- /dev/null
+++ b/hipamd/tests/hipify-clang/concurentKernels.cu
@@ -0,0 +1,240 @@
+// RUN: %run_test hipify "%s" "%t" %cuda_args
+/*
+ * Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
+ *
+ * Please refer to the NVIDIA end user license agreement (EULA) associated
+ * with this source code for terms and conditions that govern your use of
+ * this software. Any use, reproduction, disclosure, or distribution of
+ * this software and related documentation outside the terms of the EULA
+ * is strictly prohibited.
+ *
+ */
+
+//
+// This sample demonstrates the use of streams for concurrent execution. It also illustrates how to
+// introduce dependencies between CUDA streams with the new cudaStreamWaitEvent function introduced
+// in CUDA 3.2.
+//
+// Devices of compute capability 1.x will run the kernels one after another
+// Devices of compute capability 2.0 or higher can overlap the kernels
+//
+#include <stdio.h>
+#include <helper_functions.h>
+#include <helper_cuda.h>
+
+// This is a kernel that does no real work but runs at least for a specified number of clocks
+__global__ void clock_block(clock_t *d_o, clock_t clock_count)
+{
+    unsigned int start_clock = (unsigned int) clock();
+
+    clock_t clock_offset = 0;
+
+    while (clock_offset < clock_count)
+    {
+        unsigned int end_clock = (unsigned int) clock();
+
+        // The code below should work like
+        // this (thanks to modular arithmetics):
+        //
+        // clock_offset = (clock_t) (end_clock > start_clock ?
+        //                           end_clock - start_clock :
+        //                           end_clock + (0xffffffffu - start_clock));
+        //
+        // Indeed, let m = 2^32 then
+        // end - start = end + m - start (mod m).
+
+        clock_offset = (clock_t)(end_clock - start_clock);
+    }
+
+    d_o[0] = clock_offset;
+}
+
+
+// Single warp reduction kernel
+__global__ void sum(clock_t *d_clocks, int N)
+{
+    __shared__ clock_t s_clocks[32];
+
+    clock_t my_sum = 0;
+
+    for (int i = threadIdx.x; i < N; i+= blockDim.x)
+    {
+        my_sum += d_clocks[i];
+    }
+
+    s_clocks[threadIdx.x] = my_sum;
+    syncthreads();
+
+    for (int i=16; i>0; i/=2)
+    {
+        if (threadIdx.x < i)
+        {
+            s_clocks[threadIdx.x] += s_clocks[threadIdx.x + i];
+        }
+
+        syncthreads();
+    }
+
+    d_clocks[0] = s_clocks[0];
+}
+
+int main(int argc, char **argv)
+{
+    int nkernels = 8;               // number of concurrent kernels
+    int nstreams = nkernels + 1;    // use one more stream than concurrent kernel
+    int nbytes = nkernels * sizeof(clock_t);   // number of data bytes
+    float kernel_time = 10; // time the kernel should run in ms
+    float elapsed_time;   // timing variables
+    int cuda_device = 0;
+
+    printf("[%s] - Starting...\n", argv[0]);
+
+    // get number of kernels if overridden on the command line
+    if (checkCmdLineFlag(argc, (const char **)argv, "nkernels"))
+    {
+        nkernels = getCmdLineArgumentInt(argc, (const char **)argv, "nkernels");
+        nstreams = nkernels + 1;
+    }
+
+    // use command-line specified CUDA device, otherwise use device with highest Gflops/s
+    cuda_device = findCudaDevice(argc, (const char **)argv);
+
+    cudaDeviceProp deviceProp;
+    // CHECK: checkCudaErrors(hipGetDevice(&cuda_device));
+    checkCudaErrors(cudaGetDevice(&cuda_device));
+
+    // CHECK: checkCudaErrors(hipGetDeviceProperties(&deviceProp, cuda_device));
+    checkCudaErrors(cudaGetDeviceProperties(&deviceProp, cuda_device));
+
+    if ((deviceProp.concurrentKernels == 0))
+    {
+        printf("> GPU does not support concurrent kernel execution\n");
+        printf("  CUDA kernel runs will be serialized\n");
+    }
+
+    printf("> Detected Compute SM %d.%d hardware with %d multi-processors\n",
+           deviceProp.major, deviceProp.minor, deviceProp.multiProcessorCount);
+
+    // allocate host memory
+    clock_t *a = 0;                     // pointer to the array data in host memory
+    // CHECK: checkCudaErrors(hipHostMalloc((void **)&a, nbytes));
+    checkCudaErrors(cudaMallocHost((void **)&a, nbytes));
+
+    // allocate device memory
+    clock_t *d_a = 0;             // pointers to data and init value in the device memory
+    // CHECK: checkCudaErrors(hipMalloc((void **)&d_a, nbytes));
+    checkCudaErrors(cudaMalloc((void **)&d_a, nbytes));
+
+    // CHECK: hipStream_t *streams = (hipStream_t *) malloc(nstreams * sizeof(hipStream_t));
+    // allocate and initialize an array of stream handles
+    cudaStream_t *streams = (cudaStream_t *) malloc(nstreams * sizeof(cudaStream_t));
+
+    for (int i = 0; i < nstreams; i++)
+    {
+        // CHECK: checkCudaErrors(hipStreamCreate(&(streams[i])));
+        checkCudaErrors(cudaStreamCreate(&(streams[i])));
+    }
+
+    // create CUDA event handles
+    cudaEvent_t start_event, stop_event;
+
+    // CHECK: checkCudaErrors(hipEventCreate(&start_event));
+    // CHECK: checkCudaErrors(hipEventCreate(&stop_event));
+    checkCudaErrors(cudaEventCreate(&start_event));
+    checkCudaErrors(cudaEventCreate(&stop_event));
+
+    // the events are used for synchronization only and hence do not need to record timings
+    // this also makes events not introduce global sync points when recorded which is critical to get overlap
+
+    // CHECK: hipEvent_t *kernelEvent;
+    // CHECK: kernelEvent = (hipEvent_t *) malloc(nkernels * sizeof(hipEvent_t));
+    cudaEvent_t *kernelEvent;
+    kernelEvent = (cudaEvent_t *) malloc(nkernels * sizeof(cudaEvent_t));
+
+    for (int i = 0; i < nkernels; i++)
+    {
+        // CHECK: checkCudaErrors(hipEventCreateWithFlags(&(kernelEvent[i]), hipEventDisableTiming));
+        checkCudaErrors(cudaEventCreateWithFlags(&(kernelEvent[i]), cudaEventDisableTiming));
+    }
+
+    //////////////////////////////////////////////////////////////////////
+    // time execution with nkernels streams
+    clock_t total_clocks = 0;
+#if defined(__arm__) || defined(__aarch64__)
+    // the kernel takes more time than the channel reset time on arm archs, so to prevent hangs reduce time_clocks.
+    clock_t time_clocks = (clock_t)(kernel_time * (deviceProp.clockRate / 1000));
+#else
+    clock_t time_clocks = (clock_t)(kernel_time * deviceProp.clockRate);
+#endif
+
+    // CHECK: hipEventRecord(start_event, 0);
+    cudaEventRecord(start_event, 0);
+
+    // queue nkernels in separate streams and record when they are done
+    for (int i=0; i<nkernels; ++i)
+    {
+        // CHECK: hipLaunchKernelGGL(clock_block, dim3(1), dim3(1), 0, streams[i], &d_a[i], time_clocks);
+        clock_block<<<1,1,0,streams[i]>>>(&d_a[i], time_clocks);
+        total_clocks += time_clocks;
+
+        // CHECK: checkCudaErrors(hipEventRecord(kernelEvent[i], streams[i]));
+        checkCudaErrors(cudaEventRecord(kernelEvent[i], streams[i]));
+
+        // make the last stream wait for the kernel event to be recorded
+        // CHECK: checkCudaErrors(hipStreamWaitEvent(streams[nstreams-1], kernelEvent[i],0));
+        checkCudaErrors(cudaStreamWaitEvent(streams[nstreams-1], kernelEvent[i],0));
+    }
+
+    // queue a sum kernel and a copy back to host in the last stream.
+    // the commands in this stream get dispatched as soon as all the kernel events have been recorded
+    // CHECK: hipLaunchKernelGGL(sum, dim3(1), dim3(32), 0, streams[nstreams-1], d_a, nkernels);
+    // CHECK: checkCudaErrors(hipMemcpyAsync(a, d_a, sizeof(clock_t), hipMemcpyDeviceToHost, streams[nstreams-1]));
+    sum<<<1,32,0,streams[nstreams-1]>>>(d_a, nkernels);
+    checkCudaErrors(cudaMemcpyAsync(a, d_a, sizeof(clock_t), cudaMemcpyDeviceToHost, streams[nstreams-1]));
+
+    // at this point the CPU has dispatched all work for the GPU and can continue processing other tasks in parallel
+
+    // in this sample we just wait until the GPU is done
+    // CHECK: checkCudaErrors(hipEventRecord(stop_event, 0));
+    // CHECK: checkCudaErrors(hipEventSynchronize(stop_event));
+    // CHECK: checkCudaErrors(hipEventElapsedTime(&elapsed_time, start_event, stop_event));
+    checkCudaErrors(cudaEventRecord(stop_event, 0));
+    checkCudaErrors(cudaEventSynchronize(stop_event));
+    checkCudaErrors(cudaEventElapsedTime(&elapsed_time, start_event, stop_event));
+
+    printf("Expected time for serial execution of %d kernels = %.3fs\n", nkernels, nkernels * kernel_time/1000.0f);
+    printf("Expected time for concurrent execution of %d kernels = %.3fs\n", nkernels, kernel_time/1000.0f);
+    printf("Measured time for sample = %.3fs\n", elapsed_time/1000.0f);
+
+    bool bTestResult  = (a[0] > total_clocks);
+
+    // release resources
+    for (int i = 0; i < nkernels; i++)
+    {
+        // CHECK: hipStreamDestroy(streams[i]);
+        // CHECK: hipEventDestroy(kernelEvent[i]);
+        cudaStreamDestroy(streams[i]);
+        cudaEventDestroy(kernelEvent[i]);
+    }
+
+    free(streams);
+    free(kernelEvent);
+
+    // CHECK: hipEventDestroy(start_event);
+    // CHECK: hipEventDestroy(stop_event);
+    // CHECK: hipHostFree(a);
+    // CHECK: hipFree(d_a);
+    cudaEventDestroy(start_event);
+    cudaEventDestroy(stop_event);
+    cudaFreeHost(a);
+    cudaFree(d_a);
+
+    if (!bTestResult)
+    {
+        printf("Test failed!\n");
+        exit(EXIT_FAILURE);
+    }
+
+    printf("Test passed\n");
+    exit(EXIT_SUCCESS);
+}