From cebb070d30b754bc0011754228302fdbcf3598f7 Mon Sep 17 00:00:00 2001
From: Maneesh Gupta <maneesh.gupta@amd.com>
Date: Tue, 19 Dec 2017 16:06:14 +0530
Subject: [PATCH 01/24] Implement hipStreamAddCallback

Change-Id: Ib851e4d86ba9c8406ca37b88162ea483ccbc9d36
---
 src/hip_hcc.cpp                               | 30 +++++++++
 src/hip_hcc_internal.h                        | 20 ++++++
 src/hip_stream.cpp                            | 36 +++++-----
 .../stream/hipStreamAddCallback.cpp           | 66 +++++++++++++------
 4 files changed, 114 insertions(+), 38 deletions(-)
diff --git a/src/hip_hcc.cpp b/src/hip_hcc.cpp
index f15a0eb1d8..0ca170152b 100644
--- a/src/hip_hcc.cpp
+++ b/src/hip_hcc.cpp
@@ -37,6 +37,7 @@ THE SOFTWARE.
 #include <vector>
 #include <algorithm>
 #include <atomic>
+#include <mutex>
 
 #include <hc.hpp>
 #include <hc_am.hpp>
@@ -1409,9 +1410,38 @@ void ihipInit()
     tprintf(DB_SYNC, "pid=%u %-30s g_numLogicalThreads=%u\n", getpid(), "<ihipInit>", g_numLogicalThreads);
 }
 
+hipError_t ihipStreamSynchronize(hipStream_t stream)
+{
+    hipError_t e = hipSuccess;
 
+    if (stream == hipStreamNull) {
+        ihipCtx_t *ctx = ihipGetTlsDefaultCtx();
+        ctx->locked_syncDefaultStream(true/*waitOnSelf*/, true/*syncToHost*/);
+    } else {
+        // note this does not synchornize with the NULL stream:
+        stream->locked_wait();
+        e = hipSuccess;
+    }
 
+    return e;
+}
 
+void ihipStreamCallbackHandler(ihipStreamCallback_t *cb)
+{
+    hipError_t e = hipSuccess;
+
+    // Notify hipStreamAddCallback that callback handler thread is active
+    std::lock_guard<std::mutex> guard(cb->_mtx);
+    cb->_ready = true;
+
+    // Synchronize stream
+    tprintf(DB_SYNC, "ihipStreamCallbackHandler wait on stream %s\n", ToString(cb->_stream).c_str());
+    e = ihipStreamSynchronize(cb->_stream);
+
+    // Call registered callback function
+    cb->_callback(cb->_stream, e, cb->_userData);
+    delete cb;
+}
 
 //---
 // Get the stream to use for a command submission.
diff --git a/src/hip_hcc_internal.h b/src/hip_hcc_internal.h
index 4891f54fee..601b66f343 100644
--- a/src/hip_hcc_internal.h
+++ b/src/hip_hcc_internal.h
@@ -622,6 +622,24 @@ private: // Data
 };
 
 
+//----
+// Internal structure for stream callback handler
+class ihipStreamCallback_t {
+public:
+    ihipStreamCallback_t(hipStream_t stream, hipStreamCallback_t callback, void *userData) :
+        _stream(stream),
+        _callback(callback),
+        _userData(userData)
+        {
+            _ready = false;
+        };
+    hipStream_t                 _stream;
+    hipStreamCallback_t         _callback;
+    void*                       _userData;
+    std::mutex                  _mtx;
+    bool                        _ready;
+};
+
 
 //----
 // Internal event structure:
@@ -931,6 +949,8 @@ ihipCtx_t * ihipGetPrimaryCtx(unsigned deviceIndex);
 
 
 hipStream_t ihipSyncAndResolveStream(hipStream_t);
+hipError_t ihipStreamSynchronize(hipStream_t stream);
+void ihipStreamCallbackHandler(ihipStreamCallback_t *cb);
 
 // Stream printf functions:
 inline std::ostream& operator<<(std::ostream& os, const ihipStream_t& s)
diff --git a/src/hip_stream.cpp b/src/hip_stream.cpp
index dab31dad62..94fc436b75 100644
--- a/src/hip_stream.cpp
+++ b/src/hip_stream.cpp
@@ -20,6 +20,8 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 */
 
+#include <thread>
+#include <mutex>
 #include "hip/hip_runtime.h"
 #include "hip_hcc_internal.h"
 #include "trace_helper.h"
@@ -147,20 +149,8 @@ hipError_t hipStreamSynchronize(hipStream_t stream)
 {
     HIP_INIT_SPECIAL_API(TRACE_SYNC, stream);
 
-    hipError_t e = hipSuccess;
-
-    if (stream == hipStreamNull) {
-        ihipCtx_t *ctx = ihipGetTlsDefaultCtx();
-        ctx->locked_syncDefaultStream(true/*waitOnSelf*/, true/*syncToHost*/);
-    } else {
-		// note this does not synchornize with the NULL stream:
-        stream->locked_wait();
-        e = hipSuccess;
-    }
-
-
-    return ihipLogStatus(e);
-};
+    return ihipLogStatus(ihipStreamSynchronize(stream));
+}
 
 
 //---
@@ -216,8 +206,20 @@ hipError_t hipStreamAddCallback(hipStream_t stream, hipStreamCallback_t callback
 {
     HIP_INIT_API(stream, callback, userData, flags);
     hipError_t e = hipSuccess;
-    //--- explicitly synchronize stream to add callback routines
-    hipStreamSynchronize(stream);
-    callback(stream, e, userData);
+
+    // Create a thread in detached mode to handle callback
+    ihipStreamCallback_t *cb = new ihipStreamCallback_t(stream, callback, userData);
+    std::thread (ihipStreamCallbackHandler, cb).detach();
+
+    // Wait for thread to be ready
+    cb->_mtx.lock();
+    while(cb->_ready != true)
+    {
+        cb->_mtx.unlock();
+        std::this_thread::sleep_for(std::chrono::milliseconds(10));
+        cb->_mtx.lock();
+    }
+    cb->_mtx.unlock();
+
     return ihipLogStatus(e);
 }
diff --git a/tests/src/runtimeApi/stream/hipStreamAddCallback.cpp b/tests/src/runtimeApi/stream/hipStreamAddCallback.cpp
index 32a2793479..692d090509 100644
--- a/tests/src/runtimeApi/stream/hipStreamAddCallback.cpp
+++ b/tests/src/runtimeApi/stream/hipStreamAddCallback.cpp
@@ -23,8 +23,7 @@ THE SOFTWARE.
  * HIT_END
  */
 
-// Test under-development. Call hipStreamAddCallback function and see if it works as expected.
-
+#include <stdio.h>
 #include "hip/hip_runtime.h"
 #include "test_common.h"
 
@@ -32,32 +31,57 @@ THE SOFTWARE.
 #define HIPRT_CB
 #endif
 
-class CallbackClass
+__global__ void vector_square(float *C_d, float *A_d, size_t N)
 {
-public:
-    static void HIPRT_CB Callback(hipStream_t stream, hipError_t status, void *userData);
+    size_t offset = (blockIdx.x * blockDim.x + threadIdx.x);
+    size_t stride = blockDim.x * gridDim.x ;
 
-private:
-    void callbackFunc(hipError_t status);
-};
-
-void HIPRT_CB CallbackClass::Callback(hipStream_t stream, hipError_t status, void *userData)
-{
-    CallbackClass* obj = (CallbackClass*) userData;
-    obj->callbackFunc(status);
+    for (size_t i=offset; i<N; i+=stride) {
+        C_d[i] = A_d[i] * A_d[i];
+    }
 }
 
-void CallbackClass::callbackFunc(hipError_t status)
+float *A_h, *C_h;
+size_t N = 1000000;
+
+static void HIPRT_CB Callback(hipStream_t stream, hipError_t status, void *userData)
 {
-     HIPASSERT(status==hipSuccess);
+    for (size_t i=0; i<N; i++) {
+        if (C_h[i] != A_h[i] * A_h[i]) {
+            HIPCHECK(hipErrorUnknown);
+        }
+    }
+    printf ("PASSED!\n");
 }
 
-int main(){
+int main(int argc, char *argv[])
+{
+    float *A_d, *C_d;
+    size_t Nbytes = N * sizeof(float);
+
+    A_h = (float*)malloc(Nbytes);
+    HIPCHECK(A_h == 0 ? hipErrorMemoryAllocation : hipSuccess );
+    C_h = (float*)malloc(Nbytes);
+    HIPCHECK(C_h == 0 ? hipErrorMemoryAllocation : hipSuccess );
+
+    // Fill with Phi + i
+    for (size_t i=0; i<N; i++) 
+    {
+        A_h[i] = 1.618f + i; 
+    }
+
+    HIPCHECK(hipMalloc(&A_d, Nbytes));
+    HIPCHECK(hipMalloc(&C_d, Nbytes));
+
     hipStream_t mystream;
-    HIPCHECK(hipStreamCreate(&mystream));
-    CallbackClass* obj = new CallbackClass;
-    HIPCHECK(hipStreamAddCallback(mystream, CallbackClass::Callback, obj, 0));
-    HIPCHECK(hipStreamAddCallback(NULL, CallbackClass::Callback, obj, 0));
+    HIPCHECK(hipStreamCreateWithFlags(&mystream, hipStreamNonBlocking));
 
-	passed();
+    HIPCHECK(hipMemcpyAsync(A_d, A_h, Nbytes, hipMemcpyHostToDevice, mystream));
+
+    const unsigned blocks = 512;
+    const unsigned threadsPerBlock = 256;
+    hipLaunchKernelGGL((vector_square), dim3(blocks), dim3(threadsPerBlock), 0, mystream, C_d, A_d, N);
+
+    HIPCHECK(hipMemcpyAsync(C_h, C_d, Nbytes, hipMemcpyDeviceToHost, mystream));
+    HIPCHECK(hipStreamAddCallback(mystream, Callback, NULL, 0));
 }

From d9572d7fe7bb3ee556162138d46af09a6024e60e Mon Sep 17 00:00:00 2001
From: Peng <peng.sun@amd.com>
Date: Sat, 13 Jan 2018 12:33:27 -0600
Subject: [PATCH 02/24] Update hip_debugging.md on typo of chicken bits

---
 docs/markdown/hip_debugging.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/markdown/hip_debugging.md b/docs/markdown/hip_debugging.md
index e7e058d17a..bf877d894e 100644
--- a/docs/markdown/hip_debugging.md
+++ b/docs/markdown/hip_debugging.md
@@ -74,7 +74,7 @@ HIP provides 3 environment variables in the HIP_*_BLOCKING family.  These introd
 - HIP_API_BLOCKING : Forces hipMemcpyAsync and hipMemsetAsync to be host-synchronous, meaning they will wait for the requested operation to complete before returning to the caller.
 
 These options cause HCC to serialize.  Useful if you have libraries or code which is calling HCC kernels directly rather than using HIP.  
-- HCC_SERIALZIE_KERNELS : 0x1=pre-serialize before each kernel launch, 0x2=post-serialize after each kernel launch., 0x3= pre- and post- serialize.
+- HCC_SERIALIZE_KERNEL : 0x1=pre-serialize before each kernel launch, 0x2=post-serialize after each kernel launch., 0x3= pre- and post- serialize.
 - HCC_SERIALIZE_COPY    : 0x1=pre-serialize before each async copy, 0x2=post-serialize after each async copy., 0x3= pre- and post- serialize.
 
 - HSA_ENABLE_SDMA=0     : Causes host-to-device and device-to-host copies to use compute shader blit kernels rather than the dedicated DMA copy engines.  Compute shader copies have low latency (typically < 5us) and can achieve approximately 80% of the bandwidth of the DMA copy engine.  This flag is useful to isolate issues with the hardware copy engines.

From 487a430b5a3679ee3d0ee02e0ce6f22acd474ec0 Mon Sep 17 00:00:00 2001
From: Rahul Garg <rahul.garg@amd.com>
Date: Tue, 16 Jan 2018 11:44:19 +0530
Subject: [PATCH 03/24] Added support for -

- hipMemcpyFromArray
- hipMemcpyAtoH
- hipMemcpyHtoA
---
 include/hip/hcc_detail/hip_runtime_api.h  | 49 +++++++++++++++++++
 include/hip/nvcc_detail/hip_runtime_api.h | 12 +++++
 src/hip_memory.cpp                        | 59 +++++++++++++++++++++++
 3 files changed, 120 insertions(+)

diff --git a/include/hip/hcc_detail/hip_runtime_api.h b/include/hip/hcc_detail/hip_runtime_api.h
index 9d0757f83a..7f159572d7 100644
--- a/include/hip/hcc_detail/hip_runtime_api.h
+++ b/include/hip/hcc_detail/hip_runtime_api.h
@@ -1420,7 +1420,56 @@ hipError_t hipMemcpy2DToArray(hipArray* dst, size_t wOffset, size_t hOffset, con
 hipError_t hipMemcpyToArray(hipArray* dst, size_t wOffset, size_t hOffset,
                             const void* src, size_t count, hipMemcpyKind kind);
 
+/**
+ *  @brief Copies data between host and device.
+ *
+ *  @param[in]   dst       Destination memory address
+ *  @param[in]   srcArray  Source memory address
+ *  @param[in]   woffset   Source starting X offset
+ *  @param[in]   hOffset   Source starting Y offset
+ *  @param[in]   count     Size in bytes to copy
+ *  @param[in]   kind      Type of transfer
+ *  @return      #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidPitchValue, #hipErrorInvalidDevicePointer, #hipErrorInvalidMemcpyDirection
+ *
+ *  @see hipMemcpy, hipMemcpy2DToArray, hipMemcpy2D, hipMemcpyFromArray, hipMemcpyToSymbol, hipMemcpyAsync
+ */
+hipError_t hipMemcpyFromArray(void* dst, hipArray_const_t srcArray, size_t wOffset, size_t hOffset,
+         size_t count, hipMemcpyKind kind);
 
+/**
+ *  @brief Copies data between host and device.
+ *
+ *  @param[in]   dst       Destination memory address
+ *  @param[in]   srcArray  Source array
+ *  @param[in]   srcoffset Offset in bytes of source array
+ *  @param[in]   count     Size of memory copy in bytes
+ *  @return      #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidPitchValue, #hipErrorInvalidDevicePointer, #hipErrorInvalidMemcpyDirection
+ *
+ *  @see hipMemcpy, hipMemcpy2DToArray, hipMemcpy2D, hipMemcpyFromArray, hipMemcpyToSymbol, hipMemcpyAsync
+ */
+hipError_t hipMemcpyAtoH(void* dst, hipArray* srcArray, size_t srcOffset, size_t count);
+
+/**
+ *  @brief Copies data between host and device.
+ *
+ *  @param[in]   dstArray   Destination memory address
+ *  @param[in]   dstOffset  Offset in bytes of destination array
+ *  @param[in]   srcHost    Source host pointer
+ *  @param[in]   count      Size of memory copy in bytes
+ *  @return      #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidPitchValue, #hipErrorInvalidDevicePointer, #hipErrorInvalidMemcpyDirection
+ *
+ *  @see hipMemcpy, hipMemcpy2DToArray, hipMemcpy2D, hipMemcpyFromArray, hipMemcpyToSymbol, hipMemcpyAsync
+ */
+hipError_t hipMemcpyHtoA(hipArray* dstArray, size_t dstOffset, const void* srcHost, size_t count);
+
+/**
+ *  @brief Copies data between host and device.
+ *
+ *  @param[in]   p   3D memory copy parameters
+ *  @return      #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidPitchValue, #hipErrorInvalidDevicePointer, #hipErrorInvalidMemcpyDirection
+ *
+ *  @see hipMemcpy, hipMemcpy2DToArray, hipMemcpy2D, hipMemcpyFromArray, hipMemcpyToSymbol, hipMemcpyAsync
+ */
 hipError_t hipMemcpy3D(const struct hipMemcpy3DParms *p);
 
 // doxygen end Memory
diff --git a/include/hip/nvcc_detail/hip_runtime_api.h b/include/hip/nvcc_detail/hip_runtime_api.h
index dbd6d8b300..902e3620fa 100644
--- a/include/hip/nvcc_detail/hip_runtime_api.h
+++ b/include/hip/nvcc_detail/hip_runtime_api.h
@@ -483,6 +483,18 @@ inline static hipError_t hipMemcpyToArray(hipArray* dst, size_t wOffset, size_t
   return hipCUDAErrorTohipError(cudaMemcpyToArray(dst, wOffset, hOffset, src, count, hipMemcpyKindToCudaMemcpyKind(kind)));
 }
 
+inline static hipError_t hipMemcpyFromArray(void* dst, hipArray_const_t srcArray, size_t wOffset, size_t hOffset, size_t count, hipMemcpyKind kind) {
+  return hipCUDAErrorTohipError(cudaMemcpyFromArray(dst, srcArray, wOffset, hOffset, count, hipMemcpyKindToCudaMemcpyKind(kind)));
+}
+
+inline static hipError_t hipMemcpyAtoH(void* dst, hipArray* srcArray, size_t srcOffset, size_t count) {
+  return hipCUResultTohipError(cuMemcpyAtoH(dst, (CUarray)srcArray, srcOffset, count));
+}
+
+inline static hipError_t hipMemcpyHtoA(hipArray* dstArray, size_t dstOffset, const void* srcHost, size_t count) {
+  return hipCUResultTohipError(cuMemcpyHtoA((CUarray)dstArray, dstOffset, srcHost, count));
+}
+
 inline static hipError_t hipDeviceSynchronize() {
     return hipCUDAErrorTohipError(cudaDeviceSynchronize());
 }
diff --git a/src/hip_memory.cpp b/src/hip_memory.cpp
index 77526cf9ac..0fb13abeb6 100644
--- a/src/hip_memory.cpp
+++ b/src/hip_memory.cpp
@@ -1411,6 +1411,65 @@ hipError_t hipMemcpyToArray(hipArray* dst, size_t wOffset, size_t hOffset,
     return ihipLogStatus(e);
 }
 
+hipError_t hipMemcpyFromArray(void* dst, hipArray_const_t srcArray, size_t wOffset, size_t hOffset,
+         size_t count, hipMemcpyKind kind) {
+
+    HIP_INIT_SPECIAL_API((TRACE_MCMD), dst, srcArray, wOffset, hOffset, count, kind);
+
+    hipStream_t stream = ihipSyncAndResolveStream(hipStreamNull);
+
+    hc::completion_future marker;
+
+    hipError_t e = hipSuccess;
+
+    try {
+        stream->locked_copySync((char *)dst, (char*)srcArray->data + wOffset, count, kind);
+    }
+    catch (ihipException &ex) {
+        e = ex._code;
+    }
+
+    return ihipLogStatus(e);
+}
+
+hipError_t hipMemcpyHtoA(hipArray* dstArray, size_t dstOffset, const void* srcHost, size_t count)
+{
+    HIP_INIT_SPECIAL_API((TRACE_MCMD), dstArray, dstOffset, srcHost, count);
+
+    hipStream_t stream = ihipSyncAndResolveStream(hipStreamNull);
+
+    hc::completion_future marker;
+
+    hipError_t e = hipSuccess;
+    try {
+        stream->locked_copySync((char *)dstArray->data + dstOffset, srcHost, count, hipMemcpyHostToDevice);
+    } catch (ihipException &ex) {
+        e = ex._code;
+    }
+
+    return ihipLogStatus(e);
+}
+
+hipError_t hipMemcpyAtoH(void* dst, hipArray* srcArray, size_t srcOffset, size_t count)
+{
+    HIP_INIT_SPECIAL_API((TRACE_MCMD), dst, srcArray, srcOffset, count);
+
+    hipStream_t stream = ihipSyncAndResolveStream(hipStreamNull);
+
+    hc::completion_future marker;
+
+    hipError_t e = hipSuccess;
+
+    try {
+        stream->locked_copySync((char *)dst, (char*)srcArray->data + srcOffset, count, hipMemcpyDeviceToHost);
+    }
+    catch (ihipException &ex) {
+        e = ex._code;
+    }
+
+    return ihipLogStatus(e);
+}
+
 hipError_t hipMemcpy3D(const struct hipMemcpy3DParms *p)
 {
     HIP_INIT_SPECIAL_API((TRACE_MCMD), p);

From 368db8bf6c66333b989d961b26930fb3da542376 Mon Sep 17 00:00:00 2001
From: Evgeny Mankov <Evgeniy.Mankov@amd.com>
Date: Tue, 23 Jan 2018 21:43:18 +0300
Subject: [PATCH 04/24] [HIPIFY][tests][win] Fix run_test.bat

All checks should not occur in input file for FileCheck. The issue found on CHECK-NOT.
Change removes all lit checks in the hipified file based on regexp, and the resulted stdout is fed as stdin for FileCheck.
---
 tests/hipify-clang/run_test.bat | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tests/hipify-clang/run_test.bat b/tests/hipify-clang/run_test.bat
index 6eefb7e46e..b4858db807 100644
--- a/tests/hipify-clang/run_test.bat
+++ b/tests/hipify-clang/run_test.bat
@@ -1,4 +1,4 @@
-@echo off
+rem @echo off
 setlocal
 
 for %%i in (FileCheck.exe) do set FILE_CHECK=%%~$PATH:i
@@ -14,5 +14,6 @@ set clang_args=%4%clang_args%
 
 %HIPIFY% -o=%TMP_FILE% %IN_FILE% -- %clang_args%
 if errorlevel 1 (echo      Error: hipify-clang.exe failed with exit code: %errorlevel% && exit /b %errorlevel%)
-%FILE_CHECK% %IN_FILE% -input-file=%TMP_FILE%
+
+findstr /v /r /c:"[ ]*//[ ]*[CHECK*|RUN]" %TMP_FILE% | %FILE_CHECK% %IN_FILE%
 if errorlevel 1 (echo      Error: FileCheck.exe failed with exit code: %errorlevel% && exit /b %errorlevel%)

From c528f4f9c5101f4350c7f31dc141f32d3d8cacaf Mon Sep 17 00:00:00 2001
From: Evgeny Mankov <Evgeniy.Mankov@amd.com>
Date: Tue, 23 Jan 2018 21:46:27 +0300
Subject: [PATCH 05/24] [HIPIFY][tests][win] Uncomment @echo off

---
 tests/hipify-clang/run_test.bat | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/hipify-clang/run_test.bat b/tests/hipify-clang/run_test.bat
index b4858db807..d8c8d74cf0 100644
--- a/tests/hipify-clang/run_test.bat
+++ b/tests/hipify-clang/run_test.bat
@@ -1,4 +1,4 @@
-rem @echo off
+@echo off
 setlocal
 
 for %%i in (FileCheck.exe) do set FILE_CHECK=%%~$PATH:i

From 77f807b597d788d3cc05752175790261d5697c49 Mon Sep 17 00:00:00 2001
From: Evgeny Mankov <Evgeniy.Mankov@amd.com>
Date: Tue, 23 Jan 2018 23:06:55 +0300
Subject: [PATCH 06/24] [HIPIFY][fix] Fix PragmaDirective

File location have to be verified, otherwise location of the first found '#pragma once' in any included header even system will be erroneously handled, which might lead to attempt to including hip_runtime.h in it.
---
 hipify-clang/src/HipifyAction.cpp | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/hipify-clang/src/HipifyAction.cpp b/hipify-clang/src/HipifyAction.cpp
index 7cd5b3d402..7d9ddebf51 100644
--- a/hipify-clang/src/HipifyAction.cpp
+++ b/hipify-clang/src/HipifyAction.cpp
@@ -198,8 +198,13 @@ void HipifyAction::InclusionDirective(clang::SourceLocation hash_loc,
 }
 
 void HipifyAction::PragmaDirective(clang::SourceLocation Loc, clang::PragmaIntroducerKind Introducer) {
-    if (pragmaOnce) { return; }
+    if (pragmaOnce) {
+        return;
+    }
     clang::SourceManager& SM = getCompilerInstance().getSourceManager();
+    if (!SM.isWrittenInMainFile(Loc)) {
+        return;
+    }
     clang::Preprocessor& PP = getCompilerInstance().getPreprocessor();
     const clang::Token tok = PP.LookAhead(0);
     StringRef Text(SM.getCharacterData(tok.getLocation()), tok.getLength());

From 600d5d7c06f622d6f895fe79fe950f8d0453b4a8 Mon Sep 17 00:00:00 2001
From: Evgeny Mankov <Evgeniy.Mankov@amd.com>
Date: Tue, 23 Jan 2018 23:43:36 +0300
Subject: [PATCH 07/24] [HIPIFY][fix] CUDA and cuBLAS main headers correct
 handling

---
 hipify-clang/src/CUDA2HipMap.cpp      |  8 +++---
 hipify-clang/src/HipifyAction.cpp     | 35 +++++++++++++++++++--------
 hipify-clang/src/HipifyAction.h       |  5 ++--
 tests/hipify-clang/headers_test_01.cu |  4 ++-
 tests/hipify-clang/headers_test_02.cu | 10 +++++---
 tests/hipify-clang/headers_test_06.cu |  8 ++++++
 tests/hipify-clang/headers_test_07.cu |  8 ++++++
 tests/hipify-clang/headers_test_08.cu | 14 +++++++++++
 8 files changed, 71 insertions(+), 21 deletions(-)
 create mode 100644 tests/hipify-clang/headers_test_06.cu
 create mode 100644 tests/hipify-clang/headers_test_07.cu
 create mode 100644 tests/hipify-clang/headers_test_08.cu

diff --git a/hipify-clang/src/CUDA2HipMap.cpp b/hipify-clang/src/CUDA2HipMap.cpp
index 1f9c6287ed..6bcc0b38c4 100644
--- a/hipify-clang/src/CUDA2HipMap.cpp
+++ b/hipify-clang/src/CUDA2HipMap.cpp
@@ -379,12 +379,12 @@ const std::map <llvm::StringRef, hipCounter> CUDA_INCLUDE_MAP{
     {"vector_types.h",       {"hip/hip_vector_types.h",   CONV_INCLUDE,             API_RUNTIME}},
 
     // CUBLAS includes
-    {"cublas.h",    {"hipblas.h", CONV_INCLUDE, API_BLAS}},
-    {"cublas_v2.h", {"hipblas.h", CONV_INCLUDE, API_BLAS}},
+    {"cublas.h",             {"hipblas.h",                CONV_INCLUDE_CUDA_MAIN_H, API_BLAS}},
+    {"cublas_v2.h",          {"hipblas.h",                CONV_INCLUDE_CUDA_MAIN_H, API_BLAS}},
 
     // CURAND includes
-    {"curand.h",             {"hiprand.h",        CONV_INCLUDE, API_RAND}},
-    {"curand_kernel.h",      {"hiprand_kernel.h", CONV_INCLUDE, API_RAND}},
+    {"curand.h",             {"hiprand.h",                CONV_INCLUDE,             API_RAND}},
+    {"curand_kernel.h",      {"hiprand_kernel.h",         CONV_INCLUDE,             API_RAND}},
 
     // HIP includes
     // TODO: uncomment this when hip/cudacommon.h will be renamed to hip/hipcommon.h
diff --git a/hipify-clang/src/HipifyAction.cpp b/hipify-clang/src/HipifyAction.cpp
index 7cd5b3d402..9ac0eacbd1 100644
--- a/hipify-clang/src/HipifyAction.cpp
+++ b/hipify-clang/src/HipifyAction.cpp
@@ -149,23 +149,38 @@ void HipifyAction::InclusionDirective(clang::SourceLocation hash_loc,
     if (!SM.isWrittenInMainFile(hash_loc)) {
         return;
     }
+    if (!firstHeader) {
+        firstHeader = true;
+        firstHeaderLoc = hash_loc;
+    }
 
     const auto found = CUDA_INCLUDE_MAP.find(file_name);
     if (found == CUDA_INCLUDE_MAP.end()) {
-        if (!firstNotMainHeader) {
-            firstNotMainHeader = true;
-            firstNotMainHeaderLoc = hash_loc;
-        }
         return;
     }
 
     // Special-casing to avoid duplication of the hip_runtime include.
     bool secondMainInclude = false;
-    if (found->second.hipName == "hip/hip_runtime.h") {
-        if (insertedRuntimeHeader) {
-            secondMainInclude = true;
+    if (found->second.countType == CONV_INCLUDE_CUDA_MAIN_H) {
+        switch (found->second.countApiType) {
+            case API_DRIVER:
+            case API_RUNTIME:
+                if (insertedRuntimeHeader) {
+                    secondMainInclude = true;
+                    break;
+                }
+                insertedRuntimeHeader = true;
+                break;
+            case API_BLAS:
+                if (insertedBLASHeader) {
+                    secondMainInclude = true;
+                    break;
+                }
+                insertedBLASHeader = true;
+                break;
+            default:
+                break;
         }
-        insertedRuntimeHeader = true;
     }
 
     Statistics::current().incrementCounter(found->second, file_name.str());
@@ -356,8 +371,8 @@ void HipifyAction::EndSourceFileAction() {
         clang::SourceLocation sl;
         if (pragmaOnce) {
             sl = pragmaOnceLoc;
-        } else if (firstNotMainHeader) {
-            sl = firstNotMainHeaderLoc;
+        } else if (firstHeader) {
+            sl = firstHeaderLoc;
         } else {
             sl = SM.getLocForStartOfFile(SM.getMainFileID());
         }
diff --git a/hipify-clang/src/HipifyAction.h b/hipify-clang/src/HipifyAction.h
index a269a37117..42622c1e01 100644
--- a/hipify-clang/src/HipifyAction.h
+++ b/hipify-clang/src/HipifyAction.h
@@ -23,9 +23,10 @@ private:
     // not, we insert it at the top of the file when we finish processing it.
     // This approach means we do the best it's possible to do w.r.t preserving the user's include order.
     bool insertedRuntimeHeader = false;
-    bool firstNotMainHeader = false;
+    bool insertedBLASHeader = false;
+    bool firstHeader = false;
     bool pragmaOnce = false;
-    clang::SourceLocation firstNotMainHeaderLoc;
+    clang::SourceLocation firstHeaderLoc;
     clang::SourceLocation pragmaOnceLoc;
 
     /**
diff --git a/tests/hipify-clang/headers_test_01.cu b/tests/hipify-clang/headers_test_01.cu
index c39ef80d8f..3747c339e8 100644
--- a/tests/hipify-clang/headers_test_01.cu
+++ b/tests/hipify-clang/headers_test_01.cu
@@ -1,6 +1,8 @@
 // RUN: %run_test hipify "%s" "%t" %cuda_args
 
 // CHECK: #include <hip/hip_runtime.h>
+// CHECK-NOT: #include <cuda_runtime.h>
+// CHECK: #include <stdio.h>
 #include <cuda.h>
-// CHECK-NOT: #include<cuda_runtime.h>
 #include <cuda_runtime.h>
+#include <stdio.h>
diff --git a/tests/hipify-clang/headers_test_02.cu b/tests/hipify-clang/headers_test_02.cu
index 90d412f797..57308efd59 100644
--- a/tests/hipify-clang/headers_test_02.cu
+++ b/tests/hipify-clang/headers_test_02.cu
@@ -1,6 +1,8 @@
 // RUN: %run_test hipify "%s" "%t" %cuda_args
 
-// CHECK: #include <hip/hip_runtime.h>
-#include <cuda_runtime.h>
-// CHECK-NOT: #include<cuda.h>
-#include <cuda.h>
+// CHECK: #include "hip/hip_runtime.h"
+// CHECK-NOT: #include "cuda_runtime.h"
+// CHECK: #include <stdio.h>
+#include "cuda.h"
+#include "cuda_runtime.h"
+#include <stdio.h>
diff --git a/tests/hipify-clang/headers_test_06.cu b/tests/hipify-clang/headers_test_06.cu
new file mode 100644
index 0000000000..bce73c42df
--- /dev/null
+++ b/tests/hipify-clang/headers_test_06.cu
@@ -0,0 +1,8 @@
+// RUN: %run_test hipify "%s" "%t" %cuda_args
+
+// CHECK: #include <hipblas.h>
+// CHECK-NOT: #include <cublas_v2.h>
+// CHECK: #include <stdio.h>
+#include <cublas.h>
+#include <cublas_v2.h>
+#include <stdio.h>
diff --git a/tests/hipify-clang/headers_test_07.cu b/tests/hipify-clang/headers_test_07.cu
new file mode 100644
index 0000000000..4237e1eb72
--- /dev/null
+++ b/tests/hipify-clang/headers_test_07.cu
@@ -0,0 +1,8 @@
+// RUN: %run_test hipify "%s" "%t" %cuda_args
+
+// CHECK: #include "hipblas.h"
+// CHECK-NOT: #include "cublas.h"
+// CHECK: #include <stdio.h>
+#include "cublas_v2.h"
+#include "cublas.h"
+#include <stdio.h>
diff --git a/tests/hipify-clang/headers_test_08.cu b/tests/hipify-clang/headers_test_08.cu
new file mode 100644
index 0000000000..ad54871bd8
--- /dev/null
+++ b/tests/hipify-clang/headers_test_08.cu
@@ -0,0 +1,14 @@
+// RUN: %run_test hipify "%s" "%t" %cuda_args
+
+// CHECK: #include <hip/hip_runtime.h>
+// CHECK-NOT: #include <cuda_runtime.h>
+// CHECK: #include <iostream>
+// CHECK: #include "hipblas.h"
+// CHECK-NOT: #include "cublas.h"
+// CHECK: #include <stdio.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <iostream>
+#include "cublas_v2.h"
+#include "cublas.h"
+#include <stdio.h>

From f9416a0c49737f0ac19f42565ab6e93a5ea178f1 Mon Sep 17 00:00:00 2001
From: Evgeny Mankov <Evgeniy.Mankov@amd.com>
Date: Wed, 24 Jan 2018 20:13:23 +0300
Subject: [PATCH 08/24] [HIPIFY][tests][win] Make cudaRegister.cu building on
 Windows as well

---
 tests/hipify-clang/cudaRegister.cu | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/tests/hipify-clang/cudaRegister.cu b/tests/hipify-clang/cudaRegister.cu
index 79d21707c2..43b4345337 100644
--- a/tests/hipify-clang/cudaRegister.cu
+++ b/tests/hipify-clang/cudaRegister.cu
@@ -22,7 +22,12 @@ THE SOFTWARE.
 #include<cuda.h>
 #include<cuda_runtime.h>
 #include<iostream>
-#include<unistd.h>
+#ifdef _WIN32
+#include <windows.h>
+#define sleep(x) Sleep(x)
+#else
+#include <unistd.h>
+#endif
 #include<stdio.h>
 #include<malloc.h>
 
@@ -33,7 +38,7 @@ THE SOFTWARE.
 // CHECK: if(status != hipSuccess) {
 #define check(msg, status){ \
 if(status != cudaSuccess) { \
-	printf("%s failed. \n", #msg); \
+  printf("%s failed. \n", #msg); \
 } \
 }
 

From 0497424978de459d817432efbafb73c2df544b46 Mon Sep 17 00:00:00 2001
From: Kent Knox <kent.knox@amd>
Date: Wed, 24 Jan 2018 17:00:57 -0600
Subject: [PATCH 09/24] Fixing rocblas build failure

with ::Bundled_code_header constructor

Disabling hipPrintfKernel test from CI
---
 Jenkinsfile                                   | 3 ++-
 include/hip/hcc_detail/code_object_bundle.hpp | 4 ++--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index 4909e666f0..6142b94d95 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -160,6 +160,7 @@ def docker_build_inside_image( def build_image, String inside_args, String platf
     }
 
     // Cap the maximum amount of testing, in case of hangs
+    // Excluding hipPrintfKernel test from automation; variable fails on CI test machines
     timeout(time: 1, unit: 'HOURS')
     {
       stage("${platform} unit testing")
@@ -169,7 +170,7 @@ def docker_build_inside_image( def build_image, String inside_args, String platf
             cd ${build_dir_rel}
             make install -j\$(nproc)
             make build_tests -i -j\$(nproc)
-            make test
+            ctest -E hipPrintfKernel
           """
         // If unit tests output a junit or xunit file in the future, jenkins can parse that file
         // to display test results on the dashboard
diff --git a/include/hip/hcc_detail/code_object_bundle.hpp b/include/hip/hcc_detail/code_object_bundle.hpp
index 72f9d35c73..2bec0017db 100644
--- a/include/hip/hcc_detail/code_object_bundle.hpp
+++ b/include/hip/hcc_detail/code_object_bundle.hpp
@@ -150,8 +150,8 @@ namespace hip_impl
     };
 
     // CREATORS
-    template<typename I>
-    Bundled_code_header::Bundled_code_header(I f, I l) : Bundled_code_header{}
+    template<typename RandomAccessIterator>
+    Bundled_code_header::Bundled_code_header(RandomAccessIterator f, RandomAccessIterator l) : Bundled_code_header{}
     {
         read(f, l, *this);
     }

From 8acc8365fa9a424e76c5b1d6d34b121c23ed8de9 Mon Sep 17 00:00:00 2001
From: Siu Chi Chan <siuchi.chan@amd.com>
Date: Thu, 25 Jan 2018 16:51:29 +0000
Subject: [PATCH 10/24] use assign rather than insert

---
 include/hip/hcc_detail/code_object_bundle.hpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/hip/hcc_detail/code_object_bundle.hpp b/include/hip/hcc_detail/code_object_bundle.hpp
index 72f9d35c73..1b2dfc1c7c 100644
--- a/include/hip/hcc_detail/code_object_bundle.hpp
+++ b/include/hip/hcc_detail/code_object_bundle.hpp
@@ -88,7 +88,7 @@ namespace hip_impl
                     std::copy_n(it, sizeof(y.cbuf), y.cbuf);
                     it += sizeof(y.cbuf);
 
-                    y.triple.insert(y.triple.cend(), it, it + y.triple_sz);
+                    y.triple.assign(it, it + y.triple_sz);
 
                     std::copy_n(
                         f + y.offset, y.bundle_sz, std::back_inserter(y.blob));
@@ -155,4 +155,4 @@ namespace hip_impl
     {
         read(f, l, *this);
     }
-} // Namespace hip_impl.
\ No newline at end of file
+} // Namespace hip_impl.

From 98f3fe3939148599367f24faf27564ffdf926230 Mon Sep 17 00:00:00 2001
From: Kent Knox <kent.knox@amd>
Date: Thu, 25 Jan 2018 10:52:56 -0600
Subject: [PATCH 11/24] Remove archiving of RPM

We should archive RPM's from proper centos/fedora machines so that we
get the proper dependencies right
---
 Jenkinsfile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index 6142b94d95..6e5f7bc8e5 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -151,7 +151,7 @@ def docker_build_inside_image( def build_image, String inside_args, String platf
       // The rm command needs to run as sudo because the test steps below create files owned by root
       sh  """#!/usr/bin/env bash
           set -x
-          sudo rm -rf ${build_dir_rel}
+          rm -rf ${build_dir_rel}
           mkdir -p ${build_dir_rel}
           cd ${build_dir_rel}
           cmake -DCMAKE_BUILD_TYPE=${build_config} -DCMAKE_INSTALL_PREFIX=staging ${optional_configure} ${source_hip_abs}
@@ -194,7 +194,7 @@ def docker_build_inside_image( def build_image, String inside_args, String platf
         if( platform.toLowerCase( ).startsWith( 'hcc-ctu' ) )
         {
           archiveArtifacts artifacts: "${build_dir_rel}/*.deb", fingerprint: true
-          archiveArtifacts artifacts: "${build_dir_rel}/*.rpm", fingerprint: true
+          // archiveArtifacts artifacts: "${build_dir_rel}/*.rpm", fingerprint: true
         }
       }
     }

From 9860d5de20116d88943ed6512ebd9ea95d87a22b Mon Sep 17 00:00:00 2001
From: Maneesh Gupta <maneesh.gupta@amd.com>
Date: Fri, 26 Jan 2018 07:02:49 +0530
Subject: [PATCH 12/24] Disable md2html conversion in hip doc package

---
 packaging/hip_doc.txt | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/packaging/hip_doc.txt b/packaging/hip_doc.txt
index daef7810b4..5987c2c128 100644
--- a/packaging/hip_doc.txt
+++ b/packaging/hip_doc.txt
@@ -8,12 +8,12 @@ if(DOXYGEN_EXE)
     install(DIRECTORY RuntimeAPI/html DESTINATION docs/docs/RuntimeAPI)
 endif()
 
-find_program(GRIP_EXE grip)
-if(GRIP_EXE)
-    add_custom_target(convert_md_to_html ALL
-                  COMMAND @hip_SOURCE_DIR@/packaging/convert_md_to_html.sh @hip_SOURCE_DIR@ ${PROJECT_BINARY_DIR}/md2html)
-    install(DIRECTORY md2html/ DESTINATION docs)
-endif()
+#find_program(GRIP_EXE grip)
+#if(GRIP_EXE)
+#    add_custom_target(convert_md_to_html ALL
+#                  COMMAND @hip_SOURCE_DIR@/packaging/convert_md_to_html.sh @hip_SOURCE_DIR@ ${PROJECT_BINARY_DIR}/md2html)
+#    install(DIRECTORY md2html/ DESTINATION docs)
+#endif()
 
 #############################
 # Packaging steps

From 02e23c4d87b0b69615650879e623d69320fd1e1e Mon Sep 17 00:00:00 2001
From: Evgeny Mankov <Evgeniy.Mankov@amd.com>
Date: Mon, 29 Jan 2018 18:33:47 +0300
Subject: [PATCH 13/24] [HIPIFY] InclusionDirective refactoring

Due to support of cuRAND headers.

+ compound test on all headers is added;
+ missing entities are added with updating the doc;
+ a couple cuRAND tests are added (https://github.com/ROCmSoftwarePlatform/rocRAND/tree/master/benchmark):
  - the following CUDA entities are still unsupported by hipRAND:
      curandMakeMTGP32Constants
      curandMakeMTGP32KernelState
      curandGetDirectionVectors32
      curandDirectionVectorSet_t
      CURAND_DIRECTION_VECTORS_32_JOEKUO6
      curandStateSobol64_t
      curandStateScrambledSobol64_t
      curandGenerateLongLong
  - and the following - by HIP:
      cudaRuntimeGetVersion
  - those entities are handled by CHECK-NOT directive for now.
---
 docs/markdown/CURAND_API_supported_by_HIP.md  |   2 +
 hipify-clang/src/CUDA2HipMap.cpp              |  45 +-
 hipify-clang/src/HipifyAction.cpp             |  68 +-
 hipify-clang/src/HipifyAction.h               |   5 +
 hipify-clang/src/Statistics.cpp               |   4 +-
 hipify-clang/src/Statistics.h                 |   4 +-
 .../cuRAND/benchmark_curand_generate.cpp      | 393 ++++++++++
 .../cuRAND/benchmark_curand_kernel.cpp        | 669 ++++++++++++++++++
 tests/hipify-clang/cuRAND/cmdparser.hpp       | 513 ++++++++++++++
 tests/hipify-clang/headers_test_09.cu         |  93 +++
 tests/hipify-clang/lit.cfg                    |   2 +
 11 files changed, 1756 insertions(+), 42 deletions(-)
 create mode 100644 tests/hipify-clang/cuRAND/benchmark_curand_generate.cpp
 create mode 100644 tests/hipify-clang/cuRAND/benchmark_curand_kernel.cpp
 create mode 100644 tests/hipify-clang/cuRAND/cmdparser.hpp
 create mode 100644 tests/hipify-clang/headers_test_09.cu

diff --git a/docs/markdown/CURAND_API_supported_by_HIP.md b/docs/markdown/CURAND_API_supported_by_HIP.md
index 900cfadc03..aae43af272 100644
--- a/docs/markdown/CURAND_API_supported_by_HIP.md
+++ b/docs/markdown/CURAND_API_supported_by_HIP.md
@@ -154,6 +154,8 @@
 | `curand_normal2_double`                                   | `hiprand_normal2_double`                   |
 | `curand_normal4`                                          | `hiprand_normal4`                          |
 | `curand_normal4_double`                                   | `hiprand_normal4_double`                   |
+| `curand_uniform`                                          | `hiprand_uniform`                          |
+| `curand_uniform_double`                                   | `hiprand_uniform_double`                   |
 | `curand_uniform2_double`                                  | `hiprand_uniform2_double`                  |
 | `curand_uniform4`                                         | `hiprand_uniform4`                         |
 | `curand_uniform4_double`                                  | `hiprand_uniform4_double`                  |
diff --git a/hipify-clang/src/CUDA2HipMap.cpp b/hipify-clang/src/CUDA2HipMap.cpp
index 6bcc0b38c4..7001deb2e7 100644
--- a/hipify-clang/src/CUDA2HipMap.cpp
+++ b/hipify-clang/src/CUDA2HipMap.cpp
@@ -367,24 +367,39 @@ const std::map<llvm::StringRef, hipCounter> CUDA_TYPE_NAME_MAP{
 /// Maps cuda header names to hip header names.
 const std::map <llvm::StringRef, hipCounter> CUDA_INCLUDE_MAP{
     // CUDA includes
-    {"cuda.h",               {"hip/hip_runtime.h",        CONV_INCLUDE_CUDA_MAIN_H, API_DRIVER}},
-    {"cuda_runtime.h",       {"hip/hip_runtime.h",        CONV_INCLUDE_CUDA_MAIN_H, API_RUNTIME}},
-    {"cuda_runtime_api.h",   {"hip/hip_runtime_api.h",    CONV_INCLUDE,             API_RUNTIME}},
-    {"channel_descriptor.h", {"hip/channel_descriptor.h", CONV_INCLUDE,             API_RUNTIME}},
-    {"device_functions.h",   {"hip/device_functions.h",   CONV_INCLUDE,             API_RUNTIME}},
-    {"driver_types.h",       {"hip/driver_types.h",       CONV_INCLUDE,             API_RUNTIME}},
-    {"cuComplex.h",          {"hip/hip_complex.h",        CONV_INCLUDE,             API_RUNTIME}},
-    {"cuda_fp16.h",          {"hip/hip_fp16.h",           CONV_INCLUDE,             API_RUNTIME}},
-    {"cuda_texture_types.h", {"hip/hip_texture_types.h",  CONV_INCLUDE,             API_RUNTIME}},
-    {"vector_types.h",       {"hip/hip_vector_types.h",   CONV_INCLUDE,             API_RUNTIME}},
+    {"cuda.h",                    {"hip/hip_runtime.h",        CONV_INCLUDE_CUDA_MAIN_H, API_DRIVER}},
+    {"cuda_runtime.h",            {"hip/hip_runtime.h",        CONV_INCLUDE_CUDA_MAIN_H, API_RUNTIME}},
+    {"cuda_runtime_api.h",        {"hip/hip_runtime_api.h",    CONV_INCLUDE,             API_RUNTIME}},
+    {"channel_descriptor.h",      {"hip/channel_descriptor.h", CONV_INCLUDE,             API_RUNTIME}},
+    {"device_functions.h",        {"hip/device_functions.h",   CONV_INCLUDE,             API_RUNTIME}},
+    {"driver_types.h",            {"hip/driver_types.h",       CONV_INCLUDE,             API_RUNTIME}},
+    {"cuComplex.h",               {"hip/hip_complex.h",        CONV_INCLUDE,             API_RUNTIME}},
+    {"cuda_fp16.h",               {"hip/hip_fp16.h",           CONV_INCLUDE,             API_RUNTIME}},
+    {"cuda_texture_types.h",      {"hip/hip_texture_types.h",  CONV_INCLUDE,             API_RUNTIME}},
+    {"vector_types.h",            {"hip/hip_vector_types.h",   CONV_INCLUDE,             API_RUNTIME}},
 
     // CUBLAS includes
-    {"cublas.h",             {"hipblas.h",                CONV_INCLUDE_CUDA_MAIN_H, API_BLAS}},
-    {"cublas_v2.h",          {"hipblas.h",                CONV_INCLUDE_CUDA_MAIN_H, API_BLAS}},
+    {"cublas.h",                  {"hipblas.h",                CONV_INCLUDE_CUDA_MAIN_H, API_BLAS}},
+    {"cublas_v2.h",               {"hipblas.h",                CONV_INCLUDE_CUDA_MAIN_H, API_BLAS}},
 
     // CURAND includes
-    {"curand.h",             {"hiprand.h",                CONV_INCLUDE,             API_RAND}},
-    {"curand_kernel.h",      {"hiprand_kernel.h",         CONV_INCLUDE,             API_RAND}},
+    {"curand.h",                  {"hiprand.h",                CONV_INCLUDE_CUDA_MAIN_H, API_RAND}},
+    {"curand_kernel.h",           {"hiprand_kernel.h",         CONV_INCLUDE,             API_RAND}},
+    {"curand_discrete.h",         {"hiprand_kernel.h",         CONV_INCLUDE,             API_RAND}},
+    {"curand_discrete2.h",        {"hiprand_kernel.h",         CONV_INCLUDE,             API_RAND}},
+    {"curand_globals.h",          {"hiprand_kernel.h",         CONV_INCLUDE,             API_RAND}},
+    {"curand_lognormal.h",        {"hiprand_kernel.h",         CONV_INCLUDE,             API_RAND}},
+    {"curand_mrg32k3a.h",         {"hiprand_kernel.h",         CONV_INCLUDE,             API_RAND}},
+    {"curand_mtgp32.h",           {"hiprand_kernel.h",         CONV_INCLUDE,             API_RAND}},
+    {"curand_mtgp32_host.h",      {"hiprand_kernel.h",         CONV_INCLUDE,             API_RAND}},
+    {"curand_mtgp32_kernel.h",    {"hiprand_kernel.h",         CONV_INCLUDE,             API_RAND}},
+    {"curand_mtgp32dc_p_11213.h", {"hiprand_kernel.h",         CONV_INCLUDE,             API_RAND}},
+    {"curand_normal.h",           {"hiprand_kernel.h",         CONV_INCLUDE,             API_RAND}},
+    {"curand_normal_static.h",    {"hiprand_kernel.h",         CONV_INCLUDE,             API_RAND}},
+    {"curand_philox4x32_x.h",     {"hiprand_kernel.h",         CONV_INCLUDE,             API_RAND}},
+    {"curand_poisson.h",          {"hiprand_kernel.h",         CONV_INCLUDE,             API_RAND}},
+    {"curand_precalc.h",          {"hiprand_kernel.h",         CONV_INCLUDE,             API_RAND}},
+    {"curand_uniform.h",          {"hiprand_kernel.h",         CONV_INCLUDE,             API_RAND}},
 
     // HIP includes
     // TODO: uncomment this when hip/cudacommon.h will be renamed to hip/hipcommon.h
@@ -2852,6 +2867,8 @@ const std::map<llvm::StringRef, hipCounter> CUDA_IDENTIFIER_MAP{
     {"curand_normal2_double",                         {"hiprand_normal2_double",                         CONV_DEVICE_FUNC, API_RAND}},
     {"curand_normal4",                                {"hiprand_normal4",                                CONV_DEVICE_FUNC, API_RAND}},
     {"curand_normal4_double",                         {"hiprand_normal4_double",                         CONV_DEVICE_FUNC, API_RAND}},
+    {"curand_uniform",                                {"hiprand_uniform",                                CONV_DEVICE_FUNC, API_RAND}},
+    {"curand_uniform_double",                         {"hiprand_uniform_double",                         CONV_DEVICE_FUNC, API_RAND}},
     {"curand_uniform2_double",                        {"hiprand_uniform2_double",                        CONV_DEVICE_FUNC, API_RAND}},
     {"curand_uniform4",                               {"hiprand_uniform4",                               CONV_DEVICE_FUNC, API_RAND}},
     {"curand_uniform4_double",                        {"hiprand_uniform4_double",                        CONV_DEVICE_FUNC, API_RAND}},
diff --git a/hipify-clang/src/HipifyAction.cpp b/hipify-clang/src/HipifyAction.cpp
index 3459ea21f9..76efe9bf85 100644
--- a/hipify-clang/src/HipifyAction.cpp
+++ b/hipify-clang/src/HipifyAction.cpp
@@ -137,6 +137,48 @@ std::string stringifyZeroDefaultedArg(clang::SourceManager& SM, const clang::Exp
 
 } // anonymous namespace
 
+bool HipifyAction::Exclude(const hipCounter & hipToken) {
+    switch (hipToken.type) {
+        case CONV_INCLUDE_CUDA_MAIN_H:
+            switch (hipToken.apiType) {
+                case API_DRIVER:
+                case API_RUNTIME:
+                    if (insertedRuntimeHeader) { return true; }
+                    insertedRuntimeHeader = true;
+                    return false;
+                case API_BLAS:
+                  if (insertedBLASHeader) { return true; }
+                    insertedBLASHeader = true;
+                    return false;
+                case API_RAND:
+                    if (hipToken.hipName == "hiprand_kernel.h") {
+                        if (insertedRAND_kernelHeader) { return true; }
+                        insertedRAND_kernelHeader = true;
+                        return false;
+                    } else if (hipToken.hipName == "hiprand.h") {
+                        if (insertedRANDHeader) { return true; }
+                        insertedRANDHeader = true;
+                        return false;
+                    }
+                default:
+                    return false;
+            }
+            return false;
+        case CONV_INCLUDE:
+            switch (hipToken.apiType) {
+                case API_RAND:
+                    if (insertedRAND_kernelHeader) { return true; }
+                    insertedRAND_kernelHeader = true;
+                    return false;
+                default:
+                    return false;
+            }
+            return false;
+        default:
+            return false;
+    }
+    return false;
+}
 
 void HipifyAction::InclusionDirective(clang::SourceLocation hash_loc,
                                       const clang::Token&,
@@ -159,29 +201,7 @@ void HipifyAction::InclusionDirective(clang::SourceLocation hash_loc,
         return;
     }
 
-    // Special-casing to avoid duplication of the hip_runtime include.
-    bool secondMainInclude = false;
-    if (found->second.countType == CONV_INCLUDE_CUDA_MAIN_H) {
-        switch (found->second.countApiType) {
-            case API_DRIVER:
-            case API_RUNTIME:
-                if (insertedRuntimeHeader) {
-                    secondMainInclude = true;
-                    break;
-                }
-                insertedRuntimeHeader = true;
-                break;
-            case API_BLAS:
-                if (insertedBLASHeader) {
-                    secondMainInclude = true;
-                    break;
-                }
-                insertedBLASHeader = true;
-                break;
-            default:
-                break;
-        }
-    }
+    bool exclude = Exclude(found->second);
 
     Statistics::current().incrementCounter(found->second, file_name.str());
 
@@ -195,7 +215,7 @@ void HipifyAction::InclusionDirective(clang::SourceLocation hash_loc,
     clang::StringRef newInclude;
 
     // Keep the same include type that the user gave.
-    if (!secondMainInclude) {
+    if (!exclude) {
         clang::SmallString<128> includeBuffer;
         if (is_angled) {
             newInclude = llvm::Twine("<" + found->second.hipName + ">").toStringRef(includeBuffer);
diff --git a/hipify-clang/src/HipifyAction.h b/hipify-clang/src/HipifyAction.h
index 42622c1e01..8c2195b28d 100644
--- a/hipify-clang/src/HipifyAction.h
+++ b/hipify-clang/src/HipifyAction.h
@@ -6,6 +6,7 @@
 #include "clang/Tooling/Core/Replacement.h"
 #include "clang/ASTMatchers/ASTMatchFinder.h"
 #include "ReplacementsFrontendActionFactory.h"
+#include "Statistics.h"
 
 namespace ct = clang::tooling;
 
@@ -24,6 +25,8 @@ private:
     // This approach means we do the best it's possible to do w.r.t preserving the user's include order.
     bool insertedRuntimeHeader = false;
     bool insertedBLASHeader = false;
+    bool insertedRANDHeader = false;
+    bool insertedRAND_kernelHeader = false;
     bool firstHeader = false;
     bool pragmaOnce = false;
     clang::SourceLocation firstHeaderLoc;
@@ -90,4 +93,6 @@ protected:
     void run(const clang::ast_matchers::MatchFinder::MatchResult& Result) override;
 
     std::unique_ptr<clang::ASTConsumer> CreateASTConsumer(clang::CompilerInstance &CI, llvm::StringRef InFile) override;
+
+    bool Exclude(const hipCounter & hipToken);
 };
diff --git a/hipify-clang/src/Statistics.cpp b/hipify-clang/src/Statistics.cpp
index e2b3632066..4c5664a6b0 100644
--- a/hipify-clang/src/Statistics.cpp
+++ b/hipify-clang/src/Statistics.cpp
@@ -53,8 +53,8 @@ void printStat(std::ostream *csv, llvm::raw_ostream* printOut, const std::string
 
 void StatCounter::incrementCounter(const hipCounter& counter, std::string name) {
     counters[name]++;
-    apiCounters[(int) counter.countApiType]++;
-    convTypeCounters[(int) counter.countType]++;
+    apiCounters[(int) counter.apiType]++;
+    convTypeCounters[(int) counter.type]++;
 }
 
 void StatCounter::add(const StatCounter& other) {
diff --git a/hipify-clang/src/Statistics.h b/hipify-clang/src/Statistics.h
index f160ca7383..81be7b09a8 100644
--- a/hipify-clang/src/Statistics.h
+++ b/hipify-clang/src/Statistics.h
@@ -67,8 +67,8 @@ extern const char *apiNames[NUM_API_TYPES];
 
 struct hipCounter {
     llvm::StringRef hipName;
-    ConvTypes countType;
-    ApiTypes countApiType;
+    ConvTypes type;
+    ApiTypes apiType;
     bool unsupported;
 };
 
diff --git a/tests/hipify-clang/cuRAND/benchmark_curand_generate.cpp b/tests/hipify-clang/cuRAND/benchmark_curand_generate.cpp
new file mode 100644
index 0000000000..8d284c82c6
--- /dev/null
+++ b/tests/hipify-clang/cuRAND/benchmark_curand_generate.cpp
@@ -0,0 +1,393 @@
+// RUN: %run_test hipify "%s" "%t" %cuda_args
+
+// Copyright (c) 2017 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <string>
+#include <chrono>
+#include <numeric>
+#include <utility>
+#include <algorithm>
+
+#include "cmdparser.hpp"
+// CHECK: #include <hip/hip_runtime.h>
+#include <cuda_runtime.h>
+// CHECK: #include <hiprand.h>
+#include <curand.h>
+
+// CHECK: if((x)!=hipSuccess) {
+#define CUDA_CALL(x) do { if((x)!=cudaSuccess) { \
+    printf("Error at %s:%d\n",__FILE__,__LINE__);\
+    exit(EXIT_FAILURE);}} while(0)
+// CHECK: if((x)!=HIPRAND_STATUS_SUCCESS) {
+#define CURAND_CALL(x) do { if((x)!=CURAND_STATUS_SUCCESS) { \
+    printf("Error at %s:%d\n",__FILE__,__LINE__);\
+    exit(EXIT_FAILURE);}} while(0)
+
+#ifndef DEFAULT_RAND_N
+const size_t DEFAULT_RAND_N = 1024 * 1024 * 128;
+#endif
+
+// CHECK: typedef hiprandRngType_t rng_type_t;
+typedef curandRngType rng_type_t;
+
+// CHECK: using generate_func_type = std::function<hiprandStatus_t(hiprandGenerator_t, T *, size_t)>;
+template<typename T>
+using generate_func_type = std::function<curandStatus_t(curandGenerator_t, T *, size_t)>;
+
+template<typename T>
+void run_benchmark(const cli::Parser& parser,
+                   const rng_type_t rng_type,
+                   generate_func_type<T> generate_func)
+{
+    const size_t size = parser.get<size_t>("size");
+    const size_t trials = parser.get<size_t>("trials");
+
+    T * data;
+    // CHECK: CUDA_CALL(hipMalloc((void **)&data, size * sizeof(T)));
+    CUDA_CALL(cudaMalloc((void **)&data, size * sizeof(T)));
+
+    // CHECK: hiprandGenerator_t generator;
+    // CHECK: CURAND_CALL(hiprandCreateGenerator(&generator, rng_type));
+    curandGenerator_t generator;
+    CURAND_CALL(curandCreateGenerator(&generator, rng_type));
+
+    const size_t dimensions = parser.get<size_t>("dimensions");
+    // CHECK: hiprandStatus_t status = hiprandSetQuasiRandomGeneratorDimensions(generator, dimensions);
+    // CHECK: if (status != HIPRAND_STATUS_TYPE_ERROR)
+    curandStatus_t status = curandSetQuasiRandomGeneratorDimensions(generator, dimensions);
+    if (status != CURAND_STATUS_TYPE_ERROR) // If the RNG is not quasi-random
+    {
+        CURAND_CALL(status);
+    }
+
+    // Warm-up
+    for (size_t i = 0; i < 5; i++)
+    {
+        CURAND_CALL(generate_func(generator, data, size));
+    }
+    // CHECK: CUDA_CALL(hipDeviceSynchronize());
+    CUDA_CALL(cudaDeviceSynchronize());
+
+    // Measurement
+    auto start = std::chrono::high_resolution_clock::now();
+    for (size_t i = 0; i < trials; i++)
+    {
+        CURAND_CALL(generate_func(generator, data, size));
+    }
+    // CHECK: CUDA_CALL(hipDeviceSynchronize());
+    CUDA_CALL(cudaDeviceSynchronize());
+    auto end = std::chrono::high_resolution_clock::now();
+    std::chrono::duration<double, std::milli> elapsed = end - start;
+
+    std::cout << std::fixed << std::setprecision(3)
+              << "      "
+              << "Throughput = "
+              << std::setw(8) << (trials * size * sizeof(T)) /
+                    (elapsed.count() / 1e3 * (1 << 30))
+              << " GB/s, Samples = "
+              << std::setw(8) << (trials * size) /
+                    (elapsed.count() / 1e3 * (1 << 30))
+              << " GSample/s, AvgTime (1 trial) = "
+              << std::setw(8) << elapsed.count() / trials
+              << " ms, Time (all) = "
+              << std::setw(8) << elapsed.count()
+              << " ms, Size = " << size
+              << std::endl;
+    // CHECK: CURAND_CALL(hiprandDestroyGenerator(generator));
+    // CHECK: CUDA_CALL(hipFree(data));
+    CURAND_CALL(curandDestroyGenerator(generator));
+    CUDA_CALL(cudaFree(data));
+}
+
+void run_benchmarks(const cli::Parser& parser,
+                    const rng_type_t rng_type,
+                    const std::string& distribution)
+{
+    if (distribution == "uniform-uint")
+    {
+        // CHECK: if (rng_type != HIPRAND_RNG_QUASI_SOBOL64 &&
+        // CHECK: rng_type != HIPRAND_RNG_QUASI_SCRAMBLED_SOBOL64)
+        if (rng_type != CURAND_RNG_QUASI_SOBOL64 &&
+            rng_type != CURAND_RNG_QUASI_SCRAMBLED_SOBOL64)
+        {
+            run_benchmark<unsigned int>(parser, rng_type,
+                // CHECK: [](hiprandGenerator_t gen, unsigned int * data, size_t size) {
+                // CHECK: return hiprandGenerate(gen, data, size);
+                [](curandGenerator_t gen, unsigned int * data, size_t size) {
+                    return curandGenerate(gen, data, size);
+                }
+            );
+        }
+    }
+    if (distribution == "uniform-long-long")
+    {
+        // CHECK: if (rng_type == HIPRAND_RNG_QUASI_SOBOL64 ||
+        // CHECK: rng_type == HIPRAND_RNG_QUASI_SCRAMBLED_SOBOL64)
+        if (rng_type == CURAND_RNG_QUASI_SOBOL64 ||
+            rng_type == CURAND_RNG_QUASI_SCRAMBLED_SOBOL64)
+        {
+            run_benchmark<unsigned long long>(parser, rng_type,
+                // CHECK: [](hiprandGenerator_t gen, unsigned long long * data, size_t size) {
+                [](curandGenerator_t gen, unsigned long long * data, size_t size) {
+                    // curandGenerateLongLong is yet unsupported by HIP
+                    // CHECK-NOT: return hiprandGenerateLongLong(gen, data, size);
+                    return curandGenerateLongLong(gen, data, size);
+                }
+            );
+        }
+    }
+    if (distribution == "uniform-float")
+    {
+        run_benchmark<float>(parser, rng_type,
+            // CHECK: [](hiprandGenerator_t gen, float * data, size_t size) {
+            // CHECK: return hiprandGenerateUniform(gen, data, size);
+            [](curandGenerator_t gen, float * data, size_t size) {
+                return curandGenerateUniform(gen, data, size);
+            }
+        );
+    }
+    if (distribution == "uniform-double")
+    {
+        run_benchmark<double>(parser, rng_type,
+            // CHECK: [](hiprandGenerator_t gen, double * data, size_t size) {
+            // CHECK: return hiprandGenerateUniformDouble(gen, data, size);
+            [](curandGenerator_t gen, double * data, size_t size) {
+                return curandGenerateUniformDouble(gen, data, size);
+            }
+        );
+    }
+    if (distribution == "normal-float")
+    {
+        run_benchmark<float>(parser, rng_type,
+            // CHECK: [](hiprandGenerator_t gen, float * data, size_t size) {
+            // CHECK: return hiprandGenerateNormal(gen, data, size, 0.0f, 1.0f);
+            [](curandGenerator_t gen, float * data, size_t size) {
+                return curandGenerateNormal(gen, data, size, 0.0f, 1.0f);
+            }
+        );
+    }
+    if (distribution == "normal-double")
+    {
+        run_benchmark<double>(parser, rng_type,
+            // CHECK: [](hiprandGenerator_t gen, double * data, size_t size) {
+            // CHECK: return hiprandGenerateNormalDouble(gen, data, size, 0.0, 1.0);
+            [](curandGenerator_t gen, double * data, size_t size) {
+                return curandGenerateNormalDouble(gen, data, size, 0.0, 1.0);
+            }
+        );
+    }
+    if (distribution == "log-normal-float")
+    {
+        run_benchmark<float>(parser, rng_type,
+            // CHECK: [](hiprandGenerator_t gen, float * data, size_t size) {
+            // CHECK: return hiprandGenerateLogNormal(gen, data, size, 0.0f, 1.0f);
+            [](curandGenerator_t gen, float * data, size_t size) {
+                return curandGenerateLogNormal(gen, data, size, 0.0f, 1.0f);
+            }
+        );
+    }
+    if (distribution == "log-normal-double")
+    {
+        run_benchmark<double>(parser, rng_type,
+            // CHECK: [](hiprandGenerator_t gen, double * data, size_t size) {
+            // CHECK: return hiprandGenerateLogNormalDouble(gen, data, size, 0.0, 1.0);
+            [](curandGenerator_t gen, double * data, size_t size) {
+                return curandGenerateLogNormalDouble(gen, data, size, 0.0, 1.0);
+            }
+        );
+    }
+    if (distribution == "poisson")
+    {
+        const auto lambdas = parser.get<std::vector<double>>("lambda");
+        for (double lambda : lambdas)
+        {
+            std::cout << "    " << "lambda "
+                 << std::fixed << std::setprecision(1) << lambda << std::endl;
+            run_benchmark<unsigned int>(parser, rng_type,
+                // CHECK: [lambda](hiprandGenerator_t gen, unsigned int * data, size_t size) {
+                // CHECK: return hiprandGeneratePoisson(gen, data, size, lambda);
+                [lambda](curandGenerator_t gen, unsigned int * data, size_t size) {
+                    return curandGeneratePoisson(gen, data, size, lambda);
+                }
+            );
+        }
+    }
+}
+
+const std::vector<std::string> all_engines = {
+    "xorwow",
+    "mrg32k3a",
+    "mtgp32",
+    // "mt19937",
+    "philox",
+    "sobol32",
+    // "scrambled_sobol32",
+    // "sobol64",
+    // "scrambled_sobol64",
+};
+
+const std::vector<std::string> all_distributions = {
+    "uniform-uint",
+    "uniform-long-long",
+    "uniform-float",
+    "uniform-double",
+    "normal-float",
+    "normal-double",
+    "log-normal-float",
+    "log-normal-double",
+    "poisson"
+};
+
+int main(int argc, char *argv[])
+{
+    cli::Parser parser(argc, argv);
+
+    const std::string distribution_desc =
+        "space-separated list of distributions:" +
+        std::accumulate(all_distributions.begin(), all_distributions.end(), std::string(),
+            [](std::string a, std::string b) {
+                return a + "\n      " + b;
+            }
+        ) +
+        "\n      or all";
+    const std::string engine_desc =
+        "space-separated list of random number engines:" +
+        std::accumulate(all_engines.begin(), all_engines.end(), std::string(),
+            [](std::string a, std::string b) {
+                return a + "\n      " + b;
+            }
+        ) +
+        "\n      or all";
+
+    parser.set_optional<size_t>("size", "size", DEFAULT_RAND_N, "number of values");
+    parser.set_optional<size_t>("dimensions", "dimensions", 1, "number of dimensions of quasi-random values");
+    parser.set_optional<size_t>("trials", "trials", 20, "number of trials");
+    parser.set_optional<std::vector<std::string>>("dis", "dis", {"uniform-uint"}, distribution_desc.c_str());
+    parser.set_optional<std::vector<std::string>>("engine", "engine", {"philox"}, engine_desc.c_str());
+    parser.set_optional<std::vector<double>>("lambda", "lambda", {10.0}, "space-separated list of lambdas of Poisson distribution");
+    parser.run_and_exit_if_error();
+
+    std::vector<std::string> engines;
+    {
+        auto es = parser.get<std::vector<std::string>>("engine");
+        if (std::find(es.begin(), es.end(), "all") != es.end())
+        {
+            engines = all_engines;
+        }
+        else
+        {
+            for (auto e : all_engines)
+            {
+                if (std::find(es.begin(), es.end(), e) != es.end())
+                    engines.push_back(e);
+            }
+        }
+    }
+
+    std::vector<std::string> distributions;
+    {
+        auto ds = parser.get<std::vector<std::string>>("dis");
+        if (std::find(ds.begin(), ds.end(), "all") != ds.end())
+        {
+            distributions = all_distributions;
+        }
+        else
+        {
+            for (auto d : all_distributions)
+            {
+                if (std::find(ds.begin(), ds.end(), d) != ds.end())
+                    distributions.push_back(d);
+            }
+        }
+    }
+
+    int version;
+    // CHECK: CURAND_CALL(hiprandGetVersion(&version));
+    CURAND_CALL(curandGetVersion(&version));
+    int runtime_version;
+    // cudaRuntimeGetVersion is yet unsupported by HIP
+    // CHECK-NOT: CUDA_CALL(hipRuntimeGetVersion(&runtime_version));
+    CUDA_CALL(cudaRuntimeGetVersion(&runtime_version));
+    int device_id;
+    // CHECK: CUDA_CALL(hipGetDevice(&device_id));
+    // CHECK: hipDeviceProp_t props;
+    // CHECK: CUDA_CALL(hipGetDeviceProperties(&props, device_id));
+    CUDA_CALL(cudaGetDevice(&device_id));
+    cudaDeviceProp props;
+    CUDA_CALL(cudaGetDeviceProperties(&props, device_id));
+
+    std::cout << "cuRAND: " << version << " ";
+    std::cout << "Runtime: " << runtime_version << " ";
+    std::cout << "Device: " << props.name;
+    std::cout << std::endl << std::endl;
+
+    for (auto engine : engines)
+    {
+        // CHECK: rng_type_t rng_type = HIPRAND_RNG_PSEUDO_XORWOW;
+        // CHECK: rng_type = HIPRAND_RNG_PSEUDO_XORWOW;
+        // CHECK: rng_type = HIPRAND_RNG_PSEUDO_MRG32K3A;
+        // CHECK: rng_type = HIPRAND_RNG_PSEUDO_MTGP32;
+        // CHECK: rng_type = HIPRAND_RNG_PSEUDO_MT19937;
+        // CHECK: rng_type = HIPRAND_RNG_PSEUDO_PHILOX4_32_10;
+        // CHECK: rng_type = HIPRAND_RNG_QUASI_SOBOL32;
+        // CHECK: rng_type = HIPRAND_RNG_QUASI_SCRAMBLED_SOBOL32;
+        // CHECK: rng_type = HIPRAND_RNG_QUASI_SOBOL64;
+        // CHECK: rng_type = HIPRAND_RNG_QUASI_SCRAMBLED_SOBOL64;
+        rng_type_t rng_type = CURAND_RNG_PSEUDO_XORWOW;
+        if (engine == "xorwow")
+            rng_type = CURAND_RNG_PSEUDO_XORWOW;
+        else if (engine == "mrg32k3a")
+            rng_type = CURAND_RNG_PSEUDO_MRG32K3A;
+        else if (engine == "mtgp32")
+            rng_type = CURAND_RNG_PSEUDO_MTGP32;
+        else if (engine == "mt19937")
+            rng_type = CURAND_RNG_PSEUDO_MT19937;
+        else if (engine == "philox")
+            rng_type = CURAND_RNG_PSEUDO_PHILOX4_32_10;
+        else if (engine == "sobol32")
+            rng_type = CURAND_RNG_QUASI_SOBOL32;
+        else if (engine == "scrambled_sobol32")
+            rng_type = CURAND_RNG_QUASI_SCRAMBLED_SOBOL32;
+        else if (engine == "sobol64")
+            rng_type = CURAND_RNG_QUASI_SOBOL64;
+        else if (engine == "scrambled_sobol64")
+            rng_type = CURAND_RNG_QUASI_SCRAMBLED_SOBOL64;
+        else
+        {
+            std::cout << "Wrong engine name" << std::endl;
+            exit(1);
+        }
+
+        std::cout << engine << ":" << std::endl;
+
+        for (auto distribution : distributions)
+        {
+            std::cout << "  " << distribution << ":" << std::endl;
+            run_benchmarks(parser, rng_type, distribution);
+        }
+        std::cout << std::endl;
+    }
+
+    return 0;
+}
diff --git a/tests/hipify-clang/cuRAND/benchmark_curand_kernel.cpp b/tests/hipify-clang/cuRAND/benchmark_curand_kernel.cpp
new file mode 100644
index 0000000000..2c77f25331
--- /dev/null
+++ b/tests/hipify-clang/cuRAND/benchmark_curand_kernel.cpp
@@ -0,0 +1,669 @@
+// RUN: %run_test hipify "%s" "%t" %cuda_args
+
+// Copyright (c) 2017 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <string>
+#include <chrono>
+#include <numeric>
+#include <utility>
+#include <type_traits>
+#include <algorithm>
+
+#include "cmdparser.hpp"
+// CHECK: #include <hip/hip_runtime.h>
+#include <cuda_runtime.h>
+// CHECK: #include <hiprand.h>
+#include <curand.h>
+// CHECK: #include <hiprand_kernel.h>
+#include <curand_kernel.h>
+// CHECK-NOT: #include <curand_mtgp32_host.h>
+// CHECK-NOT: #include <curand_mtgp32dc_p_11213.h>
+#include <curand_mtgp32_host.h>
+#include <curand_mtgp32dc_p_11213.h>
+
+// CHECK: hipError_t error = (x);
+// CHECK: if(error!=hipSuccess) {
+#define CUDA_CALL(x) do { \
+    cudaError_t error = (x);\
+    if(error!=cudaSuccess) { \
+    printf("Error %d at %s:%d\n",error,__FILE__,__LINE__);\
+    exit(EXIT_FAILURE);}} while(0)
+#define CURAND_CALL(x) do { if((x)!=CURAND_STATUS_SUCCESS) { \
+    printf("Error at %s:%d\n",__FILE__,__LINE__);\
+    exit(EXIT_FAILURE);}} while(0)
+
+#ifndef DEFAULT_RAND_N
+const size_t DEFAULT_RAND_N = 1024 * 1024 * 128;
+#endif
+
+size_t next_power2(size_t x)
+{
+    size_t power = 1;
+    while (power < x)
+    {
+        power *= 2;
+    }
+    return power;
+}
+
+template<typename GeneratorState>
+__global__
+void init_kernel(GeneratorState * states,
+                 const unsigned long long seed,
+                 const unsigned long long offset)
+{
+    const unsigned int state_id = blockIdx.x * blockDim.x + threadIdx.x;
+    GeneratorState state;
+    // CHECK: hiprand_init(seed, state_id, offset, &state);
+    curand_init(seed, state_id, offset, &state);
+    states[state_id] = state;
+}
+
+template<typename GeneratorState, typename T, typename GenerateFunc, typename Extra>
+__global__
+void generate_kernel(GeneratorState * states,
+                     T * data,
+                     const size_t size,
+                     const GenerateFunc& generate_func,
+                     const Extra extra)
+{
+    const unsigned int state_id = blockIdx.x * blockDim.x + threadIdx.x;
+    const unsigned int stride = gridDim.x * blockDim.x;
+
+    GeneratorState state = states[state_id];
+    unsigned int index = state_id;
+    while(index < size)
+    {
+        data[index] = generate_func(&state, extra);
+        index += stride;
+    }
+    states[state_id] = state;
+}
+
+template<typename GeneratorState>
+struct runner
+{
+    GeneratorState * states;
+
+    runner(const size_t dimensions,
+           const size_t blocks,
+           const size_t threads,
+           const unsigned long long seed,
+           const unsigned long long offset)
+    {
+        const size_t states_size = blocks * threads;
+        // CHECK: CUDA_CALL(hipMalloc((void **)&states, states_size * sizeof(GeneratorState)));
+        CUDA_CALL(cudaMalloc((void **)&states, states_size * sizeof(GeneratorState)));
+        // CHECK: hipLaunchKernelGGL(init_kernel, dim3(blocks), dim3(threads), 0, 0, states, seed, offset);
+        init_kernel<<<blocks, threads>>>(states, seed, offset);
+        // CHECK: CUDA_CALL(hipPeekAtLastError());
+        // CHECK: CUDA_CALL(hipDeviceSynchronize());
+        CUDA_CALL(cudaPeekAtLastError());
+        CUDA_CALL(cudaDeviceSynchronize());
+    }
+
+    ~runner()
+    {
+        CUDA_CALL(cudaFree(states));
+    }
+
+    template<typename T, typename GenerateFunc, typename Extra>
+    void generate(const size_t blocks,
+                  const size_t threads,
+                  T * data,
+                  const size_t size,
+                  const GenerateFunc& generate_func,
+                  const Extra extra)
+    {
+        // CHECK: hipLaunchKernelGGL(generate_kernel, dim3(blocks), dim3(threads), 0, 0, states, data, size, generate_func, extra);
+        generate_kernel<<<blocks, threads>>>(states, data, size, generate_func, extra);
+    }
+};
+
+// CHECK: void generate_kernel(hiprandStateMtgp32_t * states,
+template<typename T, typename GenerateFunc, typename Extra>
+__global__
+void generate_kernel(curandStateMtgp32_t * states,
+                     T * data,
+                     const size_t size,
+                     const GenerateFunc& generate_func,
+                     const Extra extra)
+{
+    const unsigned int state_id = blockIdx.x;
+    const unsigned int thread_id = threadIdx.x;
+    unsigned int index = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned int stride = gridDim.x * blockDim.x;
+    // CHECK: __shared__ hiprandStateMtgp32_t state;
+    __shared__ curandStateMtgp32_t state;
+
+    if (thread_id == 0)
+        state = states[state_id];
+    __syncthreads();
+
+    const size_t r = size%blockDim.x;
+    const size_t size_rounded_up = r == 0 ? size : size + (blockDim.x - r);
+    while(index < size_rounded_up)
+    {
+        auto value = generate_func(&state, extra);
+        if(index < size)
+            data[index] = value;
+        index += stride;
+    }
+    __syncthreads();
+
+    if (thread_id == 0)
+        states[state_id] = state;
+}
+
+// CHECK: struct runner<hiprandStateMtgp32_t>
+template<>
+struct runner<curandStateMtgp32_t>
+{
+    // CHECK: hiprandStateMtgp32_t * states;
+    curandStateMtgp32_t * states;
+    mtgp32_kernel_params_t * d_param;
+
+    runner(const size_t dimensions,
+           const size_t blocks,
+           const size_t threads,
+           const unsigned long long seed,
+           const unsigned long long offset)
+    {
+        const size_t states_size = std::min((size_t)200, blocks);
+        // CHECK: CUDA_CALL(hipMalloc((void **)&states, states_size * sizeof(hiprandStateMtgp32_t)));
+        CUDA_CALL(cudaMalloc((void **)&states, states_size * sizeof(curandStateMtgp32_t)));
+        // CHECK: CUDA_CALL(hipMalloc((void **)&d_param, sizeof(mtgp32_kernel_params)));
+        CUDA_CALL(cudaMalloc((void **)&d_param, sizeof(mtgp32_kernel_params)));
+        // curandMakeMTGP32Constants is yet unsupported by HIP
+        // CHECK-NOT: CURAND_CALL(hiprandMakeMTGP32Constants(mtgp32dc_params_fast_11213, d_param));
+        CURAND_CALL(curandMakeMTGP32Constants(mtgp32dc_params_fast_11213, d_param));
+        // curandMakeMTGP32KernelState is yet unsupported by HIP
+        // CHECK-NOT: CURAND_CALL(hiprandMakeMTGP32KernelState(states, mtgp32dc_params_fast_11213, d_param, states_size, seed));
+        CURAND_CALL(curandMakeMTGP32KernelState(states, mtgp32dc_params_fast_11213, d_param, states_size, seed));
+    }
+
+    ~runner()
+    {
+        // CHECK: CUDA_CALL(hipFree(states));
+        // CHECK: CUDA_CALL(hipFree(d_param));
+        CUDA_CALL(cudaFree(states));
+        CUDA_CALL(cudaFree(d_param));
+    }
+
+    template<typename T, typename GenerateFunc, typename Extra>
+    void generate(const size_t blocks,
+                  const size_t threads,
+                  T * data,
+                  const size_t size,
+                  const GenerateFunc& generate_func,
+                  const Extra extra)
+    {
+        // CHECK: hipLaunchKernelGGL(generate_kernel, dim3(std::min((size_t)200, blocks)), dim3(256), 0, 0, states, data, size, generate_func, extra);
+        generate_kernel<<<std::min((size_t)200, blocks), 256>>>(states, data, size, generate_func, extra);
+    }
+};
+
+// CHECK: void init_kernel(hiprandStateSobol32_t * states,
+template<typename Directions>
+__global__
+void init_kernel(curandStateSobol32_t * states,
+                 const Directions directions,
+                 const unsigned long long offset)
+{
+    const unsigned int dimension = blockIdx.y;
+    const unsigned int state_id = blockIdx.x * blockDim.x + threadIdx.x;
+    // CHECK: hiprandStateSobol32_t state;
+    // CHECK: hiprand_init(directions[dimension], offset + state_id, &state);
+    curandStateSobol32_t state;
+    curand_init(directions[dimension], offset + state_id, &state);
+    states[gridDim.x * blockDim.x * dimension + state_id] = state;
+}
+
+// CHECK: void generate_kernel(hiprandStateSobol32_t * states,
+template<typename T, typename GenerateFunc, typename Extra>
+__global__
+void generate_kernel(curandStateSobol32_t * states,
+                     T * data,
+                     const size_t size,
+                     const GenerateFunc& generate_func,
+                     const Extra extra)
+{
+    const unsigned int dimension = blockIdx.y;
+    const unsigned int state_id = blockIdx.x * blockDim.x + threadIdx.x;
+    const unsigned int stride = gridDim.x * blockDim.x;
+    // CHECK: hiprandStateSobol32_t state = states[gridDim.x * blockDim.x * dimension + state_id];
+    curandStateSobol32_t state = states[gridDim.x * blockDim.x * dimension + state_id];
+    const unsigned int offset = dimension * size;
+    unsigned int index = state_id;
+    while(index < size)
+    {
+        data[offset + index] = generate_func(&state, extra);
+        skipahead(stride - 1, &state);
+        index += stride;
+    }
+    state = states[gridDim.x * blockDim.x * dimension + state_id];
+    skipahead(static_cast<unsigned int>(size), &state);
+    states[gridDim.x * blockDim.x * dimension + state_id] = state;
+}
+
+// CHECK: struct runner<hiprandStateSobol32_t>
+template<>
+struct runner<curandStateSobol32_t>
+{
+    // CHECK: hiprandStateSobol32_t * states;
+    curandStateSobol32_t * states;
+    size_t dimensions;
+
+    runner(const size_t dimensions,
+           const size_t blocks,
+           const size_t threads,
+           const unsigned long long seed,
+           const unsigned long long offset)
+    {
+        this->dimensions = dimensions;
+        // CHECK: CUDA_CALL(hipMalloc((void **)&states, states_size * sizeof(hiprandStateSobol32_t)));
+        const size_t states_size = blocks * threads * dimensions;
+        CUDA_CALL(cudaMalloc((void **)&states, states_size * sizeof(curandStateSobol32_t)));
+        // CHECK: hiprandDirectionVectors32_t * directions;
+        curandDirectionVectors32_t * directions;
+        // CHECK: const size_t size = dimensions * sizeof(hiprandDirectionVectors32_t);
+        const size_t size = dimensions * sizeof(curandDirectionVectors32_t);
+        // CHECK: CUDA_CALL(hipMalloc((void **)&directions, size));
+        CUDA_CALL(cudaMalloc((void **)&directions, size));
+        // CHECK: hiprandDirectionVectors32_t * h_directions;
+        curandDirectionVectors32_t * h_directions;
+        // hiprandGetDirectionVectors32 and HIPRAND_DIRECTION_VECTORS_32_JOEKUO6 (of hiprandDirectionVectorSet_t) are yet unsupported by HIP
+        // CHECK-NOT: CURAND_CALL(hiprandGetDirectionVectors32(&h_directions, HIPRAND_DIRECTION_VECTORS_32_JOEKUO6));
+        CURAND_CALL(curandGetDirectionVectors32(&h_directions, CURAND_DIRECTION_VECTORS_32_JOEKUO6));
+        // CHECK: CUDA_CALL(hipMemcpy(directions, h_directions, size, hipMemcpyHostToDevice));
+        CUDA_CALL(cudaMemcpy(directions, h_directions, size, cudaMemcpyHostToDevice));
+
+        const size_t blocks_x = next_power2((blocks + dimensions - 1) / dimensions);
+        // CHECK: hipLaunchKernelGGL(init_kernel, dim3(dim3(blocks_x, dimensions)), dim3(threads), 0, 0, states, directions, offset);
+        init_kernel<<<dim3(blocks_x, dimensions), threads>>>(states, directions, offset);
+        // CHECK: CUDA_CALL(hipPeekAtLastError());
+        // CHECK: CUDA_CALL(hipDeviceSynchronize());
+        CUDA_CALL(cudaPeekAtLastError());
+        CUDA_CALL(cudaDeviceSynchronize());
+        // CHECK: CUDA_CALL(hipFree(directions));
+        CUDA_CALL(cudaFree(directions));
+    }
+
+    ~runner()
+    {
+        // CHECK: CUDA_CALL(hipFree(states));
+        CUDA_CALL(cudaFree(states));
+    }
+
+    template<typename T, typename GenerateFunc, typename Extra>
+    void generate(const size_t blocks,
+                  const size_t threads,
+                  T * data,
+                  const size_t size,
+                  const GenerateFunc& generate_func,
+                  const Extra extra)
+    {
+        const size_t blocks_x = next_power2((blocks + dimensions - 1) / dimensions);
+        // CHECK: hipLaunchKernelGGL(generate_kernel, dim3(dim3(blocks_x, dimensions)), dim3(threads), 0, 0, states, data, size / dimensions, generate_func, extra);
+        generate_kernel<<<dim3(blocks_x, dimensions), threads>>>(states, data, size / dimensions, generate_func, extra);
+    }
+};
+
+template<typename T, typename GeneratorState, typename GenerateFunc, typename Extra>
+void run_benchmark(const cli::Parser& parser,
+                   const GenerateFunc& generate_func,
+                   const Extra extra)
+{
+    const size_t size = parser.get<size_t>("size");
+    const size_t dimensions = parser.get<size_t>("dimensions");
+    const size_t trials = parser.get<size_t>("trials");
+
+    const size_t blocks = parser.get<size_t>("blocks");
+    const size_t threads = parser.get<size_t>("threads");
+
+    T * data;
+    // CHECK: CUDA_CALL(hipMalloc((void **)&data, size * sizeof(T)));
+    CUDA_CALL(cudaMalloc((void **)&data, size * sizeof(T)));
+
+    runner<GeneratorState> r(dimensions, blocks, threads, 12345ULL, 6789ULL);
+
+    // Warm-up
+    for (size_t i = 0; i < 5; i++)
+    {
+        r.generate(blocks, threads, data, size, generate_func, extra);
+        // CHECK: CUDA_CALL(hipPeekAtLastError());
+        // CHECK: CUDA_CALL(hipDeviceSynchronize());
+        CUDA_CALL(cudaPeekAtLastError());
+        CUDA_CALL(cudaDeviceSynchronize());
+    }
+    // CHECK: CUDA_CALL(hipDeviceSynchronize());
+    CUDA_CALL(cudaDeviceSynchronize());
+
+    // Measurement
+    auto start = std::chrono::high_resolution_clock::now();
+    for (size_t i = 0; i < trials; i++)
+    {
+        r.generate(blocks, threads, data, size, generate_func, extra);
+    }
+    // CHECK: CUDA_CALL(hipPeekAtLastError());
+    // CHECK: CUDA_CALL(hipDeviceSynchronize());
+    CUDA_CALL(cudaPeekAtLastError());
+    CUDA_CALL(cudaDeviceSynchronize());
+    auto end = std::chrono::high_resolution_clock::now();
+    std::chrono::duration<double, std::milli> elapsed = end - start;
+
+    std::cout << std::fixed << std::setprecision(3)
+              << "      "
+              << "Throughput = "
+              << std::setw(8) << (trials * size * sizeof(T)) /
+                    (elapsed.count() / 1e3 * (1 << 30))
+              << " GB/s, Samples = "
+              << std::setw(8) << (trials * size) /
+                    (elapsed.count() / 1e3 * (1 << 30))
+              << " GSample/s, AvgTime (1 trial) = "
+              << std::setw(8) << elapsed.count() / trials
+              << " ms, Time (all) = "
+              << std::setw(8) << elapsed.count()
+              << " ms, Size = " << size
+              << std::endl;
+    // CHECK: CUDA_CALL(hipFree(data));
+    CUDA_CALL(cudaFree(data));
+}
+
+template<typename GeneratorState>
+void run_benchmarks(const cli::Parser& parser,
+                    const std::string& distribution)
+{
+    if (distribution == "uniform-uint")
+    {
+        // curandStateSobol64_t and curandStateScrambledSobol64_t are yet unsupported by HIP
+        // CHECK-NOT: if (!std::is_same<GeneratorState, hiprandStateSobol64_t>::value &&
+        // CHECK-NOT: !std::is_same<GeneratorState, hiprandStateScrambledSobol64_t>::value)
+        if (!std::is_same<GeneratorState, curandStateSobol64_t>::value &&
+            !std::is_same<GeneratorState, curandStateScrambledSobol64_t>::value)
+        {
+            run_benchmark<unsigned int, GeneratorState>(parser,
+                [] __device__ (GeneratorState * state, int) {
+                    // CHECK: return hiprand(state);
+                    return curand(state);
+                }, 0
+            );
+        }
+    }
+    if (distribution == "uniform-long-long")
+    {
+        // curandStateSobol64_t and curandStateScrambledSobol64_t are yet unsupported by HIP
+        // CHECK-NOT: if (!std::is_same<GeneratorState, hiprandStateSobol64_t>::value &&
+        // CHECK-NOT: !std::is_same<GeneratorState, hiprandStateScrambledSobol64_t>::value)
+        if (std::is_same<GeneratorState, curandStateSobol64_t>::value ||
+            std::is_same<GeneratorState, curandStateScrambledSobol64_t>::value)
+        {
+            run_benchmark<unsigned long long, GeneratorState>(parser,
+                [] __device__ (GeneratorState * state, int) {
+                    // CHECK: return hiprand(state);
+                    return curand(state);
+                }, 0
+            );
+        }
+    }
+    if (distribution == "uniform-float")
+    {
+        run_benchmark<float, GeneratorState>(parser,
+            [] __device__ (GeneratorState * state, int) {
+                // CHECK: return hiprand_uniform(state);
+                return curand_uniform(state);
+            }, 0
+        );
+    }
+    if (distribution == "uniform-double")
+    {
+        run_benchmark<double, GeneratorState>(parser,
+            [] __device__ (GeneratorState * state, int) {
+                // CHECK: return hiprand_uniform_double(state);
+                return curand_uniform_double(state);
+            }, 0
+        );
+    }
+    if (distribution == "normal-float")
+    {
+        run_benchmark<float, GeneratorState>(parser,
+            [] __device__ (GeneratorState * state, int) {
+                // CHECK: return hiprand_normal(state);
+                return curand_normal(state);
+            }, 0
+        );
+    }
+    if (distribution == "normal-double")
+    {
+        run_benchmark<double, GeneratorState>(parser,
+            [] __device__ (GeneratorState * state, int) {
+                // CHECK: return hiprand_normal_double(state);
+                return curand_normal_double(state);
+            }, 0
+        );
+    }
+    if (distribution == "log-normal-float")
+    {
+        run_benchmark<float, GeneratorState>(parser,
+            [] __device__ (GeneratorState * state, int) {
+                // CHECK: return hiprand_log_normal(state, 0.0f, 1.0f);
+                return curand_log_normal(state, 0.0f, 1.0f);
+            }, 0
+        );
+    }
+    if (distribution == "log-normal-double")
+    {
+        run_benchmark<double, GeneratorState>(parser,
+            [] __device__ (GeneratorState * state, int) {
+                // CHECK: return hiprand_log_normal_double(state, 0.0, 1.0);
+                return curand_log_normal_double(state, 0.0, 1.0);
+            }, 0
+        );
+    }
+    if (distribution == "poisson")
+    {
+        const auto lambdas = parser.get<std::vector<double>>("lambda");
+        for (double lambda : lambdas)
+        {
+            std::cout << "    " << "lambda "
+                 << std::fixed << std::setprecision(1) << lambda << std::endl;
+            run_benchmark<unsigned int, GeneratorState>(parser,
+                [] __device__ (GeneratorState * state, double lambda) {
+                    // CHECK: return hiprand_poisson(state, lambda);
+                    return curand_poisson(state, lambda);
+                }, lambda
+            );
+        }
+    }
+    if (distribution == "discrete-poisson")
+    {
+        const auto lambdas = parser.get<std::vector<double>>("lambda");
+        for (double lambda : lambdas)
+        {
+            std::cout << "    " << "lambda "
+                 << std::fixed << std::setprecision(1) << lambda << std::endl;
+            // CHECK: hiprandDiscreteDistribution_t discrete_distribution;
+            curandDiscreteDistribution_t discrete_distribution;
+            // CHECK: CURAND_CALL(hiprandCreatePoissonDistribution(lambda, &discrete_distribution));
+            CURAND_CALL(curandCreatePoissonDistribution(lambda, &discrete_distribution));
+            run_benchmark<unsigned int, GeneratorState>(parser,
+                // CHECK: [] __device__ (GeneratorState * state, hiprandDiscreteDistribution_t discrete_distribution) {
+                [] __device__ (GeneratorState * state, curandDiscreteDistribution_t discrete_distribution) {
+                    // CHECK: return hiprand_discrete4(state, discrete_distribution);
+                    return curand_discrete(state, discrete_distribution);
+                }, discrete_distribution
+            );
+            // CHECK: CURAND_CALL(hiprandDestroyDistribution(discrete_distribution));
+            CURAND_CALL(curandDestroyDistribution(discrete_distribution));
+        }
+    }
+}
+
+const std::vector<std::string> all_engines = {
+    "xorwow",
+    "mrg32k3a",
+    "mtgp32",
+    // "mt19937",
+    "philox",
+    "sobol32",
+    // "scrambled_sobol32",
+    // "sobol64",
+    // "scrambled_sobol64",
+};
+
+const std::vector<std::string> all_distributions = {
+    "uniform-uint",
+    // "uniform-long-long",
+    "uniform-float",
+    "uniform-double",
+    "normal-float",
+    "normal-double",
+    "log-normal-float",
+    "log-normal-double",
+    "poisson",
+    "discrete-poisson",
+};
+
+int main(int argc, char *argv[])
+{
+    cli::Parser parser(argc, argv);
+
+    const std::string distribution_desc =
+        "space-separated list of distributions:" +
+        std::accumulate(all_distributions.begin(), all_distributions.end(), std::string(),
+            [](std::string a, std::string b) {
+                return a + "\n      " + b;
+            }
+        ) +
+        "\n      or all";
+    const std::string engine_desc =
+        "space-separated list of random number engines:" +
+        std::accumulate(all_engines.begin(), all_engines.end(), std::string(),
+            [](std::string a, std::string b) {
+                return a + "\n      " + b;
+            }
+        ) +
+        "\n      or all";
+
+    parser.set_optional<size_t>("size", "size", DEFAULT_RAND_N, "number of values");
+    parser.set_optional<size_t>("dimensions", "dimensions", 1, "number of dimensions of quasi-random values");
+    parser.set_optional<size_t>("trials", "trials", 20, "number of trials");
+    parser.set_optional<size_t>("blocks", "blocks", 256, "number of blocks");
+    parser.set_optional<size_t>("threads", "threads", 256, "number of threads in each block");
+    parser.set_optional<std::vector<std::string>>("dis", "dis", {"uniform-uint"}, distribution_desc.c_str());
+    parser.set_optional<std::vector<std::string>>("engine", "engine", {"philox"}, engine_desc.c_str());
+    parser.set_optional<std::vector<double>>("lambda", "lambda", {10.0}, "space-separated list of lambdas of Poisson distribution");
+    parser.run_and_exit_if_error();
+
+    std::vector<std::string> engines;
+    {
+        auto es = parser.get<std::vector<std::string>>("engine");
+        if (std::find(es.begin(), es.end(), "all") != es.end())
+        {
+            engines = all_engines;
+        }
+        else
+        {
+            for (auto e : all_engines)
+            {
+                if (std::find(es.begin(), es.end(), e) != es.end())
+                    engines.push_back(e);
+            }
+        }
+    }
+
+    std::vector<std::string> distributions;
+    {
+        auto ds = parser.get<std::vector<std::string>>("dis");
+        if (std::find(ds.begin(), ds.end(), "all") != ds.end())
+        {
+            distributions = all_distributions;
+        }
+        else
+        {
+            for (auto d : all_distributions)
+            {
+                if (std::find(ds.begin(), ds.end(), d) != ds.end())
+                    distributions.push_back(d);
+            }
+        }
+    }
+
+    int version;
+    // CHECK: CURAND_CALL(hiprandGetVersion(&version));
+    CURAND_CALL(curandGetVersion(&version));
+    int runtime_version;
+    // cudaRuntimeGetVersion is yet unsupported by HIP
+    // CHECK-NOT: CUDA_CALL(hipRuntimeGetVersion(&runtime_version));
+    CUDA_CALL(cudaRuntimeGetVersion(&runtime_version));
+    int device_id;
+    // CHECK: CUDA_CALL(hipGetDevice(&device_id));
+    // CHECK: hipDeviceProp_t props;
+    // CHECK: CUDA_CALL(hipGetDeviceProperties(&props, device_id));
+    CUDA_CALL(cudaGetDevice(&device_id));
+    cudaDeviceProp props;
+    CUDA_CALL(cudaGetDeviceProperties(&props, device_id));
+
+    std::cout << "cuRAND: " << version << " ";
+    std::cout << "Runtime: " << runtime_version << " ";
+    std::cout << "Device: " << props.name;
+    std::cout << std::endl << std::endl;
+
+    for (auto engine : engines)
+    {
+        std::cout << engine << ":" << std::endl;
+        for (auto distribution : distributions)
+        {
+            std::cout << "  " << distribution << ":" << std::endl;
+            const std::string plot_name = engine + "-" + distribution;
+            if (engine == "xorwow")
+            {
+                // CHECK: run_benchmarks<hiprandStateXORWOW_t>(parser, distribution);
+                run_benchmarks<curandStateXORWOW_t>(parser, distribution);
+            }
+            else if (engine == "mrg32k3a")
+            {
+                // CHECK: run_benchmarks<hiprandStateMRG32k3a_t>(parser, distribution);
+                run_benchmarks<curandStateMRG32k3a_t>(parser, distribution);
+            }
+            else if (engine == "philox")
+            {
+                // CHECK: run_benchmarks<hiprandStatePhilox4_32_10_t>(parser, distribution);
+                run_benchmarks<curandStatePhilox4_32_10_t>(parser, distribution);
+            }
+            else if (engine == "sobol32")
+            {
+                // CHECK: run_benchmarks<hiprandStateSobol32_t>(parser, distribution);
+                run_benchmarks<curandStateSobol32_t>(parser, distribution);
+            }
+            else if (engine == "mtgp32")
+            {
+                // CHECK: run_benchmarks<hiprandStateMtgp32_t>(parser, distribution);
+                run_benchmarks<curandStateMtgp32_t>(parser, distribution);
+            }
+        }
+    }
+
+    return 0;
+}
diff --git a/tests/hipify-clang/cuRAND/cmdparser.hpp b/tests/hipify-clang/cuRAND/cmdparser.hpp
new file mode 100644
index 0000000000..364612ebb1
--- /dev/null
+++ b/tests/hipify-clang/cuRAND/cmdparser.hpp
@@ -0,0 +1,513 @@
+// The MIT License (MIT)
+//
+// Copyright (c) 2015 - 2016 Florian Rappl
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+/*
+  This file is part of the C++ CmdParser utility.
+  Copyright (c) 2015 - 2016 Florian Rappl
+*/
+
+#pragma once
+#include <iostream>
+#include <stdexcept>
+#include <string>
+#include <vector>
+#include <sstream>
+#include <functional>
+
+namespace cli {
+    struct CallbackArgs {
+        const std::vector<std::string>& arguments;
+        std::ostream& output;
+        std::ostream& error;
+    };
+    class Parser {
+    private:
+        class CmdBase {
+        public:
+            explicit CmdBase(const std::string& name, const std::string& alternative, const std::string& description, bool required, bool dominant, bool variadic) :
+                name(name),
+                command(name.size() > 0 ? "-" + name : ""),
+                alternative(alternative.size() > 0 ? "--" + alternative : ""),
+                description(description),
+                required(required),
+                handled(false),
+                arguments({}),
+                dominant(dominant),
+                variadic(variadic) {
+            }
+
+            virtual ~CmdBase() {
+            }
+
+            std::string name;
+            std::string command;
+            std::string alternative;
+            std::string description;
+            bool required;
+            bool handled;
+            std::vector<std::string> arguments;
+            bool const dominant;
+            bool const variadic;
+
+            virtual std::string print_value() const = 0;
+            virtual bool parse(std::ostream& output, std::ostream& error) = 0;
+
+            bool is(const std::string& given) const {
+                return given == command || given == alternative;
+            }
+        };
+
+        template<typename T>
+        struct ArgumentCountChecker
+        {
+            static constexpr bool Variadic = false;
+        };
+
+        template<typename T>
+        struct ArgumentCountChecker<std::vector<T>>
+        {
+            static constexpr bool Variadic = true;
+        };
+
+        template<typename T>
+        class CmdFunction final : public CmdBase {
+        public:
+            explicit CmdFunction(const std::string& name, const std::string& alternative, const std::string& description, bool required, bool dominant) :
+                CmdBase(name, alternative, description, required, dominant, ArgumentCountChecker<T>::Variadic) {
+            }
+
+            virtual bool parse(std::ostream& output, std::ostream& error) {
+                try {
+                    CallbackArgs args { arguments, output, error };
+                    value = callback(args);
+                    return true;
+                } catch (...) {
+                    return false;
+                }
+            }
+
+            virtual std::string print_value() const {
+                return "";
+            }
+
+            std::function<T(CallbackArgs&)> callback;
+            T value;
+        };
+
+        template<typename T>
+        class CmdArgument final : public CmdBase {
+        public:
+            explicit CmdArgument(const std::string& name, const std::string& alternative, const std::string& description, bool required, bool dominant) :
+                CmdBase(name, alternative, description, required, dominant, ArgumentCountChecker<T>::Variadic) {
+            }
+
+            virtual bool parse(std::ostream&, std::ostream&) {
+                try {
+                    value = Parser::parse(arguments, value);
+                    return true;
+                } catch (...) {
+                    return false;
+                }
+            }
+
+            virtual std::string print_value() const {
+                return stringify(value);
+            }
+
+            T value;
+        };
+
+        static int parse(const std::vector<std::string>& elements, const int&) {
+            if (elements.size() != 1)
+                throw std::bad_cast();
+
+            return std::stoi(elements[0]);
+        }
+
+        static bool parse(const std::vector<std::string>& elements, const bool& defval) {
+            if (elements.size() != 0)
+                throw std::runtime_error("A boolean command line parameter cannot have any arguments.");
+
+            return !defval;
+        }
+
+        static double parse(const std::vector<std::string>& elements, const double&) {
+            if (elements.size() != 1)
+                throw std::bad_cast();
+
+            return std::stod(elements[0]);
+        }
+
+        static float parse(const std::vector<std::string>& elements, const float&) {
+            if (elements.size() != 1)
+                throw std::bad_cast();
+
+            return std::stof(elements[0]);
+        }
+
+        static long double parse(const std::vector<std::string>& elements, const long double&) {
+            if (elements.size() != 1)
+                throw std::bad_cast();
+
+            return std::stold(elements[0]);
+        }
+
+        static unsigned int parse(const std::vector<std::string>& elements, const unsigned int&) {
+            if (elements.size() != 1)
+                throw std::bad_cast();
+
+            return static_cast<unsigned int>(std::stoul(elements[0]));
+        }
+
+        static unsigned long parse(const std::vector<std::string>& elements, const unsigned long&) {
+            if (elements.size() != 1)
+                throw std::bad_cast();
+
+            return std::stoul(elements[0]);
+        }
+
+        static unsigned long long parse(const std::vector<std::string>& elements, const unsigned long long&) {
+          if (elements.size() != 1)
+            throw std::bad_cast();
+
+          return std::stoull(elements[0]);
+        }
+
+        static long parse(const std::vector<std::string>& elements, const long&) {
+            if (elements.size() != 1)
+                throw std::bad_cast();
+
+            return std::stol(elements[0]);
+        }
+
+        static std::string parse(const std::vector<std::string>& elements, const std::string&) {
+            if (elements.size() != 1)
+                throw std::bad_cast();
+
+            return elements[0];
+        }
+
+        template<class T>
+        static std::vector<T> parse(const std::vector<std::string>& elements, const std::vector<T>&) {
+            const T defval = T();
+            std::vector<T> values { };
+            std::vector<std::string> buffer(1);
+
+            for (const auto& element : elements) {
+                buffer[0] = element;
+                values.push_back(parse(buffer, defval));
+            }
+
+            return values;
+        }
+
+        template<class T>
+        static std::string stringify(const T& value) {
+            return std::to_string(value);
+        }
+
+        template<class T>
+        static std::string stringify(const std::vector<T>& values) {
+            std::stringstream ss { };
+            ss << "[ ";
+
+            for (const auto& value : values) {
+                ss << stringify(value) << " ";
+            }
+
+            ss << "]";
+            return ss.str();
+        }
+
+        static std::string stringify(const std::string& str) {
+            return str;
+        }
+
+    public:
+        explicit Parser(int argc, const char** argv) :
+            _appname(argv[0]) {
+            for (int i = 1; i < argc; ++i) {
+                _arguments.push_back(argv[i]);
+            }
+            enable_help();
+        }
+
+        explicit Parser(int argc, char** argv) :
+            _appname(argv[0]) {
+            for (int i = 1; i < argc; ++i) {
+                _arguments.push_back(argv[i]);
+            }
+            enable_help();
+        }
+
+        ~Parser() {
+            for (int i = 0, n = _commands.size(); i < n; ++i) {
+                delete _commands[i];
+            }
+        }
+
+        bool has_help() const {
+            for (const auto command : _commands) {
+                if (command->name == "h" && command->alternative == "--help") {
+                    return true;
+                }
+            }
+
+            return false;
+        }
+
+        void enable_help() {
+            set_callback("h", "help", std::function<bool(CallbackArgs&)>([this](CallbackArgs& args){
+                args.output << this->usage();
+                exit(0);
+                return false;
+            }), "", true);
+        }
+
+        void disable_help() {
+            for (auto command = _commands.begin(); command != _commands.end(); ++command) {
+                if ((*command)->name == "h" && (*command)->alternative == "--help") {
+                    _commands.erase(command);
+                    break;
+                }
+            }
+        }
+
+        template<typename T>
+        void set_default(bool is_required, const std::string& description = "") {
+            auto command = new CmdArgument<T> { "", "", description, is_required, false };
+            _commands.push_back(command);
+        }
+
+        template<typename T>
+        void set_required(const std::string& name, const std::string& alternative, const std::string& description = "", bool dominant = false) {
+            auto command = new CmdArgument<T> { name, alternative, description, true, dominant };
+            _commands.push_back(command);
+        }
+
+        template<typename T>
+        void set_optional(const std::string& name, const std::string& alternative, T defaultValue, const std::string& description = "", bool dominant = false) {
+            auto command = new CmdArgument<T> { name, alternative, description, false, dominant };
+            command->value = defaultValue;
+            _commands.push_back(command);
+        }
+
+        template<typename T>
+        void set_callback(const std::string& name, const std::string& alternative, std::function<T(CallbackArgs&)> callback, const std::string& description = "", bool dominant = false) {
+            auto command = new CmdFunction<T> { name, alternative, description, false, dominant };
+            command->callback = callback;
+            _commands.push_back(command);
+        }
+
+        inline void run_and_exit_if_error() {
+            if (run() == false) {
+                exit(1);
+            }
+        }
+
+        inline bool run() {
+            return run(std::cout, std::cerr);
+        }
+
+        inline bool run(std::ostream& output) {
+            return run(output, std::cerr);
+        }
+
+        bool run(std::ostream& output, std::ostream& error) {
+            if (_arguments.size() > 0) {
+                auto current = find_default();
+
+                for (int i = 0, n = _arguments.size(); i < n; ++i) {
+                    auto isarg = _arguments[i].size() > 0 && _arguments[i][0] == '-';
+                    auto associated = isarg ? find(_arguments[i]) : nullptr;
+
+                    if (associated != nullptr) {
+                        current = associated;
+                        associated->handled = true;
+                    } else if (current == nullptr) {
+                        error << no_default();
+                        return false;
+                    } else {
+                        current->arguments.push_back(_arguments[i]);
+                        current->handled = true;
+                        if (!current->variadic)
+                        {
+                            // If the current command is not variadic, then no more arguments
+                            // should be added to it. In this case, switch back to the default
+                            // command.
+                            current = find_default();
+                        }
+                    }
+                }
+            }
+
+            // First, parse dominant arguments since they succeed even if required
+            // arguments are missing.
+            for (auto command : _commands) {
+                if (command->handled && command->dominant && !command->parse(output, error)) {
+                    error << howto_use(command);
+                    return false;
+                }
+            }
+
+            // Next, check for any missing arguments.
+            for (auto command : _commands) {
+                if (command->required && !command->handled) {
+                    error << howto_required(command);
+                    return false;
+                }
+            }
+
+            // Finally, parse all remaining arguments.
+            for (auto command : _commands) {
+                if (command->handled && !command->dominant && !command->parse(output, error)) {
+                    error << howto_use(command);
+                    return false;
+                }
+            }
+
+            return true;
+        }
+
+        template<typename T>
+        T get(const std::string& name) const {
+            for (const auto& command : _commands) {
+                if (command->name == name) {
+                    auto cmd = dynamic_cast<CmdArgument<T>*>(command);
+
+                    if (cmd == nullptr) {
+                        throw std::runtime_error("Invalid usage of the parameter " + name + " detected.");
+                    }
+
+                    return cmd->value;
+                }
+            }
+
+            throw std::runtime_error("The parameter " + name + " could not be found.");
+        }
+
+        template<typename T>
+        T get_if(const std::string& name, std::function<T(T)> callback) const {
+            auto value = get<T>(name);
+            return callback(value);
+        }
+
+        int requirements() const {
+            int count = 0;
+
+            for (const auto& command : _commands) {
+                if (command->required) {
+                    ++count;
+                }
+            }
+
+            return count;
+        }
+
+        int commands() const {
+            return static_cast<int>(_commands.size());
+        }
+
+        inline const std::string& app_name() const {
+            return _appname;
+        }
+
+    protected:
+        CmdBase* find(const std::string& name) {
+            for (auto command : _commands) {
+                if (command->is(name)) {
+                    return command;
+                }
+            }
+
+            return nullptr;
+        }
+
+        CmdBase* find_default() {
+            for (auto command : _commands) {
+                if (command->name == "") {
+                    return command;
+                }
+            }
+
+            return nullptr;
+        }
+
+        std::string usage() const {
+            std::stringstream ss { };
+            ss << "Available parameters:\n\n";
+
+            for (const auto& command : _commands) {
+                ss << "  " << command->command << "\t" << command->alternative;
+
+                if (command->required == true) {
+                    ss << "\t(required)";
+                }
+
+                ss << "\n   " << command->description;
+
+                if (command->required == false) {
+                    ss << "\n   " << "This parameter is optional. The default value is '" + command->print_value() << "'.";
+                }
+
+                ss << "\n\n";
+            }
+
+            return ss.str();
+        }
+
+        void print_help(std::stringstream& ss) const {
+            if (has_help()) {
+                ss << "For more help use --help or -h.\n";
+            }
+        }
+
+        std::string howto_required(CmdBase* command) const {
+            std::stringstream ss { };
+            ss << "The parameter " << command->name << " is required.\n";
+            ss << command->description << '\n';
+            print_help(ss);
+            return ss.str();
+        }
+
+        std::string howto_use(CmdBase* command) const {
+            std::stringstream ss { };
+            ss << "The parameter " << command->name << " has invalid arguments.\n";
+            ss << command->description << '\n';
+            print_help(ss);
+            return ss.str();
+        }
+
+        std::string no_default() const {
+            std::stringstream ss { };
+            ss << "No default parameter has been specified.\n";
+            ss << "The given argument must be used with a parameter.\n";
+            print_help(ss);
+            return ss.str();
+        }
+
+    private:
+        const std::string _appname;
+        std::vector<std::string> _arguments;
+        std::vector<CmdBase*> _commands;
+    };
+}
diff --git a/tests/hipify-clang/headers_test_09.cu b/tests/hipify-clang/headers_test_09.cu
new file mode 100644
index 0000000000..048ac2e9a9
--- /dev/null
+++ b/tests/hipify-clang/headers_test_09.cu
@@ -0,0 +1,93 @@
+// RUN: %run_test hipify "%s" "%t" %cuda_args
+
+// CHECK: #include <hip/hip_runtime.h>
+// CHECK: #include <memory>
+
+// CHECK-NOT: #include <cuda_runtime.h>
+// CHECK-NOT: #include <hip/hip_runtime.h>
+
+// CHECK: #include "hip/hip_runtime_api.h"
+// CHECK: #include "hip/channel_descriptor.h"
+// CHECK: #include "hip/device_functions.h"
+// CHECK: #include "hip/driver_types.h"
+// CHECK: #include "hip/hip_complex.h"
+// CHECK: #include "hip/hip_fp16.h"
+// CHECK: #include "hip/hip_texture_types.h"
+// CHECK: #include "hip/hip_vector_types.h"
+
+// CHECK: #include <iostream>
+
+// CHECK: #include "hipblas.h"
+// CHECK-NOT: #include "cublas.h"
+
+// CHECK: #include <stdio.h>
+
+// CHECK: #include "hiprand.h"
+// CHECK: #include "hiprand_kernel.h"
+
+// CHECK: #include <algorithm>
+
+// CHECK-NOT: #include "hiprand.h"
+// CHECK-NOT: #include "hiprand_kernel.h"
+// CHECK-NOT: #include "curand_discrete.h"
+// CHECK-NOT: #include "curand_discrete2.h"
+// CHECK-NOT: #include "curand_globals.h"
+// CHECK-NOT: #include "curand_lognormal.h"
+// CHECK-NOT: #include "curand_mrg32k3a.h"
+// CHECK-NOT: #include "curand_mtgp32.h"
+// CHECK-NOT: #include "curand_mtgp32_host.h"
+// CHECK-NOT: #include "curand_mtgp32_kernel.h"
+// CHECK-NOT: #include "curand_mtgp32dc_p_11213.h"
+// CHECK-NOT: #include "curand_normal.h"
+// CHECK-NOT: #include "curand_normal_static.h"
+// CHECK-NOT: #include "curand_philox4x32_x.h"
+// CHECK-NOT: #include "curand_poisson.h"
+// CHECK-NOT: #include "curand_precalc.h"
+// CHECK-NOT: #include "curand_uniform.h"
+
+// CHECK: #include <string>
+
+#include <cuda.h>
+
+#include <memory>
+
+#include <cuda_runtime.h>
+
+#include "cuda_runtime_api.h"
+#include "channel_descriptor.h"
+#include "device_functions.h"
+#include "driver_types.h"
+#include "cuComplex.h"
+#include "cuda_fp16.h"
+#include "cuda_texture_types.h"
+#include "vector_types.h"
+
+#include <iostream>
+
+#include "cublas_v2.h"
+#include "cublas.h"
+
+#include <stdio.h>
+
+#include "curand.h"
+#include "curand_kernel.h"
+
+#include <algorithm>
+
+#include "curand_discrete.h"
+#include "curand_discrete2.h"
+#include "curand_globals.h"
+#include "curand_lognormal.h"
+#include "curand_mrg32k3a.h"
+#include "curand_mtgp32.h"
+#include "curand_mtgp32_host.h"
+#include "curand_mtgp32_kernel.h"
+#include "curand_mtgp32dc_p_11213.h"
+#include "curand_normal.h"
+#include "curand_normal_static.h"
+#include "curand_philox4x32_x.h"
+#include "curand_poisson.h"
+#include "curand_precalc.h"
+#include "curand_uniform.h"
+
+#include <string>
diff --git a/tests/hipify-clang/lit.cfg b/tests/hipify-clang/lit.cfg
index 12b1410cee..104cee8311 100644
--- a/tests/hipify-clang/lit.cfg
+++ b/tests/hipify-clang/lit.cfg
@@ -21,6 +21,8 @@ config.test_format = lit.formats.ShTest()
 # test_source_root: The root path where tests are located.
 config.test_source_root = os.path.dirname(__file__)
 
+config.excludes = ['cmdparser.hpp']
+
 # test_exec_root: The path where tests are located (default is the test suite root).
 #config.test_exec_root = config.test_source_root
 

From 5bce9ea5ef8a916412e80515f6533cc2d2a7085c Mon Sep 17 00:00:00 2001
From: Maneesh Gupta <maneesh.gupta@amd.com>
Date: Tue, 30 Jan 2018 12:32:46 +0530
Subject: [PATCH 14/24] Fix hipStreamAddCallback testcase for nvcc

Change-Id: Ieec4b8d7933d8d68394d21d27132da206111efc8
---
 tests/src/runtimeApi/stream/hipStreamAddCallback.cpp | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/tests/src/runtimeApi/stream/hipStreamAddCallback.cpp b/tests/src/runtimeApi/stream/hipStreamAddCallback.cpp
index 692d090509..0f9f32a253 100644
--- a/tests/src/runtimeApi/stream/hipStreamAddCallback.cpp
+++ b/tests/src/runtimeApi/stream/hipStreamAddCallback.cpp
@@ -24,6 +24,7 @@ THE SOFTWARE.
  */
 
 #include <stdio.h>
+#include <unistd.h>
 #include "hip/hip_runtime.h"
 #include "test_common.h"
 
@@ -42,16 +43,17 @@ __global__ void vector_square(float *C_d, float *A_d, size_t N)
 }
 
 float *A_h, *C_h;
-size_t N = 1000000;
+bool cbDone = false;
 
 static void HIPRT_CB Callback(hipStream_t stream, hipError_t status, void *userData)
 {
     for (size_t i=0; i<N; i++) {
         if (C_h[i] != A_h[i] * A_h[i]) {
-            HIPCHECK(hipErrorUnknown);
+            warn("Data mismatch %zu", i);
         }
     }
     printf ("PASSED!\n");
+    cbDone = true;
 }
 
 int main(int argc, char *argv[])
@@ -84,4 +86,6 @@ int main(int argc, char *argv[])
 
     HIPCHECK(hipMemcpyAsync(C_h, C_d, Nbytes, hipMemcpyDeviceToHost, mystream));
     HIPCHECK(hipStreamAddCallback(mystream, Callback, NULL, 0));
+
+    while(!cbDone) sleep(1);
 }

From 24ab820a11210ad9e4c0d29dee87e357b30ed4e2 Mon Sep 17 00:00:00 2001
From: Rahul Garg <rahul.garg@amd.com>
Date: Tue, 30 Jan 2018 18:06:31 +0530
Subject: [PATCH 15/24] Fixed host allocated globals address lookup for host
 usage

Fixed texture driver APIs failure
---
 include/hip/hcc_detail/program_state.hpp | 6 ++----
 src/hip_memory.cpp                       | 1 +
 src/hip_module.cpp                       | 3 +--
 src/program_state.cpp                    | 9 ++++-----
 4 files changed, 8 insertions(+), 11 deletions(-)

diff --git a/include/hip/hcc_detail/program_state.hpp b/include/hip/hcc_detail/program_state.hpp
index 02e2f1e524..f7de214f10 100644
--- a/include/hip/hcc_detail/program_state.hpp
+++ b/include/hip/hcc_detail/program_state.hpp
@@ -69,18 +69,16 @@ namespace hip_impl
         }
     };
 
-    using RAII_global = std::unique_ptr<void, decltype(hsa_amd_memory_unlock)*>;
-
     const std::unordered_map<
         hsa_agent_t, std::vector<hsa_executable_t>>& executables();
     const std::unordered_map<
         std::uintptr_t,
         std::vector<std::pair<hsa_agent_t, Kernel_descriptor>>>& functions();
     const std::unordered_map<std::uintptr_t, std::string>& function_names();
-    std::unordered_map<std::string, RAII_global>& globals();
+    std::unordered_map<std::string, void*>& globals();
 
     hsa_executable_t load_executable(
         const std::string& file,
         hsa_executable_t executable,
         hsa_agent_t agent);
-} // Namespace hip_impl.
\ No newline at end of file
+} // Namespace hip_impl.
diff --git a/src/hip_memory.cpp b/src/hip_memory.cpp
index 77526cf9ac..e1016f4af4 100644
--- a/src/hip_memory.cpp
+++ b/src/hip_memory.cpp
@@ -453,6 +453,7 @@ hipError_t hipArrayCreate ( hipArray** array, const HIP_ARRAY_DESCRIPTOR* pAlloc
         array[0]->width = pAllocateArray->width;
         array[0]->height = pAllocateArray->height;
         array[0]->isDrv = true;
+        array[0]->textureType = hipTextureType2D;
         void ** ptr = &array[0]->data;
 	    if (ctx) {
             const unsigned am_flags = 0;
diff --git a/src/hip_module.cpp b/src/hip_module.cpp
index 45a44b3666..d173a2f295 100644
--- a/src/hip_module.cpp
+++ b/src/hip_module.cpp
@@ -568,7 +568,6 @@ hipError_t hipModuleGetTexRef(
     const auto it = globals().find(name);
     if (it == globals().end()) return ihipLogStatus(hipErrorInvalidValue);
 
-    *texRef = static_cast<textureReference*>(it->second.get());
-
+    *texRef = reinterpret_cast<textureReference*>(it->second);
     return ihipLogStatus(hipSuccess);
 }
diff --git a/src/program_state.cpp b/src/program_state.cpp
index e867887da2..35785dcad5 100644
--- a/src/program_state.cpp
+++ b/src/program_state.cpp
@@ -169,7 +169,7 @@ namespace
             lock_guard<mutex> lck{mtx};
 
             if (globals().find(x) != globals().cend()) return;
-
+            globals().emplace(x, (void*)(it1->second.first));
             void* p = nullptr;
             hsa_amd_memory_lock(
                 reinterpret_cast<void*>(it1->second.first),
@@ -181,7 +181,6 @@ namespace
             hsa_executable_agent_global_variable_define(
                 executable, agent, x.c_str(), p);
 
-            globals().emplace(x, RAII_global{p, hsa_amd_memory_unlock});
         }
     }
 
@@ -462,9 +461,9 @@ namespace hip_impl
         return r;
     }
 
-    unordered_map<string, RAII_global>& globals()
+    unordered_map<string, void*>& globals()
     {
-        static unordered_map<string, RAII_global> r;
+        static unordered_map<string, void*> r;
         static once_flag f;
         call_once(f, []() { r.reserve(symbol_addresses().size()); });
 
@@ -491,4 +490,4 @@ namespace hip_impl
 
         return executable;
     }
-} // Namespace hip_impl.
\ No newline at end of file
+} // Namespace hip_impl.

From 9d814bef31eb3239e98299fb8b2cb836eb671063 Mon Sep 17 00:00:00 2001
From: Evgeny Mankov <Evgeniy.Mankov@amd.com>
Date: Tue, 30 Jan 2018 18:55:09 +0300
Subject: [PATCH 16/24] [HIPIFY][tests] fix typo

---
 hipify-clang/src/CUDA2HipMap.cpp                      | 2 +-
 tests/hipify-clang/cuRAND/benchmark_curand_kernel.cpp | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/hipify-clang/src/CUDA2HipMap.cpp b/hipify-clang/src/CUDA2HipMap.cpp
index 7001deb2e7..35dab03430 100644
--- a/hipify-clang/src/CUDA2HipMap.cpp
+++ b/hipify-clang/src/CUDA2HipMap.cpp
@@ -2872,7 +2872,7 @@ const std::map<llvm::StringRef, hipCounter> CUDA_IDENTIFIER_MAP{
     {"curand_uniform2_double",                        {"hiprand_uniform2_double",                        CONV_DEVICE_FUNC, API_RAND}},
     {"curand_uniform4",                               {"hiprand_uniform4",                               CONV_DEVICE_FUNC, API_RAND}},
     {"curand_uniform4_double",                        {"hiprand_uniform4_double",                        CONV_DEVICE_FUNC, API_RAND}},
-    {"curand_discrete",                               {"hiprand_discrete4",                              CONV_DEVICE_FUNC, API_RAND}},
+    {"curand_discrete",                               {"hiprand_discrete",                              CONV_DEVICE_FUNC, API_RAND}},
     {"curand_discrete4",                              {"hiprand_discrete4",                              CONV_DEVICE_FUNC, API_RAND}},
     {"curand_poisson",                                {"hiprand_poisson",                                CONV_DEVICE_FUNC, API_RAND}},
     {"curand_poisson4",                               {"hiprand_poisson4",                               CONV_DEVICE_FUNC, API_RAND}},
diff --git a/tests/hipify-clang/cuRAND/benchmark_curand_kernel.cpp b/tests/hipify-clang/cuRAND/benchmark_curand_kernel.cpp
index 2c77f25331..bdcf512993 100644
--- a/tests/hipify-clang/cuRAND/benchmark_curand_kernel.cpp
+++ b/tests/hipify-clang/cuRAND/benchmark_curand_kernel.cpp
@@ -510,7 +510,7 @@ void run_benchmarks(const cli::Parser& parser,
             run_benchmark<unsigned int, GeneratorState>(parser,
                 // CHECK: [] __device__ (GeneratorState * state, hiprandDiscreteDistribution_t discrete_distribution) {
                 [] __device__ (GeneratorState * state, curandDiscreteDistribution_t discrete_distribution) {
-                    // CHECK: return hiprand_discrete4(state, discrete_distribution);
+                    // CHECK: return hiprand_discrete(state, discrete_distribution);
                     return curand_discrete(state, discrete_distribution);
                 }, discrete_distribution
             );

From 26095fe7c1e3b1ade993fca138d35d84e2b47a86 Mon Sep 17 00:00:00 2001
From: Evgeny Mankov <Evgeniy.Mankov@amd.com>
Date: Tue, 30 Jan 2018 19:50:18 +0300
Subject: [PATCH 17/24] [HIPIFY][tests] add poisson-api-example test

+ add missing types
+ doc update
---
 docs/markdown/CURAND_API_supported_by_HIP.md  |   2 +
 hipify-clang/src/CUDA2HipMap.cpp              |   4 +-
 .../cuRAND/poisson_api_example.cu             | 417 ++++++++++++++++++
 3 files changed, 422 insertions(+), 1 deletion(-)
 create mode 100644 tests/hipify-clang/cuRAND/poisson_api_example.cu

diff --git a/docs/markdown/CURAND_API_supported_by_HIP.md b/docs/markdown/CURAND_API_supported_by_HIP.md
index aae43af272..c35eeb26b6 100644
--- a/docs/markdown/CURAND_API_supported_by_HIP.md
+++ b/docs/markdown/CURAND_API_supported_by_HIP.md
@@ -101,6 +101,8 @@
 | struct       | `curandStateMRG32k3a_t`                                       | `hiprandStateMRG32k3a_t`                                   |
 | struct       | `curandStatePhilox4_32_10_t`                                  | `hiprandStatePhilox4_32_10_t`                              |
 | struct       | `curandStateXORWOW_t`                                         | `hiprandStateXORWOW_t`                                     |
+| struct       | `curandState_t`                                               | `hiprandState_t`                                           |
+| struct       | `curandState`                                                 | `hiprandState_t`                                           |
 
 ## **2. Host API Functions**
 
diff --git a/hipify-clang/src/CUDA2HipMap.cpp b/hipify-clang/src/CUDA2HipMap.cpp
index 35dab03430..47358802e9 100644
--- a/hipify-clang/src/CUDA2HipMap.cpp
+++ b/hipify-clang/src/CUDA2HipMap.cpp
@@ -362,6 +362,8 @@ const std::map<llvm::StringRef, hipCounter> CUDA_TYPE_NAME_MAP{
     {"curandStateMRG32k3a_t",         {"hiprandStateMRG32k3a_t",         CONV_TYPE, API_RAND}},
     {"curandStatePhilox4_32_10_t",    {"hiprandStatePhilox4_32_10_t",    CONV_TYPE, API_RAND}},
     {"curandStateXORWOW_t",           {"hiprandStateXORWOW_t",           CONV_TYPE, API_RAND}},
+    {"curandState_t",                 {"hiprandState_t",                 CONV_TYPE, API_RAND}},
+    {"curandState",                   {"hiprandState_t",                 CONV_TYPE, API_RAND}},
 };
 
 /// Maps cuda header names to hip header names.
@@ -2872,7 +2874,7 @@ const std::map<llvm::StringRef, hipCounter> CUDA_IDENTIFIER_MAP{
     {"curand_uniform2_double",                        {"hiprand_uniform2_double",                        CONV_DEVICE_FUNC, API_RAND}},
     {"curand_uniform4",                               {"hiprand_uniform4",                               CONV_DEVICE_FUNC, API_RAND}},
     {"curand_uniform4_double",                        {"hiprand_uniform4_double",                        CONV_DEVICE_FUNC, API_RAND}},
-    {"curand_discrete",                               {"hiprand_discrete",                              CONV_DEVICE_FUNC, API_RAND}},
+    {"curand_discrete",                               {"hiprand_discrete",                               CONV_DEVICE_FUNC, API_RAND}},
     {"curand_discrete4",                              {"hiprand_discrete4",                              CONV_DEVICE_FUNC, API_RAND}},
     {"curand_poisson",                                {"hiprand_poisson",                                CONV_DEVICE_FUNC, API_RAND}},
     {"curand_poisson4",                               {"hiprand_poisson4",                               CONV_DEVICE_FUNC, API_RAND}},
diff --git a/tests/hipify-clang/cuRAND/poisson_api_example.cu b/tests/hipify-clang/cuRAND/poisson_api_example.cu
new file mode 100644
index 0000000000..d4cfd90e1f
--- /dev/null
+++ b/tests/hipify-clang/cuRAND/poisson_api_example.cu
@@ -0,0 +1,417 @@
+// RUN: %run_test hipify "%s" "%t" %cuda_args
+
+// Taken from: http://docs.nvidia.com/cuda/curand/device-api-overview.html#poisson-api-example
+/*
+ * This program uses CURAND library for Poisson distribution
+ * to simulate queues in store for 16 hours. It shows the
+ * difference of using 3 different APIs:
+ * - HOST API -arrival of customers is described by Poisson(4)
+ * - SIMPLE DEVICE API -arrival of customers is described by
+ *     Poisson(4*(sin(x/100)+1)), where x is number of minutes
+ *     from store opening time.
+ * - ROBUST DEVICE API -arrival of customers is described by:
+ *     - Poisson(2) for first 3 hours.
+ *     - Poisson(1) for second 3 hours.
+ *     - Poisson(3) after 6 hours.  
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+// CHECK: #include <hip/hip_runtime.h>
+#include <cuda.h>
+// CHECK: #include <hiprand_kernel.h>
+#include <curand_kernel.h>
+// CHECK: #include <hiprand.h>
+#include <curand.h>
+
+// CHECK: #define CUDA_CALL(x) do { if((x) != hipSuccess) {
+#define CUDA_CALL(x) do { if((x) != cudaSuccess) { \
+    printf("Error at %s:%d\n",__FILE__,__LINE__); \
+    return EXIT_FAILURE;}} while(0)
+// CHECK: #define CURAND_CALL(x) do { if((x)!=HIPRAND_STATUS_SUCCESS) {
+#define CURAND_CALL(x) do { if((x)!=CURAND_STATUS_SUCCESS) { \
+    printf("Error at %s:%d\n",__FILE__,__LINE__);\
+    return EXIT_FAILURE;}} while(0)
+
+
+#define HOURS 16
+#define OPENING_HOUR 7
+#define CLOSING_HOUR (OPENING_HOUR + HOURS)
+
+#define access_2D(type, ptr, row, column, pitch)\
+    *((type*)((char*)ptr + (row) * pitch) + column)
+
+enum API_TYPE {
+    HOST_API = 0,
+    SIMPLE_DEVICE_API = 1,
+    ROBUST_DEVICE_API = 2,
+};
+
+/* global variables */
+API_TYPE api;
+int report_break;
+int cashiers_load_h[HOURS];
+__constant__ int cashiers_load[HOURS];
+// CHECK: __global__ void setup_kernel(hiprandState_t *state)
+__global__ void setup_kernel(curandState *state)
+{
+    int id = threadIdx.x + blockIdx.x * blockDim.x;
+    /* Each thread gets same seed, a different sequence 
+       number, no offset */
+    // CHECK: hiprand_init(1234, id, 0, &state[id]);
+    curand_init(1234, id, 0, &state[id]);
+}
+
+__inline__ __device__
+void update_queue(int id, int min, unsigned int new_customers,
+                  unsigned int &queue_length,
+                  unsigned int *queue_lengths, size_t pitch)
+{
+    int balance;
+    balance = new_customers - 2 * cashiers_load[(min-1)/60];
+    if (balance + (int)queue_length <= 0){
+        queue_length = 0;
+    }else{
+        queue_length += balance;
+    }
+    /* Store results */
+    access_2D(unsigned int, queue_lengths, min-1, id, pitch)
+        = queue_length;
+}
+
+// CHECK: __global__ void simple_device_API_kernel(hiprandState_t *state,
+__global__ void simple_device_API_kernel(curandState *state, 
+                    unsigned int *queue_lengths, size_t pitch)
+{
+    int id = threadIdx.x + blockIdx.x * blockDim.x;
+    unsigned int new_customers;
+    unsigned int queue_length = 0;
+    /* Copy state to local memory for efficiency */
+    // CHECK: hiprandState_t localState = state[id];
+    curandState localState = state[id];
+    /* Simulate queue in time */
+    for(int min = 1; min <= 60 * HOURS; min++) {
+        /* Draw number of new customers depending on API */
+        // CHECK: new_customers = hiprand_poisson(&localState,
+        new_customers = curand_poisson(&localState,
+                                4*(sin((float)min/100.0)+1));
+        /* Update queue */
+        update_queue(id, min, new_customers, queue_length,
+                     queue_lengths, pitch);       
+    }
+    /* Copy state back to global memory */
+    state[id] = localState;
+}
+
+
+__global__ void host_API_kernel(unsigned int *poisson_numbers,
+                    unsigned int *queue_lengths, size_t pitch)
+{
+    int id = threadIdx.x + blockIdx.x * blockDim.x;
+    unsigned int new_customers;
+    unsigned int queue_length = 0;
+    /* Simulate queue in time */
+    for(int min = 1; min <= 60 * HOURS; min++) {
+        /* Get random number from global memory */
+        new_customers = poisson_numbers
+                    [blockDim.x * gridDim.x * (min -1) + id];
+        /* Update queue */
+        update_queue(id, min, new_customers, queue_length,
+                     queue_lengths, pitch);
+    }
+}
+// CHECK: __global__ void robust_device_API_kernel(hiprandState_t *state,
+// CHECK: hiprandDiscreteDistribution_t poisson_1,
+// CHECK: hiprandDiscreteDistribution_t poisson_2,
+// CHECK: hiprandDiscreteDistribution_t poisson_3,
+__global__ void robust_device_API_kernel(curandState *state,
+                   curandDiscreteDistribution_t poisson_1,
+                   curandDiscreteDistribution_t poisson_2,
+                   curandDiscreteDistribution_t poisson_3,
+                   unsigned int *queue_lengths, size_t pitch)
+{
+    int id = threadIdx.x + blockIdx.x * 64;
+    unsigned int new_customers;
+    unsigned int queue_length = 0;
+    /* Copy state to local memory for efficiency */
+    // CHECK: hiprandState_t localState = state[id];
+    curandState localState = state[id];
+    /* Simulate queue in time */
+    /* first 3 hours */
+    for(int min = 1; min <= 60 * 3; min++) {
+        /* draw number of new customers depending on API */
+        new_customers =
+                    // CHECK: hiprand_discrete(&localState, poisson_2);
+                    curand_discrete(&localState, poisson_2);
+        /* Update queue */
+        update_queue(id, min, new_customers, queue_length,
+                                    queue_lengths, pitch);
+    }
+    /* second 3 hours */
+    for(int min = 60 * 3 + 1; min <= 60 * 6; min++) {
+        /* draw number of new customers depending on API */
+        new_customers =
+                    // CHECK: hiprand_discrete(&localState, poisson_1);
+                    curand_discrete(&localState, poisson_1);
+        /* Update queue */
+        update_queue(id, min, new_customers, queue_length,
+                                    queue_lengths, pitch);       
+    }
+    /* after 6 hours */
+    for(int min = 60 * 6 + 1; min <= 60 * HOURS; min++) {
+        /* draw number of new customers depending on API */
+        new_customers =
+                    // CHECK: hiprand_discrete(&localState, poisson_3);
+                    curand_discrete(&localState, poisson_3);
+        /* Update queue */
+        update_queue(id, min, new_customers, queue_length,
+                                    queue_lengths, pitch);       
+    }
+    /* Copy state back to global memory */
+    state[id] = localState;
+}
+
+/* Set time intervals between reports */
+void report_settings()
+{
+    do{
+        printf("Set time intervals between queue reports");
+        printf("(in minutes > 0)\n");
+        if (scanf("%d", &report_break) == 0) continue;
+    }while(report_break <= 0);
+}
+
+
+/* Set number of cashiers each hour */
+void add_cachiers(int *cashiers_load)
+{
+    int i, min, max, begin, end;
+    printf("Cashier serves 2 customers per minute...\n");
+    for (i = 0; i < HOURS; i++){
+        cashiers_load_h[i] = 0;
+    }
+    while (true){
+        printf("Adding cashier...\n");
+        min = OPENING_HOUR;
+        max = CLOSING_HOUR-1;
+        do{
+            printf("Set hour that cahier comes (%d-%d)",
+                                                min, max);
+            printf(" [type 0 to finish adding cashiers]\n");
+            if (scanf("%d", &begin) == 0) continue;
+        }while (begin > max || (begin < min && begin != 0));
+        if (begin == 0) break;
+        min = begin+1;
+        max = CLOSING_HOUR;
+        do{
+            printf("Set hour that cahier leaves (%d-%d)",
+                                                min, max);
+            printf(" [type 0 to finish adding cashiers]\n");
+            if (scanf("%d", &end) == 0) continue;
+        }while (end > max || (end < min && end != 0));
+        if (end == 0) break;
+        for (i = begin - OPENING_HOUR;
+             i < end   - OPENING_HOUR; i++){
+            cashiers_load_h[i]++;
+        }
+    }
+    for (i = OPENING_HOUR; i < CLOSING_HOUR; i++){
+        printf("\n%2d:00 - %2d:00     %d cashier",
+                i, i+1, cashiers_load_h[i-OPENING_HOUR]);
+        if (cashiers_load[i-OPENING_HOUR] != 1) printf("s");
+    }
+    printf("\n");
+}
+
+/* Set API type */
+API_TYPE set_API_type()
+{
+    printf("Choose API type:\n");
+    int choose;
+    do{
+        printf("type 1 for HOST API\n");
+        printf("type 2 for SIMPLE DEVICE API\n");
+        printf("type 3 for ROBUST DEVICE API\n");
+        if (scanf("%d", &choose) == 0) continue;
+    }while( choose < 1 || choose > 3);
+    switch(choose){
+        case 1: return HOST_API;
+        case 2: return SIMPLE_DEVICE_API;
+        case 3: return ROBUST_DEVICE_API;
+        default:
+            fprintf(stderr, "wrong API\n");
+            return HOST_API;
+    }
+}
+
+void settings()
+{
+    add_cachiers(cashiers_load);
+    // CHECK: hipMemcpyToSymbol("cashiers_load", cashiers_load_h,
+    // CHECK: HOURS * sizeof(int), 0, hipMemcpyHostToDevice);
+    cudaMemcpyToSymbol("cashiers_load", cashiers_load_h,
+            HOURS * sizeof(int), 0, cudaMemcpyHostToDevice);
+    report_settings();
+    api = set_API_type();
+}
+
+void print_statistics(unsigned int *hostResults, size_t pitch)
+{
+    int min, i, hour, minute;
+    unsigned int sum;
+    for(min = report_break; min <= 60 * HOURS;
+                            min += report_break) {
+        sum = 0;
+        for(i = 0; i < 64 * 64; i++) {
+            sum += access_2D(unsigned int, hostResults,
+                                        min-1, i, pitch);
+        }
+        hour = OPENING_HOUR + min/60;
+        minute = min%60;
+        printf("%2d:%02d   # of waiting customers = %10.4g |",
+                    hour, minute, (float)sum/(64.0 * 64.0));
+        printf("  # of cashiers = %d  |  ",
+                    cashiers_load_h[(min-1)/60]);
+        printf("# of new customers/min ~= ");
+        switch (api){
+            case HOST_API:
+                printf("%2.2f\n", 4.0);
+                break;
+            case SIMPLE_DEVICE_API:
+                printf("%2.2f\n",
+                            4*(sin((float)min/100.0)+1));
+                break;
+            case ROBUST_DEVICE_API:
+                if (min <= 3 * 60){
+                    printf("%2.2f\n", 2.0);
+                }else{
+                    if (min <= 6 * 60){
+                        printf("%2.2f\n", 1.0);
+                    }else{
+                        printf("%2.2f\n", 3.0);
+                    }
+                }
+                break;
+            default:
+                fprintf(stderr, "Wrong API\n");
+        }
+    }
+}
+
+
+int main(int argc, char *argv[])
+{
+    int n;
+    size_t pitch;
+    // CHECK: hiprandState_t *devStates;
+    curandState *devStates;
+    unsigned int *devResults, *hostResults;
+    unsigned int *poisson_numbers_d;
+    // CHECK: hiprandDiscreteDistribution_t poisson_1, poisson_2;
+    // CHECK: hiprandDiscreteDistribution_t poisson_3;
+    // CHECK: hiprandGenerator_t gen;
+    curandDiscreteDistribution_t poisson_1, poisson_2;
+    curandDiscreteDistribution_t poisson_3;
+    curandGenerator_t gen;
+
+    /* Setting cashiers, report and API */
+    settings();
+
+    /* Allocate space for results on device */
+    // CHECK: CUDA_CALL(hipMallocPitch((void **)&devResults, &pitch,
+    CUDA_CALL(cudaMallocPitch((void **)&devResults, &pitch,
+                64 * 64 * sizeof(unsigned int), 60 * HOURS));
+
+    /* Allocate space for results on host */
+    hostResults = (unsigned int *)calloc(pitch * 60 * HOURS,
+                sizeof(unsigned int));
+
+    /* Allocate space for prng states on device */
+    // CHECK: CUDA_CALL(hipMalloc((void **)&devStates, 64 * 64 *
+    // CHECK: sizeof(hiprandState_t)));
+    CUDA_CALL(cudaMalloc((void **)&devStates, 64 * 64 * 
+              sizeof(curandState)));
+
+    /* Setup prng states */
+    if (api != HOST_API){
+        // CHECK: hipLaunchKernelGGL(setup_kernel, dim3(64), dim3(64), 0, 0, devStates);
+        setup_kernel<<<64, 64>>>(devStates);
+    }
+    /* Simulate queue  */
+    switch (api){
+        case HOST_API:
+            /* Create pseudo-random number generator */
+            // CHECK: CURAND_CALL(hiprandCreateGenerator(&gen,
+                                // CHECK: HIPRAND_RNG_PSEUDO_DEFAULT));
+            CURAND_CALL(curandCreateGenerator(&gen,
+                                CURAND_RNG_PSEUDO_DEFAULT));
+            /* Set seed */
+            // CHECK: CURAND_CALL(hiprandSetPseudoRandomGeneratorSeed(
+            CURAND_CALL(curandSetPseudoRandomGeneratorSeed(
+                                            gen, 1234ULL));
+            /* compute n */
+            n = 64 * 64 * HOURS * 60;
+            /* Allocate n unsigned ints on device */
+            // CHECK: CUDA_CALL(hipMalloc((void **)&poisson_numbers_d,
+            CUDA_CALL(cudaMalloc((void **)&poisson_numbers_d,
+                                n * sizeof(unsigned int)));
+            /* Generate n unsigned ints on device */
+            // CHECK: CURAND_CALL(hiprandGeneratePoisson(gen,
+            CURAND_CALL(curandGeneratePoisson(gen,
+                                poisson_numbers_d, n, 4.0));
+            // CHECK: hipLaunchKernelGGL(host_API_kernel, dim3(64), dim3(64), 0, 0, poisson_numbers_d,
+            host_API_kernel<<<64, 64>>>(poisson_numbers_d,
+                                        devResults, pitch);
+            /* Cleanup */
+            // CHECK: CURAND_CALL(hiprandDestroyGenerator(gen));
+            CURAND_CALL(curandDestroyGenerator(gen));
+            break;
+        case SIMPLE_DEVICE_API:
+            // CHECK: hipLaunchKernelGGL(simple_device_API_kernel, dim3(64), dim3(64), 0, 0, devStates,
+            simple_device_API_kernel<<<64, 64>>>(devStates,
+                                        devResults, pitch);
+            break;
+        case ROBUST_DEVICE_API:
+            /* Create histograms for Poisson(1) */
+            // CHECK: CURAND_CALL(hiprandCreatePoissonDistribution(1.0,
+            CURAND_CALL(curandCreatePoissonDistribution(1.0,
+                                                &poisson_1));
+            /* Create histograms for Poisson(2) */
+            // CHECK: CURAND_CALL(hiprandCreatePoissonDistribution(2.0,
+            CURAND_CALL(curandCreatePoissonDistribution(2.0,
+                                                &poisson_2));
+            /* Create histograms for Poisson(3) */
+            // CHECK: CURAND_CALL(hiprandCreatePoissonDistribution(3.0,
+            CURAND_CALL(curandCreatePoissonDistribution(3.0,
+                                                &poisson_3));
+            // CHECK: hipLaunchKernelGGL(robust_device_API_kernel, dim3(64), dim3(64), 0, 0, devStates,
+            robust_device_API_kernel<<<64, 64>>>(devStates,
+                            poisson_1, poisson_2, poisson_3,
+                            devResults, pitch);
+            /* Cleanup */
+            // CHECK: CURAND_CALL(hiprandDestroyDistribution(poisson_1));
+            // CHECK: CURAND_CALL(hiprandDestroyDistribution(poisson_2));
+            // CHECK: CURAND_CALL(hiprandDestroyDistribution(poisson_3));
+            CURAND_CALL(curandDestroyDistribution(poisson_1));
+            CURAND_CALL(curandDestroyDistribution(poisson_2));
+            CURAND_CALL(curandDestroyDistribution(poisson_3));
+            break;
+        default:
+            fprintf(stderr, "Wrong API\n");
+    }
+    /* Copy device memory to host */
+    // CHECK: CUDA_CALL(hipMemcpy2D(hostResults, pitch, devResults,
+    // CHECK: 60 * HOURS, hipMemcpyDeviceToHost));
+    CUDA_CALL(cudaMemcpy2D(hostResults, pitch, devResults,
+                pitch, 64 * 64 * sizeof(unsigned int),
+                60 * HOURS, cudaMemcpyDeviceToHost));
+    /* Show result */
+    print_statistics(hostResults, pitch);
+    /* Cleanup */
+    // CHECK: CUDA_CALL(hipFree(devStates));
+    // CHECK: CUDA_CALL(hipFree(devResults));
+    CUDA_CALL(cudaFree(devStates));
+    CUDA_CALL(cudaFree(devResults));
+    free(hostResults);
+    return EXIT_SUCCESS;
+}

From e0b91d32dd40acbceea570b6b871e0c2ba8ae496 Mon Sep 17 00:00:00 2001
From: Maneesh Gupta <maneesh.gupta@amd.com>
Date: Wed, 31 Jan 2018 10:48:17 +0530
Subject: [PATCH 18/24] hipStreamAddCallback.cpp: Replace unistd sleep with
 sleep_for

Change-Id: I7a5d40a1acd8be76a0f175bcfa731ad89fb88d81
---
 tests/src/runtimeApi/stream/hipStreamAddCallback.cpp | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/tests/src/runtimeApi/stream/hipStreamAddCallback.cpp b/tests/src/runtimeApi/stream/hipStreamAddCallback.cpp
index 0f9f32a253..02912e14c6 100644
--- a/tests/src/runtimeApi/stream/hipStreamAddCallback.cpp
+++ b/tests/src/runtimeApi/stream/hipStreamAddCallback.cpp
@@ -18,13 +18,14 @@ THE SOFTWARE.
 */
 
 /* HIT_START
- * BUILD: %t %s ../../test_common.cpp
+ * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS -std=c++11
  * RUN: %t
  * HIT_END
  */
 
 #include <stdio.h>
-#include <unistd.h>
+#include <thread>
+#include <chrono>
 #include "hip/hip_runtime.h"
 #include "test_common.h"
 
@@ -87,5 +88,6 @@ int main(int argc, char *argv[])
     HIPCHECK(hipMemcpyAsync(C_h, C_d, Nbytes, hipMemcpyDeviceToHost, mystream));
     HIPCHECK(hipStreamAddCallback(mystream, Callback, NULL, 0));
 
-    while(!cbDone) sleep(1);
+    while(!cbDone)
+        std::this_thread::sleep_for(std::chrono::milliseconds(10));
 }

From 3c058379d84de2806ebc99ea8d39bd9c06b6a478 Mon Sep 17 00:00:00 2001
From: Evgeny Mankov <Evgeniy.Mankov@amd.com>
Date: Thu, 1 Feb 2018 17:07:48 +0300
Subject: [PATCH 19/24] [HIPIFY][tests] Add intro.cu test

---
 tests/hipify-clang/intro.cu | 174 ++++++++++++++++++++++++++++++++++++
 1 file changed, 174 insertions(+)
 create mode 100644 tests/hipify-clang/intro.cu

diff --git a/tests/hipify-clang/intro.cu b/tests/hipify-clang/intro.cu
new file mode 100644
index 0000000000..4b9c5c0da7
--- /dev/null
+++ b/tests/hipify-clang/intro.cu
@@ -0,0 +1,174 @@
+// RUN: %run_test hipify "%s" "%t" %cuda_args
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+// CHECK: #include <hip/hip_runtime.h>
+#include <cuda.h>
+
+#define K_THREADS 64
+#define K_INDEX() ((gridDim.x * blockIdx.y + blockIdx.x) * blockDim.x + threadIdx.x)
+#define RND() ((rand() & 0x7FFF) / float(0x8000))
+#define ERRORCHECK() cErrorCheck(__FILE__, __LINE__)
+
+// CHECK: hipEvent_t t##_start, t##_end;            \
+// CHECK: hipEventCreate(&t##_start);               \
+// CHECK: hipEventCreate(&t##_end);
+#define TIMER_CREATE(t)                      \
+  cudaEvent_t t##_start, t##_end;            \
+  cudaEventCreate(&t##_start);               \
+  cudaEventCreate(&t##_end);
+
+// CHECK: hipEventRecord(t##_start);                   \
+// CHECK: hipEventSynchronize(t##_start);
+#define TIMER_START(t)                          \
+  cudaEventRecord(t##_start);                   \
+  cudaEventSynchronize(t##_start);              \
+
+// CHECK: hipEventRecord(t##_start);                                 \
+// CHECK: hipEventSynchronize(t##_start);                            \
+// CHECK: hipEventRecord(t##_end);                                   \
+// CHECK: hipEventSynchronize(t##_end);                              \
+// CHECK: hipEventElapsedTime(&t, t##_start, t##_end);
+#define TIMER_END(t)                                          \
+  cudaEventRecord(t##_start);                                 \
+  cudaEventSynchronize(t##_start);                            \
+  cudaEventRecord(t##_end);                                   \
+  cudaEventSynchronize(t##_end);                              \
+  cudaEventElapsedTime(&t, t##_start, t##_end);               
+
+
+inline void cErrorCheck(const char *file, int line) {
+// CHECK: hipDeviceSynchronize();
+// CHECK: hipError_t err = hipGetLastError();
+// CHECK: if (err != hipSuccess) {
+// CHECK: printf("Error: %s\n", hipGetErrorString(err));
+  cudaThreadSynchronize();
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess) {
+    printf("Error: %s\n", cudaGetErrorString(err));
+    printf(" @ %s: %d\n", file, line);
+    exit(-1);
+  }
+}
+
+inline dim3 K_GRID(int n, int threads = K_THREADS) {
+  int blocks = (int)ceilf(sqrtf((float)n/threads));
+  dim3 grid(blocks, blocks);
+  return grid;
+}
+
+typedef struct data  {
+  int n;
+  float4 *r, *v, *f;
+} data;
+
+data cpu, gpu;
+
+#define N 20
+
+__global__ void repulsion(data gpu);
+__global__ void integration(data gpu);
+
+
+int main() {
+  printf("Cuda Test 1\n");
+
+  int count = 0;
+  // CHECK: hipGetDeviceCount(&count);
+  cudaGetDeviceCount(&count);
+  printf(" %d CUDA devices found\n", count);
+  if(!count) {
+    ::exit(EXIT_FAILURE);
+  }
+  // CHECK: hipFree(0);
+  cudaFree(0);
+
+  cpu.n = N;
+
+  cpu.r = (float4*)malloc(N * sizeof(float4));
+  cpu.v = (float4*)malloc(N * sizeof(float4));
+  cpu.f = (float4*)malloc(N * sizeof(float4));
+
+  for(int i = 0; i < N; ++i) {
+    cpu.v[i] = make_float4(0,0,0,0);
+    cpu.r[i] = make_float4(RND(), RND(), RND(), 0);
+    cpu.f[i] = make_float4(0,0.01,0,0);
+  }
+
+  gpu = cpu;
+  // CHECK: hipMalloc(&gpu.r, N * sizeof(float4));
+  // CHECK: hipMalloc(&gpu.v, N * sizeof(float4));
+  // CHECK: hipMalloc(&gpu.f, N * sizeof(float4));
+  cudaMalloc(&gpu.r, N * sizeof(float4));
+  cudaMalloc(&gpu.v, N * sizeof(float4));
+  cudaMalloc(&gpu.f, N * sizeof(float4));
+  // CHECK: hipMemcpy(gpu.r, cpu.r, cpu.n * sizeof(float4), hipMemcpyHostToDevice);
+  // CHECK: hipMemcpy(gpu.v, cpu.v, cpu.n * sizeof(float4), hipMemcpyHostToDevice);
+  // CHECK: hipMemcpy(gpu.f, cpu.f, cpu.n * sizeof(float4), hipMemcpyHostToDevice);
+  cudaMemcpy(gpu.r, cpu.r, cpu.n * sizeof(float4), cudaMemcpyHostToDevice);
+  cudaMemcpy(gpu.v, cpu.v, cpu.n * sizeof(float4), cudaMemcpyHostToDevice);
+  cudaMemcpy(gpu.f, cpu.f, cpu.n * sizeof(float4), cudaMemcpyHostToDevice);
+
+  ERRORCHECK();
+  float rep;
+  TIMER_CREATE(rep);
+  TIMER_START(rep);
+  // CHECK: hipLaunchKernelGGL(integration, dim3(K_GRID(cpu.n)), dim3(K_THREADS), 0, 0, gpu);
+  integration <<< K_GRID(cpu.n), K_THREADS >>>(gpu);
+
+  TIMER_END(rep);
+  printf("Took: %f ms\n", rep);
+  ERRORCHECK();
+  // CHECK: hipMemcpy(cpu.r, gpu.r, cpu.n * sizeof(float4), hipMemcpyDeviceToHost);
+  // CHECK: hipMemcpy(cpu.v, gpu.v, cpu.n * sizeof(float4), hipMemcpyDeviceToHost);
+  // CHECK: hipMemcpy(cpu.f, gpu.f, cpu.n * sizeof(float4), hipMemcpyDeviceToHost);
+  cudaMemcpy(cpu.r, gpu.r, cpu.n * sizeof(float4), cudaMemcpyDeviceToHost);
+  cudaMemcpy(cpu.v, gpu.v, cpu.n * sizeof(float4), cudaMemcpyDeviceToHost);
+  cudaMemcpy(cpu.f, gpu.f, cpu.n * sizeof(float4), cudaMemcpyDeviceToHost);
+  // CHECK: hipHostFree(cpu.r);
+  // CHECK: hipHostFree(cpu.v);
+  // CHECK: hipHostFree(cpu.f);
+  cudaFreeHost(cpu.r);
+  cudaFreeHost(cpu.v);
+  cudaFreeHost(cpu.f);
+  // CHECK: hipFree(gpu.r);
+  // CHECK: hipFree(gpu.v);
+  // CHECK: hipFree(gpu.f);
+  cudaFree(gpu.r);
+  cudaFree(gpu.v);
+  cudaFree(gpu.f);
+  // CHECK: hipDeviceReset();
+  cudaDeviceReset();
+
+  printf("Results: \n");
+  for(int i = 0; i < N; ++i) {
+    printf("%f, %f, %f \n", cpu.r[i].x, cpu.r[i].y, cpu.r[i].z);
+  }
+
+  printf("Ready...\n");
+  return 0;
+}
+
+__global__ void repulsion(data gpu) {
+  int idx = K_INDEX();
+  if(idx < N) {
+    gpu.r[idx].x = 1;
+    gpu.r[idx].y = 1;
+    gpu.r[idx].z = 1;
+  }
+}
+
+#define MULT4(v, s) v.x *= s; v.y *= s; v.z *= s; v.w *= s;
+#define ADD4(v1, v2) v1.x += v2.x; v1.y += v2.y; v1.z += v2.z; v1.w += v2.w;
+
+__global__ void integration(data gpu) {
+  int i = K_INDEX();
+  if(i < N) {
+    MULT4(gpu.f[i], 0.01);
+    MULT4(gpu.v[i], 0.01);
+    ADD4(gpu.v[i], gpu.f[i]);
+    ADD4(gpu.r[i], gpu.v[i]);
+    gpu.f[i] = make_float4(0,0,0,0);
+  }
+}
\ No newline at end of file

From d254875c429dd693e06feb736c27346bbeac385b Mon Sep 17 00:00:00 2001
From: Evgeny Mankov <Evgeniy.Mankov@amd.com>
Date: Thu, 1 Feb 2018 17:36:45 +0300
Subject: [PATCH 20/24] [HIPIFY][tests] add new line at the end of file

---
 tests/hipify-clang/intro.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/hipify-clang/intro.cu b/tests/hipify-clang/intro.cu
index 4b9c5c0da7..da797eb2ec 100644
--- a/tests/hipify-clang/intro.cu
+++ b/tests/hipify-clang/intro.cu
@@ -171,4 +171,4 @@ __global__ void integration(data gpu) {
     ADD4(gpu.r[i], gpu.v[i]);
     gpu.f[i] = make_float4(0,0,0,0);
   }
-}
\ No newline at end of file
+}

From 06771d20ba4b4627997de830624031e8611345d4 Mon Sep 17 00:00:00 2001
From: Evgeny Mankov <Evgeniy.Mankov@amd.com>
Date: Thu, 1 Feb 2018 18:34:16 +0300
Subject: [PATCH 21/24] [HIPIFY][tests] Add vec_add.cu test

---
 tests/hipify-clang/vec_add.cu | 90 +++++++++++++++++++++++++++++++++++
 1 file changed, 90 insertions(+)
 create mode 100644 tests/hipify-clang/vec_add.cu

diff --git a/tests/hipify-clang/vec_add.cu b/tests/hipify-clang/vec_add.cu
new file mode 100644
index 0000000000..ec813e8bad
--- /dev/null
+++ b/tests/hipify-clang/vec_add.cu
@@ -0,0 +1,90 @@
+// RUN: %run_test hipify "%s" "%t" %cuda_args
+
+// Kernel definition
+__global__ void  vecAdd(float* A, float* B, float* C)
+{
+  int i = threadIdx.x;
+  A[i] = 0;
+  B[i] = i;
+  C[i] = A[i] + B[i];
+}
+// CHECK: #include <hip/hip_runtime.h>
+#include <stdio.h>
+#define SIZE 10
+#define KERNELINVOKES  5000000
+int vecadd(int  gpudevice, int rank)
+{
+  int devcheck(int, int);
+  devcheck(gpudevice, rank);
+  float A[SIZE], B[SIZE], C[SIZE];
+  // Kernel invocation
+  float *devPtrA;
+  float *devPtrB;
+  float *devPtrC;
+  int memsize = SIZE * sizeof(float);
+  // CHECK: hipMalloc((void**)&devPtrA, memsize);
+  // CHECK: hipMalloc((void**)&devPtrB, memsize);
+  // CHECK: hipMalloc((void**)&devPtrC, memsize);
+  cudaMalloc((void**)&devPtrA, memsize);
+  cudaMalloc((void**)&devPtrB, memsize);
+  cudaMalloc((void**)&devPtrC, memsize);
+  // CHECK: hipMemcpy(devPtrA, A, memsize, hipMemcpyHostToDevice);
+  // CHECK: hipMemcpy(devPtrB, B, memsize, hipMemcpyHostToDevice);
+  cudaMemcpy(devPtrA, A, memsize, cudaMemcpyHostToDevice);
+  cudaMemcpy(devPtrB, B, memsize, cudaMemcpyHostToDevice);
+  for (int i = 0; i<KERNELINVOKES; i++)
+  {
+    // CHECK: hipLaunchKernelGGL(vecAdd, dim3(1), dim3(gpudevice), 0, 0, devPtrA, devPtrB, devPtrC);
+    vecAdd <<< 1, gpudevice >>>(devPtrA, devPtrB, devPtrC);
+  }
+  // CHECK: hipMemcpy(C, devPtrC, memsize, hipMemcpyDeviceToHost);
+  cudaMemcpy(C, devPtrC, memsize, cudaMemcpyDeviceToHost);
+  // calculate only up to gpudevice to show the unique output
+  // of each rank's kernel launch
+  for (int i = 0; i<gpudevice; i++)
+    printf("rank %d: C[%d]=%f\n", rank, i, C[i]);
+  // CHECK: hipFree(devPtrA);
+  // CHECK: hipFree(devPtrA);
+  // CHECK: hipFree(devPtrA);
+  cudaFree(devPtrA);
+  cudaFree(devPtrA);
+  cudaFree(devPtrA);
+}
+int devcheck(int  gpudevice, int rank)
+{
+  int device_count = 0;
+  int device;   // used with cudaGetDevice() to verify cudaSetDevice()
+  // CHECK: hipGetDeviceCount(&device_count);
+  cudaGetDeviceCount(&device_count);
+  if (gpudevice >= device_count)
+  {
+    printf("gpudevice >=  device_count ... exiting\n");
+    exit(1);
+  }
+  // CHECK: hipError_t cudareturn;
+  // CHECK: hipDeviceProp_t deviceProp;
+  // CHECK: hipGetDeviceProperties(&deviceProp, gpudevice);
+  cudaError_t cudareturn;
+  cudaDeviceProp deviceProp;
+  cudaGetDeviceProperties(&deviceProp, gpudevice);
+  // CHECK: if (deviceProp.hipWarpSize <= 1)
+  if (deviceProp.warpSize <= 1)
+  {
+    printf("rank %d: warning, CUDA Device Emulation (CPU) detected, exiting\n", rank);
+    exit(1);
+  }
+  // CHECK: cudareturn = hipSetDevice(gpudevice);
+  cudareturn = cudaSetDevice(gpudevice);
+  // CHECK: if (cudareturn == hipErrorInvalidDevice)
+  if (cudareturn == cudaErrorInvalidDevice)
+  {
+    // CHECK: perror("hipSetDevice returned hipErrorInvalidDevice");
+    perror("cudaSetDevice returned cudaErrorInvalidDevice");
+  }
+  else
+  {
+    // CHECK: hipGetDevice(&device);
+    cudaGetDevice(&device);
+    printf("rank %d: cudaGetDevice()=%d\n", rank, device);
+  }
+}

From b0efa41d31c6e12284fe1f16b7516a8f1bc171b0 Mon Sep 17 00:00:00 2001
From: Evgeny Mankov <Evgeniy.Mankov@amd.com>
Date: Thu, 1 Feb 2018 19:41:36 +0300
Subject: [PATCH 22/24] [HIPIFY][tests] Add coalescing.cu test

---
 tests/hipify-clang/coalescing.cu | 117 +++++++++++++++++++++++++++++++
 1 file changed, 117 insertions(+)
 create mode 100644 tests/hipify-clang/coalescing.cu

diff --git a/tests/hipify-clang/coalescing.cu b/tests/hipify-clang/coalescing.cu
new file mode 100644
index 0000000000..4c04289044
--- /dev/null
+++ b/tests/hipify-clang/coalescing.cu
@@ -0,0 +1,117 @@
+// RUN: %run_test hipify "%s" "%t" %cuda_args
+
+// To measure effects of memory coalescing. Coalescing.cu
+// B. Wilkinson Jan 30, 2011
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+// CHECK: #include <hip/hip_runtime.h>
+#include <cuda.h>
+
+#define BlockSize 16  // Size of blocks, 32 x 32 threads, fixed, used globally
+
+__global__ void gpu_Comput (int *h, int N, int T) {
+
+// Array loaded with global thread ID that acesses that location
+
+	int col = threadIdx.x + blockDim.x * blockIdx.x; 
+	int row = threadIdx.y + blockDim.y * blockIdx.y;
+
+	int threadID = col + row * N;
+	int index = row + col * N;  // sequentially down each row
+
+	for (int t = 0; t < T; t++)  // loop to repeat to reduce other time effects
+	  h[index] = threadID;  // load array with flattened global thread ID
+}
+
+void printArray(int *h, int N) {
+
+	printf("Results of computation, every N/8 numbers, eight numbers\n");
+
+	for (int row = 0; row < N; row += N/8) {
+	  for (int col = 0; col < N; col += N/8)
+	  printf("%6d  ", h[col + row * N]);
+	  printf("\n");
+	}
+}
+
+int main(int argc, char *argv[])  {
+
+	int T = 100;  // number of iterations, entered at keyboard
+	int B = 1;  // number of blocks, entered at keyboard
+	char key;
+
+	int *h, *dev_h;  // ptr to array holding numbers on host and device
+  // CHECK: hipEvent_t start, stop;
+	cudaEvent_t start, stop;  // cuda events to measure time
+	float elapsed_time_ms1;
+  // CHECK: hipEventCreate( &start );
+  // CHECK: hipEventCreate( &stop );
+	cudaEventCreate( &start );
+	cudaEventCreate( &stop );
+
+/* ------------------------- Keyboard input -----------------------------------*/
+
+do {  // loop to repeat complete program
+
+	printf("Grid Structure 2-D grid, 2-D blocks\n");
+	printf("Blocks fixed at 16 x 16 threads, 512 threads, max for compute cap. 1.x\n");
+	printf("Enter number of blocks in grid, each dimension, currently %d\n",B);
+	scanf("%d",&B);
+	printf("Enter number of iterations, currently %d\n",T);
+	scanf("%d",&T);
+
+	int N = B * BlockSize;  // size of data array, given input data
+
+	printf("Array size (and total grid-block size) %d x %d\n", N, N);
+
+	dim3 Block(BlockSize, BlockSize);  //Block structure, 32 x 32 max
+	dim3 Grid(B, B);  //Grid structure, B x B
+
+/* ------------------------- Allocate Memory-----------------------------------*/
+
+	int size = N * N * sizeof(int);  // number of bytes in total in array
+	h = (int*) malloc(size);  // Array on host
+  // CHECK: hipMalloc((void**)&dev_h, size);
+	cudaMalloc((void**)&dev_h, size);  // allocate device memory
+
+/* ------------------------- GPU Computation -----------------------------------*/
+
+  // CHECK: hipEventRecord( start, 0 );
+	cudaEventRecord( start, 0 );
+  // CHECK: hipLaunchKernelGGL(gpu_Comput, dim3(Grid), dim3(Block), 0, 0, dev_h, N, T);
+	gpu_Comput<<< Grid, Block >>>(dev_h, N, T);
+  // CHECK: hipEventRecord( stop, 0 );
+  // CHECK: hipEventSynchronize( stop );
+  // CHECK: hipEventElapsedTime( &elapsed_time_ms1, start, stop );
+	cudaEventRecord( stop, 0 );  // instrument code to measue end time
+	cudaEventSynchronize( stop );  // wait for all work done by threads
+	cudaEventElapsedTime( &elapsed_time_ms1, start, stop );
+  // CHECK: hipMemcpy(h,dev_h, size ,hipMemcpyDeviceToHost);
+	cudaMemcpy(h,dev_h, size ,cudaMemcpyDeviceToHost);  //Get results to check
+
+	printArray(h,N);
+	printf("\nTime to calculate results on GPU: %f ms.\n", elapsed_time_ms1);
+
+/* -------------------------REPEAT PROGRAM INPUT-----------------------------------*/
+
+	printf("\nEnter c to repeat, return to terminate\n");
+
+	scanf("%c",&key);
+	scanf("%c",&key);
+
+} while (key == 'c');  // loop of complete program
+
+/* --------------  clean up  ---------------------------------------*/
+
+free(h);
+  // CHECK: hipFree(dev_h);
+	cudaFree(dev_h);
+  // CHECK: hipEventDestroy(start);
+  // CHECK: hipEventDestroy(stop);
+	cudaEventDestroy(start);
+	cudaEventDestroy(stop);
+
+	return 0;
+}

From b1ce616a024eb4ff8689cfc27e13bab5062484d2 Mon Sep 17 00:00:00 2001
From: Rahul Garg <rahul.garg@amd.com>
Date: Tue, 6 Feb 2018 14:29:04 +0530
Subject: [PATCH 23/24] Update the programming guide with environemnt variables
 names and default threshold values used.

---
 docs/markdown/hip_programming_guide.md | 23 ++++++++++++++++-------
 1 file changed, 16 insertions(+), 7 deletions(-)

diff --git a/docs/markdown/hip_programming_guide.md b/docs/markdown/hip_programming_guide.md
index 5d0c1f2497..9313eb22e1 100644
--- a/docs/markdown/hip_programming_guide.md
+++ b/docs/markdown/hip_programming_guide.md
@@ -54,7 +54,7 @@ A stronger system-level fence can be specified when the event is created with hi
 - HIP/ROCm also supports the ability to cache host memory in the GPU using the "Non-Coherent" host memory allocations. This can provide performance benefit, but care must be taken to use the correct synchronization.
 
 
-## Unpinned Memory Transfer Optimizations
+## Unpinned Memory Transfer Optimization
 Please note that this document lists possible ways for experimenting with HIP stack to gain performance. Performance may vary from platform to platform.
  
 ### On Small BAR Setup
@@ -79,11 +79,20 @@ stage the copy through an optimized pinned staging buffer, to implement H2D and
 
 PinInPlace is another algorithm which pins the host memory "in-place", and copies it with the DMA engine.  
 
-By default staging buffers are used for unpinned memory transfers. Environment variables allow control over the unpinned copy algorithm and parameters:
+Unpinned memory transfer mode can be controlled using environment variable HCC_UNPINNED_COPY_MODE. 
 
-- HIP_PININPLACE - This environment variable forces the use of PinInPlace logic for all unpinned memory copies
+By default HCC_UNPINNED_COPY_MODE is set to 0, which uses default threshold values to decide which transfer way to use based on data size.
 
-- HIP_OPTIMAL_MEM_TRANSFER- This environment variable enables a hybrid memory copy logic based on thresholds. These thresholds can be managed with following environment variables:
-  -   HIP_H2D_MEM_TRANSFER_THRESHOLD_STAGING_OR_PININPLACE - Threshold in bytes for H2D copy. For sizes smaller than threshold staging buffers logic would be used else PinInPlace logic.
-  -   HIP_H2D_MEM_TRANSFER_THRESHOLD_DIRECT_OR_STAGING - Threshold in bytes for H2D copy. For sizes smaller than threshold direct copy logic would be used else staging buffers logic.
-  -   HIP_D2H_MEM_TRANSFER_THRESHOLD - Threshold in bytes for D2H copy. For sizes smaller than threshold staging buffer logic would be used else PinInPlace logic.
+Setting HCC_UNPINNED_COPY_MODE = 1, forces all unpinned transfer to use PinInPlace logic.
+
+Setting HCC_UNPINNED_COPY_MODE = 2, forces all unpinned transfer to use Staging buffers.
+
+Setting HCC_UNPINNED_COPY_MODE = 3, forces all unpinned transfer to use direct memcpy on large BAR systems.
+ 
+Following environment variables can be used to control the transfer thresholds:
+
+-   HCC_H2D_STAGING_THRESHOLD - Threshold in KB for H2D copy. For sizes smaller than threshold direct copy logic would be used else staging buffers logic. By default it is set to 64.
+   
+-   HCC_H2D_PININPLACE_THRESHOLD - Threshold in KB for H2D copy. For sizes smaller than threshold staging buffers logic would be used else PinInPlace logic. By default it is set to 4096.
+
+-   HCC_D2H_PININPLACE_THRESHOLD  - Threshold in KB for D2H copy. For sizes smaller than threshold staging buffer logic would be used else PinInPlace logic. By default it is set to 1024.

From 7f3e89bbb062ea3ee6a83f9b56332ea4fdcea0df Mon Sep 17 00:00:00 2001
From: Evgeny Mankov <Evgeniy.Mankov@amd.com>
Date: Thu, 8 Feb 2018 00:08:47 +0300
Subject: [PATCH 24/24] [HIPIFY][fix] Build against llvm 3.8.0 fixed - missing
 include

/srv/HIP/hipify-clang/src/main.cpp:134:19: error: no member named 'dbgs' in namespace 'llvm'
      DEBUG(llvm::dbgs() << "Skipped some replacements.\n");
            ~~~~~~^
---
 hipify-clang/src/main.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/hipify-clang/src/main.cpp b/hipify-clang/src/main.cpp
index 0cc3594466..ccf627b147 100644
--- a/hipify-clang/src/main.cpp
+++ b/hipify-clang/src/main.cpp
@@ -36,6 +36,7 @@ THE SOFTWARE.
 #include "LLVMCompat.h"
 #include "HipifyAction.h"
 #include "ArgParse.h"
+#include "llvm/Support/Debug.h"
 
 #define DEBUG_TYPE "cuda2hip"