Re-sync with upstream.

2018-02-12 20:20:24 +00:00
@@ -151,7 +151,7 @@ def docker_build_inside_image( def build_image, String inside_args, String platf
      // The rm command needs to run as sudo because the test steps below create files owned by root
      sh  """#!/usr/bin/env bash
          set -x
-          sudo rm -rf ${build_dir_rel}
+          rm -rf ${build_dir_rel}
          mkdir -p ${build_dir_rel}
          cd ${build_dir_rel}
          cmake -DCMAKE_BUILD_TYPE=${build_config} -DCMAKE_INSTALL_PREFIX=staging ${optional_configure} ${source_hip_abs}
@@ -160,6 +160,7 @@ def docker_build_inside_image( def build_image, String inside_args, String platf
    }

    // Cap the maximum amount of testing, in case of hangs
+    // Excluding hipPrintfKernel test from automation; variable fails on CI test machines
    timeout(time: 1, unit: 'HOURS')
    {
      stage("${platform} unit testing")
@@ -169,7 +170,7 @@ def docker_build_inside_image( def build_image, String inside_args, String platf
            cd ${build_dir_rel}
            make install -j\$(nproc)
            make build_tests -i -j\$(nproc)
-            make test
+            ctest -E hipPrintfKernel
          """
        // If unit tests output a junit or xunit file in the future, jenkins can parse that file
        // to display test results on the dashboard
@@ -193,7 +194,7 @@ def docker_build_inside_image( def build_image, String inside_args, String platf
        if( platform.toLowerCase( ).startsWith( 'hcc-ctu' ) )
        {
          archiveArtifacts artifacts: "${build_dir_rel}/*.deb", fingerprint: true
-          archiveArtifacts artifacts: "${build_dir_rel}/*.rpm", fingerprint: true
+          // archiveArtifacts artifacts: "${build_dir_rel}/*.rpm", fingerprint: true
        }
      }
    }
@@ -101,6 +101,8 @@
 | struct       | `curandStateMRG32k3a_t`                                       | `hiprandStateMRG32k3a_t`                                   |
 | struct       | `curandStatePhilox4_32_10_t`                                  | `hiprandStatePhilox4_32_10_t`                              |
 | struct       | `curandStateXORWOW_t`                                         | `hiprandStateXORWOW_t`                                     |
+| struct       | `curandState_t`                                               | `hiprandState_t`                                           |
+| struct       | `curandState`                                                 | `hiprandState_t`                                           |

 ## **2. Host API Functions**

@@ -154,6 +156,8 @@
 | `curand_normal2_double`                                   | `hiprand_normal2_double`                   |
 | `curand_normal4`                                          | `hiprand_normal4`                          |
 | `curand_normal4_double`                                   | `hiprand_normal4_double`                   |
+| `curand_uniform`                                          | `hiprand_uniform`                          |
+| `curand_uniform_double`                                   | `hiprand_uniform_double`                   |
 | `curand_uniform2_double`                                  | `hiprand_uniform2_double`                  |
 | `curand_uniform4`                                         | `hiprand_uniform4`                         |
 | `curand_uniform4_double`                                  | `hiprand_uniform4_double`                  |
@@ -74,7 +74,7 @@ HIP provides 3 environment variables in the HIP_*_BLOCKING family.  These introd
 - HIP_API_BLOCKING : Forces hipMemcpyAsync and hipMemsetAsync to be host-synchronous, meaning they will wait for the requested operation to complete before returning to the caller.

 These options cause HCC to serialize.  Useful if you have libraries or code which is calling HCC kernels directly rather than using HIP.  
- HCC_SERIALZIE_KERNELS : 0x1=pre-serialize before each kernel launch, 0x2=post-serialize after each kernel launch., 0x3= pre- and post- serialize.
+- HCC_SERIALIZE_KERNEL : 0x1=pre-serialize before each kernel launch, 0x2=post-serialize after each kernel launch., 0x3= pre- and post- serialize.
 - HCC_SERIALIZE_COPY    : 0x1=pre-serialize before each async copy, 0x2=post-serialize after each async copy., 0x3= pre- and post- serialize.

 - HSA_ENABLE_SDMA=0     : Causes host-to-device and device-to-host copies to use compute shader blit kernels rather than the dedicated DMA copy engines.  Compute shader copies have low latency (typically < 5us) and can achieve approximately 80% of the bandwidth of the DMA copy engine.  This flag is useful to isolate issues with the hardware copy engines.
@@ -54,7 +54,7 @@ A stronger system-level fence can be specified when the event is created with hi
 - HIP/ROCm also supports the ability to cache host memory in the GPU using the "Non-Coherent" host memory allocations. This can provide performance benefit, but care must be taken to use the correct synchronization.


-## Unpinned Memory Transfer Optimizations
+## Unpinned Memory Transfer Optimization
 Please note that this document lists possible ways for experimenting with HIP stack to gain performance. Performance may vary from platform to platform.
 
 ### On Small BAR Setup
@@ -79,11 +79,20 @@ stage the copy through an optimized pinned staging buffer, to implement H2D and

 PinInPlace is another algorithm which pins the host memory "in-place", and copies it with the DMA engine.  

-By default staging buffers are used for unpinned memory transfers. Environment variables allow control over the unpinned copy algorithm and parameters:
+Unpinned memory transfer mode can be controlled using environment variable HCC_UNPINNED_COPY_MODE. 

- HIP_PININPLACE - This environment variable forces the use of PinInPlace logic for all unpinned memory copies
+By default HCC_UNPINNED_COPY_MODE is set to 0, which uses default threshold values to decide which transfer way to use based on data size.

- HIP_OPTIMAL_MEM_TRANSFER- This environment variable enables a hybrid memory copy logic based on thresholds. These thresholds can be managed with following environment variables:
-  -   HIP_H2D_MEM_TRANSFER_THRESHOLD_STAGING_OR_PININPLACE - Threshold in bytes for H2D copy. For sizes smaller than threshold staging buffers logic would be used else PinInPlace logic.
-  -   HIP_H2D_MEM_TRANSFER_THRESHOLD_DIRECT_OR_STAGING - Threshold in bytes for H2D copy. For sizes smaller than threshold direct copy logic would be used else staging buffers logic.
-  -   HIP_D2H_MEM_TRANSFER_THRESHOLD - Threshold in bytes for D2H copy. For sizes smaller than threshold staging buffer logic would be used else PinInPlace logic.
+Setting HCC_UNPINNED_COPY_MODE = 1, forces all unpinned transfer to use PinInPlace logic.
+
+Setting HCC_UNPINNED_COPY_MODE = 2, forces all unpinned transfer to use Staging buffers.
+
+Setting HCC_UNPINNED_COPY_MODE = 3, forces all unpinned transfer to use direct memcpy on large BAR systems.
+ 
+Following environment variables can be used to control the transfer thresholds:
+
+-   HCC_H2D_STAGING_THRESHOLD - Threshold in KB for H2D copy. For sizes smaller than threshold direct copy logic would be used else staging buffers logic. By default it is set to 64.
+   
+-   HCC_H2D_PININPLACE_THRESHOLD - Threshold in KB for H2D copy. For sizes smaller than threshold staging buffers logic would be used else PinInPlace logic. By default it is set to 4096.
+
+-   HCC_D2H_PININPLACE_THRESHOLD  - Threshold in KB for D2H copy. For sizes smaller than threshold staging buffer logic would be used else PinInPlace logic. By default it is set to 1024.
@@ -362,29 +362,46 @@ const std::map<llvm::StringRef, hipCounter> CUDA_TYPE_NAME_MAP{
    {"curandStateMRG32k3a_t",         {"hiprandStateMRG32k3a_t",         CONV_TYPE, API_RAND}},
    {"curandStatePhilox4_32_10_t",    {"hiprandStatePhilox4_32_10_t",    CONV_TYPE, API_RAND}},
    {"curandStateXORWOW_t",           {"hiprandStateXORWOW_t",           CONV_TYPE, API_RAND}},
+    {"curandState_t",                 {"hiprandState_t",                 CONV_TYPE, API_RAND}},
+    {"curandState",                   {"hiprandState_t",                 CONV_TYPE, API_RAND}},
 };

 /// Maps cuda header names to hip header names.
 const std::map <llvm::StringRef, hipCounter> CUDA_INCLUDE_MAP{
    // CUDA includes
-    {"cuda.h",               {"hip/hip_runtime.h",        CONV_INCLUDE_CUDA_MAIN_H, API_DRIVER}},
-    {"cuda_runtime.h",       {"hip/hip_runtime.h",        CONV_INCLUDE_CUDA_MAIN_H, API_RUNTIME}},
-    {"cuda_runtime_api.h",   {"hip/hip_runtime_api.h",    CONV_INCLUDE,             API_RUNTIME}},
-    {"channel_descriptor.h", {"hip/channel_descriptor.h", CONV_INCLUDE,             API_RUNTIME}},
-    {"device_functions.h",   {"hip/device_functions.h",   CONV_INCLUDE,             API_RUNTIME}},
-    {"driver_types.h",       {"hip/driver_types.h",       CONV_INCLUDE,             API_RUNTIME}},
-    {"cuComplex.h",          {"hip/hip_complex.h",        CONV_INCLUDE,             API_RUNTIME}},
-    {"cuda_fp16.h",          {"hip/hip_fp16.h",           CONV_INCLUDE,             API_RUNTIME}},
-    {"cuda_texture_types.h", {"hip/hip_texture_types.h",  CONV_INCLUDE,             API_RUNTIME}},
-    {"vector_types.h",       {"hip/hip_vector_types.h",   CONV_INCLUDE,             API_RUNTIME}},
+    {"cuda.h",                    {"hip/hip_runtime.h",        CONV_INCLUDE_CUDA_MAIN_H, API_DRIVER}},
+    {"cuda_runtime.h",            {"hip/hip_runtime.h",        CONV_INCLUDE_CUDA_MAIN_H, API_RUNTIME}},
+    {"cuda_runtime_api.h",        {"hip/hip_runtime_api.h",    CONV_INCLUDE,             API_RUNTIME}},
+    {"channel_descriptor.h",      {"hip/channel_descriptor.h", CONV_INCLUDE,             API_RUNTIME}},
+    {"device_functions.h",        {"hip/device_functions.h",   CONV_INCLUDE,             API_RUNTIME}},
+    {"driver_types.h",            {"hip/driver_types.h",       CONV_INCLUDE,             API_RUNTIME}},
+    {"cuComplex.h",               {"hip/hip_complex.h",        CONV_INCLUDE,             API_RUNTIME}},
+    {"cuda_fp16.h",               {"hip/hip_fp16.h",           CONV_INCLUDE,             API_RUNTIME}},
+    {"cuda_texture_types.h",      {"hip/hip_texture_types.h",  CONV_INCLUDE,             API_RUNTIME}},
+    {"vector_types.h",            {"hip/hip_vector_types.h",   CONV_INCLUDE,             API_RUNTIME}},

    // CUBLAS includes
-    {"cublas.h",    {"hipblas.h", CONV_INCLUDE, API_BLAS}},
-    {"cublas_v2.h", {"hipblas.h", CONV_INCLUDE, API_BLAS}},
+    {"cublas.h",                  {"hipblas.h",                CONV_INCLUDE_CUDA_MAIN_H, API_BLAS}},
+    {"cublas_v2.h",               {"hipblas.h",                CONV_INCLUDE_CUDA_MAIN_H, API_BLAS}},

    // CURAND includes
-    {"curand.h",             {"hiprand.h",        CONV_INCLUDE, API_RAND}},
-    {"curand_kernel.h",      {"hiprand_kernel.h", CONV_INCLUDE, API_RAND}},
+    {"curand.h",                  {"hiprand.h",                CONV_INCLUDE_CUDA_MAIN_H, API_RAND}},
+    {"curand_kernel.h",           {"hiprand_kernel.h",         CONV_INCLUDE,             API_RAND}},
+    {"curand_discrete.h",         {"hiprand_kernel.h",         CONV_INCLUDE,             API_RAND}},
+    {"curand_discrete2.h",        {"hiprand_kernel.h",         CONV_INCLUDE,             API_RAND}},
+    {"curand_globals.h",          {"hiprand_kernel.h",         CONV_INCLUDE,             API_RAND}},
+    {"curand_lognormal.h",        {"hiprand_kernel.h",         CONV_INCLUDE,             API_RAND}},
+    {"curand_mrg32k3a.h",         {"hiprand_kernel.h",         CONV_INCLUDE,             API_RAND}},
+    {"curand_mtgp32.h",           {"hiprand_kernel.h",         CONV_INCLUDE,             API_RAND}},
+    {"curand_mtgp32_host.h",      {"hiprand_kernel.h",         CONV_INCLUDE,             API_RAND}},
+    {"curand_mtgp32_kernel.h",    {"hiprand_kernel.h",         CONV_INCLUDE,             API_RAND}},
+    {"curand_mtgp32dc_p_11213.h", {"hiprand_kernel.h",         CONV_INCLUDE,             API_RAND}},
+    {"curand_normal.h",           {"hiprand_kernel.h",         CONV_INCLUDE,             API_RAND}},
+    {"curand_normal_static.h",    {"hiprand_kernel.h",         CONV_INCLUDE,             API_RAND}},
+    {"curand_philox4x32_x.h",     {"hiprand_kernel.h",         CONV_INCLUDE,             API_RAND}},
+    {"curand_poisson.h",          {"hiprand_kernel.h",         CONV_INCLUDE,             API_RAND}},
+    {"curand_precalc.h",          {"hiprand_kernel.h",         CONV_INCLUDE,             API_RAND}},
+    {"curand_uniform.h",          {"hiprand_kernel.h",         CONV_INCLUDE,             API_RAND}},

    // HIP includes
    // TODO: uncomment this when hip/cudacommon.h will be renamed to hip/hipcommon.h
@@ -2852,10 +2869,12 @@ const std::map<llvm::StringRef, hipCounter> CUDA_IDENTIFIER_MAP{
    {"curand_normal2_double",                         {"hiprand_normal2_double",                         CONV_DEVICE_FUNC, API_RAND}},
    {"curand_normal4",                                {"hiprand_normal4",                                CONV_DEVICE_FUNC, API_RAND}},
    {"curand_normal4_double",                         {"hiprand_normal4_double",                         CONV_DEVICE_FUNC, API_RAND}},
+    {"curand_uniform",                                {"hiprand_uniform",                                CONV_DEVICE_FUNC, API_RAND}},
+    {"curand_uniform_double",                         {"hiprand_uniform_double",                         CONV_DEVICE_FUNC, API_RAND}},
    {"curand_uniform2_double",                        {"hiprand_uniform2_double",                        CONV_DEVICE_FUNC, API_RAND}},
    {"curand_uniform4",                               {"hiprand_uniform4",                               CONV_DEVICE_FUNC, API_RAND}},
    {"curand_uniform4_double",                        {"hiprand_uniform4_double",                        CONV_DEVICE_FUNC, API_RAND}},
-    {"curand_discrete",                               {"hiprand_discrete4",                              CONV_DEVICE_FUNC, API_RAND}},
+    {"curand_discrete",                               {"hiprand_discrete",                               CONV_DEVICE_FUNC, API_RAND}},
    {"curand_discrete4",                              {"hiprand_discrete4",                              CONV_DEVICE_FUNC, API_RAND}},
    {"curand_poisson",                                {"hiprand_poisson",                                CONV_DEVICE_FUNC, API_RAND}},
    {"curand_poisson4",                               {"hiprand_poisson4",                               CONV_DEVICE_FUNC, API_RAND}},
@@ -137,6 +137,48 @@ std::string stringifyZeroDefaultedArg(clang::SourceManager& SM, const clang::Exp

 } // anonymous namespace

+bool HipifyAction::Exclude(const hipCounter & hipToken) {
+    switch (hipToken.type) {
+        case CONV_INCLUDE_CUDA_MAIN_H:
+            switch (hipToken.apiType) {
+                case API_DRIVER:
+                case API_RUNTIME:
+                    if (insertedRuntimeHeader) { return true; }
+                    insertedRuntimeHeader = true;
+                    return false;
+                case API_BLAS:
+                  if (insertedBLASHeader) { return true; }
+                    insertedBLASHeader = true;
+                    return false;
+                case API_RAND:
+                    if (hipToken.hipName == "hiprand_kernel.h") {
+                        if (insertedRAND_kernelHeader) { return true; }
+                        insertedRAND_kernelHeader = true;
+                        return false;
+                    } else if (hipToken.hipName == "hiprand.h") {
+                        if (insertedRANDHeader) { return true; }
+                        insertedRANDHeader = true;
+                        return false;
+                    }
+                default:
+                    return false;
+            }
+            return false;
+        case CONV_INCLUDE:
+            switch (hipToken.apiType) {
+                case API_RAND:
+                    if (insertedRAND_kernelHeader) { return true; }
+                    insertedRAND_kernelHeader = true;
+                    return false;
+                default:
+                    return false;
+            }
+            return false;
+        default:
+            return false;
+    }
+    return false;
+}

 void HipifyAction::InclusionDirective(clang::SourceLocation hash_loc,
                                      const clang::Token&,
@@ -149,24 +191,17 @@ void HipifyAction::InclusionDirective(clang::SourceLocation hash_loc,
    if (!SM.isWrittenInMainFile(hash_loc)) {
        return;
    }
+    if (!firstHeader) {
+        firstHeader = true;
+        firstHeaderLoc = hash_loc;
+    }

    const auto found = CUDA_INCLUDE_MAP.find(file_name);
    if (found == CUDA_INCLUDE_MAP.end()) {
-        if (!firstNotMainHeader) {
-            firstNotMainHeader = true;
-            firstNotMainHeaderLoc = hash_loc;
-        }
        return;
    }

-    // Special-casing to avoid duplication of the hip_runtime include.
-    bool secondMainInclude = false;
-    if (found->second.hipName == "hip/hip_runtime.h") {
-        if (insertedRuntimeHeader) {
-            secondMainInclude = true;
-        }
-        insertedRuntimeHeader = true;
-    }
+    bool exclude = Exclude(found->second);

    Statistics::current().incrementCounter(found->second, file_name.str());

@@ -180,7 +215,7 @@ void HipifyAction::InclusionDirective(clang::SourceLocation hash_loc,
    clang::StringRef newInclude;

    // Keep the same include type that the user gave.
-    if (!secondMainInclude) {
+    if (!exclude) {
        clang::SmallString<128> includeBuffer;
        if (is_angled) {
            newInclude = llvm::Twine("<" + found->second.hipName + ">").toStringRef(includeBuffer);
@@ -198,8 +233,13 @@ void HipifyAction::InclusionDirective(clang::SourceLocation hash_loc,
 }

 void HipifyAction::PragmaDirective(clang::SourceLocation Loc, clang::PragmaIntroducerKind Introducer) {
-    if (pragmaOnce) { return; }
+    if (pragmaOnce) {
+        return;
+    }
    clang::SourceManager& SM = getCompilerInstance().getSourceManager();
+    if (!SM.isWrittenInMainFile(Loc)) {
+        return;
+    }
    clang::Preprocessor& PP = getCompilerInstance().getPreprocessor();
    const clang::Token tok = PP.LookAhead(0);
    StringRef Text(SM.getCharacterData(tok.getLocation()), tok.getLength());
@@ -356,8 +396,8 @@ void HipifyAction::EndSourceFileAction() {
        clang::SourceLocation sl;
        if (pragmaOnce) {
            sl = pragmaOnceLoc;
-        } else if (firstNotMainHeader) {
-            sl = firstNotMainHeaderLoc;
+        } else if (firstHeader) {
+            sl = firstHeaderLoc;
        } else {
            sl = SM.getLocForStartOfFile(SM.getMainFileID());
        }
@@ -6,6 +6,7 @@
 #include "clang/Tooling/Core/Replacement.h"
 #include "clang/ASTMatchers/ASTMatchFinder.h"
 #include "ReplacementsFrontendActionFactory.h"
+#include "Statistics.h"

 namespace ct = clang::tooling;

@@ -23,9 +24,12 @@ private:
    // not, we insert it at the top of the file when we finish processing it.
    // This approach means we do the best it's possible to do w.r.t preserving the user's include order.
    bool insertedRuntimeHeader = false;
-    bool firstNotMainHeader = false;
+    bool insertedBLASHeader = false;
+    bool insertedRANDHeader = false;
+    bool insertedRAND_kernelHeader = false;
+    bool firstHeader = false;
    bool pragmaOnce = false;
-    clang::SourceLocation firstNotMainHeaderLoc;
+    clang::SourceLocation firstHeaderLoc;
    clang::SourceLocation pragmaOnceLoc;

    /**
@@ -89,4 +93,6 @@ protected:
    void run(const clang::ast_matchers::MatchFinder::MatchResult& Result) override;

    std::unique_ptr<clang::ASTConsumer> CreateASTConsumer(clang::CompilerInstance &CI, llvm::StringRef InFile) override;
+
+    bool Exclude(const hipCounter & hipToken);
 };
@@ -53,8 +53,8 @@ void printStat(std::ostream *csv, llvm::raw_ostream* printOut, const std::string

 void StatCounter::incrementCounter(const hipCounter& counter, std::string name) {
    counters[name]++;
-    apiCounters[(int) counter.countApiType]++;
-    convTypeCounters[(int) counter.countType]++;
+    apiCounters[(int) counter.apiType]++;
+    convTypeCounters[(int) counter.type]++;
 }

 void StatCounter::add(const StatCounter& other) {
@@ -67,8 +67,8 @@ extern const char *apiNames[NUM_API_TYPES];

 struct hipCounter {
    llvm::StringRef hipName;
-    ConvTypes countType;
-    ApiTypes countApiType;
+    ConvTypes type;
+    ApiTypes apiType;
    bool unsupported;
 };

@@ -36,6 +36,7 @@ THE SOFTWARE.
 #include "LLVMCompat.h"
 #include "HipifyAction.h"
 #include "ArgParse.h"
+#include "llvm/Support/Debug.h"

 #define DEBUG_TYPE "cuda2hip"

@@ -86,7 +86,7 @@ namespace hip_impl
                    std::copy_n(it, sizeof(y.header.cbuf), y.header.cbuf);
                    it += sizeof(y.header.cbuf);

-                    y.triple.assign(it, it + y.header.triple_sz);
+                    y.triple.assign(it, it + y.triple_sz);

                    std::copy_n(
                        f + y.header.offset,
@@ -152,9 +152,9 @@ namespace hip_impl
    };

    // CREATORS
-    template<typename I>
-    Bundled_code_header::Bundled_code_header(I f, I l) : Bundled_code_header{}
+    template<typename RandomAccessIterator>
+    Bundled_code_header::Bundled_code_header(RandomAccessIterator f, RandomAccessIterator l) : Bundled_code_header{}
    {
        read(f, l, *this);
    }
-} // Namespace hip_impl.
+} // Namespace hip_impl.
@@ -1420,7 +1420,56 @@ hipError_t hipMemcpy2DToArray(hipArray* dst, size_t wOffset, size_t hOffset, con
 hipError_t hipMemcpyToArray(hipArray* dst, size_t wOffset, size_t hOffset,
                            const void* src, size_t count, hipMemcpyKind kind);

+/**
+ *  @brief Copies data between host and device.
+ *
+ *  @param[in]   dst       Destination memory address
+ *  @param[in]   srcArray  Source memory address
+ *  @param[in]   woffset   Source starting X offset
+ *  @param[in]   hOffset   Source starting Y offset
+ *  @param[in]   count     Size in bytes to copy
+ *  @param[in]   kind      Type of transfer
+ *  @return      #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidPitchValue, #hipErrorInvalidDevicePointer, #hipErrorInvalidMemcpyDirection
+ *
+ *  @see hipMemcpy, hipMemcpy2DToArray, hipMemcpy2D, hipMemcpyFromArray, hipMemcpyToSymbol, hipMemcpyAsync
+ */
+hipError_t hipMemcpyFromArray(void* dst, hipArray_const_t srcArray, size_t wOffset, size_t hOffset,
+         size_t count, hipMemcpyKind kind);

+/**
+ *  @brief Copies data between host and device.
+ *
+ *  @param[in]   dst       Destination memory address
+ *  @param[in]   srcArray  Source array
+ *  @param[in]   srcoffset Offset in bytes of source array
+ *  @param[in]   count     Size of memory copy in bytes
+ *  @return      #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidPitchValue, #hipErrorInvalidDevicePointer, #hipErrorInvalidMemcpyDirection
+ *
+ *  @see hipMemcpy, hipMemcpy2DToArray, hipMemcpy2D, hipMemcpyFromArray, hipMemcpyToSymbol, hipMemcpyAsync
+ */
+hipError_t hipMemcpyAtoH(void* dst, hipArray* srcArray, size_t srcOffset, size_t count);
+
+/**
+ *  @brief Copies data between host and device.
+ *
+ *  @param[in]   dstArray   Destination memory address
+ *  @param[in]   dstOffset  Offset in bytes of destination array
+ *  @param[in]   srcHost    Source host pointer
+ *  @param[in]   count      Size of memory copy in bytes
+ *  @return      #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidPitchValue, #hipErrorInvalidDevicePointer, #hipErrorInvalidMemcpyDirection
+ *
+ *  @see hipMemcpy, hipMemcpy2DToArray, hipMemcpy2D, hipMemcpyFromArray, hipMemcpyToSymbol, hipMemcpyAsync
+ */
+hipError_t hipMemcpyHtoA(hipArray* dstArray, size_t dstOffset, const void* srcHost, size_t count);
+
+/**
+ *  @brief Copies data between host and device.
+ *
+ *  @param[in]   p   3D memory copy parameters
+ *  @return      #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidPitchValue, #hipErrorInvalidDevicePointer, #hipErrorInvalidMemcpyDirection
+ *
+ *  @see hipMemcpy, hipMemcpy2DToArray, hipMemcpy2D, hipMemcpyFromArray, hipMemcpyToSymbol, hipMemcpyAsync
+ */
 hipError_t hipMemcpy3D(const struct hipMemcpy3DParms *p);

 // doxygen end Memory
@@ -69,18 +69,16 @@ namespace hip_impl
        }
    };

-    using RAII_global = std::unique_ptr<void, decltype(hsa_amd_memory_unlock)*>;
-
    const std::unordered_map<
        hsa_agent_t, std::vector<hsa_executable_t>>& executables();
    const std::unordered_map<
        std::uintptr_t,
        std::vector<std::pair<hsa_agent_t, Kernel_descriptor>>>& functions();
    const std::unordered_map<std::uintptr_t, std::string>& function_names();
-    std::unordered_map<std::string, RAII_global>& globals();
+    std::unordered_map<std::string, void*>& globals();

    hsa_executable_t load_executable(
        const std::string& file,
        hsa_executable_t executable,
        hsa_agent_t agent);
-} // Namespace hip_impl.
+} // Namespace hip_impl.
@@ -483,6 +483,18 @@ inline static hipError_t hipMemcpyToArray(hipArray* dst, size_t wOffset, size_t
  return hipCUDAErrorTohipError(cudaMemcpyToArray(dst, wOffset, hOffset, src, count, hipMemcpyKindToCudaMemcpyKind(kind)));
 }

+inline static hipError_t hipMemcpyFromArray(void* dst, hipArray_const_t srcArray, size_t wOffset, size_t hOffset, size_t count, hipMemcpyKind kind) {
+  return hipCUDAErrorTohipError(cudaMemcpyFromArray(dst, srcArray, wOffset, hOffset, count, hipMemcpyKindToCudaMemcpyKind(kind)));
+}
+
+inline static hipError_t hipMemcpyAtoH(void* dst, hipArray* srcArray, size_t srcOffset, size_t count) {
+  return hipCUResultTohipError(cuMemcpyAtoH(dst, (CUarray)srcArray, srcOffset, count));
+}
+
+inline static hipError_t hipMemcpyHtoA(hipArray* dstArray, size_t dstOffset, const void* srcHost, size_t count) {
+  return hipCUResultTohipError(cuMemcpyHtoA((CUarray)dstArray, dstOffset, srcHost, count));
+}
+
 inline static hipError_t hipDeviceSynchronize() {
    return hipCUDAErrorTohipError(cudaDeviceSynchronize());
 }
@@ -8,12 +8,12 @@ if(DOXYGEN_EXE)
    install(DIRECTORY RuntimeAPI/html DESTINATION docs/docs/RuntimeAPI)
 endif()

-find_program(GRIP_EXE grip)
-if(GRIP_EXE)
-    add_custom_target(convert_md_to_html ALL
-                  COMMAND @hip_SOURCE_DIR@/packaging/convert_md_to_html.sh @hip_SOURCE_DIR@ ${PROJECT_BINARY_DIR}/md2html)
-    install(DIRECTORY md2html/ DESTINATION docs)
-endif()
+#find_program(GRIP_EXE grip)
+#if(GRIP_EXE)
+#    add_custom_target(convert_md_to_html ALL
+#                  COMMAND @hip_SOURCE_DIR@/packaging/convert_md_to_html.sh @hip_SOURCE_DIR@ ${PROJECT_BINARY_DIR}/md2html)
+#    install(DIRECTORY md2html/ DESTINATION docs)
+#endif()

 #############################
 # Packaging steps
@@ -37,6 +37,7 @@ THE SOFTWARE.
 #include <vector>
 #include <algorithm>
 #include <atomic>
+#include <mutex>

 #include <hc.hpp>
 #include <hc_am.hpp>
@@ -1409,9 +1410,38 @@ void ihipInit()
    tprintf(DB_SYNC, "pid=%u %-30s g_numLogicalThreads=%u\n", getpid(), "<ihipInit>", g_numLogicalThreads);
 }

+hipError_t ihipStreamSynchronize(hipStream_t stream)
+{
+    hipError_t e = hipSuccess;

+    if (stream == hipStreamNull) {
+        ihipCtx_t *ctx = ihipGetTlsDefaultCtx();
+        ctx->locked_syncDefaultStream(true/*waitOnSelf*/, true/*syncToHost*/);
+    } else {
+        // note this does not synchornize with the NULL stream:
+        stream->locked_wait();
+        e = hipSuccess;
+    }

+    return e;
+}

+void ihipStreamCallbackHandler(ihipStreamCallback_t *cb)
+{
+    hipError_t e = hipSuccess;
+
+    // Notify hipStreamAddCallback that callback handler thread is active
+    std::lock_guard<std::mutex> guard(cb->_mtx);
+    cb->_ready = true;
+
+    // Synchronize stream
+    tprintf(DB_SYNC, "ihipStreamCallbackHandler wait on stream %s\n", ToString(cb->_stream).c_str());
+    e = ihipStreamSynchronize(cb->_stream);
+
+    // Call registered callback function
+    cb->_callback(cb->_stream, e, cb->_userData);
+    delete cb;
+}

 //---
 // Get the stream to use for a command submission.
@@ -622,6 +622,24 @@ private: // Data
 };


+//----
+// Internal structure for stream callback handler
+class ihipStreamCallback_t {
+public:
+    ihipStreamCallback_t(hipStream_t stream, hipStreamCallback_t callback, void *userData) :
+        _stream(stream),
+        _callback(callback),
+        _userData(userData)
+        {
+            _ready = false;
+        };
+    hipStream_t                 _stream;
+    hipStreamCallback_t         _callback;
+    void*                       _userData;
+    std::mutex                  _mtx;
+    bool                        _ready;
+};
+

 //----
 // Internal event structure:
@@ -931,6 +949,8 @@ ihipCtx_t * ihipGetPrimaryCtx(unsigned deviceIndex);


 hipStream_t ihipSyncAndResolveStream(hipStream_t);
+hipError_t ihipStreamSynchronize(hipStream_t stream);
+void ihipStreamCallbackHandler(ihipStreamCallback_t *cb);

 // Stream printf functions:
 inline std::ostream& operator<<(std::ostream& os, const ihipStream_t& s)
@@ -453,6 +453,7 @@ hipError_t hipArrayCreate ( hipArray** array, const HIP_ARRAY_DESCRIPTOR* pAlloc
        array[0]->width = pAllocateArray->width;
        array[0]->height = pAllocateArray->height;
        array[0]->isDrv = true;
+        array[0]->textureType = hipTextureType2D;
        void ** ptr = &array[0]->data;
 	    if (ctx) {
            const unsigned am_flags = 0;
@@ -1411,6 +1412,65 @@ hipError_t hipMemcpyToArray(hipArray* dst, size_t wOffset, size_t hOffset,
    return ihipLogStatus(e);
 }

+hipError_t hipMemcpyFromArray(void* dst, hipArray_const_t srcArray, size_t wOffset, size_t hOffset,
+         size_t count, hipMemcpyKind kind) {
+
+    HIP_INIT_SPECIAL_API((TRACE_MCMD), dst, srcArray, wOffset, hOffset, count, kind);
+
+    hipStream_t stream = ihipSyncAndResolveStream(hipStreamNull);
+
+    hc::completion_future marker;
+
+    hipError_t e = hipSuccess;
+
+    try {
+        stream->locked_copySync((char *)dst, (char*)srcArray->data + wOffset, count, kind);
+    }
+    catch (ihipException &ex) {
+        e = ex._code;
+    }
+
+    return ihipLogStatus(e);
+}
+
+hipError_t hipMemcpyHtoA(hipArray* dstArray, size_t dstOffset, const void* srcHost, size_t count)
+{
+    HIP_INIT_SPECIAL_API((TRACE_MCMD), dstArray, dstOffset, srcHost, count);
+
+    hipStream_t stream = ihipSyncAndResolveStream(hipStreamNull);
+
+    hc::completion_future marker;
+
+    hipError_t e = hipSuccess;
+    try {
+        stream->locked_copySync((char *)dstArray->data + dstOffset, srcHost, count, hipMemcpyHostToDevice);
+    } catch (ihipException &ex) {
+        e = ex._code;
+    }
+
+    return ihipLogStatus(e);
+}
+
+hipError_t hipMemcpyAtoH(void* dst, hipArray* srcArray, size_t srcOffset, size_t count)
+{
+    HIP_INIT_SPECIAL_API((TRACE_MCMD), dst, srcArray, srcOffset, count);
+
+    hipStream_t stream = ihipSyncAndResolveStream(hipStreamNull);
+
+    hc::completion_future marker;
+
+    hipError_t e = hipSuccess;
+
+    try {
+        stream->locked_copySync((char *)dst, (char*)srcArray->data + srcOffset, count, hipMemcpyDeviceToHost);
+    }
+    catch (ihipException &ex) {
+        e = ex._code;
+    }
+
+    return ihipLogStatus(e);
+}
+
 hipError_t hipMemcpy3D(const struct hipMemcpy3DParms *p)
 {
    HIP_INIT_SPECIAL_API((TRACE_MCMD), p);
@@ -594,7 +594,6 @@ hipError_t hipModuleGetTexRef(
    const auto it = globals().find(name);
    if (it == globals().end()) return ihipLogStatus(hipErrorInvalidValue);

-    *texRef = static_cast<textureReference*>(it->second.get());
-
+    *texRef = reinterpret_cast<textureReference*>(it->second);
    return ihipLogStatus(hipSuccess);
 }
@@ -20,6 +20,8 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 */

+#include <thread>
+#include <mutex>
 #include "hip/hip_runtime.h"
 #include "hip_hcc_internal.h"
 #include "trace_helper.h"
@@ -147,20 +149,8 @@ hipError_t hipStreamSynchronize(hipStream_t stream)
 {
    HIP_INIT_SPECIAL_API(TRACE_SYNC, stream);

-    hipError_t e = hipSuccess;
-
-    if (stream == hipStreamNull) {
-        ihipCtx_t *ctx = ihipGetTlsDefaultCtx();
-        ctx->locked_syncDefaultStream(true/*waitOnSelf*/, true/*syncToHost*/);
-    } else {
-		// note this does not synchornize with the NULL stream:
-        stream->locked_wait();
-        e = hipSuccess;
-    }
-
-
-    return ihipLogStatus(e);
-};
+    return ihipLogStatus(ihipStreamSynchronize(stream));
+}


 //---
@@ -216,8 +206,20 @@ hipError_t hipStreamAddCallback(hipStream_t stream, hipStreamCallback_t callback
 {
    HIP_INIT_API(stream, callback, userData, flags);
    hipError_t e = hipSuccess;
-    //--- explicitly synchronize stream to add callback routines
-    hipStreamSynchronize(stream);
-    callback(stream, e, userData);
+
+    // Create a thread in detached mode to handle callback
+    ihipStreamCallback_t *cb = new ihipStreamCallback_t(stream, callback, userData);
+    std::thread (ihipStreamCallbackHandler, cb).detach();
+
+    // Wait for thread to be ready
+    cb->_mtx.lock();
+    while(cb->_ready != true)
+    {
+        cb->_mtx.unlock();
+        std::this_thread::sleep_for(std::chrono::milliseconds(10));
+        cb->_mtx.lock();
+    }
+    cb->_mtx.unlock();
+
    return ihipLogStatus(e);
 }
@@ -151,7 +151,7 @@ namespace
            lock_guard<mutex> lck{mtx};

            if (globals().find(x) != globals().cend()) return;
-
+            globals().emplace(x, (void*)(it1->second.first));
            void* p = nullptr;
            hsa_amd_memory_lock(
                reinterpret_cast<void*>(it1->second.first),
@@ -163,7 +163,6 @@ namespace
            hsa_executable_agent_global_variable_define(
                executable, agent, x.c_str(), p);

-            globals().emplace(x, RAII_global{p, hsa_amd_memory_unlock});
        }
    }

@@ -444,9 +443,9 @@ namespace hip_impl
        return r;
    }

-    unordered_map<string, RAII_global>& globals()
+    unordered_map<string, void*>& globals()
    {
-        static unordered_map<string, RAII_global> r;
+        static unordered_map<string, void*> r;
        static once_flag f;
        call_once(f, []() { r.reserve(symbol_addresses().size()); });

@@ -473,4 +472,4 @@ namespace hip_impl

        return executable;
    }
-} // Namespace hip_impl.
+} // Namespace hip_impl.
@@ -0,0 +1,117 @@
+// RUN: %run_test hipify "%s" "%t" %cuda_args
+
+// To measure effects of memory coalescing. Coalescing.cu
+// B. Wilkinson Jan 30, 2011
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+// CHECK: #include <hip/hip_runtime.h>
+#include <cuda.h>
+
+#define BlockSize 16  // Size of blocks, 32 x 32 threads, fixed, used globally
+
+__global__ void gpu_Comput (int *h, int N, int T) {
+
+// Array loaded with global thread ID that acesses that location
+
+	int col = threadIdx.x + blockDim.x * blockIdx.x; 
+	int row = threadIdx.y + blockDim.y * blockIdx.y;
+
+	int threadID = col + row * N;
+	int index = row + col * N;  // sequentially down each row
+
+	for (int t = 0; t < T; t++)  // loop to repeat to reduce other time effects
+	  h[index] = threadID;  // load array with flattened global thread ID
+}
+
+void printArray(int *h, int N) {
+
+	printf("Results of computation, every N/8 numbers, eight numbers\n");
+
+	for (int row = 0; row < N; row += N/8) {
+	  for (int col = 0; col < N; col += N/8)
+	  printf("%6d  ", h[col + row * N]);
+	  printf("\n");
+	}
+}
+
+int main(int argc, char *argv[])  {
+
+	int T = 100;  // number of iterations, entered at keyboard
+	int B = 1;  // number of blocks, entered at keyboard
+	char key;
+
+	int *h, *dev_h;  // ptr to array holding numbers on host and device
+  // CHECK: hipEvent_t start, stop;
+	cudaEvent_t start, stop;  // cuda events to measure time
+	float elapsed_time_ms1;
+  // CHECK: hipEventCreate( &start );
+  // CHECK: hipEventCreate( &stop );
+	cudaEventCreate( &start );
+	cudaEventCreate( &stop );
+
+/* ------------------------- Keyboard input -----------------------------------*/
+
+do {  // loop to repeat complete program
+
+	printf("Grid Structure 2-D grid, 2-D blocks\n");
+	printf("Blocks fixed at 16 x 16 threads, 512 threads, max for compute cap. 1.x\n");
+	printf("Enter number of blocks in grid, each dimension, currently %d\n",B);
+	scanf("%d",&B);
+	printf("Enter number of iterations, currently %d\n",T);
+	scanf("%d",&T);
+
+	int N = B * BlockSize;  // size of data array, given input data
+
+	printf("Array size (and total grid-block size) %d x %d\n", N, N);
+
+	dim3 Block(BlockSize, BlockSize);  //Block structure, 32 x 32 max
+	dim3 Grid(B, B);  //Grid structure, B x B
+
+/* ------------------------- Allocate Memory-----------------------------------*/
+
+	int size = N * N * sizeof(int);  // number of bytes in total in array
+	h = (int*) malloc(size);  // Array on host
+  // CHECK: hipMalloc((void**)&dev_h, size);
+	cudaMalloc((void**)&dev_h, size);  // allocate device memory
+
+/* ------------------------- GPU Computation -----------------------------------*/
+
+  // CHECK: hipEventRecord( start, 0 );
+	cudaEventRecord( start, 0 );
+  // CHECK: hipLaunchKernelGGL(gpu_Comput, dim3(Grid), dim3(Block), 0, 0, dev_h, N, T);
+	gpu_Comput<<< Grid, Block >>>(dev_h, N, T);
+  // CHECK: hipEventRecord( stop, 0 );
+  // CHECK: hipEventSynchronize( stop );
+  // CHECK: hipEventElapsedTime( &elapsed_time_ms1, start, stop );
+	cudaEventRecord( stop, 0 );  // instrument code to measue end time
+	cudaEventSynchronize( stop );  // wait for all work done by threads
+	cudaEventElapsedTime( &elapsed_time_ms1, start, stop );
+  // CHECK: hipMemcpy(h,dev_h, size ,hipMemcpyDeviceToHost);
+	cudaMemcpy(h,dev_h, size ,cudaMemcpyDeviceToHost);  //Get results to check
+
+	printArray(h,N);
+	printf("\nTime to calculate results on GPU: %f ms.\n", elapsed_time_ms1);
+
+/* -------------------------REPEAT PROGRAM INPUT-----------------------------------*/
+
+	printf("\nEnter c to repeat, return to terminate\n");
+
+	scanf("%c",&key);
+	scanf("%c",&key);
+
+} while (key == 'c');  // loop of complete program
+
+/* --------------  clean up  ---------------------------------------*/
+
+free(h);
+  // CHECK: hipFree(dev_h);
+	cudaFree(dev_h);
+  // CHECK: hipEventDestroy(start);
+  // CHECK: hipEventDestroy(stop);
+	cudaEventDestroy(start);
+	cudaEventDestroy(stop);
+
+	return 0;
+}
@@ -0,0 +1,393 @@
+// RUN: %run_test hipify "%s" "%t" %cuda_args
+
+// Copyright (c) 2017 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <string>
+#include <chrono>
+#include <numeric>
+#include <utility>
+#include <algorithm>
+
+#include "cmdparser.hpp"
+// CHECK: #include <hip/hip_runtime.h>
+#include <cuda_runtime.h>
+// CHECK: #include <hiprand.h>
+#include <curand.h>
+
+// CHECK: if((x)!=hipSuccess) {
+#define CUDA_CALL(x) do { if((x)!=cudaSuccess) { \
+    printf("Error at %s:%d\n",__FILE__,__LINE__);\
+    exit(EXIT_FAILURE);}} while(0)
+// CHECK: if((x)!=HIPRAND_STATUS_SUCCESS) {
+#define CURAND_CALL(x) do { if((x)!=CURAND_STATUS_SUCCESS) { \
+    printf("Error at %s:%d\n",__FILE__,__LINE__);\
+    exit(EXIT_FAILURE);}} while(0)
+
+#ifndef DEFAULT_RAND_N
+const size_t DEFAULT_RAND_N = 1024 * 1024 * 128;
+#endif
+
+// CHECK: typedef hiprandRngType_t rng_type_t;
+typedef curandRngType rng_type_t;
+
+// CHECK: using generate_func_type = std::function<hiprandStatus_t(hiprandGenerator_t, T *, size_t)>;
+template<typename T>
+using generate_func_type = std::function<curandStatus_t(curandGenerator_t, T *, size_t)>;
+
+template<typename T>
+void run_benchmark(const cli::Parser& parser,
+                   const rng_type_t rng_type,
+                   generate_func_type<T> generate_func)
+{
+    const size_t size = parser.get<size_t>("size");
+    const size_t trials = parser.get<size_t>("trials");
+
+    T * data;
+    // CHECK: CUDA_CALL(hipMalloc((void **)&data, size * sizeof(T)));
+    CUDA_CALL(cudaMalloc((void **)&data, size * sizeof(T)));
+
+    // CHECK: hiprandGenerator_t generator;
+    // CHECK: CURAND_CALL(hiprandCreateGenerator(&generator, rng_type));
+    curandGenerator_t generator;
+    CURAND_CALL(curandCreateGenerator(&generator, rng_type));
+
+    const size_t dimensions = parser.get<size_t>("dimensions");
+    // CHECK: hiprandStatus_t status = hiprandSetQuasiRandomGeneratorDimensions(generator, dimensions);
+    // CHECK: if (status != HIPRAND_STATUS_TYPE_ERROR)
+    curandStatus_t status = curandSetQuasiRandomGeneratorDimensions(generator, dimensions);
+    if (status != CURAND_STATUS_TYPE_ERROR) // If the RNG is not quasi-random
+    {
+        CURAND_CALL(status);
+    }
+
+    // Warm-up
+    for (size_t i = 0; i < 5; i++)
+    {
+        CURAND_CALL(generate_func(generator, data, size));
+    }
+    // CHECK: CUDA_CALL(hipDeviceSynchronize());
+    CUDA_CALL(cudaDeviceSynchronize());
+
+    // Measurement
+    auto start = std::chrono::high_resolution_clock::now();
+    for (size_t i = 0; i < trials; i++)
+    {
+        CURAND_CALL(generate_func(generator, data, size));
+    }
+    // CHECK: CUDA_CALL(hipDeviceSynchronize());
+    CUDA_CALL(cudaDeviceSynchronize());
+    auto end = std::chrono::high_resolution_clock::now();
+    std::chrono::duration<double, std::milli> elapsed = end - start;
+
+    std::cout << std::fixed << std::setprecision(3)
+              << "      "
+              << "Throughput = "
+              << std::setw(8) << (trials * size * sizeof(T)) /
+                    (elapsed.count() / 1e3 * (1 << 30))
+              << " GB/s, Samples = "
+              << std::setw(8) << (trials * size) /
+                    (elapsed.count() / 1e3 * (1 << 30))
+              << " GSample/s, AvgTime (1 trial) = "
+              << std::setw(8) << elapsed.count() / trials
+              << " ms, Time (all) = "
+              << std::setw(8) << elapsed.count()
+              << " ms, Size = " << size
+              << std::endl;
+    // CHECK: CURAND_CALL(hiprandDestroyGenerator(generator));
+    // CHECK: CUDA_CALL(hipFree(data));
+    CURAND_CALL(curandDestroyGenerator(generator));
+    CUDA_CALL(cudaFree(data));
+}
+
+void run_benchmarks(const cli::Parser& parser,
+                    const rng_type_t rng_type,
+                    const std::string& distribution)
+{
+    if (distribution == "uniform-uint")
+    {
+        // CHECK: if (rng_type != HIPRAND_RNG_QUASI_SOBOL64 &&
+        // CHECK: rng_type != HIPRAND_RNG_QUASI_SCRAMBLED_SOBOL64)
+        if (rng_type != CURAND_RNG_QUASI_SOBOL64 &&
+            rng_type != CURAND_RNG_QUASI_SCRAMBLED_SOBOL64)
+        {
+            run_benchmark<unsigned int>(parser, rng_type,
+                // CHECK: [](hiprandGenerator_t gen, unsigned int * data, size_t size) {
+                // CHECK: return hiprandGenerate(gen, data, size);
+                [](curandGenerator_t gen, unsigned int * data, size_t size) {
+                    return curandGenerate(gen, data, size);
+                }
+            );
+        }
+    }
+    if (distribution == "uniform-long-long")
+    {
+        // CHECK: if (rng_type == HIPRAND_RNG_QUASI_SOBOL64 ||
+        // CHECK: rng_type == HIPRAND_RNG_QUASI_SCRAMBLED_SOBOL64)
+        if (rng_type == CURAND_RNG_QUASI_SOBOL64 ||
+            rng_type == CURAND_RNG_QUASI_SCRAMBLED_SOBOL64)
+        {
+            run_benchmark<unsigned long long>(parser, rng_type,
+                // CHECK: [](hiprandGenerator_t gen, unsigned long long * data, size_t size) {
+                [](curandGenerator_t gen, unsigned long long * data, size_t size) {
+                    // curandGenerateLongLong is yet unsupported by HIP
+                    // CHECK-NOT: return hiprandGenerateLongLong(gen, data, size);
+                    return curandGenerateLongLong(gen, data, size);
+                }
+            );
+        }
+    }
+    if (distribution == "uniform-float")
+    {
+        run_benchmark<float>(parser, rng_type,
+            // CHECK: [](hiprandGenerator_t gen, float * data, size_t size) {
+            // CHECK: return hiprandGenerateUniform(gen, data, size);
+            [](curandGenerator_t gen, float * data, size_t size) {
+                return curandGenerateUniform(gen, data, size);
+            }
+        );
+    }
+    if (distribution == "uniform-double")
+    {
+        run_benchmark<double>(parser, rng_type,
+            // CHECK: [](hiprandGenerator_t gen, double * data, size_t size) {
+            // CHECK: return hiprandGenerateUniformDouble(gen, data, size);
+            [](curandGenerator_t gen, double * data, size_t size) {
+                return curandGenerateUniformDouble(gen, data, size);
+            }
+        );
+    }
+    if (distribution == "normal-float")
+    {
+        run_benchmark<float>(parser, rng_type,
+            // CHECK: [](hiprandGenerator_t gen, float * data, size_t size) {
+            // CHECK: return hiprandGenerateNormal(gen, data, size, 0.0f, 1.0f);
+            [](curandGenerator_t gen, float * data, size_t size) {
+                return curandGenerateNormal(gen, data, size, 0.0f, 1.0f);
+            }
+        );
+    }
+    if (distribution == "normal-double")
+    {
+        run_benchmark<double>(parser, rng_type,
+            // CHECK: [](hiprandGenerator_t gen, double * data, size_t size) {
+            // CHECK: return hiprandGenerateNormalDouble(gen, data, size, 0.0, 1.0);
+            [](curandGenerator_t gen, double * data, size_t size) {
+                return curandGenerateNormalDouble(gen, data, size, 0.0, 1.0);
+            }
+        );
+    }
+    if (distribution == "log-normal-float")
+    {
+        run_benchmark<float>(parser, rng_type,
+            // CHECK: [](hiprandGenerator_t gen, float * data, size_t size) {
+            // CHECK: return hiprandGenerateLogNormal(gen, data, size, 0.0f, 1.0f);
+            [](curandGenerator_t gen, float * data, size_t size) {
+                return curandGenerateLogNormal(gen, data, size, 0.0f, 1.0f);
+            }
+        );
+    }
+    if (distribution == "log-normal-double")
+    {
+        run_benchmark<double>(parser, rng_type,
+            // CHECK: [](hiprandGenerator_t gen, double * data, size_t size) {
+            // CHECK: return hiprandGenerateLogNormalDouble(gen, data, size, 0.0, 1.0);
+            [](curandGenerator_t gen, double * data, size_t size) {
+                return curandGenerateLogNormalDouble(gen, data, size, 0.0, 1.0);
+            }
+        );
+    }
+    if (distribution == "poisson")
+    {
+        const auto lambdas = parser.get<std::vector<double>>("lambda");
+        for (double lambda : lambdas)
+        {
+            std::cout << "    " << "lambda "
+                 << std::fixed << std::setprecision(1) << lambda << std::endl;
+            run_benchmark<unsigned int>(parser, rng_type,
+                // CHECK: [lambda](hiprandGenerator_t gen, unsigned int * data, size_t size) {
+                // CHECK: return hiprandGeneratePoisson(gen, data, size, lambda);
+                [lambda](curandGenerator_t gen, unsigned int * data, size_t size) {
+                    return curandGeneratePoisson(gen, data, size, lambda);
+                }
+            );
+        }
+    }
+}
+
+const std::vector<std::string> all_engines = {
+    "xorwow",
+    "mrg32k3a",
+    "mtgp32",
+    // "mt19937",
+    "philox",
+    "sobol32",
+    // "scrambled_sobol32",
+    // "sobol64",
+    // "scrambled_sobol64",
+};
+
+const std::vector<std::string> all_distributions = {
+    "uniform-uint",
+    "uniform-long-long",
+    "uniform-float",
+    "uniform-double",
+    "normal-float",
+    "normal-double",
+    "log-normal-float",
+    "log-normal-double",
+    "poisson"
+};
+
+int main(int argc, char *argv[])
+{
+    cli::Parser parser(argc, argv);
+
+    const std::string distribution_desc =
+        "space-separated list of distributions:" +
+        std::accumulate(all_distributions.begin(), all_distributions.end(), std::string(),
+            [](std::string a, std::string b) {
+                return a + "\n      " + b;
+            }
+        ) +
+        "\n      or all";
+    const std::string engine_desc =
+        "space-separated list of random number engines:" +
+        std::accumulate(all_engines.begin(), all_engines.end(), std::string(),
+            [](std::string a, std::string b) {
+                return a + "\n      " + b;
+            }
+        ) +
+        "\n      or all";
+
+    parser.set_optional<size_t>("size", "size", DEFAULT_RAND_N, "number of values");
+    parser.set_optional<size_t>("dimensions", "dimensions", 1, "number of dimensions of quasi-random values");
+    parser.set_optional<size_t>("trials", "trials", 20, "number of trials");
+    parser.set_optional<std::vector<std::string>>("dis", "dis", {"uniform-uint"}, distribution_desc.c_str());
+    parser.set_optional<std::vector<std::string>>("engine", "engine", {"philox"}, engine_desc.c_str());
+    parser.set_optional<std::vector<double>>("lambda", "lambda", {10.0}, "space-separated list of lambdas of Poisson distribution");
+    parser.run_and_exit_if_error();
+
+    std::vector<std::string> engines;
+    {
+        auto es = parser.get<std::vector<std::string>>("engine");
+        if (std::find(es.begin(), es.end(), "all") != es.end())
+        {
+            engines = all_engines;
+        }
+        else
+        {
+            for (auto e : all_engines)
+            {
+                if (std::find(es.begin(), es.end(), e) != es.end())
+                    engines.push_back(e);
+            }
+        }
+    }
+
+    std::vector<std::string> distributions;
+    {
+        auto ds = parser.get<std::vector<std::string>>("dis");
+        if (std::find(ds.begin(), ds.end(), "all") != ds.end())
+        {
+            distributions = all_distributions;
+        }
+        else
+        {
+            for (auto d : all_distributions)
+            {
+                if (std::find(ds.begin(), ds.end(), d) != ds.end())
+                    distributions.push_back(d);
+            }
+        }
+    }
+
+    int version;
+    // CHECK: CURAND_CALL(hiprandGetVersion(&version));
+    CURAND_CALL(curandGetVersion(&version));
+    int runtime_version;
+    // cudaRuntimeGetVersion is yet unsupported by HIP
+    // CHECK-NOT: CUDA_CALL(hipRuntimeGetVersion(&runtime_version));
+    CUDA_CALL(cudaRuntimeGetVersion(&runtime_version));
+    int device_id;
+    // CHECK: CUDA_CALL(hipGetDevice(&device_id));
+    // CHECK: hipDeviceProp_t props;
+    // CHECK: CUDA_CALL(hipGetDeviceProperties(&props, device_id));
+    CUDA_CALL(cudaGetDevice(&device_id));
+    cudaDeviceProp props;
+    CUDA_CALL(cudaGetDeviceProperties(&props, device_id));
+
+    std::cout << "cuRAND: " << version << " ";
+    std::cout << "Runtime: " << runtime_version << " ";
+    std::cout << "Device: " << props.name;
+    std::cout << std::endl << std::endl;
+
+    for (auto engine : engines)
+    {
+        // CHECK: rng_type_t rng_type = HIPRAND_RNG_PSEUDO_XORWOW;
+        // CHECK: rng_type = HIPRAND_RNG_PSEUDO_XORWOW;
+        // CHECK: rng_type = HIPRAND_RNG_PSEUDO_MRG32K3A;
+        // CHECK: rng_type = HIPRAND_RNG_PSEUDO_MTGP32;
+        // CHECK: rng_type = HIPRAND_RNG_PSEUDO_MT19937;
+        // CHECK: rng_type = HIPRAND_RNG_PSEUDO_PHILOX4_32_10;
+        // CHECK: rng_type = HIPRAND_RNG_QUASI_SOBOL32;
+        // CHECK: rng_type = HIPRAND_RNG_QUASI_SCRAMBLED_SOBOL32;
+        // CHECK: rng_type = HIPRAND_RNG_QUASI_SOBOL64;
+        // CHECK: rng_type = HIPRAND_RNG_QUASI_SCRAMBLED_SOBOL64;
+        rng_type_t rng_type = CURAND_RNG_PSEUDO_XORWOW;
+        if (engine == "xorwow")
+            rng_type = CURAND_RNG_PSEUDO_XORWOW;
+        else if (engine == "mrg32k3a")
+            rng_type = CURAND_RNG_PSEUDO_MRG32K3A;
+        else if (engine == "mtgp32")
+            rng_type = CURAND_RNG_PSEUDO_MTGP32;
+        else if (engine == "mt19937")
+            rng_type = CURAND_RNG_PSEUDO_MT19937;
+        else if (engine == "philox")
+            rng_type = CURAND_RNG_PSEUDO_PHILOX4_32_10;
+        else if (engine == "sobol32")
+            rng_type = CURAND_RNG_QUASI_SOBOL32;
+        else if (engine == "scrambled_sobol32")
+            rng_type = CURAND_RNG_QUASI_SCRAMBLED_SOBOL32;
+        else if (engine == "sobol64")
+            rng_type = CURAND_RNG_QUASI_SOBOL64;
+        else if (engine == "scrambled_sobol64")
+            rng_type = CURAND_RNG_QUASI_SCRAMBLED_SOBOL64;
+        else
+        {
+            std::cout << "Wrong engine name" << std::endl;
+            exit(1);
+        }
+
+        std::cout << engine << ":" << std::endl;
+
+        for (auto distribution : distributions)
+        {
+            std::cout << "  " << distribution << ":" << std::endl;
+            run_benchmarks(parser, rng_type, distribution);
+        }
+        std::cout << std::endl;
+    }
+
+    return 0;
+}
@@ -0,0 +1,669 @@
+// RUN: %run_test hipify "%s" "%t" %cuda_args
+
+// Copyright (c) 2017 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <string>
+#include <chrono>
+#include <numeric>
+#include <utility>
+#include <type_traits>
+#include <algorithm>
+
+#include "cmdparser.hpp"
+// CHECK: #include <hip/hip_runtime.h>
+#include <cuda_runtime.h>
+// CHECK: #include <hiprand.h>
+#include <curand.h>
+// CHECK: #include <hiprand_kernel.h>
+#include <curand_kernel.h>
+// CHECK-NOT: #include <curand_mtgp32_host.h>
+// CHECK-NOT: #include <curand_mtgp32dc_p_11213.h>
+#include <curand_mtgp32_host.h>
+#include <curand_mtgp32dc_p_11213.h>
+
+// CHECK: hipError_t error = (x);
+// CHECK: if(error!=hipSuccess) {
+#define CUDA_CALL(x) do { \
+    cudaError_t error = (x);\
+    if(error!=cudaSuccess) { \
+    printf("Error %d at %s:%d\n",error,__FILE__,__LINE__);\
+    exit(EXIT_FAILURE);}} while(0)
+#define CURAND_CALL(x) do { if((x)!=CURAND_STATUS_SUCCESS) { \
+    printf("Error at %s:%d\n",__FILE__,__LINE__);\
+    exit(EXIT_FAILURE);}} while(0)
+
+#ifndef DEFAULT_RAND_N
+const size_t DEFAULT_RAND_N = 1024 * 1024 * 128;
+#endif
+
+size_t next_power2(size_t x)
+{
+    size_t power = 1;
+    while (power < x)
+    {
+        power *= 2;
+    }
+    return power;
+}
+
+template<typename GeneratorState>
+__global__
+void init_kernel(GeneratorState * states,
+                 const unsigned long long seed,
+                 const unsigned long long offset)
+{
+    const unsigned int state_id = blockIdx.x * blockDim.x + threadIdx.x;
+    GeneratorState state;
+    // CHECK: hiprand_init(seed, state_id, offset, &state);
+    curand_init(seed, state_id, offset, &state);
+    states[state_id] = state;
+}
+
+template<typename GeneratorState, typename T, typename GenerateFunc, typename Extra>
+__global__
+void generate_kernel(GeneratorState * states,
+                     T * data,
+                     const size_t size,
+                     const GenerateFunc& generate_func,
+                     const Extra extra)
+{
+    const unsigned int state_id = blockIdx.x * blockDim.x + threadIdx.x;
+    const unsigned int stride = gridDim.x * blockDim.x;
+
+    GeneratorState state = states[state_id];
+    unsigned int index = state_id;
+    while(index < size)
+    {
+        data[index] = generate_func(&state, extra);
+        index += stride;
+    }
+    states[state_id] = state;
+}
+
+template<typename GeneratorState>
+struct runner
+{
+    GeneratorState * states;
+
+    runner(const size_t dimensions,
+           const size_t blocks,
+           const size_t threads,
+           const unsigned long long seed,
+           const unsigned long long offset)
+    {
+        const size_t states_size = blocks * threads;
+        // CHECK: CUDA_CALL(hipMalloc((void **)&states, states_size * sizeof(GeneratorState)));
+        CUDA_CALL(cudaMalloc((void **)&states, states_size * sizeof(GeneratorState)));
+        // CHECK: hipLaunchKernelGGL(init_kernel, dim3(blocks), dim3(threads), 0, 0, states, seed, offset);
+        init_kernel<<<blocks, threads>>>(states, seed, offset);
+        // CHECK: CUDA_CALL(hipPeekAtLastError());
+        // CHECK: CUDA_CALL(hipDeviceSynchronize());
+        CUDA_CALL(cudaPeekAtLastError());
+        CUDA_CALL(cudaDeviceSynchronize());
+    }
+
+    ~runner()
+    {
+        CUDA_CALL(cudaFree(states));
+    }
+
+    template<typename T, typename GenerateFunc, typename Extra>
+    void generate(const size_t blocks,
+                  const size_t threads,
+                  T * data,
+                  const size_t size,
+                  const GenerateFunc& generate_func,
+                  const Extra extra)
+    {
+        // CHECK: hipLaunchKernelGGL(generate_kernel, dim3(blocks), dim3(threads), 0, 0, states, data, size, generate_func, extra);
+        generate_kernel<<<blocks, threads>>>(states, data, size, generate_func, extra);
+    }
+};
+
+// CHECK: void generate_kernel(hiprandStateMtgp32_t * states,
+template<typename T, typename GenerateFunc, typename Extra>
+__global__
+void generate_kernel(curandStateMtgp32_t * states,
+                     T * data,
+                     const size_t size,
+                     const GenerateFunc& generate_func,
+                     const Extra extra)
+{
+    const unsigned int state_id = blockIdx.x;
+    const unsigned int thread_id = threadIdx.x;
+    unsigned int index = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned int stride = gridDim.x * blockDim.x;
+    // CHECK: __shared__ hiprandStateMtgp32_t state;
+    __shared__ curandStateMtgp32_t state;
+
+    if (thread_id == 0)
+        state = states[state_id];
+    __syncthreads();
+
+    const size_t r = size%blockDim.x;
+    const size_t size_rounded_up = r == 0 ? size : size + (blockDim.x - r);
+    while(index < size_rounded_up)
+    {
+        auto value = generate_func(&state, extra);
+        if(index < size)
+            data[index] = value;
+        index += stride;
+    }
+    __syncthreads();
+
+    if (thread_id == 0)
+        states[state_id] = state;
+}
+
+// CHECK: struct runner<hiprandStateMtgp32_t>
+template<>
+struct runner<curandStateMtgp32_t>
+{
+    // CHECK: hiprandStateMtgp32_t * states;
+    curandStateMtgp32_t * states;
+    mtgp32_kernel_params_t * d_param;
+
+    runner(const size_t dimensions,
+           const size_t blocks,
+           const size_t threads,
+           const unsigned long long seed,
+           const unsigned long long offset)
+    {
+        const size_t states_size = std::min((size_t)200, blocks);
+        // CHECK: CUDA_CALL(hipMalloc((void **)&states, states_size * sizeof(hiprandStateMtgp32_t)));
+        CUDA_CALL(cudaMalloc((void **)&states, states_size * sizeof(curandStateMtgp32_t)));
+        // CHECK: CUDA_CALL(hipMalloc((void **)&d_param, sizeof(mtgp32_kernel_params)));
+        CUDA_CALL(cudaMalloc((void **)&d_param, sizeof(mtgp32_kernel_params)));
+        // curandMakeMTGP32Constants is yet unsupported by HIP
+        // CHECK-NOT: CURAND_CALL(hiprandMakeMTGP32Constants(mtgp32dc_params_fast_11213, d_param));
+        CURAND_CALL(curandMakeMTGP32Constants(mtgp32dc_params_fast_11213, d_param));
+        // curandMakeMTGP32KernelState is yet unsupported by HIP
+        // CHECK-NOT: CURAND_CALL(hiprandMakeMTGP32KernelState(states, mtgp32dc_params_fast_11213, d_param, states_size, seed));
+        CURAND_CALL(curandMakeMTGP32KernelState(states, mtgp32dc_params_fast_11213, d_param, states_size, seed));
+    }
+
+    ~runner()
+    {
+        // CHECK: CUDA_CALL(hipFree(states));
+        // CHECK: CUDA_CALL(hipFree(d_param));
+        CUDA_CALL(cudaFree(states));
+        CUDA_CALL(cudaFree(d_param));
+    }
+
+    template<typename T, typename GenerateFunc, typename Extra>
+    void generate(const size_t blocks,
+                  const size_t threads,
+                  T * data,
+                  const size_t size,
+                  const GenerateFunc& generate_func,
+                  const Extra extra)
+    {
+        // CHECK: hipLaunchKernelGGL(generate_kernel, dim3(std::min((size_t)200, blocks)), dim3(256), 0, 0, states, data, size, generate_func, extra);
+        generate_kernel<<<std::min((size_t)200, blocks), 256>>>(states, data, size, generate_func, extra);
+    }
+};
+
+// CHECK: void init_kernel(hiprandStateSobol32_t * states,
+template<typename Directions>
+__global__
+void init_kernel(curandStateSobol32_t * states,
+                 const Directions directions,
+                 const unsigned long long offset)
+{
+    const unsigned int dimension = blockIdx.y;
+    const unsigned int state_id = blockIdx.x * blockDim.x + threadIdx.x;
+    // CHECK: hiprandStateSobol32_t state;
+    // CHECK: hiprand_init(directions[dimension], offset + state_id, &state);
+    curandStateSobol32_t state;
+    curand_init(directions[dimension], offset + state_id, &state);
+    states[gridDim.x * blockDim.x * dimension + state_id] = state;
+}
+
+// CHECK: void generate_kernel(hiprandStateSobol32_t * states,
+template<typename T, typename GenerateFunc, typename Extra>
+__global__
+void generate_kernel(curandStateSobol32_t * states,
+                     T * data,
+                     const size_t size,
+                     const GenerateFunc& generate_func,
+                     const Extra extra)
+{
+    const unsigned int dimension = blockIdx.y;
+    const unsigned int state_id = blockIdx.x * blockDim.x + threadIdx.x;
+    const unsigned int stride = gridDim.x * blockDim.x;
+    // CHECK: hiprandStateSobol32_t state = states[gridDim.x * blockDim.x * dimension + state_id];
+    curandStateSobol32_t state = states[gridDim.x * blockDim.x * dimension + state_id];
+    const unsigned int offset = dimension * size;
+    unsigned int index = state_id;
+    while(index < size)
+    {
+        data[offset + index] = generate_func(&state, extra);
+        skipahead(stride - 1, &state);
+        index += stride;
+    }
+    state = states[gridDim.x * blockDim.x * dimension + state_id];
+    skipahead(static_cast<unsigned int>(size), &state);
+    states[gridDim.x * blockDim.x * dimension + state_id] = state;
+}
+
+// CHECK: struct runner<hiprandStateSobol32_t>
+template<>
+struct runner<curandStateSobol32_t>
+{
+    // CHECK: hiprandStateSobol32_t * states;
+    curandStateSobol32_t * states;
+    size_t dimensions;
+
+    runner(const size_t dimensions,
+           const size_t blocks,
+           const size_t threads,
+           const unsigned long long seed,
+           const unsigned long long offset)
+    {
+        this->dimensions = dimensions;
+        // CHECK: CUDA_CALL(hipMalloc((void **)&states, states_size * sizeof(hiprandStateSobol32_t)));
+        const size_t states_size = blocks * threads * dimensions;
+        CUDA_CALL(cudaMalloc((void **)&states, states_size * sizeof(curandStateSobol32_t)));
+        // CHECK: hiprandDirectionVectors32_t * directions;
+        curandDirectionVectors32_t * directions;
+        // CHECK: const size_t size = dimensions * sizeof(hiprandDirectionVectors32_t);
+        const size_t size = dimensions * sizeof(curandDirectionVectors32_t);
+        // CHECK: CUDA_CALL(hipMalloc((void **)&directions, size));
+        CUDA_CALL(cudaMalloc((void **)&directions, size));
+        // CHECK: hiprandDirectionVectors32_t * h_directions;
+        curandDirectionVectors32_t * h_directions;
+        // hiprandGetDirectionVectors32 and HIPRAND_DIRECTION_VECTORS_32_JOEKUO6 (of hiprandDirectionVectorSet_t) are yet unsupported by HIP
+        // CHECK-NOT: CURAND_CALL(hiprandGetDirectionVectors32(&h_directions, HIPRAND_DIRECTION_VECTORS_32_JOEKUO6));
+        CURAND_CALL(curandGetDirectionVectors32(&h_directions, CURAND_DIRECTION_VECTORS_32_JOEKUO6));
+        // CHECK: CUDA_CALL(hipMemcpy(directions, h_directions, size, hipMemcpyHostToDevice));
+        CUDA_CALL(cudaMemcpy(directions, h_directions, size, cudaMemcpyHostToDevice));
+
+        const size_t blocks_x = next_power2((blocks + dimensions - 1) / dimensions);
+        // CHECK: hipLaunchKernelGGL(init_kernel, dim3(dim3(blocks_x, dimensions)), dim3(threads), 0, 0, states, directions, offset);
+        init_kernel<<<dim3(blocks_x, dimensions), threads>>>(states, directions, offset);
+        // CHECK: CUDA_CALL(hipPeekAtLastError());
+        // CHECK: CUDA_CALL(hipDeviceSynchronize());
+        CUDA_CALL(cudaPeekAtLastError());
+        CUDA_CALL(cudaDeviceSynchronize());
+        // CHECK: CUDA_CALL(hipFree(directions));
+        CUDA_CALL(cudaFree(directions));
+    }
+
+    ~runner()
+    {
+        // CHECK: CUDA_CALL(hipFree(states));
+        CUDA_CALL(cudaFree(states));
+    }
+
+    template<typename T, typename GenerateFunc, typename Extra>
+    void generate(const size_t blocks,
+                  const size_t threads,
+                  T * data,
+                  const size_t size,
+                  const GenerateFunc& generate_func,
+                  const Extra extra)
+    {
+        const size_t blocks_x = next_power2((blocks + dimensions - 1) / dimensions);
+        // CHECK: hipLaunchKernelGGL(generate_kernel, dim3(dim3(blocks_x, dimensions)), dim3(threads), 0, 0, states, data, size / dimensions, generate_func, extra);
+        generate_kernel<<<dim3(blocks_x, dimensions), threads>>>(states, data, size / dimensions, generate_func, extra);
+    }
+};
+
+template<typename T, typename GeneratorState, typename GenerateFunc, typename Extra>
+void run_benchmark(const cli::Parser& parser,
+                   const GenerateFunc& generate_func,
+                   const Extra extra)
+{
+    const size_t size = parser.get<size_t>("size");
+    const size_t dimensions = parser.get<size_t>("dimensions");
+    const size_t trials = parser.get<size_t>("trials");
+
+    const size_t blocks = parser.get<size_t>("blocks");
+    const size_t threads = parser.get<size_t>("threads");
+
+    T * data;
+    // CHECK: CUDA_CALL(hipMalloc((void **)&data, size * sizeof(T)));
+    CUDA_CALL(cudaMalloc((void **)&data, size * sizeof(T)));
+
+    runner<GeneratorState> r(dimensions, blocks, threads, 12345ULL, 6789ULL);
+
+    // Warm-up
+    for (size_t i = 0; i < 5; i++)
+    {
+        r.generate(blocks, threads, data, size, generate_func, extra);
+        // CHECK: CUDA_CALL(hipPeekAtLastError());
+        // CHECK: CUDA_CALL(hipDeviceSynchronize());
+        CUDA_CALL(cudaPeekAtLastError());
+        CUDA_CALL(cudaDeviceSynchronize());
+    }
+    // CHECK: CUDA_CALL(hipDeviceSynchronize());
+    CUDA_CALL(cudaDeviceSynchronize());
+
+    // Measurement
+    auto start = std::chrono::high_resolution_clock::now();
+    for (size_t i = 0; i < trials; i++)
+    {
+        r.generate(blocks, threads, data, size, generate_func, extra);
+    }
+    // CHECK: CUDA_CALL(hipPeekAtLastError());
+    // CHECK: CUDA_CALL(hipDeviceSynchronize());
+    CUDA_CALL(cudaPeekAtLastError());
+    CUDA_CALL(cudaDeviceSynchronize());
+    auto end = std::chrono::high_resolution_clock::now();
+    std::chrono::duration<double, std::milli> elapsed = end - start;
+
+    std::cout << std::fixed << std::setprecision(3)
+              << "      "
+              << "Throughput = "
+              << std::setw(8) << (trials * size * sizeof(T)) /
+                    (elapsed.count() / 1e3 * (1 << 30))
+              << " GB/s, Samples = "
+              << std::setw(8) << (trials * size) /
+                    (elapsed.count() / 1e3 * (1 << 30))
+              << " GSample/s, AvgTime (1 trial) = "
+              << std::setw(8) << elapsed.count() / trials
+              << " ms, Time (all) = "
+              << std::setw(8) << elapsed.count()
+              << " ms, Size = " << size
+              << std::endl;
+    // CHECK: CUDA_CALL(hipFree(data));
+    CUDA_CALL(cudaFree(data));
+}
+
+template<typename GeneratorState>
+void run_benchmarks(const cli::Parser& parser,
+                    const std::string& distribution)
+{
+    if (distribution == "uniform-uint")
+    {
+        // curandStateSobol64_t and curandStateScrambledSobol64_t are yet unsupported by HIP
+        // CHECK-NOT: if (!std::is_same<GeneratorState, hiprandStateSobol64_t>::value &&
+        // CHECK-NOT: !std::is_same<GeneratorState, hiprandStateScrambledSobol64_t>::value)
+        if (!std::is_same<GeneratorState, curandStateSobol64_t>::value &&
+            !std::is_same<GeneratorState, curandStateScrambledSobol64_t>::value)
+        {
+            run_benchmark<unsigned int, GeneratorState>(parser,
+                [] __device__ (GeneratorState * state, int) {
+                    // CHECK: return hiprand(state);
+                    return curand(state);
+                }, 0
+            );
+        }
+    }
+    if (distribution == "uniform-long-long")
+    {
+        // curandStateSobol64_t and curandStateScrambledSobol64_t are yet unsupported by HIP
+        // CHECK-NOT: if (!std::is_same<GeneratorState, hiprandStateSobol64_t>::value &&
+        // CHECK-NOT: !std::is_same<GeneratorState, hiprandStateScrambledSobol64_t>::value)
+        if (std::is_same<GeneratorState, curandStateSobol64_t>::value ||
+            std::is_same<GeneratorState, curandStateScrambledSobol64_t>::value)
+        {
+            run_benchmark<unsigned long long, GeneratorState>(parser,
+                [] __device__ (GeneratorState * state, int) {
+                    // CHECK: return hiprand(state);
+                    return curand(state);
+                }, 0
+            );
+        }
+    }
+    if (distribution == "uniform-float")
+    {
+        run_benchmark<float, GeneratorState>(parser,
+            [] __device__ (GeneratorState * state, int) {
+                // CHECK: return hiprand_uniform(state);
+                return curand_uniform(state);
+            }, 0
+        );
+    }
+    if (distribution == "uniform-double")
+    {
+        run_benchmark<double, GeneratorState>(parser,
+            [] __device__ (GeneratorState * state, int) {
+                // CHECK: return hiprand_uniform_double(state);
+                return curand_uniform_double(state);
+            }, 0
+        );
+    }
+    if (distribution == "normal-float")
+    {
+        run_benchmark<float, GeneratorState>(parser,
+            [] __device__ (GeneratorState * state, int) {
+                // CHECK: return hiprand_normal(state);
+                return curand_normal(state);
+            }, 0
+        );
+    }
+    if (distribution == "normal-double")
+    {
+        run_benchmark<double, GeneratorState>(parser,
+            [] __device__ (GeneratorState * state, int) {
+                // CHECK: return hiprand_normal_double(state);
+                return curand_normal_double(state);
+            }, 0
+        );
+    }
+    if (distribution == "log-normal-float")
+    {
+        run_benchmark<float, GeneratorState>(parser,
+            [] __device__ (GeneratorState * state, int) {
+                // CHECK: return hiprand_log_normal(state, 0.0f, 1.0f);
+                return curand_log_normal(state, 0.0f, 1.0f);
+            }, 0
+        );
+    }
+    if (distribution == "log-normal-double")
+    {
+        run_benchmark<double, GeneratorState>(parser,
+            [] __device__ (GeneratorState * state, int) {
+                // CHECK: return hiprand_log_normal_double(state, 0.0, 1.0);
+                return curand_log_normal_double(state, 0.0, 1.0);
+            }, 0
+        );
+    }
+    if (distribution == "poisson")
+    {
+        const auto lambdas = parser.get<std::vector<double>>("lambda");
+        for (double lambda : lambdas)
+        {
+            std::cout << "    " << "lambda "
+                 << std::fixed << std::setprecision(1) << lambda << std::endl;
+            run_benchmark<unsigned int, GeneratorState>(parser,
+                [] __device__ (GeneratorState * state, double lambda) {
+                    // CHECK: return hiprand_poisson(state, lambda);
+                    return curand_poisson(state, lambda);
+                }, lambda
+            );
+        }
+    }
+    if (distribution == "discrete-poisson")
+    {
+        const auto lambdas = parser.get<std::vector<double>>("lambda");
+        for (double lambda : lambdas)
+        {
+            std::cout << "    " << "lambda "
+                 << std::fixed << std::setprecision(1) << lambda << std::endl;
+            // CHECK: hiprandDiscreteDistribution_t discrete_distribution;
+            curandDiscreteDistribution_t discrete_distribution;
+            // CHECK: CURAND_CALL(hiprandCreatePoissonDistribution(lambda, &discrete_distribution));
+            CURAND_CALL(curandCreatePoissonDistribution(lambda, &discrete_distribution));
+            run_benchmark<unsigned int, GeneratorState>(parser,
+                // CHECK: [] __device__ (GeneratorState * state, hiprandDiscreteDistribution_t discrete_distribution) {
+                [] __device__ (GeneratorState * state, curandDiscreteDistribution_t discrete_distribution) {
+                    // CHECK: return hiprand_discrete(state, discrete_distribution);
+                    return curand_discrete(state, discrete_distribution);
+                }, discrete_distribution
+            );
+            // CHECK: CURAND_CALL(hiprandDestroyDistribution(discrete_distribution));
+            CURAND_CALL(curandDestroyDistribution(discrete_distribution));
+        }
+    }
+}
+
+const std::vector<std::string> all_engines = {
+    "xorwow",
+    "mrg32k3a",
+    "mtgp32",
+    // "mt19937",
+    "philox",
+    "sobol32",
+    // "scrambled_sobol32",
+    // "sobol64",
+    // "scrambled_sobol64",
+};
+
+const std::vector<std::string> all_distributions = {
+    "uniform-uint",
+    // "uniform-long-long",
+    "uniform-float",
+    "uniform-double",
+    "normal-float",
+    "normal-double",
+    "log-normal-float",
+    "log-normal-double",
+    "poisson",
+    "discrete-poisson",
+};
+
+int main(int argc, char *argv[])
+{
+    cli::Parser parser(argc, argv);
+
+    const std::string distribution_desc =
+        "space-separated list of distributions:" +
+        std::accumulate(all_distributions.begin(), all_distributions.end(), std::string(),
+            [](std::string a, std::string b) {
+                return a + "\n      " + b;
+            }
+        ) +
+        "\n      or all";
+    const std::string engine_desc =
+        "space-separated list of random number engines:" +
+        std::accumulate(all_engines.begin(), all_engines.end(), std::string(),
+            [](std::string a, std::string b) {
+                return a + "\n      " + b;
+            }
+        ) +
+        "\n      or all";
+
+    parser.set_optional<size_t>("size", "size", DEFAULT_RAND_N, "number of values");
+    parser.set_optional<size_t>("dimensions", "dimensions", 1, "number of dimensions of quasi-random values");
+    parser.set_optional<size_t>("trials", "trials", 20, "number of trials");
+    parser.set_optional<size_t>("blocks", "blocks", 256, "number of blocks");
+    parser.set_optional<size_t>("threads", "threads", 256, "number of threads in each block");
+    parser.set_optional<std::vector<std::string>>("dis", "dis", {"uniform-uint"}, distribution_desc.c_str());
+    parser.set_optional<std::vector<std::string>>("engine", "engine", {"philox"}, engine_desc.c_str());
+    parser.set_optional<std::vector<double>>("lambda", "lambda", {10.0}, "space-separated list of lambdas of Poisson distribution");
+    parser.run_and_exit_if_error();
+
+    std::vector<std::string> engines;
+    {
+        auto es = parser.get<std::vector<std::string>>("engine");
+        if (std::find(es.begin(), es.end(), "all") != es.end())
+        {
+            engines = all_engines;
+        }
+        else
+        {
+            for (auto e : all_engines)
+            {
+                if (std::find(es.begin(), es.end(), e) != es.end())
+                    engines.push_back(e);
+            }
+        }
+    }
+
+    std::vector<std::string> distributions;
+    {
+        auto ds = parser.get<std::vector<std::string>>("dis");
+        if (std::find(ds.begin(), ds.end(), "all") != ds.end())
+        {
+            distributions = all_distributions;
+        }
+        else
+        {
+            for (auto d : all_distributions)
+            {
+                if (std::find(ds.begin(), ds.end(), d) != ds.end())
+                    distributions.push_back(d);
+            }
+        }
+    }
+
+    int version;
+    // CHECK: CURAND_CALL(hiprandGetVersion(&version));
+    CURAND_CALL(curandGetVersion(&version));
+    int runtime_version;
+    // cudaRuntimeGetVersion is yet unsupported by HIP
+    // CHECK-NOT: CUDA_CALL(hipRuntimeGetVersion(&runtime_version));
+    CUDA_CALL(cudaRuntimeGetVersion(&runtime_version));
+    int device_id;
+    // CHECK: CUDA_CALL(hipGetDevice(&device_id));
+    // CHECK: hipDeviceProp_t props;
+    // CHECK: CUDA_CALL(hipGetDeviceProperties(&props, device_id));
+    CUDA_CALL(cudaGetDevice(&device_id));
+    cudaDeviceProp props;
+    CUDA_CALL(cudaGetDeviceProperties(&props, device_id));
+
+    std::cout << "cuRAND: " << version << " ";
+    std::cout << "Runtime: " << runtime_version << " ";
+    std::cout << "Device: " << props.name;
+    std::cout << std::endl << std::endl;
+
+    for (auto engine : engines)
+    {
+        std::cout << engine << ":" << std::endl;
+        for (auto distribution : distributions)
+        {
+            std::cout << "  " << distribution << ":" << std::endl;
+            const std::string plot_name = engine + "-" + distribution;
+            if (engine == "xorwow")
+            {
+                // CHECK: run_benchmarks<hiprandStateXORWOW_t>(parser, distribution);
+                run_benchmarks<curandStateXORWOW_t>(parser, distribution);
+            }
+            else if (engine == "mrg32k3a")
+            {
+                // CHECK: run_benchmarks<hiprandStateMRG32k3a_t>(parser, distribution);
+                run_benchmarks<curandStateMRG32k3a_t>(parser, distribution);
+            }
+            else if (engine == "philox")
+            {
+                // CHECK: run_benchmarks<hiprandStatePhilox4_32_10_t>(parser, distribution);
+                run_benchmarks<curandStatePhilox4_32_10_t>(parser, distribution);
+            }
+            else if (engine == "sobol32")
+            {
+                // CHECK: run_benchmarks<hiprandStateSobol32_t>(parser, distribution);
+                run_benchmarks<curandStateSobol32_t>(parser, distribution);
+            }
+            else if (engine == "mtgp32")
+            {
+                // CHECK: run_benchmarks<hiprandStateMtgp32_t>(parser, distribution);
+                run_benchmarks<curandStateMtgp32_t>(parser, distribution);
+            }
+        }
+    }
+
+    return 0;
+}
@@ -0,0 +1,513 @@
+// The MIT License (MIT)
+//
+// Copyright (c) 2015 - 2016 Florian Rappl
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+/*
+  This file is part of the C++ CmdParser utility.
+  Copyright (c) 2015 - 2016 Florian Rappl
+*/
+
+#pragma once
+#include <iostream>
+#include <stdexcept>
+#include <string>
+#include <vector>
+#include <sstream>
+#include <functional>
+
+namespace cli {
+    struct CallbackArgs {
+        const std::vector<std::string>& arguments;
+        std::ostream& output;
+        std::ostream& error;
+    };
+    class Parser {
+    private:
+        class CmdBase {
+        public:
+            explicit CmdBase(const std::string& name, const std::string& alternative, const std::string& description, bool required, bool dominant, bool variadic) :
+                name(name),
+                command(name.size() > 0 ? "-" + name : ""),
+                alternative(alternative.size() > 0 ? "--" + alternative : ""),
+                description(description),
+                required(required),
+                handled(false),
+                arguments({}),
+                dominant(dominant),
+                variadic(variadic) {
+            }
+
+            virtual ~CmdBase() {
+            }
+
+            std::string name;
+            std::string command;
+            std::string alternative;
+            std::string description;
+            bool required;
+            bool handled;
+            std::vector<std::string> arguments;
+            bool const dominant;
+            bool const variadic;
+
+            virtual std::string print_value() const = 0;
+            virtual bool parse(std::ostream& output, std::ostream& error) = 0;
+
+            bool is(const std::string& given) const {
+                return given == command || given == alternative;
+            }
+        };
+
+        template<typename T>
+        struct ArgumentCountChecker
+        {
+            static constexpr bool Variadic = false;
+        };
+
+        template<typename T>
+        struct ArgumentCountChecker<std::vector<T>>
+        {
+            static constexpr bool Variadic = true;
+        };
+
+        template<typename T>
+        class CmdFunction final : public CmdBase {
+        public:
+            explicit CmdFunction(const std::string& name, const std::string& alternative, const std::string& description, bool required, bool dominant) :
+                CmdBase(name, alternative, description, required, dominant, ArgumentCountChecker<T>::Variadic) {
+            }
+
+            virtual bool parse(std::ostream& output, std::ostream& error) {
+                try {
+                    CallbackArgs args { arguments, output, error };
+                    value = callback(args);
+                    return true;
+                } catch (...) {
+                    return false;
+                }
+            }
+
+            virtual std::string print_value() const {
+                return "";
+            }
+
+            std::function<T(CallbackArgs&)> callback;
+            T value;
+        };
+
+        template<typename T>
+        class CmdArgument final : public CmdBase {
+        public:
+            explicit CmdArgument(const std::string& name, const std::string& alternative, const std::string& description, bool required, bool dominant) :
+                CmdBase(name, alternative, description, required, dominant, ArgumentCountChecker<T>::Variadic) {
+            }
+
+            virtual bool parse(std::ostream&, std::ostream&) {
+                try {
+                    value = Parser::parse(arguments, value);
+                    return true;
+                } catch (...) {
+                    return false;
+                }
+            }
+
+            virtual std::string print_value() const {
+                return stringify(value);
+            }
+
+            T value;
+        };
+
+        static int parse(const std::vector<std::string>& elements, const int&) {
+            if (elements.size() != 1)
+                throw std::bad_cast();
+
+            return std::stoi(elements[0]);
+        }
+
+        static bool parse(const std::vector<std::string>& elements, const bool& defval) {
+            if (elements.size() != 0)
+                throw std::runtime_error("A boolean command line parameter cannot have any arguments.");
+
+            return !defval;
+        }
+
+        static double parse(const std::vector<std::string>& elements, const double&) {
+            if (elements.size() != 1)
+                throw std::bad_cast();
+
+            return std::stod(elements[0]);
+        }
+
+        static float parse(const std::vector<std::string>& elements, const float&) {
+            if (elements.size() != 1)
+                throw std::bad_cast();
+
+            return std::stof(elements[0]);
+        }
+
+        static long double parse(const std::vector<std::string>& elements, const long double&) {
+            if (elements.size() != 1)
+                throw std::bad_cast();
+
+            return std::stold(elements[0]);
+        }
+
+        static unsigned int parse(const std::vector<std::string>& elements, const unsigned int&) {
+            if (elements.size() != 1)
+                throw std::bad_cast();
+
+            return static_cast<unsigned int>(std::stoul(elements[0]));
+        }
+
+        static unsigned long parse(const std::vector<std::string>& elements, const unsigned long&) {
+            if (elements.size() != 1)
+                throw std::bad_cast();
+
+            return std::stoul(elements[0]);
+        }
+
+        static unsigned long long parse(const std::vector<std::string>& elements, const unsigned long long&) {
+          if (elements.size() != 1)
+            throw std::bad_cast();
+
+          return std::stoull(elements[0]);
+        }
+
+        static long parse(const std::vector<std::string>& elements, const long&) {
+            if (elements.size() != 1)
+                throw std::bad_cast();
+
+            return std::stol(elements[0]);
+        }
+
+        static std::string parse(const std::vector<std::string>& elements, const std::string&) {
+            if (elements.size() != 1)
+                throw std::bad_cast();
+
+            return elements[0];
+        }
+
+        template<class T>
+        static std::vector<T> parse(const std::vector<std::string>& elements, const std::vector<T>&) {
+            const T defval = T();
+            std::vector<T> values { };
+            std::vector<std::string> buffer(1);
+
+            for (const auto& element : elements) {
+                buffer[0] = element;
+                values.push_back(parse(buffer, defval));
+            }
+
+            return values;
+        }
+
+        template<class T>
+        static std::string stringify(const T& value) {
+            return std::to_string(value);
+        }
+
+        template<class T>
+        static std::string stringify(const std::vector<T>& values) {
+            std::stringstream ss { };
+            ss << "[ ";
+
+            for (const auto& value : values) {
+                ss << stringify(value) << " ";
+            }
+
+            ss << "]";
+            return ss.str();
+        }
+
+        static std::string stringify(const std::string& str) {
+            return str;
+        }
+
+    public:
+        explicit Parser(int argc, const char** argv) :
+            _appname(argv[0]) {
+            for (int i = 1; i < argc; ++i) {
+                _arguments.push_back(argv[i]);
+            }
+            enable_help();
+        }
+
+        explicit Parser(int argc, char** argv) :
+            _appname(argv[0]) {
+            for (int i = 1; i < argc; ++i) {
+                _arguments.push_back(argv[i]);
+            }
+            enable_help();
+        }
+
+        ~Parser() {
+            for (int i = 0, n = _commands.size(); i < n; ++i) {
+                delete _commands[i];
+            }
+        }
+
+        bool has_help() const {
+            for (const auto command : _commands) {
+                if (command->name == "h" && command->alternative == "--help") {
+                    return true;
+                }
+            }
+
+            return false;
+        }
+
+        void enable_help() {
+            set_callback("h", "help", std::function<bool(CallbackArgs&)>([this](CallbackArgs& args){
+                args.output << this->usage();
+                exit(0);
+                return false;
+            }), "", true);
+        }
+
+        void disable_help() {
+            for (auto command = _commands.begin(); command != _commands.end(); ++command) {
+                if ((*command)->name == "h" && (*command)->alternative == "--help") {
+                    _commands.erase(command);
+                    break;
+                }
+            }
+        }
+
+        template<typename T>
+        void set_default(bool is_required, const std::string& description = "") {
+            auto command = new CmdArgument<T> { "", "", description, is_required, false };
+            _commands.push_back(command);
+        }
+
+        template<typename T>
+        void set_required(const std::string& name, const std::string& alternative, const std::string& description = "", bool dominant = false) {
+            auto command = new CmdArgument<T> { name, alternative, description, true, dominant };
+            _commands.push_back(command);
+        }
+
+        template<typename T>
+        void set_optional(const std::string& name, const std::string& alternative, T defaultValue, const std::string& description = "", bool dominant = false) {
+            auto command = new CmdArgument<T> { name, alternative, description, false, dominant };
+            command->value = defaultValue;
+            _commands.push_back(command);
+        }
+
+        template<typename T>
+        void set_callback(const std::string& name, const std::string& alternative, std::function<T(CallbackArgs&)> callback, const std::string& description = "", bool dominant = false) {
+            auto command = new CmdFunction<T> { name, alternative, description, false, dominant };
+            command->callback = callback;
+            _commands.push_back(command);
+        }
+
+        inline void run_and_exit_if_error() {
+            if (run() == false) {
+                exit(1);
+            }
+        }
+
+        inline bool run() {
+            return run(std::cout, std::cerr);
+        }
+
+        inline bool run(std::ostream& output) {
+            return run(output, std::cerr);
+        }
+
+        bool run(std::ostream& output, std::ostream& error) {
+            if (_arguments.size() > 0) {
+                auto current = find_default();
+
+                for (int i = 0, n = _arguments.size(); i < n; ++i) {
+                    auto isarg = _arguments[i].size() > 0 && _arguments[i][0] == '-';
+                    auto associated = isarg ? find(_arguments[i]) : nullptr;
+
+                    if (associated != nullptr) {
+                        current = associated;
+                        associated->handled = true;
+                    } else if (current == nullptr) {
+                        error << no_default();
+                        return false;
+                    } else {
+                        current->arguments.push_back(_arguments[i]);
+                        current->handled = true;
+                        if (!current->variadic)
+                        {
+                            // If the current command is not variadic, then no more arguments
+                            // should be added to it. In this case, switch back to the default
+                            // command.
+                            current = find_default();
+                        }
+                    }
+                }
+            }
+
+            // First, parse dominant arguments since they succeed even if required
+            // arguments are missing.
+            for (auto command : _commands) {
+                if (command->handled && command->dominant && !command->parse(output, error)) {
+                    error << howto_use(command);
+                    return false;
+                }
+            }
+
+            // Next, check for any missing arguments.
+            for (auto command : _commands) {
+                if (command->required && !command->handled) {
+                    error << howto_required(command);
+                    return false;
+                }
+            }
+
+            // Finally, parse all remaining arguments.
+            for (auto command : _commands) {
+                if (command->handled && !command->dominant && !command->parse(output, error)) {
+                    error << howto_use(command);
+                    return false;
+                }
+            }
+
+            return true;
+        }
+
+        template<typename T>
+        T get(const std::string& name) const {
+            for (const auto& command : _commands) {
+                if (command->name == name) {
+                    auto cmd = dynamic_cast<CmdArgument<T>*>(command);
+
+                    if (cmd == nullptr) {
+                        throw std::runtime_error("Invalid usage of the parameter " + name + " detected.");
+                    }
+
+                    return cmd->value;
+                }
+            }
+
+            throw std::runtime_error("The parameter " + name + " could not be found.");
+        }
+
+        template<typename T>
+        T get_if(const std::string& name, std::function<T(T)> callback) const {
+            auto value = get<T>(name);
+            return callback(value);
+        }
+
+        int requirements() const {
+            int count = 0;
+
+            for (const auto& command : _commands) {
+                if (command->required) {
+                    ++count;
+                }
+            }
+
+            return count;
+        }
+
+        int commands() const {
+            return static_cast<int>(_commands.size());
+        }
+
+        inline const std::string& app_name() const {
+            return _appname;
+        }
+
+    protected:
+        CmdBase* find(const std::string& name) {
+            for (auto command : _commands) {
+                if (command->is(name)) {
+                    return command;
+                }
+            }
+
+            return nullptr;
+        }
+
+        CmdBase* find_default() {
+            for (auto command : _commands) {
+                if (command->name == "") {
+                    return command;
+                }
+            }
+
+            return nullptr;
+        }
+
+        std::string usage() const {
+            std::stringstream ss { };
+            ss << "Available parameters:\n\n";
+
+            for (const auto& command : _commands) {
+                ss << "  " << command->command << "\t" << command->alternative;
+
+                if (command->required == true) {
+                    ss << "\t(required)";
+                }
+
+                ss << "\n   " << command->description;
+
+                if (command->required == false) {
+                    ss << "\n   " << "This parameter is optional. The default value is '" + command->print_value() << "'.";
+                }
+
+                ss << "\n\n";
+            }
+
+            return ss.str();
+        }
+
+        void print_help(std::stringstream& ss) const {
+            if (has_help()) {
+                ss << "For more help use --help or -h.\n";
+            }
+        }
+
+        std::string howto_required(CmdBase* command) const {
+            std::stringstream ss { };
+            ss << "The parameter " << command->name << " is required.\n";
+            ss << command->description << '\n';
+            print_help(ss);
+            return ss.str();
+        }
+
+        std::string howto_use(CmdBase* command) const {
+            std::stringstream ss { };
+            ss << "The parameter " << command->name << " has invalid arguments.\n";
+            ss << command->description << '\n';
+            print_help(ss);
+            return ss.str();
+        }
+
+        std::string no_default() const {
+            std::stringstream ss { };
+            ss << "No default parameter has been specified.\n";
+            ss << "The given argument must be used with a parameter.\n";
+            print_help(ss);
+            return ss.str();
+        }
+
+    private:
+        const std::string _appname;
+        std::vector<std::string> _arguments;
+        std::vector<CmdBase*> _commands;
+    };
+}
@@ -0,0 +1,417 @@
+// RUN: %run_test hipify "%s" "%t" %cuda_args
+
+// Taken from: http://docs.nvidia.com/cuda/curand/device-api-overview.html#poisson-api-example
+/*
+ * This program uses CURAND library for Poisson distribution
+ * to simulate queues in store for 16 hours. It shows the
+ * difference of using 3 different APIs:
+ * - HOST API -arrival of customers is described by Poisson(4)
+ * - SIMPLE DEVICE API -arrival of customers is described by
+ *     Poisson(4*(sin(x/100)+1)), where x is number of minutes
+ *     from store opening time.
+ * - ROBUST DEVICE API -arrival of customers is described by:
+ *     - Poisson(2) for first 3 hours.
+ *     - Poisson(1) for second 3 hours.
+ *     - Poisson(3) after 6 hours.  
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+// CHECK: #include <hip/hip_runtime.h>
+#include <cuda.h>
+// CHECK: #include <hiprand_kernel.h>
+#include <curand_kernel.h>
+// CHECK: #include <hiprand.h>
+#include <curand.h>
+
+// CHECK: #define CUDA_CALL(x) do { if((x) != hipSuccess) {
+#define CUDA_CALL(x) do { if((x) != cudaSuccess) { \
+    printf("Error at %s:%d\n",__FILE__,__LINE__); \
+    return EXIT_FAILURE;}} while(0)
+// CHECK: #define CURAND_CALL(x) do { if((x)!=HIPRAND_STATUS_SUCCESS) {
+#define CURAND_CALL(x) do { if((x)!=CURAND_STATUS_SUCCESS) { \
+    printf("Error at %s:%d\n",__FILE__,__LINE__);\
+    return EXIT_FAILURE;}} while(0)
+
+
+#define HOURS 16
+#define OPENING_HOUR 7
+#define CLOSING_HOUR (OPENING_HOUR + HOURS)
+
+#define access_2D(type, ptr, row, column, pitch)\
+    *((type*)((char*)ptr + (row) * pitch) + column)
+
+enum API_TYPE {
+    HOST_API = 0,
+    SIMPLE_DEVICE_API = 1,
+    ROBUST_DEVICE_API = 2,
+};
+
+/* global variables */
+API_TYPE api;
+int report_break;
+int cashiers_load_h[HOURS];
+__constant__ int cashiers_load[HOURS];
+// CHECK: __global__ void setup_kernel(hiprandState_t *state)
+__global__ void setup_kernel(curandState *state)
+{
+    int id = threadIdx.x + blockIdx.x * blockDim.x;
+    /* Each thread gets same seed, a different sequence 
+       number, no offset */
+    // CHECK: hiprand_init(1234, id, 0, &state[id]);
+    curand_init(1234, id, 0, &state[id]);
+}
+
+__inline__ __device__
+void update_queue(int id, int min, unsigned int new_customers,
+                  unsigned int &queue_length,
+                  unsigned int *queue_lengths, size_t pitch)
+{
+    int balance;
+    balance = new_customers - 2 * cashiers_load[(min-1)/60];
+    if (balance + (int)queue_length <= 0){
+        queue_length = 0;
+    }else{
+        queue_length += balance;
+    }
+    /* Store results */
+    access_2D(unsigned int, queue_lengths, min-1, id, pitch)
+        = queue_length;
+}
+
+// CHECK: __global__ void simple_device_API_kernel(hiprandState_t *state,
+__global__ void simple_device_API_kernel(curandState *state, 
+                    unsigned int *queue_lengths, size_t pitch)
+{
+    int id = threadIdx.x + blockIdx.x * blockDim.x;
+    unsigned int new_customers;
+    unsigned int queue_length = 0;
+    /* Copy state to local memory for efficiency */
+    // CHECK: hiprandState_t localState = state[id];
+    curandState localState = state[id];
+    /* Simulate queue in time */
+    for(int min = 1; min <= 60 * HOURS; min++) {
+        /* Draw number of new customers depending on API */
+        // CHECK: new_customers = hiprand_poisson(&localState,
+        new_customers = curand_poisson(&localState,
+                                4*(sin((float)min/100.0)+1));
+        /* Update queue */
+        update_queue(id, min, new_customers, queue_length,
+                     queue_lengths, pitch);       
+    }
+    /* Copy state back to global memory */
+    state[id] = localState;
+}
+
+
+__global__ void host_API_kernel(unsigned int *poisson_numbers,
+                    unsigned int *queue_lengths, size_t pitch)
+{
+    int id = threadIdx.x + blockIdx.x * blockDim.x;
+    unsigned int new_customers;
+    unsigned int queue_length = 0;
+    /* Simulate queue in time */
+    for(int min = 1; min <= 60 * HOURS; min++) {
+        /* Get random number from global memory */
+        new_customers = poisson_numbers
+                    [blockDim.x * gridDim.x * (min -1) + id];
+        /* Update queue */
+        update_queue(id, min, new_customers, queue_length,
+                     queue_lengths, pitch);
+    }
+}
+// CHECK: __global__ void robust_device_API_kernel(hiprandState_t *state,
+// CHECK: hiprandDiscreteDistribution_t poisson_1,
+// CHECK: hiprandDiscreteDistribution_t poisson_2,
+// CHECK: hiprandDiscreteDistribution_t poisson_3,
+__global__ void robust_device_API_kernel(curandState *state,
+                   curandDiscreteDistribution_t poisson_1,
+                   curandDiscreteDistribution_t poisson_2,
+                   curandDiscreteDistribution_t poisson_3,
+                   unsigned int *queue_lengths, size_t pitch)
+{
+    int id = threadIdx.x + blockIdx.x * 64;
+    unsigned int new_customers;
+    unsigned int queue_length = 0;
+    /* Copy state to local memory for efficiency */
+    // CHECK: hiprandState_t localState = state[id];
+    curandState localState = state[id];
+    /* Simulate queue in time */
+    /* first 3 hours */
+    for(int min = 1; min <= 60 * 3; min++) {
+        /* draw number of new customers depending on API */
+        new_customers =
+                    // CHECK: hiprand_discrete(&localState, poisson_2);
+                    curand_discrete(&localState, poisson_2);
+        /* Update queue */
+        update_queue(id, min, new_customers, queue_length,
+                                    queue_lengths, pitch);
+    }
+    /* second 3 hours */
+    for(int min = 60 * 3 + 1; min <= 60 * 6; min++) {
+        /* draw number of new customers depending on API */
+        new_customers =
+                    // CHECK: hiprand_discrete(&localState, poisson_1);
+                    curand_discrete(&localState, poisson_1);
+        /* Update queue */
+        update_queue(id, min, new_customers, queue_length,
+                                    queue_lengths, pitch);       
+    }
+    /* after 6 hours */
+    for(int min = 60 * 6 + 1; min <= 60 * HOURS; min++) {
+        /* draw number of new customers depending on API */
+        new_customers =
+                    // CHECK: hiprand_discrete(&localState, poisson_3);
+                    curand_discrete(&localState, poisson_3);
+        /* Update queue */
+        update_queue(id, min, new_customers, queue_length,
+                                    queue_lengths, pitch);       
+    }
+    /* Copy state back to global memory */
+    state[id] = localState;
+}
+
+/* Set time intervals between reports */
+void report_settings()
+{
+    do{
+        printf("Set time intervals between queue reports");
+        printf("(in minutes > 0)\n");
+        if (scanf("%d", &report_break) == 0) continue;
+    }while(report_break <= 0);
+}
+
+
+/* Set number of cashiers each hour */
+void add_cachiers(int *cashiers_load)
+{
+    int i, min, max, begin, end;
+    printf("Cashier serves 2 customers per minute...\n");
+    for (i = 0; i < HOURS; i++){
+        cashiers_load_h[i] = 0;
+    }
+    while (true){
+        printf("Adding cashier...\n");
+        min = OPENING_HOUR;
+        max = CLOSING_HOUR-1;
+        do{
+            printf("Set hour that cahier comes (%d-%d)",
+                                                min, max);
+            printf(" [type 0 to finish adding cashiers]\n");
+            if (scanf("%d", &begin) == 0) continue;
+        }while (begin > max || (begin < min && begin != 0));
+        if (begin == 0) break;
+        min = begin+1;
+        max = CLOSING_HOUR;
+        do{
+            printf("Set hour that cahier leaves (%d-%d)",
+                                                min, max);
+            printf(" [type 0 to finish adding cashiers]\n");
+            if (scanf("%d", &end) == 0) continue;
+        }while (end > max || (end < min && end != 0));
+        if (end == 0) break;
+        for (i = begin - OPENING_HOUR;
+             i < end   - OPENING_HOUR; i++){
+            cashiers_load_h[i]++;
+        }
+    }
+    for (i = OPENING_HOUR; i < CLOSING_HOUR; i++){
+        printf("\n%2d:00 - %2d:00     %d cashier",
+                i, i+1, cashiers_load_h[i-OPENING_HOUR]);
+        if (cashiers_load[i-OPENING_HOUR] != 1) printf("s");
+    }
+    printf("\n");
+}
+
+/* Set API type */
+API_TYPE set_API_type()
+{
+    printf("Choose API type:\n");
+    int choose;
+    do{
+        printf("type 1 for HOST API\n");
+        printf("type 2 for SIMPLE DEVICE API\n");
+        printf("type 3 for ROBUST DEVICE API\n");
+        if (scanf("%d", &choose) == 0) continue;
+    }while( choose < 1 || choose > 3);
+    switch(choose){
+        case 1: return HOST_API;
+        case 2: return SIMPLE_DEVICE_API;
+        case 3: return ROBUST_DEVICE_API;
+        default:
+            fprintf(stderr, "wrong API\n");
+            return HOST_API;
+    }
+}
+
+void settings()
+{
+    add_cachiers(cashiers_load);
+    // CHECK: hipMemcpyToSymbol("cashiers_load", cashiers_load_h,
+    // CHECK: HOURS * sizeof(int), 0, hipMemcpyHostToDevice);
+    cudaMemcpyToSymbol("cashiers_load", cashiers_load_h,
+            HOURS * sizeof(int), 0, cudaMemcpyHostToDevice);
+    report_settings();
+    api = set_API_type();
+}
+
+void print_statistics(unsigned int *hostResults, size_t pitch)
+{
+    int min, i, hour, minute;
+    unsigned int sum;
+    for(min = report_break; min <= 60 * HOURS;
+                            min += report_break) {
+        sum = 0;
+        for(i = 0; i < 64 * 64; i++) {
+            sum += access_2D(unsigned int, hostResults,
+                                        min-1, i, pitch);
+        }
+        hour = OPENING_HOUR + min/60;
+        minute = min%60;
+        printf("%2d:%02d   # of waiting customers = %10.4g |",
+                    hour, minute, (float)sum/(64.0 * 64.0));
+        printf("  # of cashiers = %d  |  ",
+                    cashiers_load_h[(min-1)/60]);
+        printf("# of new customers/min ~= ");
+        switch (api){
+            case HOST_API:
+                printf("%2.2f\n", 4.0);
+                break;
+            case SIMPLE_DEVICE_API:
+                printf("%2.2f\n",
+                            4*(sin((float)min/100.0)+1));
+                break;
+            case ROBUST_DEVICE_API:
+                if (min <= 3 * 60){
+                    printf("%2.2f\n", 2.0);
+                }else{
+                    if (min <= 6 * 60){
+                        printf("%2.2f\n", 1.0);
+                    }else{
+                        printf("%2.2f\n", 3.0);
+                    }
+                }
+                break;
+            default:
+                fprintf(stderr, "Wrong API\n");
+        }
+    }
+}
+
+
+int main(int argc, char *argv[])
+{
+    int n;
+    size_t pitch;
+    // CHECK: hiprandState_t *devStates;
+    curandState *devStates;
+    unsigned int *devResults, *hostResults;
+    unsigned int *poisson_numbers_d;
+    // CHECK: hiprandDiscreteDistribution_t poisson_1, poisson_2;
+    // CHECK: hiprandDiscreteDistribution_t poisson_3;
+    // CHECK: hiprandGenerator_t gen;
+    curandDiscreteDistribution_t poisson_1, poisson_2;
+    curandDiscreteDistribution_t poisson_3;
+    curandGenerator_t gen;
+
+    /* Setting cashiers, report and API */
+    settings();
+
+    /* Allocate space for results on device */
+    // CHECK: CUDA_CALL(hipMallocPitch((void **)&devResults, &pitch,
+    CUDA_CALL(cudaMallocPitch((void **)&devResults, &pitch,
+                64 * 64 * sizeof(unsigned int), 60 * HOURS));
+
+    /* Allocate space for results on host */
+    hostResults = (unsigned int *)calloc(pitch * 60 * HOURS,
+                sizeof(unsigned int));
+
+    /* Allocate space for prng states on device */
+    // CHECK: CUDA_CALL(hipMalloc((void **)&devStates, 64 * 64 *
+    // CHECK: sizeof(hiprandState_t)));
+    CUDA_CALL(cudaMalloc((void **)&devStates, 64 * 64 * 
+              sizeof(curandState)));
+
+    /* Setup prng states */
+    if (api != HOST_API){
+        // CHECK: hipLaunchKernelGGL(setup_kernel, dim3(64), dim3(64), 0, 0, devStates);
+        setup_kernel<<<64, 64>>>(devStates);
+    }
+    /* Simulate queue  */
+    switch (api){
+        case HOST_API:
+            /* Create pseudo-random number generator */
+            // CHECK: CURAND_CALL(hiprandCreateGenerator(&gen,
+                                // CHECK: HIPRAND_RNG_PSEUDO_DEFAULT));
+            CURAND_CALL(curandCreateGenerator(&gen,
+                                CURAND_RNG_PSEUDO_DEFAULT));
+            /* Set seed */
+            // CHECK: CURAND_CALL(hiprandSetPseudoRandomGeneratorSeed(
+            CURAND_CALL(curandSetPseudoRandomGeneratorSeed(
+                                            gen, 1234ULL));
+            /* compute n */
+            n = 64 * 64 * HOURS * 60;
+            /* Allocate n unsigned ints on device */
+            // CHECK: CUDA_CALL(hipMalloc((void **)&poisson_numbers_d,
+            CUDA_CALL(cudaMalloc((void **)&poisson_numbers_d,
+                                n * sizeof(unsigned int)));
+            /* Generate n unsigned ints on device */
+            // CHECK: CURAND_CALL(hiprandGeneratePoisson(gen,
+            CURAND_CALL(curandGeneratePoisson(gen,
+                                poisson_numbers_d, n, 4.0));
+            // CHECK: hipLaunchKernelGGL(host_API_kernel, dim3(64), dim3(64), 0, 0, poisson_numbers_d,
+            host_API_kernel<<<64, 64>>>(poisson_numbers_d,
+                                        devResults, pitch);
+            /* Cleanup */
+            // CHECK: CURAND_CALL(hiprandDestroyGenerator(gen));
+            CURAND_CALL(curandDestroyGenerator(gen));
+            break;
+        case SIMPLE_DEVICE_API:
+            // CHECK: hipLaunchKernelGGL(simple_device_API_kernel, dim3(64), dim3(64), 0, 0, devStates,
+            simple_device_API_kernel<<<64, 64>>>(devStates,
+                                        devResults, pitch);
+            break;
+        case ROBUST_DEVICE_API:
+            /* Create histograms for Poisson(1) */
+            // CHECK: CURAND_CALL(hiprandCreatePoissonDistribution(1.0,
+            CURAND_CALL(curandCreatePoissonDistribution(1.0,
+                                                &poisson_1));
+            /* Create histograms for Poisson(2) */
+            // CHECK: CURAND_CALL(hiprandCreatePoissonDistribution(2.0,
+            CURAND_CALL(curandCreatePoissonDistribution(2.0,
+                                                &poisson_2));
+            /* Create histograms for Poisson(3) */
+            // CHECK: CURAND_CALL(hiprandCreatePoissonDistribution(3.0,
+            CURAND_CALL(curandCreatePoissonDistribution(3.0,
+                                                &poisson_3));
+            // CHECK: hipLaunchKernelGGL(robust_device_API_kernel, dim3(64), dim3(64), 0, 0, devStates,
+            robust_device_API_kernel<<<64, 64>>>(devStates,
+                            poisson_1, poisson_2, poisson_3,
+                            devResults, pitch);
+            /* Cleanup */
+            // CHECK: CURAND_CALL(hiprandDestroyDistribution(poisson_1));
+            // CHECK: CURAND_CALL(hiprandDestroyDistribution(poisson_2));
+            // CHECK: CURAND_CALL(hiprandDestroyDistribution(poisson_3));
+            CURAND_CALL(curandDestroyDistribution(poisson_1));
+            CURAND_CALL(curandDestroyDistribution(poisson_2));
+            CURAND_CALL(curandDestroyDistribution(poisson_3));
+            break;
+        default:
+            fprintf(stderr, "Wrong API\n");
+    }
+    /* Copy device memory to host */
+    // CHECK: CUDA_CALL(hipMemcpy2D(hostResults, pitch, devResults,
+    // CHECK: 60 * HOURS, hipMemcpyDeviceToHost));
+    CUDA_CALL(cudaMemcpy2D(hostResults, pitch, devResults,
+                pitch, 64 * 64 * sizeof(unsigned int),
+                60 * HOURS, cudaMemcpyDeviceToHost));
+    /* Show result */
+    print_statistics(hostResults, pitch);
+    /* Cleanup */
+    // CHECK: CUDA_CALL(hipFree(devStates));
+    // CHECK: CUDA_CALL(hipFree(devResults));
+    CUDA_CALL(cudaFree(devStates));
+    CUDA_CALL(cudaFree(devResults));
+    free(hostResults);
+    return EXIT_SUCCESS;
+}
@@ -22,7 +22,12 @@ THE SOFTWARE.
 #include<cuda.h>
 #include<cuda_runtime.h>
 #include<iostream>
-#include<unistd.h>
+#ifdef _WIN32
+#include <windows.h>
+#define sleep(x) Sleep(x)
+#else
+#include <unistd.h>
+#endif
 #include<stdio.h>
 #include<malloc.h>

@@ -33,7 +38,7 @@ THE SOFTWARE.
 // CHECK: if(status != hipSuccess) {
 #define check(msg, status){ \
 if(status != cudaSuccess) { \
-	printf("%s failed. \n", #msg); \
+  printf("%s failed. \n", #msg); \
 } \
 }

@@ -1,6 +1,8 @@
 // RUN: %run_test hipify "%s" "%t" %cuda_args

 // CHECK: #include <hip/hip_runtime.h>
+// CHECK-NOT: #include <cuda_runtime.h>
+// CHECK: #include <stdio.h>
 #include <cuda.h>
-// CHECK-NOT: #include<cuda_runtime.h>
 #include <cuda_runtime.h>
+#include <stdio.h>
@@ -1,6 +1,8 @@
 // RUN: %run_test hipify "%s" "%t" %cuda_args

-// CHECK: #include <hip/hip_runtime.h>
-#include <cuda_runtime.h>
-// CHECK-NOT: #include<cuda.h>
-#include <cuda.h>
+// CHECK: #include "hip/hip_runtime.h"
+// CHECK-NOT: #include "cuda_runtime.h"
+// CHECK: #include <stdio.h>
+#include "cuda.h"
+#include "cuda_runtime.h"
+#include <stdio.h>
@@ -0,0 +1,8 @@
+// RUN: %run_test hipify "%s" "%t" %cuda_args
+
+// CHECK: #include <hipblas.h>
+// CHECK-NOT: #include <cublas_v2.h>
+// CHECK: #include <stdio.h>
+#include <cublas.h>
+#include <cublas_v2.h>
+#include <stdio.h>
@@ -0,0 +1,8 @@
+// RUN: %run_test hipify "%s" "%t" %cuda_args
+
+// CHECK: #include "hipblas.h"
+// CHECK-NOT: #include "cublas.h"
+// CHECK: #include <stdio.h>
+#include "cublas_v2.h"
+#include "cublas.h"
+#include <stdio.h>
@@ -0,0 +1,14 @@
+// RUN: %run_test hipify "%s" "%t" %cuda_args
+
+// CHECK: #include <hip/hip_runtime.h>
+// CHECK-NOT: #include <cuda_runtime.h>
+// CHECK: #include <iostream>
+// CHECK: #include "hipblas.h"
+// CHECK-NOT: #include "cublas.h"
+// CHECK: #include <stdio.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <iostream>
+#include "cublas_v2.h"
+#include "cublas.h"
+#include <stdio.h>
@@ -0,0 +1,93 @@
+// RUN: %run_test hipify "%s" "%t" %cuda_args
+
+// CHECK: #include <hip/hip_runtime.h>
+// CHECK: #include <memory>
+
+// CHECK-NOT: #include <cuda_runtime.h>
+// CHECK-NOT: #include <hip/hip_runtime.h>
+
+// CHECK: #include "hip/hip_runtime_api.h"
+// CHECK: #include "hip/channel_descriptor.h"
+// CHECK: #include "hip/device_functions.h"
+// CHECK: #include "hip/driver_types.h"
+// CHECK: #include "hip/hip_complex.h"
+// CHECK: #include "hip/hip_fp16.h"
+// CHECK: #include "hip/hip_texture_types.h"
+// CHECK: #include "hip/hip_vector_types.h"
+
+// CHECK: #include <iostream>
+
+// CHECK: #include "hipblas.h"
+// CHECK-NOT: #include "cublas.h"
+
+// CHECK: #include <stdio.h>
+
+// CHECK: #include "hiprand.h"
+// CHECK: #include "hiprand_kernel.h"
+
+// CHECK: #include <algorithm>
+
+// CHECK-NOT: #include "hiprand.h"
+// CHECK-NOT: #include "hiprand_kernel.h"
+// CHECK-NOT: #include "curand_discrete.h"
+// CHECK-NOT: #include "curand_discrete2.h"
+// CHECK-NOT: #include "curand_globals.h"
+// CHECK-NOT: #include "curand_lognormal.h"
+// CHECK-NOT: #include "curand_mrg32k3a.h"
+// CHECK-NOT: #include "curand_mtgp32.h"
+// CHECK-NOT: #include "curand_mtgp32_host.h"
+// CHECK-NOT: #include "curand_mtgp32_kernel.h"
+// CHECK-NOT: #include "curand_mtgp32dc_p_11213.h"
+// CHECK-NOT: #include "curand_normal.h"
+// CHECK-NOT: #include "curand_normal_static.h"
+// CHECK-NOT: #include "curand_philox4x32_x.h"
+// CHECK-NOT: #include "curand_poisson.h"
+// CHECK-NOT: #include "curand_precalc.h"
+// CHECK-NOT: #include "curand_uniform.h"
+
+// CHECK: #include <string>
+
+#include <cuda.h>
+
+#include <memory>
+
+#include <cuda_runtime.h>
+
+#include "cuda_runtime_api.h"
+#include "channel_descriptor.h"
+#include "device_functions.h"
+#include "driver_types.h"
+#include "cuComplex.h"
+#include "cuda_fp16.h"
+#include "cuda_texture_types.h"
+#include "vector_types.h"
+
+#include <iostream>
+
+#include "cublas_v2.h"
+#include "cublas.h"
+
+#include <stdio.h>
+
+#include "curand.h"
+#include "curand_kernel.h"
+
+#include <algorithm>
+
+#include "curand_discrete.h"
+#include "curand_discrete2.h"
+#include "curand_globals.h"
+#include "curand_lognormal.h"
+#include "curand_mrg32k3a.h"
+#include "curand_mtgp32.h"
+#include "curand_mtgp32_host.h"
+#include "curand_mtgp32_kernel.h"
+#include "curand_mtgp32dc_p_11213.h"
+#include "curand_normal.h"
+#include "curand_normal_static.h"
+#include "curand_philox4x32_x.h"
+#include "curand_poisson.h"
+#include "curand_precalc.h"
+#include "curand_uniform.h"
+
+#include <string>
@@ -0,0 +1,174 @@
+// RUN: %run_test hipify "%s" "%t" %cuda_args
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+// CHECK: #include <hip/hip_runtime.h>
+#include <cuda.h>
+
+#define K_THREADS 64
+#define K_INDEX() ((gridDim.x * blockIdx.y + blockIdx.x) * blockDim.x + threadIdx.x)
+#define RND() ((rand() & 0x7FFF) / float(0x8000))
+#define ERRORCHECK() cErrorCheck(__FILE__, __LINE__)
+
+// CHECK: hipEvent_t t##_start, t##_end;            \
+// CHECK: hipEventCreate(&t##_start);               \
+// CHECK: hipEventCreate(&t##_end);
+#define TIMER_CREATE(t)                      \
+  cudaEvent_t t##_start, t##_end;            \
+  cudaEventCreate(&t##_start);               \
+  cudaEventCreate(&t##_end);
+
+// CHECK: hipEventRecord(t##_start);                   \
+// CHECK: hipEventSynchronize(t##_start);
+#define TIMER_START(t)                          \
+  cudaEventRecord(t##_start);                   \
+  cudaEventSynchronize(t##_start);              \
+
+// CHECK: hipEventRecord(t##_start);                                 \
+// CHECK: hipEventSynchronize(t##_start);                            \
+// CHECK: hipEventRecord(t##_end);                                   \
+// CHECK: hipEventSynchronize(t##_end);                              \
+// CHECK: hipEventElapsedTime(&t, t##_start, t##_end);
+#define TIMER_END(t)                                          \
+  cudaEventRecord(t##_start);                                 \
+  cudaEventSynchronize(t##_start);                            \
+  cudaEventRecord(t##_end);                                   \
+  cudaEventSynchronize(t##_end);                              \
+  cudaEventElapsedTime(&t, t##_start, t##_end);               
+
+
+inline void cErrorCheck(const char *file, int line) {
+// CHECK: hipDeviceSynchronize();
+// CHECK: hipError_t err = hipGetLastError();
+// CHECK: if (err != hipSuccess) {
+// CHECK: printf("Error: %s\n", hipGetErrorString(err));
+  cudaThreadSynchronize();
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess) {
+    printf("Error: %s\n", cudaGetErrorString(err));
+    printf(" @ %s: %d\n", file, line);
+    exit(-1);
+  }
+}
+
+inline dim3 K_GRID(int n, int threads = K_THREADS) {
+  int blocks = (int)ceilf(sqrtf((float)n/threads));
+  dim3 grid(blocks, blocks);
+  return grid;
+}
+
+typedef struct data  {
+  int n;
+  float4 *r, *v, *f;
+} data;
+
+data cpu, gpu;
+
+#define N 20
+
+__global__ void repulsion(data gpu);
+__global__ void integration(data gpu);
+
+
+int main() {
+  printf("Cuda Test 1\n");
+
+  int count = 0;
+  // CHECK: hipGetDeviceCount(&count);
+  cudaGetDeviceCount(&count);
+  printf(" %d CUDA devices found\n", count);
+  if(!count) {
+    ::exit(EXIT_FAILURE);
+  }
+  // CHECK: hipFree(0);
+  cudaFree(0);
+
+  cpu.n = N;
+
+  cpu.r = (float4*)malloc(N * sizeof(float4));
+  cpu.v = (float4*)malloc(N * sizeof(float4));
+  cpu.f = (float4*)malloc(N * sizeof(float4));
+
+  for(int i = 0; i < N; ++i) {
+    cpu.v[i] = make_float4(0,0,0,0);
+    cpu.r[i] = make_float4(RND(), RND(), RND(), 0);
+    cpu.f[i] = make_float4(0,0.01,0,0);
+  }
+
+  gpu = cpu;
+  // CHECK: hipMalloc(&gpu.r, N * sizeof(float4));
+  // CHECK: hipMalloc(&gpu.v, N * sizeof(float4));
+  // CHECK: hipMalloc(&gpu.f, N * sizeof(float4));
+  cudaMalloc(&gpu.r, N * sizeof(float4));
+  cudaMalloc(&gpu.v, N * sizeof(float4));
+  cudaMalloc(&gpu.f, N * sizeof(float4));
+  // CHECK: hipMemcpy(gpu.r, cpu.r, cpu.n * sizeof(float4), hipMemcpyHostToDevice);
+  // CHECK: hipMemcpy(gpu.v, cpu.v, cpu.n * sizeof(float4), hipMemcpyHostToDevice);
+  // CHECK: hipMemcpy(gpu.f, cpu.f, cpu.n * sizeof(float4), hipMemcpyHostToDevice);
+  cudaMemcpy(gpu.r, cpu.r, cpu.n * sizeof(float4), cudaMemcpyHostToDevice);
+  cudaMemcpy(gpu.v, cpu.v, cpu.n * sizeof(float4), cudaMemcpyHostToDevice);
+  cudaMemcpy(gpu.f, cpu.f, cpu.n * sizeof(float4), cudaMemcpyHostToDevice);
+
+  ERRORCHECK();
+  float rep;
+  TIMER_CREATE(rep);
+  TIMER_START(rep);
+  // CHECK: hipLaunchKernelGGL(integration, dim3(K_GRID(cpu.n)), dim3(K_THREADS), 0, 0, gpu);
+  integration <<< K_GRID(cpu.n), K_THREADS >>>(gpu);
+
+  TIMER_END(rep);
+  printf("Took: %f ms\n", rep);
+  ERRORCHECK();
+  // CHECK: hipMemcpy(cpu.r, gpu.r, cpu.n * sizeof(float4), hipMemcpyDeviceToHost);
+  // CHECK: hipMemcpy(cpu.v, gpu.v, cpu.n * sizeof(float4), hipMemcpyDeviceToHost);
+  // CHECK: hipMemcpy(cpu.f, gpu.f, cpu.n * sizeof(float4), hipMemcpyDeviceToHost);
+  cudaMemcpy(cpu.r, gpu.r, cpu.n * sizeof(float4), cudaMemcpyDeviceToHost);
+  cudaMemcpy(cpu.v, gpu.v, cpu.n * sizeof(float4), cudaMemcpyDeviceToHost);
+  cudaMemcpy(cpu.f, gpu.f, cpu.n * sizeof(float4), cudaMemcpyDeviceToHost);
+  // CHECK: hipHostFree(cpu.r);
+  // CHECK: hipHostFree(cpu.v);
+  // CHECK: hipHostFree(cpu.f);
+  cudaFreeHost(cpu.r);
+  cudaFreeHost(cpu.v);
+  cudaFreeHost(cpu.f);
+  // CHECK: hipFree(gpu.r);
+  // CHECK: hipFree(gpu.v);
+  // CHECK: hipFree(gpu.f);
+  cudaFree(gpu.r);
+  cudaFree(gpu.v);
+  cudaFree(gpu.f);
+  // CHECK: hipDeviceReset();
+  cudaDeviceReset();
+
+  printf("Results: \n");
+  for(int i = 0; i < N; ++i) {
+    printf("%f, %f, %f \n", cpu.r[i].x, cpu.r[i].y, cpu.r[i].z);
+  }
+
+  printf("Ready...\n");
+  return 0;
+}
+
+__global__ void repulsion(data gpu) {
+  int idx = K_INDEX();
+  if(idx < N) {
+    gpu.r[idx].x = 1;
+    gpu.r[idx].y = 1;
+    gpu.r[idx].z = 1;
+  }
+}
+
+#define MULT4(v, s) v.x *= s; v.y *= s; v.z *= s; v.w *= s;
+#define ADD4(v1, v2) v1.x += v2.x; v1.y += v2.y; v1.z += v2.z; v1.w += v2.w;
+
+__global__ void integration(data gpu) {
+  int i = K_INDEX();
+  if(i < N) {
+    MULT4(gpu.f[i], 0.01);
+    MULT4(gpu.v[i], 0.01);
+    ADD4(gpu.v[i], gpu.f[i]);
+    ADD4(gpu.r[i], gpu.v[i]);
+    gpu.f[i] = make_float4(0,0,0,0);
+  }
+}
@@ -21,6 +21,8 @@ config.test_format = lit.formats.ShTest()
 # test_source_root: The root path where tests are located.
 config.test_source_root = os.path.dirname(__file__)

+config.excludes = ['cmdparser.hpp']
+
 # test_exec_root: The path where tests are located (default is the test suite root).
 #config.test_exec_root = config.test_source_root

@@ -14,5 +14,6 @@ set clang_args=%4%clang_args%

 %HIPIFY% -o=%TMP_FILE% %IN_FILE% -- %clang_args%
 if errorlevel 1 (echo      Error: hipify-clang.exe failed with exit code: %errorlevel% && exit /b %errorlevel%)
-%FILE_CHECK% %IN_FILE% -input-file=%TMP_FILE%
+
+findstr /v /r /c:"[ ]*//[ ]*[CHECK*|RUN]" %TMP_FILE% | %FILE_CHECK% %IN_FILE%
 if errorlevel 1 (echo      Error: FileCheck.exe failed with exit code: %errorlevel% && exit /b %errorlevel%)
@@ -0,0 +1,90 @@
+// RUN: %run_test hipify "%s" "%t" %cuda_args
+
+// Kernel definition
+__global__ void  vecAdd(float* A, float* B, float* C)
+{
+  int i = threadIdx.x;
+  A[i] = 0;
+  B[i] = i;
+  C[i] = A[i] + B[i];
+}
+// CHECK: #include <hip/hip_runtime.h>
+#include <stdio.h>
+#define SIZE 10
+#define KERNELINVOKES  5000000
+int vecadd(int  gpudevice, int rank)
+{
+  int devcheck(int, int);
+  devcheck(gpudevice, rank);
+  float A[SIZE], B[SIZE], C[SIZE];
+  // Kernel invocation
+  float *devPtrA;
+  float *devPtrB;
+  float *devPtrC;
+  int memsize = SIZE * sizeof(float);
+  // CHECK: hipMalloc((void**)&devPtrA, memsize);
+  // CHECK: hipMalloc((void**)&devPtrB, memsize);
+  // CHECK: hipMalloc((void**)&devPtrC, memsize);
+  cudaMalloc((void**)&devPtrA, memsize);
+  cudaMalloc((void**)&devPtrB, memsize);
+  cudaMalloc((void**)&devPtrC, memsize);
+  // CHECK: hipMemcpy(devPtrA, A, memsize, hipMemcpyHostToDevice);
+  // CHECK: hipMemcpy(devPtrB, B, memsize, hipMemcpyHostToDevice);
+  cudaMemcpy(devPtrA, A, memsize, cudaMemcpyHostToDevice);
+  cudaMemcpy(devPtrB, B, memsize, cudaMemcpyHostToDevice);
+  for (int i = 0; i<KERNELINVOKES; i++)
+  {
+    // CHECK: hipLaunchKernelGGL(vecAdd, dim3(1), dim3(gpudevice), 0, 0, devPtrA, devPtrB, devPtrC);
+    vecAdd <<< 1, gpudevice >>>(devPtrA, devPtrB, devPtrC);
+  }
+  // CHECK: hipMemcpy(C, devPtrC, memsize, hipMemcpyDeviceToHost);
+  cudaMemcpy(C, devPtrC, memsize, cudaMemcpyDeviceToHost);
+  // calculate only up to gpudevice to show the unique output
+  // of each rank's kernel launch
+  for (int i = 0; i<gpudevice; i++)
+    printf("rank %d: C[%d]=%f\n", rank, i, C[i]);
+  // CHECK: hipFree(devPtrA);
+  // CHECK: hipFree(devPtrA);
+  // CHECK: hipFree(devPtrA);
+  cudaFree(devPtrA);
+  cudaFree(devPtrA);
+  cudaFree(devPtrA);
+}
+int devcheck(int  gpudevice, int rank)
+{
+  int device_count = 0;
+  int device;   // used with cudaGetDevice() to verify cudaSetDevice()
+  // CHECK: hipGetDeviceCount(&device_count);
+  cudaGetDeviceCount(&device_count);
+  if (gpudevice >= device_count)
+  {
+    printf("gpudevice >=  device_count ... exiting\n");
+    exit(1);
+  }
+  // CHECK: hipError_t cudareturn;
+  // CHECK: hipDeviceProp_t deviceProp;
+  // CHECK: hipGetDeviceProperties(&deviceProp, gpudevice);
+  cudaError_t cudareturn;
+  cudaDeviceProp deviceProp;
+  cudaGetDeviceProperties(&deviceProp, gpudevice);
+  // CHECK: if (deviceProp.hipWarpSize <= 1)
+  if (deviceProp.warpSize <= 1)
+  {
+    printf("rank %d: warning, CUDA Device Emulation (CPU) detected, exiting\n", rank);
+    exit(1);
+  }
+  // CHECK: cudareturn = hipSetDevice(gpudevice);
+  cudareturn = cudaSetDevice(gpudevice);
+  // CHECK: if (cudareturn == hipErrorInvalidDevice)
+  if (cudareturn == cudaErrorInvalidDevice)
+  {
+    // CHECK: perror("hipSetDevice returned hipErrorInvalidDevice");
+    perror("cudaSetDevice returned cudaErrorInvalidDevice");
+  }
+  else
+  {
+    // CHECK: hipGetDevice(&device);
+    cudaGetDevice(&device);
+    printf("rank %d: cudaGetDevice()=%d\n", rank, device);
+  }
+}
@@ -18,13 +18,14 @@ THE SOFTWARE.
 */

 /* HIT_START
- * BUILD: %t %s ../../test_common.cpp
+ * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS -std=c++11
 * RUN: %t
 * HIT_END
 */

-// Test under-development. Call hipStreamAddCallback function and see if it works as expected.
-
+#include <stdio.h>
+#include <thread>
+#include <chrono>
 #include "hip/hip_runtime.h"
 #include "test_common.h"

@@ -32,32 +33,61 @@ THE SOFTWARE.
 #define HIPRT_CB
 #endif

-class CallbackClass
+__global__ void vector_square(float *C_d, float *A_d, size_t N)
 {
-public:
-    static void HIPRT_CB Callback(hipStream_t stream, hipError_t status, void *userData);
+    size_t offset = (blockIdx.x * blockDim.x + threadIdx.x);
+    size_t stride = blockDim.x * gridDim.x ;

-private:
-    void callbackFunc(hipError_t status);
-};
-
-void HIPRT_CB CallbackClass::Callback(hipStream_t stream, hipError_t status, void *userData)
-{
-    CallbackClass* obj = (CallbackClass*) userData;
-    obj->callbackFunc(status);
+    for (size_t i=offset; i<N; i+=stride) {
+        C_d[i] = A_d[i] * A_d[i];
+    }
 }

-void CallbackClass::callbackFunc(hipError_t status)
+float *A_h, *C_h;
+bool cbDone = false;
+
+static void HIPRT_CB Callback(hipStream_t stream, hipError_t status, void *userData)
 {
-     HIPASSERT(status==hipSuccess);
+    for (size_t i=0; i<N; i++) {
+        if (C_h[i] != A_h[i] * A_h[i]) {
+            warn("Data mismatch %zu", i);
+        }
+    }
+    printf ("PASSED!\n");
+    cbDone = true;
 }

-int main(){
+int main(int argc, char *argv[])
+{
+    float *A_d, *C_d;
+    size_t Nbytes = N * sizeof(float);
+
+    A_h = (float*)malloc(Nbytes);
+    HIPCHECK(A_h == 0 ? hipErrorMemoryAllocation : hipSuccess );
+    C_h = (float*)malloc(Nbytes);
+    HIPCHECK(C_h == 0 ? hipErrorMemoryAllocation : hipSuccess );
+
+    // Fill with Phi + i
+    for (size_t i=0; i<N; i++) 
+    {
+        A_h[i] = 1.618f + i; 
+    }
+
+    HIPCHECK(hipMalloc(&A_d, Nbytes));
+    HIPCHECK(hipMalloc(&C_d, Nbytes));
+
    hipStream_t mystream;
-    HIPCHECK(hipStreamCreate(&mystream));
-    CallbackClass* obj = new CallbackClass;
-    HIPCHECK(hipStreamAddCallback(mystream, CallbackClass::Callback, obj, 0));
-    HIPCHECK(hipStreamAddCallback(NULL, CallbackClass::Callback, obj, 0));
+    HIPCHECK(hipStreamCreateWithFlags(&mystream, hipStreamNonBlocking));

-	passed();
+    HIPCHECK(hipMemcpyAsync(A_d, A_h, Nbytes, hipMemcpyHostToDevice, mystream));
+
+    const unsigned blocks = 512;
+    const unsigned threadsPerBlock = 256;
+    hipLaunchKernelGGL((vector_square), dim3(blocks), dim3(threadsPerBlock), 0, mystream, C_d, A_d, N);
+
+    HIPCHECK(hipMemcpyAsync(C_h, C_d, Nbytes, hipMemcpyDeviceToHost, mystream));
+    HIPCHECK(hipStreamAddCallback(mystream, Callback, NULL, 0));
+
+    while(!cbDone)
+        std::this_thread::sleep_for(std::chrono::milliseconds(10));
 }