Merge branch 'master' of https://github.com/ROCm-Developer-Tools/HIP into feature_purge_genco

2018-01-17 14:02:19 +00:00
commit 09c704a2d0
@@ -368,6 +368,12 @@ if(POLICY CMP0037)
    cmake_policy(POP)
 endif()

+#############################
+# Code analysis
+#############################
+# Target: static_check
+add_custom_target(static_check COMMAND cppcheck --force --quiet --enable=warning,performance,portability,information,missingInclude src include -I /opt/rocm/include/hcc -I /opt/rocm/include --suppress=*:/opt/rocm/include/hcc/hc.hpp WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
+
 #############################
 # Testing steps
 #############################
@@ -395,5 +401,4 @@ else()
    message(STATUS "Testing targets will not be available. To enable them please ensure that the HIP installation directory is writeable. Use -DCMAKE_INSTALL_PREFIX to specify a suitable location")
 endif()

-
 # vim: ts=4:sw=4:expandtab:smartindent
@@ -367,11 +367,11 @@ if( params.hcc_integration_test )
 // The following launches 3 builds in parallel: hcc-ctu, hcc-1.6 and cuda
 parallel hcc_ctu:
 {
-  node('docker && rocm')
+  node('docker && rocm && dkms')
  {
    String hcc_ver = 'hcc-ctu'
    String from_image = 'compute-artifactory:5001/radeonopencompute/hcc/clang_tot_upgrade/hcc-lc-ubuntu-16.04:latest'
-    String inside_args = '--device=/dev/kfd'
+    String inside_args = '--device=/dev/kfd --device=/dev/dri --group-add=video'

    // Checkout source code, dependencies and version files
    String source_hip_rel = checkout_and_version( hcc_ver )
@@ -408,11 +408,11 @@ parallel hcc_ctu:
 },
 hcc_1_6:
 {
-  node('docker && rocm')
+  node('docker && rocm && !dkms')
  {
    String hcc_ver = 'hcc-1.6'
-    String from_image = 'compute-artifactory:5001/radeonopencompute/hcc/roc-1.6.x/hcc-lc-ubuntu-16.04:latest'
-    String inside_args = '--device=/dev/kfd'
+    String from_image = 'rocm/dev-ubuntu-16.04:latest'
+    String inside_args = '--device=/dev/kfd --device=/dev/dri'

    // Checkout source code, dependencies and version files
    String source_hip_rel = checkout_and_version( hcc_ver )
@@ -449,11 +449,7 @@ nvcc:
    // Block of string constants customizing behavior for cuda
    String nvcc_ver = 'nvcc-9.0'
    String from_image = 'nvidia/cuda:9.0-devel'
-
-    // This unfortunately hardcodes the driver version nvidia_driver_384.90 in the volume mount.  Research if a way
-    // exists to get volume driver to customize the volume names to leave out driver version
-    String inside_args = '''--device=/dev/nvidiactl --device=/dev/nvidia0 --device=/dev/nvidia-uvm --device=/dev/nvidia-uvm-tools
-        --volume-driver=nvidia-docker --volume=nvidia_driver_384.90:/usr/local/nvidia:ro''';
+    String inside_args = '--runtime=nvidia';

    // Checkout source code, dependencies and version files
    String source_hip_rel = checkout_and_version( nvcc_ver )
@@ -7,6 +7,7 @@ MAINTAINER Kent Knox <kent.knox@amd>
 ARG user_uid

 # Install Packages
+# python and libnuma1 are dependencies of rocm_agent_enumerator
 RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
    sudo \
    build-essential \
@@ -14,6 +15,8 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-ins
    git \
    libelf-dev \
    rpm \
+    python \
+    libnuma1 \
    && \
    apt-get clean && \
    rm -rf /var/lib/apt/lists/*
@@ -5,8 +5,8 @@
 | **type**     |   **CUDA**                                                    |   **HIP**                                                  |
 |-------------:|---------------------------------------------------------------|------------------------------------------------------------|
 | struct       | `CUDA_ARRAY3D_DESCRIPTOR`                                     |                                                            |
-| struct       | `CUDA_ARRAY_DESCRIPTOR`                                       |                                                            |
-| struct       | `CUDA_MEMCPY2D`                                               |                                                            |
+| struct       | `CUDA_ARRAY_DESCRIPTOR`                                       | `HIP_ARRAY_DESCRIPTOR`                                     |
+| struct       | `CUDA_MEMCPY2D`                                               | `hip_Memcpy2D`                                             |
 | struct       | `CUDA_MEMCPY3D`                                               |                                                            |
 | struct       | `CUDA_MEMCPY3D_PEER`                                          |                                                            |
 | struct       | `CUDA_POINTER_ATTRIBUTE_P2P_TOKENS`                           |                                                            |
@@ -27,15 +27,15 @@
 |         0x03 |*`CU_CUBEMAP_FACE_NEGATIVE_Y`*                                 |                                                            |
 |         0x04 |*`CU_CUBEMAP_FACE_POSITIVE_Z`*                                 |                                                            |
 |         0x05 |*`CU_CUBEMAP_FACE_NEGATIVE_Z`*                                 |                                                            |
-| enum         |***`CUarray_format`***                                         |                                                            |
-|         0x01 |*`CU_AD_FORMAT_UNSIGNED_INT8`*                                 |                                                            |
-|         0x02 |*`CU_AD_FORMAT_UNSIGNED_INT16`*                                |                                                            |
-|         0x03 |*`CU_AD_FORMAT_UNSIGNED_INT32`*                                |                                                            |
-|         0x08 |*`CU_AD_FORMAT_SIGNED_INT8`*                                   |                                                            |
-|         0x09 |*`CU_AD_FORMAT_SIGNED_INT16`*                                  |                                                            |
-|         0x0a |*`CU_AD_FORMAT_SIGNED_INT32`*                                  |                                                            |
-|         0x10 |*`CU_AD_FORMAT_HALF`*                                          |                                                            |
-|         0x20 |*`CU_AD_FORMAT_FLOAT`*                                         |                                                            |
+| enum         |***`CUarray_format`***                                         |***`hipArray_format`***                                     |
+|         0x01 |*`CU_AD_FORMAT_UNSIGNED_INT8`*                                 |*`HIP_AD_FORMAT_UNSIGNED_INT8`*                             |
+|         0x02 |*`CU_AD_FORMAT_UNSIGNED_INT16`*                                |*`HIP_AD_FORMAT_UNSIGNED_INT16`*                            |
+|         0x03 |*`CU_AD_FORMAT_UNSIGNED_INT32`*                                |*`HIP_AD_FORMAT_UNSIGNED_INT32`*                            |
+|         0x08 |*`CU_AD_FORMAT_SIGNED_INT8`*                                   |*`HIP_AD_FORMAT_SIGNED_INT8`*                               |
+|         0x09 |*`CU_AD_FORMAT_SIGNED_INT16`*                                  |*`HIP_AD_FORMAT_SIGNED_INT16`*                              |
+|         0x0a |*`CU_AD_FORMAT_SIGNED_INT32`*                                  |*`HIP_AD_FORMAT_SIGNED_INT32`*                              |
+|         0x10 |*`CU_AD_FORMAT_HALF`*                                          |*`HIP_AD_FORMAT_HALF`*                                      |
+|         0x20 |*`CU_AD_FORMAT_FLOAT`*                                         |*`HIP_AD_FORMAT_FLOAT`*                                     |
 | enum         |***`CUctx_flags`***                                            |                                                            |
 |         0x00 |*`CU_CTX_SCHED_AUTO`*                                          |                                                            |
 |         0x01 |*`CU_CTX_SCHED_SPIN`*                                          |                                                            |
@@ -518,7 +518,7 @@

 |   **CUDA**                                                |   **HIP**                     |
 |-----------------------------------------------------------|-------------------------------|
-| `cuArray3DCreate`                                         |                               |
+| `cuArray3DCreate`                                         | `hipArray3DCreate`            |
 | `cuArray3DGetDescriptor`                                  |                               |
 | `cuArrayCreate`                                           |                               |
 | `cuArrayDestroy`                                          |                               |
@@ -65,22 +65,40 @@ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DHIPIFY_CLANG_RES=\\\"${LLVM_LIBRARY_DI
 install(TARGETS hipify-clang DESTINATION bin)

 if (HIPIFY_CLANG_TESTS)
-    find_package(PythonInterp 2.7 REQUIRED EXACT)
+    find_package(PythonInterp 2.7 REQUIRED)

    function (require_program PROGRAM_NAME)
-        find_program(FOUND_PROGRAM ${PROGRAM_NAME})
-        if (NOT FOUND_PROGRAM)
-            message(FATAL_ERROR "Can't find ${PROGRAM_NAME}. Either set HIPIFY_CLANG_TESTS to OFF to disable hipify tests, or install the missing program.")
+        find_program(FOUND_${PROGRAM_NAME} ${PROGRAM_NAME})
+        if (FOUND_${PROGRAM_NAME})
+            message(STATUS "Found ${PROGRAM_NAME}: ${FOUND_${PROGRAM_NAME}}")
+        else()
+            message(SEND_ERROR "Can't find ${PROGRAM_NAME}. Either set HIPIFY_CLANG_TESTS to OFF to disable hipify tests, or install the missing program.")
        endif()
    endfunction()

    require_program(lit)
    require_program(FileCheck)
-    require_program(socat)

    # Populates CUDA_TOOLKIT_ROOT_DIR, which is then applied to the lit config to give the
    # value of --cuda-path for the test runs.
    find_package(CUDA REQUIRED)
+    if ((CUDA_VERSION VERSION_LESS "7.0") OR (LLVM_PACKAGE_VERSION VERSION_LESS "3.8") OR
+        (CUDA_VERSION VERSION_GREATER "7.5" AND LLVM_PACKAGE_VERSION VERSION_LESS "4.0") OR
+        (CUDA_VERSION VERSION_GREATER "8.0" AND LLVM_PACKAGE_VERSION VERSION_LESS "6.0") OR
+        (CUDA_VERSION VERSION_GREATER "9.0" AND LLVM_PACKAGE_VERSION VERSION_LESS "7.0"))
+        message(SEND_ERROR "CUDA ${CUDA_VERSION} is not supported by clang ${LLVM_PACKAGE_VERSION}.")
+        if (CUDA_VERSION VERSION_LESS "7.0")
+            message(STATUS "Please install CUDA 7.0 or higher.")
+        elseif ((CUDA_VERSION VERSION_EQUAL "7.0") OR (CUDA_VERSION VERSION_EQUAL "7.5"))
+            message(STATUS "Please install clang 3.8 or higher.")
+        elseif (CUDA_VERSION VERSION_EQUAL "8.0")
+            message(STATUS "Please install clang 4.0 or higher.")
+        elseif (CUDA_VERSION VERSION_EQUAL "9.0")
+            message(STATUS "Please install clang 6.0 or higher.")
+        elseif (CUDA_VERSION VERSION_EQUAL "9.1")
+            message(STATUS "Please install clang 7.0 or higher.")
+        endif()
+    endif()

    configure_file(
        ${CMAKE_CURRENT_LIST_DIR}/../tests/hipify-clang/lit.site.cfg.in
@@ -10,8 +10,8 @@ const std::map<llvm::StringRef, hipCounter> CUDA_TYPE_NAME_MAP{

    ///////////////////////////// CUDA DRIVER API /////////////////////////////
    {"CUDA_ARRAY3D_DESCRIPTOR",           {"HIP_ARRAY3D_DESCRIPTOR",           CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}},
-    {"CUDA_ARRAY_DESCRIPTOR",             {"HIP_ARRAY_DESCRIPTOR",             CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}},
-    {"CUDA_MEMCPY2D",                     {"HIP_MEMCPY2D",                     CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}},
+    {"CUDA_ARRAY_DESCRIPTOR",             {"HIP_ARRAY_DESCRIPTOR",             CONV_TYPE, API_DRIVER}},
+    {"CUDA_MEMCPY2D",                     {"hip_Memcpy2D",                     CONV_TYPE, API_DRIVER}},
    {"CUDA_MEMCPY3D",                     {"HIP_MEMCPY3D",                     CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}},
    {"CUDA_MEMCPY3D_PEER",                {"HIP_MEMCPY3D_PEER",                CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}},
    {"CUDA_POINTER_ATTRIBUTE_P2P_TOKENS", {"HIP_POINTER_ATTRIBUTE_P2P_TOKENS", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}},
@@ -23,7 +23,7 @@ const std::map<llvm::StringRef, hipCounter> CUDA_TYPE_NAME_MAP{

    {"CUaddress_mode",        {"hipAddress_mode",       CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}},
    {"CUarray_cubemap_face",  {"hipArray_cubemap_face", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}},
-    {"CUarray_format",        {"hipArray_format",       CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}},
+    {"CUarray_format",        {"hipArray_format",       CONV_TYPE, API_DRIVER}},
    {"CUcomputemode",         {"hipComputemode",        CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}},    // API_RUNTIME ANALOGUE (cudaComputeMode)
    {"CUmem_advise",          {"hipMemAdvise",          CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}},    // API_RUNTIME ANALOGUE (cudaComputeMode)
    {"CUmem_range_attribute", {"hipMemRangeAttribute",  CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}},    // API_RUNTIME ANALOGUE (cudaMemRangeAttribute)
@@ -213,6 +213,13 @@ const std::map<llvm::StringRef, hipCounter> CUDA_TYPE_NAME_MAP{
    {"cudaMipmappedArray_t",       {"hipMipmappedArray_t",       CONV_MEM, API_RUNTIME}},
    {"cudaMipmappedArray_const_t", {"hipMipmappedArray_const_t", CONV_MEM, API_RUNTIME}},

+    // defines
+    {"cudaArrayDefault",           {"hipArrayDefault",          CONV_MEM, API_RUNTIME}},
+    {"cudaArrayLayered",           {"hipArrayLayered",          CONV_MEM, API_RUNTIME}},
+    {"cudaArraySurfaceLoadStore",  {"hipArraySurfaceLoadStore", CONV_MEM, API_RUNTIME}},
+    {"cudaArrayCubemap",           {"hipArrayCubemap",          CONV_MEM, API_RUNTIME}},
+    {"cudaArrayTextureGather",     {"hipArrayTextureGather",    CONV_MEM, API_RUNTIME}},
+
    {"cudaMemoryAdvise", {"hipMemAdvise", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}},    // API_Driver ANALOGUE (CUmem_advise)
    {"cudaMemRangeAttribute", {"hipMemRangeAttribute", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}},    // API_Driver ANALOGUE (CUmem_range_attribute)
    {"cudaMemcpyKind", {"hipMemcpyKind", CONV_MEM, API_RUNTIME}},
@@ -263,6 +270,15 @@ const std::map<llvm::StringRef, hipCounter> CUDA_TYPE_NAME_MAP{

    {"cudaSurfaceFormatMode", {"hipSurfaceFormatMode", CONV_SURFACE, API_RUNTIME, HIP_UNSUPPORTED}},

+    // defines
+    {"cudaTextureType1D",             {"hipTextureType1D",             CONV_TEX, API_RUNTIME}},
+    {"cudaTextureType2D",             {"hipTextureType2D",             CONV_TEX, API_RUNTIME}},
+    {"cudaTextureType3D",             {"hipTextureType3D",             CONV_TEX, API_RUNTIME}},
+    {"cudaTextureTypeCubemap",        {"hipTextureTypeCubemap",        CONV_TEX, API_RUNTIME}},
+    {"cudaTextureType1DLayered",      {"hipTextureType1DLayered",      CONV_TEX, API_RUNTIME}},
+    {"cudaTextureType2DLayered",      {"hipTextureType2DLayered",      CONV_TEX, API_RUNTIME}},
+    {"cudaTextureTypeCubemapLayered", {"hipTextureTypeCubemapLayered", CONV_TEX, API_RUNTIME}},
+
    // Inter-Process Communication (IPC)
    {"cudaIpcEventHandle_t",  {"hipIpcEventHandle_t", CONV_TYPE, API_RUNTIME}},
    {"cudaIpcEventHandle_st", {"hipIpcEventHandle_t", CONV_TYPE, API_RUNTIME}},
@@ -548,14 +564,14 @@ const std::map<llvm::StringRef, hipCounter> CUDA_IDENTIFIER_MAP{
    {"CU_CUBEMAP_FACE_NEGATIVE_Z",        {"HIP_CUBEMAP_FACE_NEGATIVE_Z",      CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}},    // 0x05

    // CUarray_format enum
-    {"CU_AD_FORMAT_UNSIGNED_INT8",        {"HIP_AD_FORMAT_UNSIGNED_INT8",      CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}},    // 0x01
-    {"CU_AD_FORMAT_UNSIGNED_INT16",       {"HIP_AD_FORMAT_UNSIGNED_INT16",     CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}},    // 0x02
-    {"CU_AD_FORMAT_UNSIGNED_INT32",       {"HIP_AD_FORMAT_UNSIGNED_INT32",     CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}},    // 0x03
-    {"CU_AD_FORMAT_SIGNED_INT8",          {"HIP_AD_FORMAT_SIGNED_INT8",        CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}},    // 0x08
-    {"CU_AD_FORMAT_SIGNED_INT16",         {"HIP_AD_FORMAT_SIGNED_INT16",       CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}},    // 0x09
-    {"CU_AD_FORMAT_SIGNED_INT32",         {"HIP_AD_FORMAT_SIGNED_INT32",       CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}},    // 0x0a
-    {"CU_AD_FORMAT_HALF",                 {"HIP_AD_FORMAT_HALF",               CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}},    // 0x10
-    {"CU_AD_FORMAT_FLOAT",                {"HIP_AD_FORMAT_FLOAT",              CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}},    // 0x20
+    {"CU_AD_FORMAT_UNSIGNED_INT8",        {"HIP_AD_FORMAT_UNSIGNED_INT8",      CONV_TYPE, API_DRIVER}},    // 0x01
+    {"CU_AD_FORMAT_UNSIGNED_INT16",       {"HIP_AD_FORMAT_UNSIGNED_INT16",     CONV_TYPE, API_DRIVER}},    // 0x02
+    {"CU_AD_FORMAT_UNSIGNED_INT32",       {"HIP_AD_FORMAT_UNSIGNED_INT32",     CONV_TYPE, API_DRIVER}},    // 0x03
+    {"CU_AD_FORMAT_SIGNED_INT8",          {"HIP_AD_FORMAT_SIGNED_INT8",        CONV_TYPE, API_DRIVER}},    // 0x08
+    {"CU_AD_FORMAT_SIGNED_INT16",         {"HIP_AD_FORMAT_SIGNED_INT16",       CONV_TYPE, API_DRIVER}},    // 0x09
+    {"CU_AD_FORMAT_SIGNED_INT32",         {"HIP_AD_FORMAT_SIGNED_INT32",       CONV_TYPE, API_DRIVER}},    // 0x0a
+    {"CU_AD_FORMAT_HALF",                 {"HIP_AD_FORMAT_HALF",               CONV_TYPE, API_DRIVER}},    // 0x10
+    {"CU_AD_FORMAT_FLOAT",                {"HIP_AD_FORMAT_FLOAT",              CONV_TYPE, API_DRIVER}},    // 0x20

    // CUcomputemode enum
    {"CU_COMPUTEMODE_DEFAULT",            {"hipComputeModeDefault",            CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}},    // 0 // API_RUNTIME ANALOGUE (cudaComputeModeDefault = 0)
@@ -1047,7 +1063,7 @@ const std::map<llvm::StringRef, hipCounter> CUDA_IDENTIFIER_MAP{
    {"cuStreamBatchMemOp",         {"hipStreamBatchMemOp",         CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED}},    // // no API_Runtime ANALOGUE

    // Memory management
-    {"cuArray3DCreate",           {"hipArray3DCreate",           CONV_MEM, API_DRIVER, HIP_UNSUPPORTED}},
+    {"cuArray3DCreate",           {"hipArray3DCreate",           CONV_MEM, API_DRIVER}},
    {"cuArray3DGetDescriptor",    {"hipArray3DGetDescriptor",    CONV_MEM, API_DRIVER, HIP_UNSUPPORTED}},
    {"cuArrayCreate",             {"hipArrayCreate",             CONV_MEM, API_DRIVER, HIP_UNSUPPORTED}},
    {"cuArrayDestroy",            {"hipArrayDestroy",            CONV_MEM, API_DRIVER, HIP_UNSUPPORTED}},
@@ -152,16 +152,19 @@ void HipifyAction::InclusionDirective(clang::SourceLocation hash_loc,

    const auto found = CUDA_INCLUDE_MAP.find(file_name);
    if (found == CUDA_INCLUDE_MAP.end()) {
-        // Not a CUDA include - don't touch it.
+        if (!firstNotMainHeader) {
+            firstNotMainHeader = true;
+            firstNotMainHeaderLoc = hash_loc;
+        }
        return;
    }

    // Special-casing to avoid duplication of the hip_runtime include.
+    bool secondMainInclude = false;
    if (found->second.hipName == "hip/hip_runtime.h") {
        if (insertedRuntimeHeader) {
-            return;
+            secondMainInclude = true;
        }
-
        insertedRuntimeHeader = true;
    }

@@ -169,28 +172,42 @@ void HipifyAction::InclusionDirective(clang::SourceLocation hash_loc,

    clang::SourceLocation sl = filename_range.getBegin();
    if (found->second.unsupported) {
-        // An unsupported CUDA header? Oh dear. Print a warning.
        clang::DiagnosticsEngine& DE = getCompilerInstance().getDiagnostics();
        DE.Report(sl, DE.getCustomDiagID(clang::DiagnosticsEngine::Warning, "Unsupported CUDA header"));
        return;
    }

-    const char *B = SM.getCharacterData(sl);
-    const char *E = SM.getCharacterData(filename_range.getEnd());
-    clang::SmallString<128> includeBuffer;
    clang::StringRef newInclude;

    // Keep the same include type that the user gave.
-    if (is_angled) {
-        newInclude = llvm::Twine("<" + found->second.hipName + ">").toStringRef(includeBuffer);
+    if (!secondMainInclude) {
+        clang::SmallString<128> includeBuffer;
+        if (is_angled) {
+            newInclude = llvm::Twine("<" + found->second.hipName + ">").toStringRef(includeBuffer);
+        } else {
+            newInclude = llvm::Twine("\"" + found->second.hipName + "\"").toStringRef(includeBuffer);
+        }
    } else {
-        newInclude = llvm::Twine("\"" + found->second.hipName + "\"").toStringRef(includeBuffer);
+        // hashLoc is location of the '#', thus replacing the whole include directive by empty newInclude starting with '#'.
+        sl = hash_loc;
    }
-
+    const char *B = SM.getCharacterData(sl);
+    const char *E = SM.getCharacterData(filename_range.getEnd());
    ct::Replacement Rep(SM, sl, E - B, newInclude);
    insertReplacement(Rep, clang::FullSourceLoc{sl, SM});
 }

+void HipifyAction::PragmaDirective(clang::SourceLocation Loc, clang::PragmaIntroducerKind Introducer) {
+    if (pragmaOnce) { return; }
+    clang::SourceManager& SM = getCompilerInstance().getSourceManager();
+    clang::Preprocessor& PP = getCompilerInstance().getPreprocessor();
+    const clang::Token tok = PP.LookAhead(0);
+    StringRef Text(SM.getCharacterData(tok.getLocation()), tok.getLength());
+    if (Text == "once") {
+        pragmaOnce = true;
+        pragmaOnceLoc = PP.LookAhead(1).getLocation();
+    }
+}

 bool HipifyAction::cudaLaunchKernel(const clang::ast_matchers::MatchFinder::MatchResult& Result) {
    StringRef refName = "cudaLaunchKernel";
@@ -336,10 +353,16 @@ void HipifyAction::EndSourceFileAction() {
        // implicitly included by the compiler. Instead, we _delete_ CUDA headers, and unconditionally insert
        // one copy of the hip include into every file.
        clang::SourceManager& SM = getCompilerInstance().getSourceManager();
-
-        clang::SourceLocation sl = SM.getLocForStartOfFile(SM.getMainFileID());
+        clang::SourceLocation sl;
+        if (pragmaOnce) {
+            sl = pragmaOnceLoc;
+        } else if (firstNotMainHeader) {
+            sl = firstNotMainHeaderLoc;
+        } else {
+            sl = SM.getLocForStartOfFile(SM.getMainFileID());
+        }
        clang::FullSourceLoc fullSL(sl, SM);
-        ct::Replacement Rep(SM, sl, 0, "#include <hip/hip_runtime.h>\n");
+        ct::Replacement Rep(SM, sl, 0, "\n#include <hip/hip_runtime.h>\n");
        insertReplacement(Rep, fullSL);
    }

@@ -364,6 +387,10 @@ public:
                            const clang::Module* imported) override {
        hipifyAction.InclusionDirective(hash_loc, include_token, file_name, is_angled, filename_range, file, search_path, relative_path, imported);
    }
+
+    void PragmaDirective(clang::SourceLocation Loc, clang::PragmaIntroducerKind Introducer) override {
+        hipifyAction.PragmaDirective(Loc, Introducer);
+    }
 };

 }
@@ -23,6 +23,10 @@ private:
    // not, we insert it at the top of the file when we finish processing it.
    // This approach means we do the best it's possible to do w.r.t preserving the user's include order.
    bool insertedRuntimeHeader = false;
+    bool firstNotMainHeader = false;
+    bool pragmaOnce = false;
+    clang::SourceLocation firstNotMainHeaderLoc;
+    clang::SourceLocation pragmaOnceLoc;

    /**
     * Rewrite a string literal to refer to hip, not CUDA.
@@ -57,6 +61,11 @@ public:
                            StringRef relative_path,
                            const clang::Module *imported);

+    /**
+    * Called by the preprocessor for each pragma directive during the non-raw lexing pass.
+    */
+    void PragmaDirective(clang::SourceLocation Loc, clang::PragmaIntroducerKind Introducer);
+
 protected:
    /**
     * Add a Replacement for the current file. These will all be applied after executing the FrontendAction.
@@ -25,7 +25,7 @@ ct::Replacements& getReplacements(ct::RefactoringTool& Tool, clang::StringRef fi
 void insertReplacement(ct::Replacements& replacements, const ct::Replacement& rep) {
 #if LLVM_VERSION_MAJOR > 3
    // New clang added error checking to Replacements, and *insists* that you explicitly check it.
-    llvm::Error e = replacements.add(rep);
+    llvm::consumeError(replacements.add(rep));
 #else
    // In older versions, it's literally an std::set<Replacement>
    replacements.insert(rep);
@@ -62,6 +62,8 @@ struct HIP_ARRAY_DESCRIPTOR {
 	unsigned int numChannels;
 	size_t width;
 	size_t height;
+    unsigned int flags;
+    size_t depth;
 };

 struct hipArray {
@@ -73,6 +75,7 @@ struct hipArray {
    unsigned int depth;
    struct HIP_ARRAY_DESCRIPTOR drvDesc;
    bool isDrv;
+    unsigned int textureType;
 };

 typedef struct hip_Memcpy2D {
@@ -251,6 +254,30 @@ struct hipMemcpy3DParms {

    struct hipExtent      extent;
    enum hipMemcpyKind    kind;
+    
+    size_t                Depth;
+    size_t                Height;
+    size_t                WidthInBytes;
+    hipDeviceptr_t        dstDevice;
+    size_t                dstHeight;
+    void *                dstHost;
+    size_t                dstLOD;
+    hipMemoryType         dstMemoryType;
+    size_t                dstPitch;
+    size_t                dstXInBytes;
+    size_t                dstY;
+    size_t                dstZ;
+    void *                reserved0;
+    void *                reserved1;
+    hipDeviceptr_t        srcDevice;
+    size_t                srcHeight;
+    const void *          srcHost;
+    size_t                srcLOD;
+    hipMemoryType          srcMemoryType;
+    size_t                srcPitch;
+    size_t                srcXInBytes;
+    size_t                srcY;
+    size_t                srcZ;
 };

 static __inline__ struct hipPitchedPtr make_hipPitchedPtr(void *d, size_t p, size_t xsz, size_t ysz)
@@ -1317,6 +1317,11 @@ hipError_t hipMallocArray(hipArray** array, const struct hipChannelFormatDesc* d
                          size_t width, size_t height, unsigned int flags);
 #endif
 hipError_t hipArrayCreate ( hipArray** pHandle, const HIP_ARRAY_DESCRIPTOR* pAllocateArray );
+
+hipError_t hipArray3DCreate(hipArray_t *array, const HIP_ARRAY_DESCRIPTOR* pAllocateArray );
+
+hipError_t hipMalloc3D (hipPitchedPtr* pitchedDevPtr, hipExtent extent );
+
 /**
 *  @brief Frees an array on the device.
 *
@@ -448,7 +448,7 @@ __device__  __half2 __lowhigh2highlow(const __half2 a) {

 __device__  __half2 __lows2half2(const __half2 a, const __half2 b) {
  __half2 c;
-  c.y = a.x;
+  c.x = a.x;
  c.y = b.x;
  return c;
 }
@@ -344,24 +344,16 @@ hipError_t hipHostAlloc(void** ptr, size_t sizeBytes, unsigned int flags)
    return hipHostMalloc(ptr, sizeBytes, flags);
 };

-
 // width in bytes
-hipError_t hipMallocPitch(void** ptr, size_t* pitch, size_t width, size_t height)
+hipError_t ihipMallocPitch(void** ptr, size_t* pitch, size_t width, size_t height, size_t depth)
 {
-    HIP_INIT_SPECIAL_API((TRACE_MEM), ptr, pitch, width, height);
-    HIP_SET_DEVICE();
    hipError_t  hip_status = hipSuccess;
-
-    if(width == 0 || height == 0)
-        return ihipLogStatus(hipErrorUnknown);
-
    // hardcoded 128 bytes
    *pitch = ((((int)width-1)/128) + 1)*128;
    const size_t sizeBytes = (*pitch)*height;

    auto ctx = ihipGetTlsDefaultCtx();

-    //err = hipMalloc(ptr, (*pitch)*height);
    if (ctx) {
        hc::accelerator acc = ctx->getDevice()->_acc;
        hsa_agent_t* agent =static_cast<hsa_agent_t*>(acc.get_hsa_agent());
@@ -373,9 +365,12 @@ hipError_t hipMallocPitch(void** ptr, size_t* pitch, size_t width, size_t height
        hsa_ext_image_descriptor_t imageDescriptor;
        imageDescriptor.width = *pitch;
        imageDescriptor.height = height;
-        imageDescriptor.depth = 0;
+        imageDescriptor.depth = 0;//depth;
        imageDescriptor.array_size = 0;
-        imageDescriptor.geometry = HSA_EXT_IMAGE_GEOMETRY_2D;
+        if(depth == 0)
+            imageDescriptor.geometry = HSA_EXT_IMAGE_GEOMETRY_2D;
+        else
+            imageDescriptor.geometry = HSA_EXT_IMAGE_GEOMETRY_3D;
        imageDescriptor.format.channel_order = HSA_EXT_IMAGE_CHANNEL_ORDER_R;
        imageDescriptor.format.channel_type = HSA_EXT_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32;

@@ -394,6 +389,42 @@ hipError_t hipMallocPitch(void** ptr, size_t* pitch, size_t width, size_t height
        hip_status = hipErrorMemoryAllocation;
    }

+    return hip_status;
+}
+
+// width in bytes
+hipError_t hipMallocPitch(void** ptr, size_t* pitch, size_t width, size_t height)
+{
+    HIP_INIT_SPECIAL_API((TRACE_MEM), ptr, pitch, width, height);
+    HIP_SET_DEVICE();
+    hipError_t  hip_status = hipSuccess;
+
+    if(width == 0 || height == 0)
+        return ihipLogStatus(hipErrorUnknown);
+
+    hip_status = ihipMallocPitch(ptr, pitch, width, height, 0);
+    return ihipLogStatus(hip_status);
+}
+
+hipError_t hipMalloc3D (hipPitchedPtr* pitchedDevPtr, hipExtent extent )
+{
+    HIP_INIT_API(pitchedDevPtr, &extent);
+    HIP_SET_DEVICE();
+    hipError_t  hip_status = hipSuccess;
+
+    if(extent.width == 0 || extent.height == 0)
+       return ihipLogStatus(hipErrorUnknown);
+    if(!pitchedDevPtr)
+       return ihipLogStatus(hipErrorInvalidValue);
+    void* ptr;
+    size_t pitch;
+
+    hip_status = ihipMallocPitch(&pitchedDevPtr->ptr, &pitch, extent.width, extent.height, extent.depth);
+    if(hip_status == hipSuccess) {
+        pitchedDevPtr->pitch = pitch;
+        pitchedDevPtr->xsize = extent.width;
+        pitchedDevPtr->ysize = extent.height;
+    }
    return ihipLogStatus(hip_status);
 }

@@ -531,7 +562,7 @@ hipError_t hipMallocArray(hipArray** array, const hipChannelFormatDesc* desc,
        array[0]->depth = 1;
        array[0]->desc = *desc;
        array[0]->isDrv = false;
-
+        array[0]->textureType = hipTextureType2D;
        void ** ptr = &array[0]->data;

        if (ctx) {
@@ -610,12 +641,132 @@ hipError_t hipMallocArray(hipArray** array, const hipChannelFormatDesc* desc,
    return ihipLogStatus(hip_status);
 }

+hipError_t hipArray3DCreate(hipArray_t *array, const HIP_ARRAY_DESCRIPTOR* pAllocateArray )
+{
+    HIP_INIT_SPECIAL_API((TRACE_MEM), array, pAllocateArray);
+    hipError_t  hip_status = hipSuccess;
+
+    auto ctx = ihipGetTlsDefaultCtx();
+
+    *array = (hipArray*)malloc(sizeof(hipArray));
+    array[0]->type  = pAllocateArray->flags;
+    array[0]->width = pAllocateArray->width;
+    array[0]->height = pAllocateArray->height;
+    array[0]->depth = pAllocateArray->depth;
+    array[0]->drvDesc = *pAllocateArray;
+    array[0]->isDrv = true;
+    array[0]->textureType = hipTextureType3D;
+    void ** ptr = &array[0]->data;
+
+    if (ctx) {
+        const unsigned am_flags = 0;
+        const size_t size = pAllocateArray->width*pAllocateArray->height*pAllocateArray->depth;
+
+        size_t allocSize = 0;
+        hsa_ext_image_channel_type_t channelType;
+        switch(pAllocateArray->format) {
+            case HIP_AD_FORMAT_UNSIGNED_INT8:
+                allocSize = size * sizeof(uint8_t);
+                channelType = HSA_EXT_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8;
+                break;
+            case HIP_AD_FORMAT_UNSIGNED_INT16:
+                allocSize = size * sizeof(uint16_t);
+                channelType = HSA_EXT_IMAGE_CHANNEL_TYPE_UNSIGNED_INT16;
+                break;
+            case HIP_AD_FORMAT_UNSIGNED_INT32:
+                allocSize = size * sizeof(uint32_t);
+                channelType = HSA_EXT_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32;
+                break;
+            case HIP_AD_FORMAT_SIGNED_INT8:
+                allocSize = size * sizeof(int8_t);
+                channelType = HSA_EXT_IMAGE_CHANNEL_TYPE_SIGNED_INT8;
+                break;
+            case HIP_AD_FORMAT_SIGNED_INT16:
+                allocSize = size * sizeof(int16_t);
+                channelType = HSA_EXT_IMAGE_CHANNEL_TYPE_SIGNED_INT16;
+                break;
+            case HIP_AD_FORMAT_SIGNED_INT32:
+                allocSize = size * sizeof(int32_t);
+                channelType = HSA_EXT_IMAGE_CHANNEL_TYPE_SIGNED_INT32;
+                break;
+            case HIP_AD_FORMAT_HALF:
+                allocSize = size * sizeof(int16_t);
+                channelType = HSA_EXT_IMAGE_CHANNEL_TYPE_HALF_FLOAT;
+                break;
+            case HIP_AD_FORMAT_FLOAT:
+                allocSize = size * sizeof(float);
+                channelType = HSA_EXT_IMAGE_CHANNEL_TYPE_FLOAT;
+                break;
+            default:
+                hip_status = hipErrorUnknown;
+                break;
+        }
+
+        hc::accelerator acc = ctx->getDevice()->_acc;
+        hsa_agent_t* agent =static_cast<hsa_agent_t*>(acc.get_hsa_agent());
+
+        size_t allocGranularity = 0;
+        hsa_amd_memory_pool_t *allocRegion = static_cast<hsa_amd_memory_pool_t*>(acc.get_hsa_am_region());
+        hsa_amd_memory_pool_get_info(*allocRegion, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_GRANULE, &allocGranularity);
+
+        hsa_ext_image_descriptor_t imageDescriptor;
+        imageDescriptor.width = pAllocateArray->width;
+        imageDescriptor.height = pAllocateArray->height;
+        imageDescriptor.depth = 0;
+        imageDescriptor.array_size = 0;
+        switch (pAllocateArray->flags) {
+        case hipArrayLayered:
+            imageDescriptor.geometry = HSA_EXT_IMAGE_GEOMETRY_2DA;
+            imageDescriptor.array_size = pAllocateArray->depth;
+            break;
+        case hipArraySurfaceLoadStore:
+        case hipArrayTextureGather:
+        case hipArrayDefault:
+            assert(0);
+            break;
+        case hipArrayCubemap:
+        default:
+            imageDescriptor.geometry = HSA_EXT_IMAGE_GEOMETRY_3D;
+            imageDescriptor.depth = pAllocateArray->depth;
+            break;
+        }
+        hsa_ext_image_channel_order_t channelOrder;
+
+        //getChannelOrderAndType(*desc, hipReadModeElementType, &channelOrder, &channelType);
+        if (pAllocateArray->numChannels == 4) {
+            channelOrder = HSA_EXT_IMAGE_CHANNEL_ORDER_RGBA;
+        } else if (pAllocateArray->numChannels == 2) {
+            channelOrder = HSA_EXT_IMAGE_CHANNEL_ORDER_RG;
+        } else if (pAllocateArray->numChannels == 1) {
+            channelOrder = HSA_EXT_IMAGE_CHANNEL_ORDER_R;
+        }
+        imageDescriptor.format.channel_order = channelOrder;
+        imageDescriptor.format.channel_type = channelType;
+
+        hsa_access_permission_t permission = HSA_ACCESS_PERMISSION_RW;
+        hsa_ext_image_data_info_t imageInfo;
+        hsa_status_t status = hsa_ext_image_data_get_info(*agent, &imageDescriptor, permission, &imageInfo);
+        size_t alignment = imageInfo.alignment <= allocGranularity ? 0 : imageInfo.alignment;
+
+        *ptr = hip_internal::allocAndSharePtr("device_array", allocSize, ctx, false, am_flags, 0, alignment);
+
+        if (size && (*ptr == NULL)) {
+            hip_status = hipErrorMemoryAllocation;
+        }
+
+    } else {
+        hip_status = hipErrorMemoryAllocation;
+    }
+
+    return ihipLogStatus(hip_status);
+}
+
 hipError_t hipMalloc3DArray(hipArray_t *array,
                            const struct hipChannelFormatDesc* desc,
                            struct hipExtent extent,
                            unsigned int flags)
 {
-    HIP_INIT();
+    HIP_INIT_API(array, desc, &extent, flags);
    HIP_SET_DEVICE();
    hipError_t  hip_status = hipSuccess;

@@ -627,7 +778,8 @@ hipError_t hipMalloc3DArray(hipArray_t *array,
    array[0]->height = extent.height;
    array[0]->depth = extent.depth;
    array[0]->desc = *desc;
-
+    array[0]->isDrv = false;
+    array[0]->textureType = hipTextureType3D; 
    void ** ptr = &array[0]->data;

    if (ctx) {
@@ -702,7 +854,7 @@ hipError_t hipMalloc3DArray(hipArray_t *array,
        hip_status = hipErrorMemoryAllocation;
    }

-    return hip_status;
+    return ihipLogStatus(hip_status);
 }

 hipError_t hipHostGetFlags(unsigned int* flagsPtr, void* hostPtr)
@@ -1262,53 +1414,85 @@ hipError_t hipMemcpyToArray(hipArray* dst, size_t wOffset, size_t hOffset,
 hipError_t hipMemcpy3D(const struct hipMemcpy3DParms *p)
 {
    HIP_INIT_SPECIAL_API((TRACE_MCMD), p);
-
-    hipStream_t stream = ihipSyncAndResolveStream(hipStreamNull);
-
-    hc::completion_future marker;
-
    hipError_t e = hipSuccess;
-
-    size_t byteSize;
    if(p) {
-        switch(p->dstArray->desc.f) {
-            case hipChannelFormatKindSigned:
-                byteSize = sizeof(int);
-                break;
-            case hipChannelFormatKindUnsigned:
-                byteSize = sizeof(unsigned int);
-                break;
-            case hipChannelFormatKindFloat:
-                byteSize = sizeof(float);
-                break;
-            case hipChannelFormatKindNone:
-                byteSize = sizeof(size_t);
-                break;
-            default:
-                byteSize = 0;
-                break;
+        size_t byteSize;
+        size_t depth;
+        size_t height;
+        size_t widthInBytes;
+        size_t dstWidthInbytes;
+        size_t srcPitch;
+        size_t dstPitch;
+        void *srcPtr;
+        void *dstPtr;
+        size_t ySize;
+        if(p->dstArray != nullptr) {
+            if(p->dstArray->isDrv == false) {
+                    switch(p->dstArray->desc.f) {
+                        case hipChannelFormatKindSigned:
+                            byteSize = sizeof(int);
+                            break;
+                        case hipChannelFormatKindUnsigned:
+                            byteSize = sizeof(unsigned int);
+                            break;
+                        case hipChannelFormatKindFloat:
+                            byteSize = sizeof(float);
+                            break;
+                        case hipChannelFormatKindNone:
+                            byteSize = sizeof(size_t);
+                            break;
+                        default:
+                            byteSize = 0;
+                            break;
+                    }
+                    depth = p->extent.depth;
+                    height = p->extent.height;
+                    widthInBytes = p->extent.width * byteSize;
+                    srcPitch = p->srcPtr.pitch;
+                    srcPtr = p->srcPtr.ptr;
+                    ySize = p->srcPtr.ysize;
+                    dstWidthInbytes = p->dstArray->width*byteSize;
+                    dstPtr = p->dstArray->data;
+                } else {
+                    depth = p->Depth;
+                    height = p->Height;
+                    widthInBytes = p->WidthInBytes;
+                    dstWidthInbytes = p->dstArray->width*4;
+                    srcPitch = p->srcPitch;
+                    srcPtr = (void*)p->srcHost;
+                    ySize = p->srcHeight;
+                    dstPtr = p->dstArray->data;
+                }
+        } else {
+            //Non array destination
+            depth = p->extent.depth;
+            height = p->extent.height;
+            widthInBytes = p->extent.width;
+            srcPitch = p->srcPtr.pitch;
+            srcPtr = p->srcPtr.ptr;
+            dstPtr = p->dstPtr.ptr;
+            ySize = p->srcPtr.ysize;
+            dstWidthInbytes = p->dstPtr.pitch;
        }
+        hipStream_t stream = ihipSyncAndResolveStream(hipStreamNull);
+        hc::completion_future marker;
+        try {
+            for (int i = 0; i < depth; i++) {
+                for(int j = 0; j < height; j++) {
+                    // TODO: p->srcPos or p->dstPos are not 0.
+                    unsigned char* src = (unsigned char*)srcPtr + i*ySize*srcPitch + j*srcPitch;
+                    unsigned char* dst = (unsigned char*)dstPtr + i*height*dstWidthInbytes + j*dstWidthInbytes;
+                    stream->locked_copySync(dst, src, widthInBytes, p->kind);
+                 }
+             }
+         } catch (ihipException ex) {
+                e = ex._code;
+         }
    } else {
-        return ihipLogStatus(hipErrorUnknown);
+        e = hipErrorInvalidValue;
    }
-
-    try {
-        for (int i = 0; i < p->extent.depth; i++) {
-            for(int j = 0; j < p->extent.height; j++) {
-                // TODO: p->srcPos or p->dstPos are not 0.
-                unsigned char* src = (unsigned char*)p->srcPtr.ptr + i*p->srcPtr.ysize*p->srcPtr.pitch + j*p->srcPtr.pitch;
-                unsigned char* dst = (unsigned char*)p->dstArray->data + i*p->dstArray->height*p->dstArray->width*byteSize + j*p->dstArray->width*byteSize;
-                stream->locked_copySync(dst, src, p->extent.width*byteSize, p->kind);
-			}
-        }
-    }
-    catch (ihipException &ex) {
-        e = ex._code;
-    }
-
    return ihipLogStatus(e);
 }
-
 namespace
 {
    template<
@@ -623,7 +623,7 @@ hipError_t hipBindTextureToArray(textureReference* tex,
 	HIP_INIT_API(tex, array, desc);
 	hipError_t  hip_status = hipSuccess;
    // TODO: hipReadModeElementType is default.
-    hip_status = ihipBindTextureToArrayImpl(hipTextureType2D, hipReadModeElementType,
+    hip_status = ihipBindTextureToArrayImpl(array->textureType, hipReadModeElementType,
                                      array, *desc, tex);
    return ihipLogStatus(hip_status);
 }
@@ -742,7 +742,7 @@ hipError_t hipTexRefSetArray ( textureReference* tex,  hipArray_const_t array, u
 	HIP_INIT_API(tex, array, flags);
 	hipError_t  hip_status = hipSuccess;

-    hip_status =  ihipBindTextureToArrayImpl(hipTextureType2D, hipReadModeElementType,
+    hip_status =  ihipBindTextureToArrayImpl(array->textureType, hipReadModeElementType,
 	                                      array, array->desc,tex );
    return ihipLogStatus(hip_status);
 }
@@ -89,9 +89,8 @@ namespace hip_impl
        stream->lockclose_postKernelCommand(kernel_name, acc_v);

        delete static_cast<L*>(locked_stream);
-        locked_stream = nullptr;
        if(HIP_PROFILE_API) {
            MARKER_END();
        }
    }
-}
+}
@@ -16,7 +16,6 @@

 template<typename T>
 __global__ void axpy(T a, T *x, T *y) {
-  // CHECK: y[hipThreadIdx_x] = a * x[hipThreadIdx_x];
  y[threadIdx.x] = a * x[threadIdx.x];
 }

@@ -1,242 +0,0 @@
-// RUN: %run_test hipify "%s" "%t" %cuda_args
-/*
- * Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
- *
- * Please refer to the NVIDIA end user license agreement (EULA) associated
- * with this source code for terms and conditions that govern your use of
- * this software. Any use, reproduction, disclosure, or distribution of
- * this software and related documentation outside the terms of the EULA
- * is strictly prohibited.
- *
- */
-
-//
-// This sample demonstrates the use of streams for concurrent execution. It also illustrates how to
-// introduce dependencies between CUDA streams with the new cudaStreamWaitEvent function introduced
-// in CUDA 3.2.
-//
-// Devices of compute capability 1.x will run the kernels one after another
-// Devices of compute capability 2.0 or higher can overlap the kernels
-//
-#include <stdio.h>
-#include <helper_functions.h>
-#include <helper_cuda.h>
-
-// This is a kernel that does no real work but runs at least for a specified number of clocks
-__global__ void clock_block(clock_t *d_o, clock_t clock_count)
-{
-    unsigned int start_clock = (unsigned int) clock();
-
-    clock_t clock_offset = 0;
-
-    while (clock_offset < clock_count)
-    {
-        unsigned int end_clock = (unsigned int) clock();
-
-        // The code below should work like
-        // this (thanks to modular arithmetics):
-        //
-        // clock_offset = (clock_t) (end_clock > start_clock ?
-        //                           end_clock - start_clock :
-        //                           end_clock + (0xffffffffu - start_clock));
-        //
-        // Indeed, let m = 2^32 then
-        // end - start = end + m - start (mod m).
-
-        clock_offset = (clock_t)(end_clock - start_clock);
-    }
-
-    d_o[0] = clock_offset;
-}
-
-
-// Single warp reduction kernel
-__global__ void sum(clock_t *d_clocks, int N)
-{
-    __shared__ clock_t s_clocks[32];
-
-    clock_t my_sum = 0;
-
-    for (int i = threadIdx.x; i < N; i+= blockDim.x)
-    {
-        my_sum += d_clocks[i];
-    }
-
-    s_clocks[threadIdx.x] = my_sum;
-    syncthreads();
-
-    for (int i=16; i>0; i/=2)
-    {
-        if (threadIdx.x < i)
-        {
-            s_clocks[threadIdx.x] += s_clocks[threadIdx.x + i];
-        }
-
-        syncthreads();
-    }
-
-    d_clocks[0] = s_clocks[0];
-}
-
-int main(int argc, char **argv)
-{
-    int nkernels = 8;               // number of concurrent kernels
-    int nstreams = nkernels + 1;    // use one more stream than concurrent kernel
-    int nbytes = nkernels * sizeof(clock_t);   // number of data bytes
-    float kernel_time = 10; // time the kernel should run in ms
-    float elapsed_time;   // timing variables
-    int cuda_device = 0;
-
-    printf("[%s] - Starting...\n", argv[0]);
-
-    // get number of kernels if overridden on the command line
-    if (checkCmdLineFlag(argc, (const char **)argv, "nkernels"))
-    {
-        nkernels = getCmdLineArgumentInt(argc, (const char **)argv, "nkernels");
-        nstreams = nkernels + 1;
-    }
-
-    // use command-line specified CUDA device, otherwise use device with highest Gflops/s
-    cuda_device = findCudaDevice(argc, (const char **)argv);
-
-    // CHECK: hipDeviceProp_t deviceProp;
-    cudaDeviceProp deviceProp;
-    // CHECK: checkCudaErrors(hipGetDevice(&cuda_device));
-    checkCudaErrors(cudaGetDevice(&cuda_device));
-
-    // CHECK: checkCudaErrors(hipGetDeviceProperties(&deviceProp, cuda_device));
-    checkCudaErrors(cudaGetDeviceProperties(&deviceProp, cuda_device));
-
-    if ((deviceProp.concurrentKernels == 0))
-    {
-        printf("> GPU does not support concurrent kernel execution\n");
-        printf("  CUDA kernel runs will be serialized\n");
-    }
-
-    printf("> Detected Compute SM %d.%d hardware with %d multi-processors\n",
-           deviceProp.major, deviceProp.minor, deviceProp.multiProcessorCount);
-
-    // allocate host memory
-    clock_t *a = 0;                     // pointer to the array data in host memory
-    // CHECK: checkCudaErrors(hipHostMalloc((void **)&a, nbytes));
-    checkCudaErrors(cudaMallocHost((void **)&a, nbytes));
-
-    // allocate device memory
-    clock_t *d_a = 0;             // pointers to data and init value in the device memory
-    // CHECK: checkCudaErrors(hipMalloc((void **)&d_a, nbytes));
-    checkCudaErrors(cudaMalloc((void **)&d_a, nbytes));
-
-    // CHECK: hipStream_t *streams = (hipStream_t *) malloc(nstreams * sizeof(hipStream_t));
-    // allocate and initialize an array of stream handles
-    cudaStream_t *streams = (cudaStream_t *) malloc(nstreams * sizeof(cudaStream_t));
-
-    for (int i = 0; i < nstreams; i++)
-    {
-        // CHECK: checkCudaErrors(hipStreamCreate(&(streams[i])));
-        checkCudaErrors(cudaStreamCreate(&(streams[i])));
-    }
-
-    // CHECK: hipEvent_t start_event, stop_event;
-    // create CUDA event handles
-    cudaEvent_t start_event, stop_event;
-
-    // CHECK: checkCudaErrors(hipEventCreate(&start_event));
-    // CHECK: checkCudaErrors(hipEventCreate(&stop_event));
-    checkCudaErrors(cudaEventCreate(&start_event));
-    checkCudaErrors(cudaEventCreate(&stop_event));
-
-    // the events are used for synchronization only and hence do not need to record timings
-    // this also makes events not introduce global sync points when recorded which is critical to get overlap
-
-    // CHECK: hipEvent_t *kernelEvent;
-    // CHECK: kernelEvent = (hipEvent_t *) malloc(nkernels * sizeof(hipEvent_t));
-    cudaEvent_t *kernelEvent;
-    kernelEvent = (cudaEvent_t *) malloc(nkernels * sizeof(cudaEvent_t));
-
-    for (int i = 0; i < nkernels; i++)
-    {
-        // CHECK: checkCudaErrors(hipEventCreateWithFlags(&(kernelEvent[i]), hipEventDisableTiming));
-        checkCudaErrors(cudaEventCreateWithFlags(&(kernelEvent[i]), cudaEventDisableTiming));
-    }
-
-    //////////////////////////////////////////////////////////////////////
-    // time execution with nkernels streams
-    clock_t total_clocks = 0;
-#if defined(__arm__) || defined(__aarch64__)
-    // the kernel takes more time than the channel reset time on arm archs, so to prevent hangs reduce time_clocks.
-    clock_t time_clocks = (clock_t)(kernel_time * (deviceProp.clockRate / 1000));
-#else
-    clock_t time_clocks = (clock_t)(kernel_time * deviceProp.clockRate);
-#endif
-
-    // CHECK: hipEventRecord(start_event, 0);
-    cudaEventRecord(start_event, 0);
-
-    // queue nkernels in separate streams and record when they are done
-    for (int i=0; i<nkernels; ++i)
-    {
-        // CHECK: hipLaunchKernelGGL(clock_block, dim3(1), dim3(1), 0, streams[i], &d_a[i], time_clocks);
-        clock_block<<<1,1,0,streams[i]>>>(&d_a[i], time_clocks);
-        total_clocks += time_clocks;
-
-        // CHECK: checkCudaErrors(hipEventRecord(kernelEvent[i], streams[i]));
-        checkCudaErrors(cudaEventRecord(kernelEvent[i], streams[i]));
-
-        // make the last stream wait for the kernel event to be recorded
-        // CHECK: checkCudaErrors(hipStreamWaitEvent(streams[nstreams-1], kernelEvent[i],0));
-        checkCudaErrors(cudaStreamWaitEvent(streams[nstreams-1], kernelEvent[i],0));
-    }
-
-    // queue a sum kernel and a copy back to host in the last stream.
-    // the commands in this stream get dispatched as soon as all the kernel events have been recorded
-    // CHECK: hipLaunchKernelGGL(sum, dim3(1), dim3(32), 0, streams[nstreams-1], d_a, nkernels);
-    // CHECK: checkCudaErrors(hipMemcpyAsync(a, d_a, sizeof(clock_t), hipMemcpyDeviceToHost, streams[nstreams-1]));
-    sum<<<1,32,0,streams[nstreams-1]>>>(d_a, nkernels);
-    checkCudaErrors(cudaMemcpyAsync(a, d_a, sizeof(clock_t), cudaMemcpyDeviceToHost, streams[nstreams-1]));
-
-    // at this point the CPU has dispatched all work for the GPU and can continue processing other tasks in parallel
-
-    // in this sample we just wait until the GPU is done
-    // CHECK: checkCudaErrors(hipEventRecord(stop_event, 0));
-    // CHECK: checkCudaErrors(hipEventSynchronize(stop_event));
-    // CHECK: checkCudaErrors(hipEventElapsedTime(&elapsed_time, start_event, stop_event));
-    checkCudaErrors(cudaEventRecord(stop_event, 0));
-    checkCudaErrors(cudaEventSynchronize(stop_event));
-    checkCudaErrors(cudaEventElapsedTime(&elapsed_time, start_event, stop_event));
-
-    printf("Expected time for serial execution of %d kernels = %.3fs\n", nkernels, nkernels * kernel_time/1000.0f);
-    printf("Expected time for concurrent execution of %d kernels = %.3fs\n", nkernels, kernel_time/1000.0f);
-    printf("Measured time for sample = %.3fs\n", elapsed_time/1000.0f);
-
-    bool bTestResult  = (a[0] > total_clocks);
-
-    // release resources
-    for (int i = 0; i < nkernels; i++)
-    {
-        // CHECK: hipStreamDestroy(streams[i]);
-        // CHECK: hipEventDestroy(kernelEvent[i]);
-        cudaStreamDestroy(streams[i]);
-        cudaEventDestroy(kernelEvent[i]);
-    }
-
-    free(streams);
-    free(kernelEvent);
-
-    // CHECK: hipEventDestroy(start_event);
-    // CHECK: hipEventDestroy(stop_event);
-    // CHECK: hipHostFree(a);
-    // CHECK: hipFree(d_a);
-    cudaEventDestroy(start_event);
-    cudaEventDestroy(stop_event);
-    cudaFreeHost(a);
-    cudaFree(d_a);
-
-    if (!bTestResult)
-    {
-        printf("Test failed!\n");
-        exit(EXIT_FAILURE);
-    }
-
-    printf("Test passed\n");
-    exit(EXIT_SUCCESS);
-}
@@ -38,7 +38,6 @@ if(status != cudaSuccess) { \
 }

 __global__ void Inc1(float *Ad, float *Bd){
-  // CHECK: int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
 	int tx = threadIdx.x + blockIdx.x * blockDim.x;
 	if(tx < 1 ){
 		for(int i=0;i<ITER;i++){
@@ -51,7 +50,6 @@ __global__ void Inc1(float *Ad, float *Bd){
 }

 __global__ void Inc2(float *Ad, float *Bd){
-  // CHECK: int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
 	int tx = threadIdx.x + blockIdx.x * blockDim.x;
 	if(tx < 1024){
 		for(int i=0;i<ITER;i++){
@@ -0,0 +1,6 @@
+// RUN: %run_test hipify "%s" "%t" %cuda_args
+
+// CHECK: #include <hip/hip_runtime.h>
+#include <cuda.h>
+// CHECK-NOT: #include<cuda_runtime.h>
+#include <cuda_runtime.h>
@@ -0,0 +1,6 @@
+// RUN: %run_test hipify "%s" "%t" %cuda_args
+
+// CHECK: #include <hip/hip_runtime.h>
+#include <cuda_runtime.h>
+// CHECK-NOT: #include<cuda.h>
+#include <cuda.h>
@@ -0,0 +1,10 @@
+// RUN: %run_test hipify "%s" "%t" %cuda_args
+
+// CHECK: #pragma once
+// CHECK-NEXT: #include <hip/hip_runtime.h>
+#pragma once
+// CHECK-NOT: #include <hip/hip_runtime.h>
+int main(int argc, char* argv[]) {
+  return 0;
+}
+
@@ -0,0 +1,12 @@
+// RUN: %run_test hipify "%s" "%t" %cuda_args
+
+// CHECK: #include <hip/hip_runtime.h>
+// CHECK-NEXT: #include <stdio.h>
+// CHECK-NEXT: #include <iostream>
+#include <stdio.h>
+#include <iostream>
+// CHECK-NOT: #include <hip/hip_runtime.h>
+int main(int argc, char* argv[]) {
+  return 0;
+}
+
@@ -0,0 +1,12 @@
+// RUN: %run_test hipify "%s" "%t" %cuda_args
+
+// CHECK: #pragma once
+// CHECK-NEXT: #include <hip/hip_runtime.h>
+#pragma once
+// CHECK-NOT: #include <hip/hip_runtime.h>
+#include <stdio.h>
+
+int main(int argc, char* argv[]) {
+  return 0;
+}
+
@@ -13,7 +13,7 @@ import lit.util
 config.name = 'hipify'

 # suffixes: CUDA source is only supported
-config.suffixes = ['.cu']
+config.suffixes = ['.cu','.cuh','.cpp','.c','.hpp','.h']

 # testFormat: The test format to use to interpret tests.
 config.test_format = lit.formats.ShTest()
@@ -44,8 +44,17 @@ if obj_root is not None:
    path = os.path.pathsep.join((llvm_tools_dir, config.environment['PATH']))
    config.environment['PATH'] = path

-config.substitutions.append(("hipify", obj_root+"/hipify-clang"))
+hipify_path = obj_root
+clang_args = "-x cuda -v --cuda-gpu-arch=sm_30 --cuda-path='%s'"

-# Clang args for CUDA...
-config.substitutions.append(("%cuda_args", "-x cuda --cuda-path=%s --cuda-gpu-arch=sm_30 -isystem%s/samples/common/inc" % (config.cuda_root, config.cuda_root)))
-config.substitutions.append(("%run_test", config.test_source_root + "/run_test.sh"))
+if sys.platform in ['win32']:
+    run_test_ext = ".bat"
+    hipify_path += "/" + config.build_type
+    clang_args += " -isystem'%s'/common/inc -std=c++14"
+else:
+    run_test_ext = ".sh"
+    clang_args += " -isystem'%s'/samples/common/inc"
+
+config.substitutions.append(("%cuda_args", clang_args % (config.cuda_root, config.cuda_sdk_root)))
+config.substitutions.append(("hipify", '"' + hipify_path + "/hipify-clang" + '"'))
+config.substitutions.append(("%run_test", '"' + config.test_source_root + "/run_test" + run_test_ext + '"'))
@@ -1,8 +1,23 @@
 import sys
+import os

 config.llvm_tools_dir = "@LLVM_TOOLS_BINARY_DIR@"
 config.obj_root = "@CMAKE_CURRENT_BINARY_DIR@"
 config.cuda_root = "@CUDA_TOOLKIT_ROOT_DIR@"
+if sys.platform in ['win32']:
+    config.cuda_sdk_root = "@CUDA_SDK_ROOT_DIR@"
+    if not config.cuda_sdk_root or config.cuda_sdk_root == "CUDA_SDK_ROOT_DIR-NOTFOUND":
+        cuda_version = "@CUDA_VERSION@"
+        cuda_version = cuda_version.replace('.','_')
+        config.cuda_samples_root = os.environ.get('NVCUDASAMPLES' + cuda_version + '_ROOT')
+        if not config.cuda_samples_root:
+            lit_config.fatal('No CUDA Samples dir set! Please set CUDA_SDK_ROOT_DIR.')
+        config.cuda_sdk_root = config.cuda_samples_root
+    config.build_type = "@CMAKE_BUILD_TYPE@"
+    if not config.build_type:
+        config.build_type = "Debug"
+else:
+    config.cuda_sdk_root = config.cuda_root

 # Support substitution of the tools and libs dirs with user parameters. This is
 # used when we can't determine the tool dir at configuration time.
@@ -0,0 +1,18 @@
+@echo off
+setlocal
+
+for %%i in (FileCheck.exe) do set FILE_CHECK=%%~$PATH:i
+if not defined FILE_CHECK (echo     Error: FileCheck.exe not found in PATH. && exit /b 1)
+
+set HIPIFY=%1
+set IN_FILE=%2
+set TMP_FILE=%3
+
+set all_args=%*
+call set clang_args=%%all_args:*%4=%%
+set clang_args=%4%clang_args%
+
+%HIPIFY% -o=%TMP_FILE% %IN_FILE% -- %clang_args%
+if errorlevel 1 (echo      Error: hipify-clang.exe failed with exit code: %errorlevel% && exit /b %errorlevel%)
+%FILE_CHECK% %IN_FILE% -input-file=%TMP_FILE%
+if errorlevel 1 (echo      Error: FileCheck.exe failed with exit code: %errorlevel% && exit /b %errorlevel%)
@@ -13,16 +13,5 @@ shift 3

 # Remaining args are the ones to forward to clang proper.

-# Time for the classic insane little trick for making colour output work.
-# A self-deleting shell-script that does the thing we want to do...
-TMP_SCRIPT=$(mktemp)
-cat << EOF > $TMP_SCRIPT
-set -o errexit
-set -o xtrace
-rm $TMP_SCRIPT
 $HIPIFY -o=$TMP_FILE $IN_FILE -- $@ && cat $TMP_FILE | sed -Ee 's|//.+|// |g' | FileCheck $IN_FILE
-EOF
-chmod a+x $TMP_SCRIPT

-# Run the script via socat, spawning a virtual terminal and propagating exit code, and hence failure.
-socat -du EXEC:$TMP_SCRIPT,pty,stderr STDOUT
@@ -41,8 +41,6 @@ template <typename T>
 __global__ void
 vector_square(T *C_d, const T *A_d, size_t N)
 {
-    // CHECK: size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
-    // CHECK: size_t stride = hipBlockDim_x * hipGridDim_x;
    size_t offset = (blockIdx.x * blockDim.x + threadIdx.x);
    size_t stride = blockDim.x * gridDim.x;

@@ -21,7 +21,7 @@ THE SOFTWARE.
 */

 /* HIT_START
- * BUILD: %t %s ../test_common.cpp 
+ * BUILD: %t %s ../test_common.cpp NVCC_OPTIONS --Wno-deprecated-declarations 
 * RUN: %t
 * HIT_END
 */
@@ -18,7 +18,7 @@ THE SOFTWARE.
 */

 /* HIT_START
- * BUILD: %t %s ../test_common.cpp
+ * BUILD: %t %s ../test_common.cpp NVCC_OPTIONS --Wno-deprecated-declarations
 * RUN: %t
 * HIT_END
 */
@@ -16,7 +16,7 @@ IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTI
 THE USE OR OTHER DEALINGS IN THE SOFTWARE. */

 /* HIT_START
- * BUILD: %t %s EXCLUDE_HIP_PLATFORM nvcc
+ * BUILD: %t %s
 * RUN: %t
 * HIT_END
 */
@@ -397,32 +397,30 @@ int main(int argc, char *argv[])
    if (gpuCount < 2)
    {
        printf("P2P application requires atleast 2 gpu devices\n");
-        return 0;
-    }
+    } else {
+        if (p_tests & 0x100) {
+            testPeerHostToDevice(false/*useAsyncCopy*/);
+        }
+        testPeerHostToDevice(true/*useAsyncCopy*/);

-    if (p_tests & 0x100) {
-        testPeerHostToDevice(false/*useAsyncCopy*/);
-    }
-    testPeerHostToDevice(true/*useAsyncCopy*/);
+        if (p_tests & 0x1) {
+            enablePeerFirst(false/*useAsyncCopy*/);
+        }

-    if (p_tests & 0x1) {
-        enablePeerFirst(false/*useAsyncCopy*/);
-    }
+        if (p_tests & 0x2) {
+            allocMemoryFirst(false/*useAsyncCopy*/);
+        }

-    if (p_tests & 0x2) {
-        allocMemoryFirst(false/*useAsyncCopy*/);
-    }
+        if (p_tests & 0x4) {
+            simpleNegative();
+        }

-    if (p_tests & 0x4) {
-        simpleNegative();
+        if (p_tests & 0x8) {
+            enablePeerFirst(true/*useAsyncCopy*/);
+        }
+        if (p_tests & 0x10) {
+            allocMemoryFirst(true/*useAsyncCopy*/);
+        }
    }
-
-    if (p_tests & 0x8) {
-        enablePeerFirst(true/*useAsyncCopy*/);
-    }
-    if (p_tests & 0x10) {
-        allocMemoryFirst(true/*useAsyncCopy*/);
-    }
-
    passed();
 }
@@ -23,7 +23,7 @@ THE SOFTWARE.

 /* HIT_START
 * BUILD: %t %s ../../test_common.cpp
- * RUN: %t EXCLUDE_HIP_PLATFORM nvcc
+ * RUN: %t
 * HIT_END
 */

@@ -23,7 +23,7 @@ THE SOFTWARE.
 */

 /* HIT_START
- * BUILD: %t %s ../../test_common.cpp EXCLUDE_HIP_PLATFORM nvcc
+ * BUILD: %t %s ../../test_common.cpp
 * RUN: %t
 * HIT_END
 */
@@ -32,7 +32,7 @@ THE SOFTWARE.

 int main()
 {
-    hipDevice_t device;
+   
    size_t Nbytes = N*sizeof(int);
    int numDevices = 0;
    int *A_d, *B_d, *C_d, *X_d, *Y_d, *Z_d;
@@ -69,8 +69,8 @@ int main()


        HIPCHECK(hipSetDevice(1));
-        HIPCHECK(hipMemcpyDtoD(X_d, A_d,  Nbytes));
-        HIPCHECK(hipMemcpyDtoD(Y_d, B_d,  Nbytes));
+        HIPCHECK(hipMemcpyDtoD((hipDeviceptr_t)X_d, (hipDeviceptr_t)A_d,  Nbytes));
+        HIPCHECK(hipMemcpyDtoD((hipDeviceptr_t)Y_d, (hipDeviceptr_t)B_d,  Nbytes));

        hipLaunchKernel(
          HipTest::vectorADD,
@@ -82,7 +82,7 @@ int main()
          static_cast<const int*>(Y_d),
          Z_d,
          N);
-        HIPCHECK(hipMemcpyDtoH(C_h, Z_d, Nbytes));
+        HIPCHECK(hipMemcpyDtoH(C_h, (hipDeviceptr_t)Z_d, Nbytes));
        HIPCHECK(hipDeviceSynchronize());
        HipTest::checkVectorADD(A_h, B_h, C_h, N);

@@ -32,7 +32,6 @@ THE SOFTWARE.

 int main()
 {
-    hipDevice_t device;
    size_t Nbytes = N*sizeof(int);
    int numDevices = 0;
    int *A_d, *B_d, *C_d, *X_d, *Y_d, *Z_d;
@@ -70,8 +69,8 @@ int main()

        HIPCHECK(hipStreamCreate(&s));
        HIPCHECK(hipSetDevice(1));
-        HIPCHECK(hipMemcpyDtoDAsync(X_d, A_d, Nbytes, s));
-        HIPCHECK(hipMemcpyDtoDAsync(Y_d, B_d, Nbytes, s));
+        HIPCHECK(hipMemcpyDtoDAsync((hipDeviceptr_t)X_d, (hipDeviceptr_t)A_d, Nbytes, s));
+        HIPCHECK(hipMemcpyDtoDAsync((hipDeviceptr_t)Y_d, (hipDeviceptr_t)B_d, Nbytes, s));

        hipLaunchKernel(
            HipTest::vectorADD,
@@ -83,7 +82,7 @@ int main()
            static_cast<const int*>(Y_d),
            Z_d,
            N);
-        HIPCHECK(hipMemcpyDtoHAsync(C_h, Z_d, Nbytes, s));
+        HIPCHECK(hipMemcpyDtoHAsync(C_h, (hipDeviceptr_t)Z_d, Nbytes, s));
        HIPCHECK(hipStreamSynchronize(s));
        HIPCHECK(hipDeviceSynchronize());