Re-sync with upstream.

[ROCm/clr commit: 602280eb8b]
2018-10-18 12:27:03 +01:00
@@ -296,53 +296,8 @@ def docker_upload_dockerhub( String local_org, String image_name, String remote_
 String build_config = 'Release'
 String job_name = env.JOB_NAME.toLowerCase( )

-// The following launches 3 builds in parallel: hcc-ctu, hcc-1.6 and cuda
-parallel rocm_1_9:
-{
-  node('hip-rocm')
-  {
-    String hcc_ver = 'rocm-1.9.x'
-    String from_image = 'ci_test_nodes/rocm-1.9.x/ubuntu-16.04:latest'
-    String inside_args = '--device=/dev/kfd --device=/dev/dri --group-add=video'
-
-    // Checkout source code, dependencies and version files
-    String source_hip_rel = checkout_and_version( hcc_ver )
-
-    // Create/reuse a docker image that represents the hip build environment
-    def hip_build_image = docker_build_image( hcc_ver, 'hip', '', source_hip_rel, from_image )
-
-    // Print system information for the log
-    hip_build_image.inside( inside_args )
-    {
-      sh  """#!/usr/bin/env bash
-          set -x
-          /opt/rocm/bin/rocm_agent_enumerator -t ALL
-          /opt/rocm/bin/hcc --version
-        """
-    }
-
-    // Conctruct a binary directory path based on build config
-    String build_hip_rel = build_directory_rel( build_config );
-
-    // Build hip inside of the build environment
-    docker_build_inside_image( hip_build_image, inside_args, hcc_ver, '', build_config, source_hip_rel, build_hip_rel )
-
-    // Clean docker build image
-    docker_clean_images( 'hip', docker_build_image_name( ) )
-
-    // After a successful build, upload a docker image of the results
-    /*
-    String hip_image_name = docker_upload_artifactory( hcc_ver, job_name, from_image, source_hip_rel, build_hip_rel )
-    if( params.push_image_to_docker_hub )
-    {
-      docker_upload_dockerhub( job_name, hip_image_name, 'rocm' )
-      docker_clean_images( 'rocm', hip_image_name )
-    }
-    docker_clean_images( job_name, hip_image_name )
-    */
-  }
-},
-rocm_head:
+// The following launches 2 builds in parallel: rocm-head and cuda-9.x
+parallel rocm_head:
 {
  node('hip-rocm')
  {
@@ -53,6 +53,9 @@ sub parse_config_file {
 $verbose = $ENV{'HIPCC_VERBOSE'} // 0;
 # Verbose: 0x1=commands, 0x2=paths, 0x4=hipcc args

+$HIPCC_COMPILE_FLAGS_APPEND=$ENV{'HIPCC_COMPILE_FLAGS_APPEND'};
+$HIPCC_LINK_FLAGS_APPEND=$ENV{'HIPCC_LINK_FLAGS_APPEND'};
+
 $HIP_PATH=$ENV{'HIP_PATH'} // dirname (dirname $0); # use parent directory of hipcc
 $HIP_VDI_HOME=$ENV{'HIP_VDI_HOME'};
 $HIP_CLANG_PATH=$ENV{'HIP_CLANG_PATH'};
@@ -727,7 +730,16 @@ if ($needHipHcc) {
 # Reason is that NVCC uses the file extension to determine whether to compile in CUDA mode or
 # pass-through CPP mode.

+if ($HIP_PLATFORM eq "clang") {
+    $HIPLDFLAGS .= " -lgcc_s -lgcc -lpthread -lm";
+}

+if ($HIPCC_COMPILE_FLAGS_APPEND) {
+    $HIPCXXFLAGS .= " $HIPCC_COMPILE_FLAGS_APPEND";
+}
+if ($HIPCC_LINK_FLAGS_APPEND) {
+    $HIPLDFLAGS .= " $HIPCC_LINK_FLAGS_APPEND";
+}

 my $CMD="$HIPCC";
 if ($needCXXFLAGS) {
@@ -737,9 +749,6 @@ if ($needLDFLAGS and not $compileOnly) {
    $CMD .= " $HIPLDFLAGS";
 }
 $CMD .= " $toolArgs";
-if ($needLDFLAGS and not $compileOnly and $HIP_PLATFORM eq "clang") {
-    $CMD .= " -lgcc_s -lgcc -lpthread -lm";
-}

 if ($verbose & 0x1) {
    print "hipcc-cmd: ", $CMD, "\n";
@@ -45,6 +45,8 @@ and provides practical suggestions on how to port CUDA code and work through com
    + [/usr/include/c++/v1/memory:5172:15: error: call to implicitly deleted default constructor of 'std::__1::bad_weak_ptr' throw bad_weak_ptr();](#usrincludecv1memory517215-error-call-to-implicitly-deleted-default-constructor-of-std__1bad_weak_ptr-throw-bad_weak_ptr)
  * [HIP Environment Variables](#hip-environment-variables)
  * [Editor Highlighting](#editor-highlighting)
+  * [CUDA to HIP Math Library Equivalents](#library-equivalents)
+  

 <!-- tocstop -->

@@ -53,8 +55,8 @@ and provides practical suggestions on how to port CUDA code and work through com
 ### General Tips
 - Starting the port on a Cuda machine is often the easiest approach, since you can incrementally port pieces of the code to HIP while leaving the rest in Cuda. (Recall that on Cuda machines HIP is just a thin layer over Cuda, so the two code types can interoperate on nvcc platforms.) Also, the HIP port can be compared with the original Cuda code for function and performance.
 - Once the Cuda code is ported to HIP and is running on the Cuda machine, compile the HIP code using hcc on an AMD machine.
- HIP ports can replace Cuda versions---HIP can deliver the same performance as a native Cuda implementation, with the benefit of portability to both Nvidia and AMD architectures as well as a path to future C++ standard support. You can handle platform-specific features through conditional compilation or by adding them to the open-source HIP infrastructure.
- Use **bin/hipconvertinplace.sh** to hipify all code files in the Cuda source directory.
+- HIP ports can replace Cuda versions: HIP can deliver the same performance as a native Cuda implementation, with the benefit of portability to both Nvidia and AMD architectures as well as a path to future C++ standard support. You can handle platform-specific features through conditional compilation or by adding them to the open-source HIP infrastructure.
+- Use **[bin/hipconvertinplace.sh](https://github.com/ROCm-Developer-Tools/HIP/blob/master/bin/hipconvertinplace.sh)** to hipify all code files in the Cuda source directory.

 ### Scanning existing CUDA code to scope the porting effort
 The hipexamine.sh tool will scan a source directory to determine which files contain CUDA code and how much of that code can be automatically hipified, 
@@ -77,7 +79,7 @@ info: TOTAL-converted 89 CUDA->HIP refs( dev:3 mem:32 kern:2 builtin:37 math:0 s
  kernels (1 total) :   kmeansPoint(1)
 ```

-hipexamine scans each code file (cpp, c, h, hpp, etc) found in the specified directory:
+hipexamine scans each code file (cpp, c, h, hpp, etc.) found in the specified directory:

   * Files with no CUDA code (ie kmeans.h) print one line summary just listing the source file name.
   * Files with CUDA code print a summary of what was found - for example the kmeans_cuda_kernel.cu file:
@@ -85,11 +87,11 @@ hipexamine scans each code file (cpp, c, h, hpp, etc) found in the specified dir
 info: hipify ./kmeans_cuda_kernel.cu =====>
  info: converted 40 CUDA->HIP refs( dev:0 mem:0 kern:0 builtin:37 math:0 stream:0 event:0 
 ```
-* Some of the most interesting information in kmeans_cuda_kernel.cu :
-       * How many CUDA calls were converted to HIP (40)
-       * Breakdown of the different CUDA functionality used (dev:0 mem:0 etc).  This file uses many CUDA builtins (37) and texture functions (3).
-       * Warning for code that looks like CUDA API but was not converted (0 in this file).
-       * Count Lines-of-Code (LOC) - 185 for this file. 
+* Interesting information in kmeans_cuda_kernel.cu :
+  * How many CUDA calls were converted to HIP (40)
+  * Breakdown of the CUDA functionality used (dev:0 mem:0 etc). This file uses many CUDA builtins (37) and texture functions (3).
+   * Warning for code that looks like CUDA API but was not converted (0 in this file).
+   * Count Lines-of-Code (LOC) - 185 for this file. 

 * hipexamine also presents a summary at the end of the process for the statistics collected across all files. This has similar format to the per-file reporting, and also includes a list of all kernels which have been called.  An example from above:

@@ -111,9 +113,9 @@ For each input file FILE, this script will:
 This is useful for testing improvements to the hipify toolset.


-The "hipconvertinplace.sh" script will perform inplace conversion for all code files in the specified directory.
+The [hipconvertinplace.sh](https://github.com/ROCm-Developer-Tools/HIP/blob/master/bin/hipconvertinplace.sh) script will perform inplace conversion for all code files in the specified directory.
 This can be quite handy when dealing with an existing CUDA code base since the script preserves the existing directory structure
-and filenames - so includes work.  After converting in-place, you can review the code to add additional parameters to
+and filenames - and includes work.  After converting in-place, you can review the code to add additional parameters to
 directory names.


@@ -138,7 +140,7 @@ Many projects use a mixture of an accelerator compiler (hcc or nvcc) and a stand
 
 
 ### Identifying the Compiler: hcc, hip-clang or nvcc
-Often, its useful to know whether the underlying compiler is hcc, hip-clang or nvcc. This knowledge can guard platform-specific code (features that only work on the nvcc, hip-clang or hcc path but not all) or aid in platform-specific performance tuning.   
+Often, it's useful to know whether the underlying compiler is hcc, hip-clang or nvcc. This knowledge can guard platform-specific code (features that only work on the nvcc, hip-clang or hcc path but not all) or aid in platform-specific performance tuning.   
 
 ```
 #ifdef __HCC__
@@ -164,7 +166,7 @@ Often, its useful to know whether the underlying compiler is hcc, hip-clang or
 // Compiled with nvcc (Cuda language extensions enabled) 
 ```
 
-hcc and hip-clang directly generates the host code (using the Clang x86 target) and passes the code to another host compiler. Thus, it lacks the equivalent of the \__CUDA_ACC define.
+hcc and hip-clang directly generates the host code (using the Clang x86 target) and passes the code to another host compiler. Thus, they have no equivalent of the \__CUDA_ACC define.
 
 The macro `__HIPCC__` is set if either `__HCC__`, `__HIP__` or `__CUDACC__` is defined. This configuration is useful in determining when code is being compiled using an accelerator-enabled compiler (hcc or nvcc) as opposed to a standard host compiler (GCC, ICC, Clang, etc.).
 
@@ -177,7 +179,7 @@ Both nvcc and hcc make two passes over the code: one for host code and one for d
 #if __HIP_DEVICE_COMPILE__
 ```
 
-Unlike `__CUDA_ARCH__`, the `__HIP_DEVICE_COMPILE__` value is 1 or undefined, and it doesnt represent the feature capability of the target device.  
+Unlike `__CUDA_ARCH__`, the `__HIP_DEVICE_COMPILE__` value is 1 or undefined, and it doesn't represent the feature capability of the target device.  


 ### Compiler Defines: Summary
@@ -212,7 +214,7 @@ Some Cuda code tests `__CUDA_ARCH__` for a specific value to determine whether t
 #if (__CUDA_ARCH__ >= 130) 
 // doubles are supported
 ```
-This type of code requires special attention, since hcc/AMD and nvcc/Cuda devices have different architectural capabilities. Moreover, you cant determine the presence of a feature using a simple comparison against an architectures version number. HIP provides a set of defines and device properties to query whether a specific architectural feature is supported.
+This type of code requires special attention, since hcc/AMD and nvcc/Cuda devices have different architectural capabilities. Moreover, you can't determine the presence of a feature using a simple comparison against an architecture's version number. HIP provides a set of defines and device properties to query whether a specific architectural feature is supported.

 The `__HIP_ARCH_*` defines can replace comparisons of `__CUDA_ARCH__` values: 
 ```
@@ -259,9 +261,8 @@ The table below shows the full set of architectural properties that HIP supports
 |`__HIP_ARCH_HAS_WARP_FUNNEL_SHIFT__`          |     hasFunnelShift             |Funnel shift two input words into one
 |Sync:                                    |                                      |
 |`__HIP_ARCH_HAS_THREAD_FENCE_SYSTEM__`        |     hasThreadFenceSystem       |threadfence\_system
-|`__HIP_ARCH_HAS_SYNC_THREAD_EXT__`            |     hasSyncThreadsExt         |syncthreads\_count, syncthreads\_and, syncthreads\_or 
-|                                                       
-|Miscellaneous:                                     |                                     |                                                     
+|`__HIP_ARCH_HAS_SYNC_THREAD_EXT__`            |     hasSyncThreadsExt         |syncthreads\_count, syncthreads\_and, syncthreads\_or                                                      
+|Miscellaneous:                                |                                |                                                     
 |`__HIP_ARCH_HAS_SURFACE_FUNCS__`              |   hasSurfaceFuncs              |                                                     
 |`__HIP_ARCH_HAS_3DGRID__`                     |   has3dGrid                    | Grids and groups are 3D                                                     
 |`__HIP_ARCH_HAS_DYNAMIC_PARALLEL__`           |   hasDynamicParallelism        | 
@@ -343,7 +344,7 @@ It also uses a standard compiler (g++) for the rest of the application. nvcc is
 Code compiled using this tool can employ only the intersection of language features supported by both nvcc and the host compiler. 
 In some cases, you must take care to ensure the data types and alignment of the host compiler are identical to those of the device compiler. Only some host compilers are supported---for example, recent nvcc versions lack Clang host-compiler capability.  

-hcc generates both device and host code using the same Clang-based compiler. The code uses the same API as gcc, which allows code generated by different gcc-compatible compilers to be linked together. For example, code compiled using hcc can link with code compiled using "standard" compilers (such as gcc, ICC and Clang). You must take care to ensure all compilers use the same standard C++ header and library formats.
+hcc generates both device and host code using the same Clang-based compiler. The code uses the same API as gcc, which allows code generated by different gcc-compatible compilers to be linked together. For example, code compiled using hcc can link with code compiled using "standard" compilers (such as gcc, ICC and Clang). Take care to ensure all compilers use the same standard C++ header and library formats.


 ### libc++ and libstdc++
@@ -553,7 +554,7 @@ hipcc-cmd: /opt/hcc/bin/hcc  -hc -I/opt/hcc/include -stdlib=libc++ -I../../../..

 #### /usr/include/c++/v1/memory:5172:15: error: call to implicitly deleted default constructor of 'std::__1::bad_weak_ptr' throw bad_weak_ptr();

-If you pass a ".cu" file, hcc will attempt to compile it as a Cuda language file. You must tell hcc that its in fact a C++ file: use the "-x c++" option.
+If you pass a ".cu" file, hcc will attempt to compile it as a Cuda language file. You must tell hcc that it's in fact a C++ file: use the "-x c++" option.


 ### HIP Environment Variables
@@ -577,3 +578,20 @@ HIP_VISIBLE_DEVICES            =  0 : Only devices whose index is present in the

 ### Editor Highlighting
 See the utils/vim or utils/gedit directories to add handy highlighting to hip files.
+
+
+### Library Equivalents
+
+| CUDA Library | ROCm Library | Comment |
+|------- | ---------   | -----   |
+| cuBLAS        |    rocBLAS     | Basic Linear Algebra Subroutines 
+| cuFFT        |    rocFFT     | Fast Fourier Transfer Library   
+| cuSPARSE     |    rocSPARSE   | Sparse BLAS  + SPMV 
+| cuSolver     |    rocSolver   | Lapack library
+| AMG-X    |    rocALUTION   | Sparse iterative solvers and preconditioners with Geometric and Algebraic MultiGrid
+| Thrust    |    hipThrust | C++ parallel algorithms library
+| CUB     |    rocPRIM | Low Level Optimized Parallel Primitives
+| cuDNN    |    MIOpen | Deep learning Solver Library 
+| cuRAND    |    rocRAND | Random Number Generator Library
+| EIGEN    |    EIGEN – HIP port | C++ template library for linear algebra: matrices, vectors, numerical solvers, 
+| NCCL    |    RCCL  | Communications Primitives Library based on the MPI equivalents
@@ -7,7 +7,6 @@ namespace cl = llvm::cl;
 namespace ct = clang::tooling;

 extern cl::OptionCategory ToolTemplateCategory;
-
 extern cl::opt<std::string> OutputFilename;
 extern cl::opt<bool> Inplace;
 extern cl::opt<bool> NoBackup;
@@ -15,5 +14,4 @@ extern cl::opt<bool> NoOutput;
 extern cl::opt<bool> PrintStats;
 extern cl::opt<std::string> OutputStatsFilename;
 extern cl::opt<bool> Examine;
-
 extern cl::extrahelp CommonHelp;
@@ -3,26 +3,25 @@
 #include "llvm/ADT/StringRef.h"
 #include <set>
 #include <map>
-
 #include "Statistics.h"

 #define HIP_UNSUPPORTED true

-/// Maps cuda header names to hip header names.
+// Maps cuda header names to hip header names.
 extern const std::map<llvm::StringRef, hipCounter> CUDA_INCLUDE_MAP;

-/// Maps the names of CUDA types to the corresponding hip types.
+// Maps the names of CUDA types to the corresponding hip types.
 extern const std::map<llvm::StringRef, hipCounter> CUDA_TYPE_NAME_MAP;

-/// Map all other CUDA identifiers (function/macro names, enum values) to hip versions.
+// Map all other CUDA identifiers (function/macro names, enum values) to hip versions.
 extern const std::map<llvm::StringRef, hipCounter> CUDA_IDENTIFIER_MAP;

 /**
- * The union of all the above maps.
- *
- * This should be used rarely, but is still needed to convert macro definitions (which can
- * contain any combination of the above things). AST walkers can usually get away with just
- * looking in the lookup table for the type of element they are processing, however, saving
- * a great deal of time.
- */
+  * The union of all the above maps, except includes.
+  *
+  * This should be used rarely, but is still needed to convert macro definitions (which can
+  * contain any combination of the above things). AST walkers can usually get away with just
+  * looking in the lookup table for the type of element they are processing, however, saving
+  * a great deal of time.
+  */
 const std::map<llvm::StringRef, hipCounter>& CUDA_RENAMES_MAP();
@@ -1,12 +1,8 @@
 #include "HipifyAction.h"
-
-#include <memory>
-
 #include "clang/Basic/SourceLocation.h"
 #include "clang/Frontend/CompilerInstance.h"
 #include "clang/ASTMatchers/ASTMatchFinder.h"
 #include "clang/ASTMatchers/ASTMatchers.h"
-
 #include "LLVMCompat.h"
 #include "CUDA2HipMap.h"
 #include "StringUtils.h"
@@ -16,180 +12,167 @@ namespace ct = clang::tooling;
 namespace mat = clang::ast_matchers;

 void HipifyAction::RewriteString(StringRef s, clang::SourceLocation start) {
-    clang::SourceManager& SM = getCompilerInstance().getSourceManager();
-
-    size_t begin = 0;
-    while ((begin = s.find("cu", begin)) != StringRef::npos) {
-        const size_t end = s.find_first_of(" ", begin + 4);
-        StringRef name = s.slice(begin, end);
-        const auto found = CUDA_RENAMES_MAP().find(name);
-        if (found != CUDA_RENAMES_MAP().end()) {
-            StringRef repName = found->second.hipName;
-            hipCounter counter = {"[string literal]", ConvTypes::CONV_LITERAL, ApiTypes::API_RUNTIME, found->second.unsupported};
-            Statistics::current().incrementCounter(counter, name.str());
-
-            if (!counter.unsupported) {
-                clang::SourceLocation sl = start.getLocWithOffset(begin + 1);
-                ct::Replacement Rep(SM, sl, name.size(), repName);
-                clang::FullSourceLoc fullSL(sl, SM);
-                insertReplacement(Rep, fullSL);
-            }
-        }
-
-        if (end == StringRef::npos) {
-            break;
-        }
-
-        begin = end + 1;
+  clang::SourceManager& SM = getCompilerInstance().getSourceManager();
+  size_t begin = 0;
+  while ((begin = s.find("cu", begin)) != StringRef::npos) {
+    const size_t end = s.find_first_of(" ", begin + 4);
+    StringRef name = s.slice(begin, end);
+    const auto found = CUDA_RENAMES_MAP().find(name);
+    if (found != CUDA_RENAMES_MAP().end()) {
+      StringRef repName = found->second.hipName;
+      hipCounter counter = {"[string literal]", ConvTypes::CONV_LITERAL, ApiTypes::API_RUNTIME, found->second.unsupported};
+      Statistics::current().incrementCounter(counter, name.str());
+      if (!counter.unsupported) {
+        clang::SourceLocation sl = start.getLocWithOffset(begin + 1);
+        ct::Replacement Rep(SM, sl, name.size(), repName);
+        clang::FullSourceLoc fullSL(sl, SM);
+        insertReplacement(Rep, fullSL);
+      }
    }
+    if (end == StringRef::npos) {
+      break;
+    }
+    begin = end + 1;
+  }
 }

 /**
- * Look at, and consider altering, a given token.
- *
- * If it's not a CUDA identifier, nothing happens.
- * If it's an unsupported CUDA identifier, a warning is emitted.
- * Otherwise, the source file is updated with the corresponding hipification.
- */
+  * Look at, and consider altering, a given token. 
+  *
+  * If it's not a CUDA identifier, nothing happens.
+  * If it's an unsupported CUDA identifier, a warning is emitted.
+  * Otherwise, the source file is updated with the corresponding hipification.
+  */
 void HipifyAction::RewriteToken(const clang::Token& t) {
-    clang::SourceManager& SM = getCompilerInstance().getSourceManager();
-
-    // String literals containing CUDA references need fixing...
-    if (t.is(clang::tok::string_literal)) {
-        StringRef s(t.getLiteralData(), t.getLength());
-        RewriteString(unquoteStr(s), t.getLocation());
-        return;
-    } else if (!t.isAnyIdentifier()) {
-        // If it's neither a string nor an identifier, we don't care.
-        return;
-    }
-
-    StringRef name = t.getRawIdentifier();
-    const auto found = CUDA_RENAMES_MAP().find(name);
-    if (found == CUDA_RENAMES_MAP().end()) {
-        // So it's an identifier, but not CUDA? Boring.
-        return;
-    }
-
-    Statistics::current().incrementCounter(found->second, name.str());
-
-    clang::SourceLocation sl = t.getLocation();
-    if (found->second.unsupported) {
-        // An unsupported identifier? Curses! Warn the user.
-        clang::DiagnosticsEngine& DE = getCompilerInstance().getDiagnostics();
-        const auto ID = DE.getCustomDiagID(clang::DiagnosticsEngine::Warning, "CUDA identifier unsupported in hip");
-        DE.Report(sl, ID);
-        return;
-    }
-
-    StringRef repName = found->second.hipName;
-    ct::Replacement Rep(SM, sl, name.size(), repName);
-    clang::FullSourceLoc fullSL(sl, SM);
-    insertReplacement(Rep, fullSL);
+  clang::SourceManager& SM = getCompilerInstance().getSourceManager();
+  // String literals containing CUDA references need fixing...
+  if (t.is(clang::tok::string_literal)) {
+    StringRef s(t.getLiteralData(), t.getLength());
+    RewriteString(unquoteStr(s), t.getLocation());
+    return;
+  } else if (!t.isAnyIdentifier()) {
+    // If it's neither a string nor an identifier, we don't care.
+    return;
+  }
+  StringRef name = t.getRawIdentifier();
+  const auto found = CUDA_RENAMES_MAP().find(name);
+  if (found == CUDA_RENAMES_MAP().end()) {
+    // So it's an identifier, but not CUDA? Boring.
+    return;
+  }
+  Statistics::current().incrementCounter(found->second, name.str());
+  clang::SourceLocation sl = t.getLocation();
+  if (found->second.unsupported) {
+    // Warn the user about unsupported identifier.
+    clang::DiagnosticsEngine& DE = getCompilerInstance().getDiagnostics();
+    const auto ID = DE.getCustomDiagID(clang::DiagnosticsEngine::Warning, "CUDA identifier unsupported in hip");
+    DE.Report(sl, ID);
+    return;
+  }
+  StringRef repName = found->second.hipName;
+  ct::Replacement Rep(SM, sl, name.size(), repName);
+  clang::FullSourceLoc fullSL(sl, SM);
+  insertReplacement(Rep, fullSL);
 }

 namespace {

 clang::SourceRange getReadRange(clang::SourceManager& SM, const clang::SourceRange& exprRange) {
-    clang::SourceLocation begin = exprRange.getBegin();
-    clang::SourceLocation end = exprRange.getEnd();
+  clang::SourceLocation begin = exprRange.getBegin();
+  clang::SourceLocation end = exprRange.getEnd();

-    bool beginSafe = !SM.isMacroBodyExpansion(begin) || clang::Lexer::isAtStartOfMacroExpansion(begin, SM, clang::LangOptions{});
-    bool endSafe = !SM.isMacroBodyExpansion(end) || clang::Lexer::isAtEndOfMacroExpansion(end, SM, clang::LangOptions{});
+  bool beginSafe = !SM.isMacroBodyExpansion(begin) || clang::Lexer::isAtStartOfMacroExpansion(begin, SM, clang::LangOptions{});
+  bool endSafe = !SM.isMacroBodyExpansion(end) || clang::Lexer::isAtEndOfMacroExpansion(end, SM, clang::LangOptions{});

-    if (beginSafe && endSafe) {
-        return {SM.getFileLoc(begin), SM.getFileLoc(end)};
-    } else {
-        return {SM.getSpellingLoc(begin), SM.getSpellingLoc(end)};
-    }
+  if (beginSafe && endSafe) {
+    return {SM.getFileLoc(begin), SM.getFileLoc(end)};
+  } else {
+    return {SM.getSpellingLoc(begin), SM.getSpellingLoc(end)};
+  }
 }

-
 clang::SourceRange getWriteRange(clang::SourceManager& SM, const clang::SourceRange& exprRange) {
-    clang::SourceLocation begin = exprRange.getBegin();
-    clang::SourceLocation end = exprRange.getEnd();
-
-    // If the range is contained within a macro, update the macro definition.
-    // Otherwise, use the file location and hope for the best.
-    if (!SM.isMacroBodyExpansion(begin) || !SM.isMacroBodyExpansion(end)) {
-        return {SM.getFileLoc(begin), SM.getFileLoc(end)};
-    }
-
-    return {SM.getSpellingLoc(begin), SM.getSpellingLoc(end)};
+  clang::SourceLocation begin = exprRange.getBegin();
+  clang::SourceLocation end = exprRange.getEnd();
+  // If the range is contained within a macro, update the macro definition.
+  // Otherwise, use the file location and hope for the best.
+  if (!SM.isMacroBodyExpansion(begin) || !SM.isMacroBodyExpansion(end)) {
+    return {SM.getFileLoc(begin), SM.getFileLoc(end)};
+  }
+  return {SM.getSpellingLoc(begin), SM.getSpellingLoc(end)};
 }

-
 StringRef readSourceText(clang::SourceManager& SM, const clang::SourceRange& exprRange) {
-    return clang::Lexer::getSourceText(clang::CharSourceRange::getTokenRange(getReadRange(SM, exprRange)), SM, clang::LangOptions(), nullptr);
+  return clang::Lexer::getSourceText(clang::CharSourceRange::getTokenRange(getReadRange(SM, exprRange)), SM, clang::LangOptions(), nullptr);
 }

 /**
- * Get a string representation of the expression `arg`, unless it's a defaulting function
- * call argument, in which case get a 0. Used for building argument lists to kernel calls.
- */
+  * Get a string representation of the expression `arg`, unless it's a defaulting function
+  * call argument, in which case get a 0. Used for building argument lists to kernel calls.
+  */
 std::string stringifyZeroDefaultedArg(clang::SourceManager& SM, const clang::Expr* arg) {
-    if (clang::isa<clang::CXXDefaultArgExpr>(arg)) {
-        return "0";
-    } else {
-        return readSourceText(SM, arg->getSourceRange());
-    }
+  if (clang::isa<clang::CXXDefaultArgExpr>(arg)) {
+    return "0";
+  } else {
+    return readSourceText(SM, arg->getSourceRange());
+  }
 }

 } // anonymous namespace

 bool HipifyAction::Exclude(const hipCounter & hipToken) {
-    switch (hipToken.type) {
-        case CONV_INCLUDE_CUDA_MAIN_H:
-            switch (hipToken.apiType) {
-                case API_DRIVER:
-                case API_RUNTIME:
-                    if (insertedRuntimeHeader) { return true; }
-                    insertedRuntimeHeader = true;
-                    return false;
-                case API_BLAS:
-                  if (insertedBLASHeader) { return true; }
-                    insertedBLASHeader = true;
-                    return false;
-                case API_RAND:
-                    if (hipToken.hipName == "hiprand_kernel.h") {
-                        if (insertedRAND_kernelHeader) { return true; }
-                        insertedRAND_kernelHeader = true;
-                        return false;
-                    } else if (hipToken.hipName == "hiprand.h") {
-                        if (insertedRANDHeader) { return true; }
-                        insertedRANDHeader = true;
-                        return false;
-                    }
-                case API_DNN:
-                  if (insertedDNNHeader) { return true; }
-                    insertedDNNHeader = true;
-                    return false;
-                case API_FFT:
-                  if (insertedFFTHeader) { return true; }
-                  insertedFFTHeader = true;
-                  return false;
-                case API_COMPLEX:
-                  if (insertedComplexHeader) { return true; }
-                  insertedComplexHeader = true;
-                  return false;
-                default:
-                    return false;
-            }
+  switch (hipToken.type) {
+    case CONV_INCLUDE_CUDA_MAIN_H:
+      switch (hipToken.apiType) {
+        case API_DRIVER:
+        case API_RUNTIME:
+          if (insertedRuntimeHeader) { return true; }
+          insertedRuntimeHeader = true;
+          return false;
+        case API_BLAS:
+          if (insertedBLASHeader) { return true; }
+          insertedBLASHeader = true;
+          return false;
+        case API_RAND:
+          if (hipToken.hipName == "hiprand_kernel.h") {
+            if (insertedRAND_kernelHeader) { return true; }
+            insertedRAND_kernelHeader = true;
            return false;
-        case CONV_INCLUDE:
-            switch (hipToken.apiType) {
-                case API_RAND:
-                    if (insertedRAND_kernelHeader) { return true; }
-                    insertedRAND_kernelHeader = true;
-                    return false;
-                default:
-                    return false;
-            }
+          } else if (hipToken.hipName == "hiprand.h") {
+            if (insertedRANDHeader) { return true; }
+            insertedRANDHeader = true;
            return false;
+          }
+        case API_DNN:
+          if (insertedDNNHeader) { return true; }
+          insertedDNNHeader = true;
+          return false;
+        case API_FFT:
+          if (insertedFFTHeader) { return true; }
+          insertedFFTHeader = true;
+          return false;
+        case API_COMPLEX:
+          if (insertedComplexHeader) { return true; }
+          insertedComplexHeader = true;
+          return false;
        default:
-            return false;
-    }
-    return false;
+          return false;
+      }
+      return false;
+    case CONV_INCLUDE:
+      switch (hipToken.apiType) {
+        case API_RAND:
+          if (insertedRAND_kernelHeader) { return true; }
+          insertedRAND_kernelHeader = true;
+          return false;
+        default:
+          return false;
+      }
+      return false;
+    default:
+      return false;
+  }
+  return false;
 }

 void HipifyAction::InclusionDirective(clang::SourceLocation hash_loc,
@@ -199,286 +182,263 @@ void HipifyAction::InclusionDirective(clang::SourceLocation hash_loc,
                                      clang::CharSourceRange filename_range,
                                      const clang::FileEntry*, StringRef,
                                      StringRef, const clang::Module*) {
-    clang::SourceManager& SM = getCompilerInstance().getSourceManager();
-    if (!SM.isWrittenInMainFile(hash_loc)) {
-        return;
-    }
-    if (!firstHeader) {
-        firstHeader = true;
-        firstHeaderLoc = hash_loc;
-    }
+  clang::SourceManager& SM = getCompilerInstance().getSourceManager();
+  if (!SM.isWrittenInMainFile(hash_loc)) {
+    return;
+  }
+  if (!firstHeader) {
+    firstHeader = true;
+    firstHeaderLoc = hash_loc;
+  }
+  const auto found = CUDA_INCLUDE_MAP.find(file_name);
+  if (found == CUDA_INCLUDE_MAP.end()) {
+    return;
+  }
+  bool exclude = Exclude(found->second);
+  Statistics::current().incrementCounter(found->second, file_name.str());

-    const auto found = CUDA_INCLUDE_MAP.find(file_name);
-    if (found == CUDA_INCLUDE_MAP.end()) {
-        return;
-    }
+  clang::SourceLocation sl = filename_range.getBegin();
+  if (found->second.unsupported) {
+    clang::DiagnosticsEngine& DE = getCompilerInstance().getDiagnostics();
+    DE.Report(sl, DE.getCustomDiagID(clang::DiagnosticsEngine::Warning, "Unsupported CUDA header"));
+    return;
+  }

-    bool exclude = Exclude(found->second);
-
-    Statistics::current().incrementCounter(found->second, file_name.str());
-
-    clang::SourceLocation sl = filename_range.getBegin();
-    if (found->second.unsupported) {
-        clang::DiagnosticsEngine& DE = getCompilerInstance().getDiagnostics();
-        DE.Report(sl, DE.getCustomDiagID(clang::DiagnosticsEngine::Warning, "Unsupported CUDA header"));
-        return;
-    }
-
-    clang::StringRef newInclude;
-
-    // Keep the same include type that the user gave.
-    if (!exclude) {
-        clang::SmallString<128> includeBuffer;
-        if (is_angled) {
-            newInclude = llvm::Twine("<" + found->second.hipName + ">").toStringRef(includeBuffer);
-        } else {
-            newInclude = llvm::Twine("\"" + found->second.hipName + "\"").toStringRef(includeBuffer);
-        }
+  clang::StringRef newInclude;
+  // Keep the same include type that the user gave.
+  if (!exclude) {
+    clang::SmallString<128> includeBuffer;
+    if (is_angled) {
+      newInclude = llvm::Twine("<" + found->second.hipName + ">").toStringRef(includeBuffer);
    } else {
-        // hashLoc is location of the '#', thus replacing the whole include directive by empty newInclude starting with '#'.
-        sl = hash_loc;
+      newInclude = llvm::Twine("\"" + found->second.hipName + "\"").toStringRef(includeBuffer);
    }
-    const char *B = SM.getCharacterData(sl);
-    const char *E = SM.getCharacterData(filename_range.getEnd());
-    ct::Replacement Rep(SM, sl, E - B, newInclude);
-    insertReplacement(Rep, clang::FullSourceLoc{sl, SM});
+  } else {
+    // hashLoc is location of the '#', thus replacing the whole include directive by empty newInclude starting with '#'.
+    sl = hash_loc;
+  }
+  const char *B = SM.getCharacterData(sl);
+  const char *E = SM.getCharacterData(filename_range.getEnd());
+  ct::Replacement Rep(SM, sl, E - B, newInclude);
+  insertReplacement(Rep, clang::FullSourceLoc{sl, SM});
 }

 void HipifyAction::PragmaDirective(clang::SourceLocation Loc, clang::PragmaIntroducerKind Introducer) {
-    if (pragmaOnce) {
-        return;
-    }
-    clang::SourceManager& SM = getCompilerInstance().getSourceManager();
-    if (!SM.isWrittenInMainFile(Loc)) {
-        return;
-    }
-    clang::Preprocessor& PP = getCompilerInstance().getPreprocessor();
-    const clang::Token tok = PP.LookAhead(0);
-    StringRef Text(SM.getCharacterData(tok.getLocation()), tok.getLength());
-    if (Text == "once") {
-        pragmaOnce = true;
-        pragmaOnceLoc = PP.LookAhead(1).getLocation();
-    }
+  if (pragmaOnce) {
+    return;
+  }
+  clang::SourceManager& SM = getCompilerInstance().getSourceManager();
+  if (!SM.isWrittenInMainFile(Loc)) {
+    return;
+  }
+  clang::Preprocessor& PP = getCompilerInstance().getPreprocessor();
+  const clang::Token tok = PP.LookAhead(0);
+  StringRef Text(SM.getCharacterData(tok.getLocation()), tok.getLength());
+  if (Text == "once") {
+    pragmaOnce = true;
+    pragmaOnceLoc = PP.LookAhead(1).getLocation();
+  }
 }

 bool HipifyAction::cudaLaunchKernel(const clang::ast_matchers::MatchFinder::MatchResult& Result) {
-    StringRef refName = "cudaLaunchKernel";
+  StringRef refName = "cudaLaunchKernel";
+  const auto* launchKernel = Result.Nodes.getNodeAs<clang::CUDAKernelCallExpr>(refName);
+  if (!launchKernel) {
+    return false;
+  }
+  clang::SmallString<40> XStr;
+  llvm::raw_svector_ostream OS(XStr);
+  clang::LangOptions DefaultLangOptions;
+  clang::SourceManager* SM = Result.SourceManager;

-    const auto* launchKernel = Result.Nodes.getNodeAs<clang::CUDAKernelCallExpr>(refName);
-    if (!launchKernel) {
-        return false;
-    }
+  const clang::Expr& calleeExpr = *(launchKernel->getCallee());
+  OS << "hipLaunchKernelGGL(" << readSourceText(*SM, calleeExpr.getSourceRange()) << ", ";

-    clang::SmallString<40> XStr;
-    llvm::raw_svector_ostream OS(XStr);
+  // Next up are the four kernel configuration parameters, the last two of which are optional and default to zero.
+  const clang::CallExpr& config = *(launchKernel->getConfig());

-    clang::LangOptions DefaultLangOptions;
-    clang::SourceManager* SM = Result.SourceManager;
+  // Copy the two dimensional arguments verbatim.
+  OS << "dim3(" << readSourceText(*SM, config.getArg(0)->getSourceRange()) << "), ";
+  OS << "dim3(" << readSourceText(*SM, config.getArg(1)->getSourceRange()) << "), ";

-    const clang::Expr& calleeExpr = *(launchKernel->getCallee());
-    OS << "hipLaunchKernelGGL(" << readSourceText(*SM, calleeExpr.getSourceRange()) << ", ";
+  // The stream/memory arguments default to zero if omitted.
+  OS << stringifyZeroDefaultedArg(*SM, config.getArg(2)) << ", ";
+  OS << stringifyZeroDefaultedArg(*SM, config.getArg(3));

-    // Next up are the four kernel configuration parameters, the last two of which are optional and default to zero.
-    const clang::CallExpr& config = *(launchKernel->getConfig());
+  // If there are ordinary arguments to the kernel, just copy them verbatim into our new call.
+  int numArgs = launchKernel->getNumArgs();
+  if (numArgs > 0) {
+    OS << ", ";
+    // Start of the first argument.
+    clang::SourceLocation argStart = launchKernel->getArg(0)->getLocStart();
+    // End of the last argument.
+    clang::SourceLocation argEnd = launchKernel->getArg(numArgs - 1)->getLocEnd();
+    OS << readSourceText(*SM, {argStart, argEnd});
+  }
+  OS << ")";

-    // Copy the two dimensional arguments verbatim.
-    OS << "dim3(" << readSourceText(*SM, config.getArg(0)->getSourceRange()) << "), ";
-    OS << "dim3(" << readSourceText(*SM, config.getArg(1)->getSourceRange()) << "), ";
-
-    // The stream/memory arguments default to zero if omitted.
-    OS << stringifyZeroDefaultedArg(*SM, config.getArg(2)) << ", ";
-    OS << stringifyZeroDefaultedArg(*SM, config.getArg(3));
-
-    // If there are ordinary arguments to the kernel, just copy them verbatim into our new call.
-    int numArgs = launchKernel->getNumArgs();
-    if (numArgs > 0) {
-        OS << ", ";
-
-        // Start of the first argument.
-        clang::SourceLocation argStart = launchKernel->getArg(0)->getLocStart();
-
-        // End of the last argument.
-        clang::SourceLocation argEnd = launchKernel->getArg(numArgs - 1)->getLocEnd();
-
-        OS << readSourceText(*SM, {argStart, argEnd});
-    }
-
-    OS << ")";
-
-    clang::SourceRange replacementRange = getWriteRange(*SM, {launchKernel->getLocStart(), launchKernel->getLocEnd()});
-    clang::SourceLocation launchStart = replacementRange.getBegin();
-    clang::SourceLocation launchEnd = replacementRange.getEnd();
-
-    size_t length = SM->getCharacterData(clang::Lexer::getLocForEndOfToken(launchEnd, 0, *SM, DefaultLangOptions)) - SM->getCharacterData(launchStart);
-
-    ct::Replacement Rep(*SM, launchStart, length, OS.str());
-    clang::FullSourceLoc fullSL(launchStart, *SM);
-    insertReplacement(Rep, fullSL);
-    hipCounter counter = {"hipLaunchKernelGGL", ConvTypes::CONV_KERN, ApiTypes::API_RUNTIME};
-    Statistics::current().incrementCounter(counter, refName.str());
-
-    return true;
+  clang::SourceRange replacementRange = getWriteRange(*SM, {launchKernel->getLocStart(), launchKernel->getLocEnd()});
+  clang::SourceLocation launchStart = replacementRange.getBegin();
+  clang::SourceLocation launchEnd = replacementRange.getEnd();
+  size_t length = SM->getCharacterData(clang::Lexer::getLocForEndOfToken(launchEnd, 0, *SM, DefaultLangOptions)) - SM->getCharacterData(launchStart);
+  ct::Replacement Rep(*SM, launchStart, length, OS.str());
+  clang::FullSourceLoc fullSL(launchStart, *SM);
+  insertReplacement(Rep, fullSL);
+  hipCounter counter = {"hipLaunchKernelGGL", ConvTypes::CONV_KERN, ApiTypes::API_RUNTIME};
+  Statistics::current().incrementCounter(counter, refName.str());
+  return true;
 }

 bool HipifyAction::cudaSharedIncompleteArrayVar(const clang::ast_matchers::MatchFinder::MatchResult& Result) {
-    StringRef refName = "cudaSharedIncompleteArrayVar";
-    auto* sharedVar = Result.Nodes.getNodeAs<clang::VarDecl>(refName);
-    if (!sharedVar) {
-        return false;
-    }
+  StringRef refName = "cudaSharedIncompleteArrayVar";
+  auto* sharedVar = Result.Nodes.getNodeAs<clang::VarDecl>(refName);
+  if (!sharedVar) {
+    return false;
+  }
+  // Example: extern __shared__ uint sRadix1[];
+  if (!sharedVar->hasExternalFormalLinkage()) {
+    return false;
+  }

-    // Example: extern __shared__ uint sRadix1[];
-    if (!sharedVar->hasExternalFormalLinkage()) {
-        return false;
+  clang::QualType QT = sharedVar->getType();
+  std::string typeName;
+  if (QT->isIncompleteArrayType()) {
+    const clang::ArrayType* AT = QT.getTypePtr()->getAsArrayTypeUnsafe();
+    QT = AT->getElementType();
+    if (QT.getTypePtr()->isBuiltinType()) {
+      QT = QT.getCanonicalType();
+      const auto* BT = clang::dyn_cast<clang::BuiltinType>(QT);
+      if (BT) {
+        clang::LangOptions LO;
+        LO.CUDA = true;
+        clang::PrintingPolicy policy(LO);
+        typeName = BT->getName(policy);
+      }
+    } else {
+      typeName = QT.getAsString();
    }
+  }

-    clang::QualType QT = sharedVar->getType();
-    std::string typeName;
-    if (QT->isIncompleteArrayType()) {
-        const clang::ArrayType* AT = QT.getTypePtr()->getAsArrayTypeUnsafe();
-        QT = AT->getElementType();
-        if (QT.getTypePtr()->isBuiltinType()) {
-            QT = QT.getCanonicalType();
-            const auto* BT = clang::dyn_cast<clang::BuiltinType>(QT);
-            if (BT) {
-                clang::LangOptions LO;
-                LO.CUDA = true;
-                clang::PrintingPolicy policy(LO);
-                typeName = BT->getName(policy);
-            }
-        } else {
-            typeName = QT.getAsString();
-        }
-    }
-
-    if (!typeName.empty()) {
-        clang::SourceLocation slStart = sharedVar->getLocStart();
-        clang::SourceLocation slEnd = sharedVar->getLocEnd();
-        clang::SourceManager* SM = Result.SourceManager;
-        size_t repLength = SM->getCharacterData(slEnd) - SM->getCharacterData(slStart) + 1;
-        std::string varName = sharedVar->getNameAsString();
-        std::string repName = "HIP_DYNAMIC_SHARED(" + typeName + ", " + varName + ")";
-        ct::Replacement Rep(*SM, slStart, repLength, repName);
-        clang::FullSourceLoc fullSL(slStart, *SM);
-        insertReplacement(Rep, fullSL);
-        hipCounter counter = {"HIP_DYNAMIC_SHARED", ConvTypes::CONV_MEM, ApiTypes::API_RUNTIME};
-        Statistics::current().incrementCounter(counter, refName.str());
-    }
-
-    return true;
+  if (!typeName.empty()) {
+    clang::SourceLocation slStart = sharedVar->getLocStart();
+    clang::SourceLocation slEnd = sharedVar->getLocEnd();
+    clang::SourceManager* SM = Result.SourceManager;
+    size_t repLength = SM->getCharacterData(slEnd) - SM->getCharacterData(slStart) + 1;
+    std::string varName = sharedVar->getNameAsString();
+    std::string repName = "HIP_DYNAMIC_SHARED(" + typeName + ", " + varName + ")";
+    ct::Replacement Rep(*SM, slStart, repLength, repName);
+    clang::FullSourceLoc fullSL(slStart, *SM);
+    insertReplacement(Rep, fullSL);
+    hipCounter counter = {"HIP_DYNAMIC_SHARED", ConvTypes::CONV_MEM, ApiTypes::API_RUNTIME};
+    Statistics::current().incrementCounter(counter, refName.str());
+  }
+  return true;
 }

 void HipifyAction::insertReplacement(const ct::Replacement& rep, const clang::FullSourceLoc& fullSL) {
-    llcompat::insertReplacement(*replacements, rep);
-    if (PrintStats) {
-        rep.getLength();
-        Statistics::current().lineTouched(fullSL.getExpansionLineNumber());
-        Statistics::current().bytesChanged(rep.getLength());
-    }
+  llcompat::insertReplacement(*replacements, rep);
+  if (PrintStats) {
+    rep.getLength();
+    Statistics::current().lineTouched(fullSL.getExpansionLineNumber());
+    Statistics::current().bytesChanged(rep.getLength());
+  }
 }

 std::unique_ptr<clang::ASTConsumer> HipifyAction::CreateASTConsumer(clang::CompilerInstance& CI, llvm::StringRef) {
-    Finder.reset(new clang::ast_matchers::MatchFinder);
-
-    // Replace the <<<...>>> language extension with a hip kernel launch
-    Finder->addMatcher(mat::cudaKernelCallExpr(mat::isExpansionInMainFile()).bind("cudaLaunchKernel"), this);
-
-    Finder->addMatcher(
-        mat::varDecl(
-            mat::isExpansionInMainFile(),
-            mat::allOf(
-                mat::hasAttr(clang::attr::CUDAShared),
-                mat::hasType(mat::incompleteArrayType())
-            )
-        ).bind("cudaSharedIncompleteArrayVar"),
-        this
-    );
-
-    // Ownership is transferred to the caller...
-    return Finder->newASTConsumer();
+  Finder.reset(new clang::ast_matchers::MatchFinder);
+  // Replace the <<<...>>> language extension with a hip kernel launch
+  Finder->addMatcher(mat::cudaKernelCallExpr(mat::isExpansionInMainFile()).bind("cudaLaunchKernel"), this);
+  Finder->addMatcher(
+    mat::varDecl(
+      mat::isExpansionInMainFile(),
+      mat::allOf(
+        mat::hasAttr(clang::attr::CUDAShared),
+        mat::hasType(mat::incompleteArrayType())
+      )
+    ).bind("cudaSharedIncompleteArrayVar"),
+    this
+  );
+  // Ownership is transferred to the caller...
+  return Finder->newASTConsumer();
 }

 void HipifyAction::EndSourceFileAction() {
-    // Insert the hip header, if we didn't already do it by accident during substitution.
-    if (!insertedRuntimeHeader) {
-        // It's not sufficient to just replace CUDA headers with hip ones, because numerous CUDA headers are
-        // implicitly included by the compiler. Instead, we _delete_ CUDA headers, and unconditionally insert
-        // one copy of the hip include into every file.
-        clang::SourceManager& SM = getCompilerInstance().getSourceManager();
-        clang::SourceLocation sl;
-        if (pragmaOnce) {
-            sl = pragmaOnceLoc;
-        } else if (firstHeader) {
-            sl = firstHeaderLoc;
-        } else {
-            sl = SM.getLocForStartOfFile(SM.getMainFileID());
-        }
-        clang::FullSourceLoc fullSL(sl, SM);
-        ct::Replacement Rep(SM, sl, 0, "\n#include <hip/hip_runtime.h>\n");
-        insertReplacement(Rep, fullSL);
+  // Insert the hip header, if we didn't already do it by accident during substitution.
+  if (!insertedRuntimeHeader) {
+    // It's not sufficient to just replace CUDA headers with hip ones, because numerous CUDA headers are
+    // implicitly included by the compiler. Instead, we _delete_ CUDA headers, and unconditionally insert
+    // one copy of the hip include into every file.
+    clang::SourceManager& SM = getCompilerInstance().getSourceManager();
+    clang::SourceLocation sl;
+    if (pragmaOnce) {
+      sl = pragmaOnceLoc;
+    } else if (firstHeader) {
+      sl = firstHeaderLoc;
+    } else {
+      sl = SM.getLocForStartOfFile(SM.getMainFileID());
    }
-
-    clang::ASTFrontendAction::EndSourceFileAction();
+    clang::FullSourceLoc fullSL(sl, SM);
+    ct::Replacement Rep(SM, sl, 0, "\n#include <hip/hip_runtime.h>\n");
+    insertReplacement(Rep, fullSL);
+  }
+  clang::ASTFrontendAction::EndSourceFileAction();
 }

-
 namespace {

 /**
- * A silly little class to proxy PPCallbacks back to the HipifyAction class.
- */
+  * A silly little class to proxy PPCallbacks back to the HipifyAction class.
+  */
 class PPCallbackProxy : public clang::PPCallbacks {
-    HipifyAction& hipifyAction;
+  HipifyAction& hipifyAction;

 public:
-    explicit PPCallbackProxy(HipifyAction& action): hipifyAction(action) {}
+  explicit PPCallbackProxy(HipifyAction& action): hipifyAction(action) {}

-    void InclusionDirective(clang::SourceLocation hash_loc, const clang::Token& include_token,
-                            StringRef file_name, bool is_angled, clang::CharSourceRange filename_range,
-                            const clang::FileEntry* file, StringRef search_path, StringRef relative_path,
-                            const clang::Module* imported
+  void InclusionDirective(clang::SourceLocation hash_loc, const clang::Token& include_token,
+                          StringRef file_name, bool is_angled, clang::CharSourceRange filename_range,
+                          const clang::FileEntry* file, StringRef search_path, StringRef relative_path,
+                          const clang::Module* imported
 #if LLVM_VERSION_MAJOR > 6
-                          , clang::SrcMgr::CharacteristicKind FileType
+                        , clang::SrcMgr::CharacteristicKind FileType
 #endif
-                           ) override {
-        hipifyAction.InclusionDirective(hash_loc, include_token, file_name, is_angled, filename_range, file, search_path, relative_path, imported);
-    }
+                         ) override {
+    hipifyAction.InclusionDirective(hash_loc, include_token, file_name, is_angled, filename_range, file, search_path, relative_path, imported);
+  }

-    void PragmaDirective(clang::SourceLocation Loc, clang::PragmaIntroducerKind Introducer) override {
-        hipifyAction.PragmaDirective(Loc, Introducer);
-    }
+  void PragmaDirective(clang::SourceLocation Loc, clang::PragmaIntroducerKind Introducer) override {
+    hipifyAction.PragmaDirective(Loc, Introducer);
+  }
 };

 }

 void HipifyAction::ExecuteAction() {
-    clang::Preprocessor& PP = getCompilerInstance().getPreprocessor();
-    clang::SourceManager& SM = getCompilerInstance().getSourceManager();
+  clang::Preprocessor& PP = getCompilerInstance().getPreprocessor();
+  clang::SourceManager& SM = getCompilerInstance().getSourceManager();

-    // Start lexing the specified input file.
-    const llvm::MemoryBuffer* FromFile = SM.getBuffer(SM.getMainFileID());
-    clang::Lexer RawLex(SM.getMainFileID(), FromFile, SM, PP.getLangOpts());
-    RawLex.SetKeepWhitespaceMode(true);
+  // Start lexing the specified input file.
+  const llvm::MemoryBuffer* FromFile = SM.getBuffer(SM.getMainFileID());
+  clang::Lexer RawLex(SM.getMainFileID(), FromFile, SM, PP.getLangOpts());
+  RawLex.SetKeepWhitespaceMode(true);

-    // Perform a token-level rewrite of CUDA identifiers to hip ones. The raw-mode lexer gives us enough
-    // information to tell the difference between identifiers, string literals, and "other stuff". It also
-    // ignores preprocessor directives, so this transformation will operate inside preprocessor-deleted
-    // code.
-    clang::Token RawTok;
+  // Perform a token-level rewrite of CUDA identifiers to hip ones. The raw-mode lexer gives us enough
+  // information to tell the difference between identifiers, string literals, and "other stuff". It also
+  // ignores preprocessor directives, so this transformation will operate inside preprocessor-deleted code.
+  clang::Token RawTok;
+  RawLex.LexFromRawLexer(RawTok);
+  while (RawTok.isNot(clang::tok::eof)) {
+    RewriteToken(RawTok);
    RawLex.LexFromRawLexer(RawTok);
-    while (RawTok.isNot(clang::tok::eof)) {
-        RewriteToken(RawTok);
-        RawLex.LexFromRawLexer(RawTok);
-    }
+  }

-    // Register yourself as the preprocessor callback, by proxy.
-    PP.addPPCallbacks(std::unique_ptr<PPCallbackProxy>(new PPCallbackProxy(*this)));
-
-    // Now we're done futzing with the lexer, have the subclass proceeed with Sema and AST matching.
-    clang::ASTFrontendAction::ExecuteAction();
+  // Register yourself as the preprocessor callback, by proxy.
+  PP.addPPCallbacks(std::unique_ptr<PPCallbackProxy>(new PPCallbackProxy(*this)));
+  // Now we're done futzing with the lexer, have the subclass proceeed with Sema and AST matching.
+  clang::ASTFrontendAction::ExecuteAction();
 }

 void HipifyAction::run(const clang::ast_matchers::MatchFinder::MatchResult& Result) {
-    if (cudaLaunchKernel(Result)) return;
-    if (cudaSharedIncompleteArrayVar(Result)) return;
+  if (cudaLaunchKernel(Result)) return;
+  if (cudaSharedIncompleteArrayVar(Result)) return;
 }
@@ -2,8 +2,8 @@

 #include "clang/Lex/PPCallbacks.h"
 #include "clang/Tooling/Tooling.h"
-#include "clang/Frontend/FrontendAction.h"
 #include "clang/Tooling/Core/Replacement.h"
+#include "clang/Frontend/FrontendAction.h"
 #include "clang/ASTMatchers/ASTMatchFinder.h"
 #include "ReplacementsFrontendActionFactory.h"
 #include "Statistics.h"
@@ -11,91 +11,62 @@
 namespace ct = clang::tooling;

 /**
- * A FrontendAction that hipifies CUDA programs.
- */
+  * A FrontendAction that hipifies CUDA programs.
+  */
 class HipifyAction : public clang::ASTFrontendAction,
                     public clang::ast_matchers::MatchFinder::MatchCallback {
 private:
-    ct::Replacements* replacements;
-    std::unique_ptr<clang::ast_matchers::MatchFinder> Finder;
-
-    /// CUDA implicitly adds its runtime header. We rewrite explicitly-provided CUDA includes with equivalent
-    // ones, and track - using this flag - if the result led to us including the hip runtime header. If it did
-    // not, we insert it at the top of the file when we finish processing it.
-    // This approach means we do the best it's possible to do w.r.t preserving the user's include order.
-    bool insertedRuntimeHeader = false;
-    bool insertedBLASHeader = false;
-    bool insertedRANDHeader = false;
-    bool insertedRAND_kernelHeader = false;
-    bool insertedDNNHeader = false;
-    bool insertedFFTHeader = false;
-    bool insertedComplexHeader = false;
-    bool firstHeader = false;
-    bool pragmaOnce = false;
-    clang::SourceLocation firstHeaderLoc;
-    clang::SourceLocation pragmaOnceLoc;
-
-    /**
-     * Rewrite a string literal to refer to hip, not CUDA.
-     */
-    void RewriteString(StringRef s, clang::SourceLocation start);
-
-    /**
-     * Replace a CUDA identifier with the corresponding hip identifier, if applicable.
-     */
-    void RewriteToken(const clang::Token &t);
+  ct::Replacements* replacements;
+  std::unique_ptr<clang::ast_matchers::MatchFinder> Finder;
+  // CUDA implicitly adds its runtime header. We rewrite explicitly-provided CUDA includes with equivalent
+  // ones, and track - using this flag - if the result led to us including the hip runtime header. If it did
+  // not, we insert it at the top of the file when we finish processing it.
+  // This approach means we do the best it's possible to do w.r.t preserving the user's include order.
+  bool insertedRuntimeHeader = false;
+  bool insertedBLASHeader = false;
+  bool insertedRANDHeader = false;
+  bool insertedRAND_kernelHeader = false;
+  bool insertedDNNHeader = false;
+  bool insertedFFTHeader = false;
+  bool insertedComplexHeader = false;
+  bool firstHeader = false;
+  bool pragmaOnce = false;
+  clang::SourceLocation firstHeaderLoc;
+  clang::SourceLocation pragmaOnceLoc;
+  // Rewrite a string literal to refer to hip, not CUDA.
+  void RewriteString(StringRef s, clang::SourceLocation start);
+  // Replace a CUDA identifier with the corresponding hip identifier, if applicable.
+  void RewriteToken(const clang::Token &t);

 public:
-    explicit HipifyAction(ct::Replacements *replacements):
-        clang::ASTFrontendAction(),
-        replacements(replacements) {}
-
-    // MatchCallback listeners
-    bool cudaBuiltin(const clang::ast_matchers::MatchFinder::MatchResult& Result);
-    bool cudaLaunchKernel(const clang::ast_matchers::MatchFinder::MatchResult& Result);
-    bool cudaSharedIncompleteArrayVar(const clang::ast_matchers::MatchFinder::MatchResult& Result);
-
-    /**
-     * Called by the preprocessor for each include directive during the non-raw lexing pass.
-     */
-    void InclusionDirective(clang::SourceLocation hash_loc,
-                            const clang::Token &include_token,
-                            StringRef file_name,
-                            bool is_angled,
-                            clang::CharSourceRange filename_range,
-                            const clang::FileEntry *file,
-                            StringRef search_path,
-                            StringRef relative_path,
-                            const clang::Module *imported);
-
-    /**
-    * Called by the preprocessor for each pragma directive during the non-raw lexing pass.
-    */
-    void PragmaDirective(clang::SourceLocation Loc, clang::PragmaIntroducerKind Introducer);
+  explicit HipifyAction(ct::Replacements *replacements): clang::ASTFrontendAction(),
+    replacements(replacements) {}
+  // MatchCallback listeners
+  bool cudaBuiltin(const clang::ast_matchers::MatchFinder::MatchResult& Result);
+  bool cudaLaunchKernel(const clang::ast_matchers::MatchFinder::MatchResult& Result);
+  bool cudaSharedIncompleteArrayVar(const clang::ast_matchers::MatchFinder::MatchResult& Result);
+  // Called by the preprocessor for each include directive during the non-raw lexing pass.
+  void InclusionDirective(clang::SourceLocation hash_loc,
+                          const clang::Token &include_token,
+                          StringRef file_name,
+                          bool is_angled,
+                          clang::CharSourceRange filename_range,
+                          const clang::FileEntry *file,
+                          StringRef search_path,
+                          StringRef relative_path,
+                          const clang::Module *imported);
+  // Called by the preprocessor for each pragma directive during the non-raw lexing pass.
+  void PragmaDirective(clang::SourceLocation Loc, clang::PragmaIntroducerKind Introducer);

 protected:
-    /**
-     * Add a Replacement for the current file. These will all be applied after executing the FrontendAction.
-     */
-    void insertReplacement(const ct::Replacement& rep, const clang::FullSourceLoc& fullSL);
-
-    /**
-     * FrontendAction entry point.
-     */
-    void ExecuteAction() override;
-
-    /**
-     * Called at the start of each new file to process.
-     */
-    void EndSourceFileAction() override;
-
-    /**
-     * MatchCallback API entry point. Called by the AST visitor while searching the AST for things we registered an
-     * interest for.
-     */
-    void run(const clang::ast_matchers::MatchFinder::MatchResult& Result) override;
-
-    std::unique_ptr<clang::ASTConsumer> CreateASTConsumer(clang::CompilerInstance &CI, llvm::StringRef InFile) override;
-
-    bool Exclude(const hipCounter & hipToken);
+  // Add a Replacement for the current file. These will all be applied after executing the FrontendAction.
+  void insertReplacement(const ct::Replacement& rep, const clang::FullSourceLoc& fullSL);
+  // FrontendAction entry point.
+  void ExecuteAction() override;
+  // Called at the start of each new file to process.
+  void EndSourceFileAction() override;
+  // MatchCallback API entry point. Called by the AST visitor while searching the AST for things we registered an interest for.
+  void run(const clang::ast_matchers::MatchFinder::MatchResult& Result) override;
+  std::unique_ptr<clang::ASTConsumer> CreateASTConsumer(clang::CompilerInstance &CI, llvm::StringRef InFile) override;
+  bool Exclude(const hipCounter & hipToken);
 };
@@ -3,40 +3,40 @@
 namespace llcompat {

 void PrintStackTraceOnErrorSignal() {
-    // The signature of PrintStackTraceOnErrorSignal changed in llvm 3.9. We don't support
-    // anything older than 3.8, so let's specifically detect the one old version we support.
+  // The signature of PrintStackTraceOnErrorSignal changed in llvm 3.9. We don't support
+  // anything older than 3.8, so let's specifically detect the one old version we support.
 #if (LLVM_VERSION_MAJOR == 3) && (LLVM_VERSION_MINOR == 8)
-    llvm::sys::PrintStackTraceOnErrorSignal();
+  llvm::sys::PrintStackTraceOnErrorSignal();
 #else
-    llvm::sys::PrintStackTraceOnErrorSignal(clang::StringRef());
+  llvm::sys::PrintStackTraceOnErrorSignal(clang::StringRef());
 #endif
 }

 ct::Replacements& getReplacements(ct::RefactoringTool& Tool, clang::StringRef file) {
 #if LLVM_VERSION_MAJOR > 3
-    // getReplacements() now returns a map from filename to Replacements - so create an entry
-    // for this source file and return a reference to it.
-    return Tool.getReplacements()[file];
+  // getReplacements() now returns a map from filename to Replacements - so create an entry
+  // for this source file and return a reference to it.
+  return Tool.getReplacements()[file];
 #else
-    return Tool.getReplacements();
+  return Tool.getReplacements();
 #endif
 }

 void insertReplacement(ct::Replacements& replacements, const ct::Replacement& rep) {
 #if LLVM_VERSION_MAJOR > 3
-    // New clang added error checking to Replacements, and *insists* that you explicitly check it.
-    llvm::consumeError(replacements.add(rep));
+  // New clang added error checking to Replacements, and *insists* that you explicitly check it.
+  llvm::consumeError(replacements.add(rep));
 #else
-    // In older versions, it's literally an std::set<Replacement>
-    replacements.insert(rep);
+  // In older versions, it's literally an std::set<Replacement>
+  replacements.insert(rep);
 #endif
 }

 void EnterPreprocessorTokenStream(clang::Preprocessor& _pp, const clang::Token *start, size_t len, bool DisableMacroExpansion) {
 #if (LLVM_VERSION_MAJOR == 3) && (LLVM_VERSION_MINOR == 8)
-    _pp.EnterTokenStream(start, len, false, DisableMacroExpansion);
+  _pp.EnterTokenStream(start, len, false, DisableMacroExpansion);
 #else
-    _pp.EnterTokenStream(clang::ArrayRef<clang::Token>{start, len}, DisableMacroExpansion);
+  _pp.EnterTokenStream(clang::ArrayRef<clang::Token>{start, len}, DisableMacroExpansion);
 #endif
 }

@@ -11,40 +11,38 @@ namespace ct = clang::tooling;
 // Things for papering over the differences between different LLVM versions.

 namespace llcompat {
-
-
 /**
- * The getNumArgs function on macros was rather unhelpfully renamed in clang 4.0. Its semantics
- * remain unchanged, so let's be slightly ugly about it here. :D
- */
+  * The getNumArgs function on macros was rather unhelpfully renamed in clang 4.0. Its semantics
+  * remain unchanged, so let's be slightly ugly about it here. :D
+  */
 #if LLVM_VERSION_MAJOR > 4
-    #define GET_NUM_ARGS() getNumParams()
+  #define GET_NUM_ARGS() getNumParams()
 #else
-    #define GET_NUM_ARGS() getNumArgs()
+  #define GET_NUM_ARGS() getNumArgs()
 #endif

 #if LLVM_VERSION_MAJOR < 7
-    #define LLVM_DEBUG(X) DEBUG(X)
+  #define LLVM_DEBUG(X) DEBUG(X)
 #endif

 void PrintStackTraceOnErrorSignal();

 /**
- * Get the replacement map for a given filename in a RefactoringTool.
- *
- * Older LLVM versions don't actually support multiple filenames, so everything all gets
- * smushed together. It is the caller's responsibility to cope with this.
- */
+  * Get the replacement map for a given filename in a RefactoringTool.
+  *
+  * Older LLVM versions don't actually support multiple filenames, so everything all gets
+  * smushed together. It is the caller's responsibility to cope with this.
+  */
 ct::Replacements& getReplacements(ct::RefactoringTool& Tool, clang::StringRef file);

 /**
- * Add a Replacement to a Replacements.
- */
+  * Add a Replacement to a Replacements.
+  */
 void insertReplacement(ct::Replacements& replacements, const ct::Replacement& rep);

 /**
- * Version-agnostic version of Preprocessor::EnterTokenStream().
- */
+  * Version-agnostic version of Preprocessor::EnterTokenStream().
+  */
 void EnterPreprocessorTokenStream(clang::Preprocessor& _pp,
                                  const clang::Token *start,
                                  size_t len,
@@ -6,23 +6,22 @@

 namespace ct = clang::tooling;

-
 /**
- * A FrontendActionFactory that propagates a set of Replacements into the FrontendAction.
- * This is necessary boilerplate for using a custom FrontendAction with a RefactoringTool.
- *
- * @tparam T The FrontendAction to create.
- */
+  * A FrontendActionFactory that propagates a set of Replacements into the FrontendAction.
+  * This is necessary boilerplate for using a custom FrontendAction with a RefactoringTool.
+  *
+  * @tparam T The FrontendAction to create.
+  */
 template <typename T>
 class ReplacementsFrontendActionFactory : public ct::FrontendActionFactory {
-    ct::Replacements* replacements;
+  ct::Replacements* replacements;

 public:
-    explicit ReplacementsFrontendActionFactory(ct::Replacements* r):
-        ct::FrontendActionFactory(),
-        replacements(r) {}
+  explicit ReplacementsFrontendActionFactory(ct::Replacements* r):
+    ct::FrontendActionFactory(),
+    replacements(r) {}

-    clang::FrontendAction* create() override {
-        return new T(replacements);
-    }
+  clang::FrontendAction* create() override {
+    return new T(replacements);
+  }
 };
@@ -3,18 +3,17 @@
 #include <sstream>
 #include <iomanip>

-
 const char *counterNames[NUM_CONV_TYPES] = {
-    "version", "init", "device", "mem", "kern", "coord_func", "math_func", "device_func",
-    "special_func", "stream", "event", "occupancy", "ctx", "peer", "module",
-    "cache", "exec", "external_resource_interop", "graph", "err", "def", "tex", "gl", "graphics",
-    "surface", "jit", "d3d9", "d3d10", "d3d11", "vdpau", "egl", "complex",
-    "thread", "other", "include", "include_cuda_main_header", "type", "literal",
-    "numeric_literal"
+  "version", "init", "device", "mem", "kern", "coord_func", "math_func", "device_func",
+  "special_func", "stream", "event", "occupancy", "ctx", "peer", "module",
+  "cache", "exec", "external_resource_interop", "graph", "err", "def", "tex", "gl", "graphics",
+  "surface", "jit", "d3d9", "d3d10", "d3d11", "vdpau", "egl", "complex",
+  "thread", "other", "include", "include_cuda_main_header", "type", "literal",
+  "numeric_literal"
 };

 const char *apiNames[NUM_API_TYPES] = {
-    "CUDA Driver API", "CUDA RT API", "CUBLAS API", "CURAND API", "CUDNN API", "CUFFT API", "cuComplex API"
+  "CUDA Driver API", "CUDA RT API", "CUBLAS API", "CURAND API", "CUDNN API", "CUFFT API", "cuComplex API"
 };

 namespace {
@@ -24,203 +23,174 @@ void conditionalPrint(ST *stream1,
                      ST2* stream2,
                      const std::string& s1,
                      const std::string& s2) {
-    if (stream1) {
-        *stream1 << s1;
-    }
-
-    if (stream2) {
-        *stream2 << s2;
-    }
+  if (stream1) {
+    *stream1 << s1;
+  }
+  if (stream2) {
+    *stream2 << s2;
+  }
 }

-
-/**
- * Print a named stat value to both the terminal and the CSV file.
- */
+// Print a named stat value to both the terminal and the CSV file.
 template<typename T>
 void printStat(std::ostream *csv, llvm::raw_ostream* printOut, const std::string &name, T value) {
-    if (printOut) {
-        *printOut << "  " << name << ": " << value << "\n";
-    }
-
-    if (csv) {
-        *csv << name << ";" << value << "\n";
-    }
+  if (printOut) {
+    *printOut << "  " << name << ": " << value << "\n";
+  }
+  if (csv) {
+    *csv << name << ";" << value << "\n";
+  }
 }

-
 } // Anonymous namespace

 void StatCounter::incrementCounter(const hipCounter& counter, std::string name) {
-    counters[name]++;
-    apiCounters[(int) counter.apiType]++;
-    convTypeCounters[(int) counter.type]++;
+  counters[name]++;
+  apiCounters[(int) counter.apiType]++;
+  convTypeCounters[(int) counter.type]++;
 }

 void StatCounter::add(const StatCounter& other) {
-    for (const auto& p : other.counters) {
-        counters[p.first] += p.second;
-    }
-
-    for (int i = 0; i < NUM_API_TYPES; i++) {
-        apiCounters[i] += other.apiCounters[i];
-    }
-
-    for (int i = 0; i < NUM_CONV_TYPES; i++) {
-        convTypeCounters[i] += other.convTypeCounters[i];
-    }
+  for (const auto& p : other.counters) {
+    counters[p.first] += p.second;
+  }
+  for (int i = 0; i < NUM_API_TYPES; i++) {
+    apiCounters[i] += other.apiCounters[i];
+  }
+  for (int i = 0; i < NUM_CONV_TYPES; i++) {
+    convTypeCounters[i] += other.convTypeCounters[i];
+  }
 }

 int StatCounter::getConvSum() {
-    int acc = 0;
-    for (const int& i : convTypeCounters) {
-        acc += i;
-    }
-
-    return acc;
+  int acc = 0;
+  for (const int& i : convTypeCounters) {
+    acc += i;
+  }
+  return acc;
 }

 void StatCounter::print(std::ostream* csv, llvm::raw_ostream* printOut, std::string prefix) {
-    conditionalPrint(csv, printOut, "\nCUDA ref type;Count\n", "[HIPIFY] info: " + prefix + " refs by type:\n");
-    for (int i = 0; i < NUM_CONV_TYPES; i++) {
-        if (convTypeCounters[i] > 0) {
-            printStat(csv, printOut, counterNames[i], convTypeCounters[i]);
-        }
-    }
-
-    conditionalPrint(csv, printOut, "\nCUDA API;Count\n", "[HIPIFY] info: " + prefix + " refs by API:\n");
-    for (int i = 0; i < NUM_API_TYPES; i++) {
-        printStat(csv, printOut, apiNames[i], apiCounters[i]);
-    }
-
-    conditionalPrint(csv, printOut, "\nCUDA ref name;Count\n", "[HIPIFY] info: " + prefix + " refs by names:\n");
-    for (const auto &it : counters) {
-        printStat(csv, printOut, it.first, it.second);
+  conditionalPrint(csv, printOut, "\nCUDA ref type;Count\n", "[HIPIFY] info: " + prefix + " refs by type:\n");
+  for (int i = 0; i < NUM_CONV_TYPES; i++) {
+    if (convTypeCounters[i] > 0) {
+      printStat(csv, printOut, counterNames[i], convTypeCounters[i]);
    }
+  }
+  conditionalPrint(csv, printOut, "\nCUDA API;Count\n", "[HIPIFY] info: " + prefix + " refs by API:\n");
+  for (int i = 0; i < NUM_API_TYPES; i++) {
+    printStat(csv, printOut, apiNames[i], apiCounters[i]);
+  }
+  conditionalPrint(csv, printOut, "\nCUDA ref name;Count\n", "[HIPIFY] info: " + prefix + " refs by names:\n");
+  for (const auto &it : counters) {
+    printStat(csv, printOut, it.first, it.second);
+  }
 }

-
 Statistics::Statistics(std::string name): fileName(name) {
-    // Compute the total bytes/lines in the input file.
-    std::ifstream src_file(name, std::ios::binary | std::ios::ate);
-    src_file.clear();
-    src_file.seekg(0);
-    totalLines = (int) std::count(std::istreambuf_iterator<char>(src_file), std::istreambuf_iterator<char>(), '\n');
-    totalBytes = (int) src_file.tellg();
-
-    // Mark the start time...
-    startTime = chr::steady_clock::now();
-};
-
+  // Compute the total bytes/lines in the input file.
+  std::ifstream src_file(name, std::ios::binary | std::ios::ate);
+  src_file.clear();
+  src_file.seekg(0);
+  totalLines = (int) std::count(std::istreambuf_iterator<char>(src_file), std::istreambuf_iterator<char>(), '\n');
+  totalBytes = (int) src_file.tellg();
+  startTime = chr::steady_clock::now();
+}

 ///////// Counter update routines //////////

 void Statistics::incrementCounter(const hipCounter &counter, std::string name) {
-    if (counter.unsupported) {
-        unsupported.incrementCounter(counter, name);
-    } else {
-        supported.incrementCounter(counter, name);
-    }
+  if (counter.unsupported) {
+    unsupported.incrementCounter(counter, name);
+  } else {
+    supported.incrementCounter(counter, name);
+  }
 }

 void Statistics::add(const Statistics &other) {
-    supported.add(other.supported);
-    unsupported.add(other.unsupported);
-    totalBytes += other.totalBytes;
-    totalLines += other.totalLines;
-    touchedBytes += other.touchedBytes;
+  supported.add(other.supported);
+  unsupported.add(other.unsupported);
+  totalBytes += other.totalBytes;
+  totalLines += other.totalLines;
+  touchedBytes += other.touchedBytes;
 }

 void Statistics::lineTouched(int lineNumber) {
-    touchedLines.insert(lineNumber);
+  touchedLines.insert(lineNumber);
 }
 void Statistics::bytesChanged(int bytes) {
-    touchedBytes += bytes;
+  touchedBytes += bytes;
 }
 void Statistics::markCompletion() {
-    completionTime = chr::steady_clock::now();
+  completionTime = chr::steady_clock::now();
 }

-
 ///////// Output functions //////////

 void Statistics::print(std::ostream* csv, llvm::raw_ostream* printOut, bool skipHeader) {
-   if (!skipHeader) {
-       std::string str = "file \'" + fileName + "\' statistics:\n";
-       conditionalPrint(csv, printOut, "\n" + str, "\n[HIPIFY] info: " + str);
-   }
-
-   size_t changedLines = touchedLines.size();
-
-   // Total number of (un)supported refs that were converted.
-   int supportedSum = supported.getConvSum();
-   int unsupportedSum = unsupported.getConvSum();
-
-   printStat(csv, printOut, "CONVERTED refs count", supportedSum);
-   printStat(csv, printOut, "UNCONVERTED refs count", unsupportedSum);
-   printStat(csv, printOut, "CONVERSION %", 100 - std::lround(double(unsupportedSum * 100) / double(supportedSum + unsupportedSum)));
-   printStat(csv, printOut, "REPLACED bytes", touchedBytes);
-   printStat(csv, printOut, "TOTAL bytes", totalBytes);
-   printStat(csv, printOut, "CHANGED lines of code", changedLines);
-   printStat(csv, printOut, "TOTAL lines of code", totalLines);
-
-   if (totalBytes > 0) {
-       printStat(csv, printOut, "CODE CHANGED (in bytes) %", std::lround(double(touchedBytes * 100) / double(totalBytes)));
-   }
-
-   if (totalLines > 0) {
-       printStat(csv, printOut, "CODE CHANGED (in lines) %", std::lround(double(changedLines * 100) / double(totalLines)));
-   }
-
-   typedef std::chrono::duration<double, std::milli> duration;
-   duration elapsed = completionTime - startTime;
-   std::stringstream stream;
-   stream << std::fixed << std::setprecision(2) << elapsed.count() / 1000;
-   printStat(csv, printOut, "TIME ELAPSED s", stream.str());
-
-   supported.print(csv, printOut, "CONVERTED");
-   unsupported.print(csv, printOut, "UNCONVERTED");
+  if (!skipHeader) {
+    std::string str = "file \'" + fileName + "\' statistics:\n";
+    conditionalPrint(csv, printOut, "\n" + str, "\n[HIPIFY] info: " + str);
+  }
+  size_t changedLines = touchedLines.size();
+  // Total number of (un)supported refs that were converted.
+  int supportedSum = supported.getConvSum();
+  int unsupportedSum = unsupported.getConvSum();
+  printStat(csv, printOut, "CONVERTED refs count", supportedSum);
+  printStat(csv, printOut, "UNCONVERTED refs count", unsupportedSum);
+  printStat(csv, printOut, "CONVERSION %", 100 - std::lround(double(unsupportedSum * 100) / double(supportedSum + unsupportedSum)));
+  printStat(csv, printOut, "REPLACED bytes", touchedBytes);
+  printStat(csv, printOut, "TOTAL bytes", totalBytes);
+  printStat(csv, printOut, "CHANGED lines of code", changedLines);
+  printStat(csv, printOut, "TOTAL lines of code", totalLines);
+  if (totalBytes > 0) {
+    printStat(csv, printOut, "CODE CHANGED (in bytes) %", std::lround(double(touchedBytes * 100) / double(totalBytes)));
+  }
+  if (totalLines > 0) {
+    printStat(csv, printOut, "CODE CHANGED (in lines) %", std::lround(double(changedLines * 100) / double(totalLines)));
+  }
+  typedef std::chrono::duration<double, std::milli> duration;
+  duration elapsed = completionTime - startTime;
+  std::stringstream stream;
+  stream << std::fixed << std::setprecision(2) << elapsed.count() / 1000;
+  printStat(csv, printOut, "TIME ELAPSED s", stream.str());
+  supported.print(csv, printOut, "CONVERTED");
+  unsupported.print(csv, printOut, "UNCONVERTED");
 }

 void Statistics::printAggregate(std::ostream *csv, llvm::raw_ostream* printOut) {
-    Statistics globalStats = getAggregate();
-
-    conditionalPrint(csv, printOut, "\nTOTAL statistics:\n", "\n[HIPIFY] info: TOTAL statistics:\n");
-
-    // A file is considered "converted" if we made any changes to it.
-    int convertedFiles = 0;
-    for (const auto& p : stats) {
-        if (!p.second.touchedLines.empty()) {
-            convertedFiles++;
-        }
+  Statistics globalStats = getAggregate();
+  conditionalPrint(csv, printOut, "\nTOTAL statistics:\n", "\n[HIPIFY] info: TOTAL statistics:\n");
+  // A file is considered "converted" if we made any changes to it.
+  int convertedFiles = 0;
+  for (const auto& p : stats) {
+    if (!p.second.touchedLines.empty()) {
+      convertedFiles++;
    }
-
-    printStat(csv, printOut, "CONVERTED files", convertedFiles);
-    printStat(csv, printOut, "PROCESSED files", stats.size());
-
-    globalStats.print(csv, printOut);
+  }
+  printStat(csv, printOut, "CONVERTED files", convertedFiles);
+  printStat(csv, printOut, "PROCESSED files", stats.size());
+  globalStats.print(csv, printOut);
 }

 //// Static state management ////

 Statistics Statistics::getAggregate() {
-    Statistics globalStats("global");
-
-    for (const auto& p : stats) {
-        globalStats.add(p.second);
-    }
-
-    return globalStats;
+  Statistics globalStats("global");
+  for (const auto& p : stats) {
+    globalStats.add(p.second);
+  }
+  return globalStats;
 }

 Statistics& Statistics::current() {
-    assert(Statistics::currentStatistics);
-    return *Statistics::currentStatistics;
+  assert(Statistics::currentStatistics);
+  return *Statistics::currentStatistics;
 }

 void Statistics::setActive(std::string name) {
-    stats.emplace(std::make_pair(name, Statistics{name}));
-    Statistics::currentStatistics = &stats.at(name);
+  stats.emplace(std::make_pair(name, Statistics{name}));
+  Statistics::currentStatistics = &stats.at(name);
 }

 std::map<std::string, Statistics> Statistics::stats = {};
@@ -3,66 +3,66 @@
 #include <chrono>
 #include <string>
 #include <fstream>
-#include <llvm/ADT/StringRef.h>
 #include <map>
 #include <set>
+#include <llvm/ADT/StringRef.h>
 #include <llvm/Support/raw_ostream.h>

 namespace chr = std::chrono;

 enum ConvTypes {
-    CONV_VERSION = 0,
-    CONV_INIT,
-    CONV_DEVICE,
-    CONV_MEM,
-    CONV_KERN,
-    CONV_COORD_FUNC,
-    CONV_MATH_FUNC,
-    CONV_DEVICE_FUNC,
-    CONV_SPECIAL_FUNC,
-    CONV_STREAM,
-    CONV_EVENT,
-    CONV_OCCUPANCY,
-    CONV_CONTEXT,
-    CONV_PEER,
-    CONV_MODULE,
-    CONV_CACHE,
-    CONV_EXEC,
-    CONV_EXTERNAL_RES,
-    CONV_GRAPH,
-    CONV_ERROR,
-    CONV_DEF,
-    CONV_TEX,
-    CONV_GL,
-    CONV_GRAPHICS,
-    CONV_SURFACE,
-    CONV_JIT,
-    CONV_D3D9,
-    CONV_D3D10,
-    CONV_D3D11,
-    CONV_VDPAU,
-    CONV_EGL,
-    CONV_COMPLEX,
-    CONV_THREAD,
-    CONV_OTHER,
-    CONV_INCLUDE,
-    CONV_INCLUDE_CUDA_MAIN_H,
-    CONV_TYPE,
-    CONV_LITERAL,
-    CONV_NUMERIC_LITERAL,
-    CONV_LAST
+  CONV_VERSION = 0,
+  CONV_INIT,
+  CONV_DEVICE,
+  CONV_MEM,
+  CONV_KERN,
+  CONV_COORD_FUNC,
+  CONV_MATH_FUNC,
+  CONV_DEVICE_FUNC,
+  CONV_SPECIAL_FUNC,
+  CONV_STREAM,
+  CONV_EVENT,
+  CONV_OCCUPANCY,
+  CONV_CONTEXT,
+  CONV_PEER,
+  CONV_MODULE,
+  CONV_CACHE,
+  CONV_EXEC,
+  CONV_EXTERNAL_RES,
+  CONV_GRAPH,
+  CONV_ERROR,
+  CONV_DEF,
+  CONV_TEX,
+  CONV_GL,
+  CONV_GRAPHICS,
+  CONV_SURFACE,
+  CONV_JIT,
+  CONV_D3D9,
+  CONV_D3D10,
+  CONV_D3D11,
+  CONV_VDPAU,
+  CONV_EGL,
+  CONV_COMPLEX,
+  CONV_THREAD,
+  CONV_OTHER,
+  CONV_INCLUDE,
+  CONV_INCLUDE_CUDA_MAIN_H,
+  CONV_TYPE,
+  CONV_LITERAL,
+  CONV_NUMERIC_LITERAL,
+  CONV_LAST
 };
 constexpr int NUM_CONV_TYPES = (int) ConvTypes::CONV_LAST;

 enum ApiTypes {
-    API_DRIVER = 0,
-    API_RUNTIME,
-    API_BLAS,
-    API_RAND,
-    API_DNN,
-    API_FFT,
-    API_COMPLEX,
-    API_LAST
+  API_DRIVER = 0,
+  API_RUNTIME,
+  API_BLAS,
+  API_RAND,
+  API_DNN,
+  API_FFT,
+  API_COMPLEX,
+  API_LAST
 };
 constexpr int NUM_API_TYPES = (int) ApiTypes::API_LAST;

@@ -70,113 +70,81 @@ constexpr int NUM_API_TYPES = (int) ApiTypes::API_LAST;
 extern const char *counterNames[NUM_CONV_TYPES];
 extern const char *apiNames[NUM_API_TYPES];

-
 struct hipCounter {
-    llvm::StringRef hipName;
-    ConvTypes type;
-    ApiTypes apiType;
-    bool unsupported;
+  llvm::StringRef hipName;
+  ConvTypes type;
+  ApiTypes apiType;
+  bool unsupported;
 };

-
 /**
- * Tracks a set of named counters, as well as counters for each of the type enums defined above.
- */
+  * Tracks a set of named counters, as well as counters for each of the type enums defined above.
+  */
 class StatCounter {
 private:
-    // Each thing we track is either "supported" or "unsupported"...
-    std::map<std::string, int> counters;
-
-    int apiCounters[NUM_API_TYPES] = {};
-    int convTypeCounters[NUM_CONV_TYPES] = {};
+  // Each thing we track is either "supported" or "unsupported"...
+  std::map<std::string, int> counters;
+  int apiCounters[NUM_API_TYPES] = {};
+  int convTypeCounters[NUM_CONV_TYPES] = {};

 public:
-    void incrementCounter(const hipCounter& counter, std::string name);
-
-    /**
-     * Add the counters from `other` onto the counters of this object.
-     */
-    void add(const StatCounter& other);
-
-    int getConvSum();
-
-    void print(std::ostream* csv, llvm::raw_ostream* printOut, std::string prefix);
+  void incrementCounter(const hipCounter& counter, std::string name);
+  // Add the counters from `other` onto the counters of this object.
+  void add(const StatCounter& other);
+  int getConvSum();
+  void print(std::ostream* csv, llvm::raw_ostream* printOut, std::string prefix);
 };

 /**
- * Tracks the statistics for a single input file.
- */
+  * Tracks the statistics for a single input file.
+  */
 class Statistics {
-    StatCounter supported;
-    StatCounter unsupported;
-
-    std::string fileName;
-
-    std::set<int> touchedLines = {};
-    int touchedBytes = 0;
-
-    int totalLines = 0;
-    int totalBytes = 0;
-
-    chr::steady_clock::time_point startTime;
-    chr::steady_clock::time_point completionTime;
+  StatCounter supported;
+  StatCounter unsupported;
+  std::string fileName;
+  std::set<int> touchedLines = {};
+  int touchedBytes = 0;
+  int totalLines = 0;
+  int totalBytes = 0;
+  chr::steady_clock::time_point startTime;
+  chr::steady_clock::time_point completionTime;

 public:
-    Statistics(std::string name);
-
-    void incrementCounter(const hipCounter &counter, std::string name);
-
-    /**
-     * Add the counters from `other` onto the counters of this object.
-     */
-    void add(const Statistics &other);
-
-    void lineTouched(int lineNumber);
-    void bytesChanged(int bytes);
-
-    /**
-     * Set the completion timestamp to now.
-     */
-    void markCompletion();
-
-    /////// Output functions ///////
+  Statistics(std::string name);
+  void incrementCounter(const hipCounter &counter, std::string name);
+  // Add the counters from `other` onto the counters of this object.
+  void add(const Statistics &other);
+  void lineTouched(int lineNumber);
+  void bytesChanged(int bytes);
+  // Set the completion timestamp to now.
+  void markCompletion();

 public:
-   /**
-     * Pretty-print the statistics stored in this object.
-     *
-     * @param csv Pointer to an output stream for the CSV to write. If null, no CSV is written
-     * @param printOut Pointer to an output stream to print human-readable textual stats to. If null, no
-     *                 such stats are produced.
-     */
-    void print(std::ostream* csv, llvm::raw_ostream* printOut, bool skipHeader = false);
-
-    /// Print aggregated statistics for all registered counters.
-    static void printAggregate(std::ostream *csv, llvm::raw_ostream* printOut);
-
-    /////// Static nonsense ///////
-
-    // The Statistics for each input file.
-    static std::map<std::string, Statistics> stats;
-
-    // The Statistics objects for the currently-being-processed input file.
-    static Statistics* currentStatistics;
-
-    /**
-     * Aggregate statistics over all entries in `stats` and return the resulting Statistics object.
-     */
-    static Statistics getAggregate();
-
-    /**
-     * Convenient global entry point for updating the "active" Statistics. Since we operate single-threadedly
-     * processing one file at a time, this allows us to simply expose the stats for the current file globally,
-     * simplifying things.
-     */
-    static Statistics& current();
-
-    /**
-     * Set the active Statistics object to the named one, creating it if necessary, and write the completion
-     * timestamp into the currently active one.
-     */
-    static void setActive(std::string name);
+  /**
+    * Pretty-print the statistics stored in this object.
+    *
+    * @param csv Pointer to an output stream for the CSV to write. If null, no CSV is written
+    * @param printOut Pointer to an output stream to print human-readable textual stats to. If null, no
+    *                 such stats are produced.
+    */
+  void print(std::ostream* csv, llvm::raw_ostream* printOut, bool skipHeader = false);
+  // Print aggregated statistics for all registered counters.
+  static void printAggregate(std::ostream *csv, llvm::raw_ostream* printOut);
+  // The Statistics for each input file.
+  static std::map<std::string, Statistics> stats;
+  // The Statistics objects for the currently-being-processed input file.
+  static Statistics* currentStatistics;
+  // Aggregate statistics over all entries in `stats` and return the resulting Statistics object.
+  static Statistics getAggregate();
+  /**
+    * Convenient global entry point for updating the "active" Statistics. Since we operate single-threadedly
+    * processing one file at a time, this allows us to simply expose the stats for the current file globally,
+    * simplifying things.
+    */
+  static Statistics& current();
+  /**
+    * Set the active Statistics object to the named one, creating it if necessary, and write the completion
+    * timestamp into the currently active one.
+    */
+  static void setActive(std::string name);
 };
@@ -1,17 +1,15 @@
 #include "StringUtils.h"

 llvm::StringRef unquoteStr(llvm::StringRef s) {
-    if (s.size() > 1 && s.front() == '"' && s.back() == '"') {
-        return s.substr(1, s.size() - 2);
-    }
-
-    return s;
+  if (s.size() > 1 && s.front() == '"' && s.back() == '"') {
+    return s.substr(1, s.size() - 2);
+  }
+  return s;
 }

 void removePrefixIfPresent(std::string &s, std::string prefix) {
-    if (s.find(prefix) != 0) {
-        return;
-    }
-
-    s.erase(0, prefix.size());
+  if (s.find(prefix) != 0) {
+    return;
+  }
+  s.erase(0, prefix.size());
 }
@@ -4,11 +4,11 @@
 #include "llvm/ADT/StringRef.h"

 /**
- * Remove double-quotes from the start/end of a string, if present.
- */
+  * Remove double-quotes from the start/end of a string, if present.
+  */
 llvm::StringRef unquoteStr(llvm::StringRef s);

 /**
- * If `s` starts with `prefix`, remove it. Otherwise, does nothing.
- */
+  * If `s` starts with `prefix`, remove it. Otherwise, does nothing.
+  */
 void removePrefixIfPresent(std::string &s, std::string prefix);
@@ -1,5 +1,5 @@
 /*
-Copyright (c) 2015-2017 Advanced Micro Devices, Inc. All rights reserved.
+Copyright (c) 2015-2018 Advanced Micro Devices, Inc. All rights reserved.

 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -20,10 +20,11 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 */
 /**
- * @file Cuda2Hip.cpp
- *
- * This file is compiled and linked into clang based hipify tool.
- */
+  * @file Cuda2Hip.cpp
+  *
+  * This file is compiled and linked into clang based hipify tool.
+  */
+
 #include <cstdio>
 #include <fstream>
 #include <set>
@@ -31,7 +32,6 @@ THE SOFTWARE.
 #include <chrono>
 #include <iomanip>
 #include <sstream>
-
 #include "CUDA2HipMap.h"
 #include "LLVMCompat.h"
 #include "HipifyAction.h"
@@ -42,7 +42,6 @@ THE SOFTWARE.

 namespace ct = clang::tooling;

-
 namespace {

 void copyFile(const std::string& src, const std::string& dst) {
@@ -55,7 +54,6 @@ void copyFile(const std::string& src, const std::string& dst) {

 int main(int argc, const char **argv) {
  llcompat::PrintStackTraceOnErrorSignal();
-
  ct::CommonOptionsParser OptionsParser(argc, argv, ToolTemplateCategory, llvm::cl::OneOrMore);
  std::vector<std::string> fileSources = OptionsParser.getSourcePathList();
  std::string dst = OutputFilename;
@@ -63,7 +61,6 @@ int main(int argc, const char **argv) {
    llvm::errs() << "[HIPIFY] conflict: -o and multiple source files are specified.\n";
    return 1;
  }
-
  if (NoOutput) {
    if (Inplace) {
      llvm::errs() << "[HIPIFY] conflict: both -no-output and -inplace options are specified.\n";
@@ -74,13 +71,10 @@ int main(int argc, const char **argv) {
      return 1;
    }
  }
-
  if (Examine) {
    NoOutput = PrintStats = true;
  }
-
  int Result = 0;
-
  // Arguments for the Statistics print routines.
  std::unique_ptr<std::ostream> csv = nullptr;
  llvm::raw_ostream* statPrint = nullptr;
@@ -90,7 +84,6 @@ int main(int argc, const char **argv) {
  if (PrintStats) {
    statPrint = &llvm::errs();
  }
-
  for (const auto & src : fileSources) {
    if (dst.empty()) {
      if (Inplace) {
@@ -102,55 +95,42 @@ int main(int argc, const char **argv) {
      llvm::errs() << "[HIPIFY] conflict: both -o and -inplace options are specified.\n";
      return 1;
    }
-
    std::string tmpFile = src + ".hipify-tmp";
-
    // Create a copy of the file to work on. When we're done, we'll move this onto the
    // output (which may mean overwriting the input, if we're in-place).
    // Should we fail for some reason, we'll just leak this file and not corrupt the input.
    copyFile(src, tmpFile);
-
    // Initialise the statistics counters for this file.
    Statistics::setActive(src);
-
    // RefactoringTool operates on the file in-place. Giving it the output path is no good,
    // because that'll break relative includes, and we don't want to overwrite the input file.
    // So what we do is operate on a copy, which we then move to the output.
    ct::RefactoringTool Tool(OptionsParser.getCompilations(), tmpFile);
    ct::Replacements& replacementsToUse = llcompat::getReplacements(Tool, tmpFile);
-
    ReplacementsFrontendActionFactory<HipifyAction> actionFactory(&replacementsToUse);
-
    Tool.appendArgumentsAdjuster(ct::getInsertArgumentAdjuster("--cuda-host-only", ct::ArgumentInsertPosition::BEGIN));
-
    // Ensure at least c++11 is used.
    Tool.appendArgumentsAdjuster(ct::getInsertArgumentAdjuster("-std=c++11", ct::ArgumentInsertPosition::BEGIN));
 #if defined(HIPIFY_CLANG_RES)
    Tool.appendArgumentsAdjuster(ct::getInsertArgumentAdjuster("-resource-dir=" HIPIFY_CLANG_RES));
 #endif
    Tool.appendArgumentsAdjuster(ct::getClangSyntaxOnlyAdjuster());
-
    // Hipify _all_ the things!
    if (Tool.runAndSave(&actionFactory)) {
      LLVM_DEBUG(llvm::dbgs() << "Skipped some replacements.\n");
    }
-
    // Either move the tmpfile to the output, or remove it.
    if (!NoOutput) {
      rename(tmpFile.c_str(), dst.c_str());
    } else {
      remove(tmpFile.c_str());
    }
-
    Statistics::current().markCompletion();
    Statistics::current().print(csv.get(), statPrint);
-
    dst.clear();
  }
-
  if (fileSources.size() > 1) {
    Statistics::printAggregate(csv.get(), statPrint);
  }
-
  return Result;
 }
@@ -752,80 +752,6 @@ void *__amdgcn_get_dynamicgroupbaseptr() {
    return __get_dynamicgroupbaseptr();
 }

-
-
-// hip.amdgcn.bc - sync threads
-#define __CLK_LOCAL_MEM_FENCE    0x01
-typedef unsigned __cl_mem_fence_flags;
-
-typedef enum __memory_scope {
-  __memory_scope_work_item = __OPENCL_MEMORY_SCOPE_WORK_ITEM,
-  __memory_scope_work_group = __OPENCL_MEMORY_SCOPE_WORK_GROUP,
-  __memory_scope_device = __OPENCL_MEMORY_SCOPE_DEVICE,
-  __memory_scope_all_svm_devices = __OPENCL_MEMORY_SCOPE_ALL_SVM_DEVICES,
-  __memory_scope_sub_group = __OPENCL_MEMORY_SCOPE_SUB_GROUP
-} __memory_scope;
-
-// enum values aligned with what clang uses in EmitAtomicExpr()
-typedef enum __memory_order
-{
-  __memory_order_relaxed = __ATOMIC_RELAXED,
-  __memory_order_acquire = __ATOMIC_ACQUIRE,
-  __memory_order_release = __ATOMIC_RELEASE,
-  __memory_order_acq_rel = __ATOMIC_ACQ_REL,
-  __memory_order_seq_cst = __ATOMIC_SEQ_CST
-} __memory_order;
-
-__device__
-inline
-static void
-__atomic_work_item_fence(__cl_mem_fence_flags flags, __memory_order order, __memory_scope scope)
-{
-    // We're tying global-happens-before and local-happens-before together as does HSA
-    if (order != __memory_order_relaxed) {
-        switch (scope) {
-        case __memory_scope_work_item:
-            break;
-        case __memory_scope_sub_group:
-            switch (order) {
-            case __memory_order_relaxed: break;
-            case __memory_order_acquire: __llvm_fence_acq_sg(); break;
-            case __memory_order_release: __llvm_fence_rel_sg(); break;
-            case __memory_order_acq_rel: __llvm_fence_ar_sg(); break;
-            case __memory_order_seq_cst: __llvm_fence_sc_sg(); break;
-            }
-            break;
-        case __memory_scope_work_group:
-            switch (order) {
-            case __memory_order_relaxed: break;
-            case __memory_order_acquire: __llvm_fence_acq_wg(); break;
-            case __memory_order_release: __llvm_fence_rel_wg(); break;
-            case __memory_order_acq_rel: __llvm_fence_ar_wg(); break;
-            case __memory_order_seq_cst: __llvm_fence_sc_wg(); break;
-            }
-            break;
-        case __memory_scope_device:
-            switch (order) {
-            case __memory_order_relaxed: break;
-            case __memory_order_acquire: __llvm_fence_acq_dev(); break;
-            case __memory_order_release: __llvm_fence_rel_dev(); break;
-            case __memory_order_acq_rel: __llvm_fence_ar_dev(); break;
-            case __memory_order_seq_cst: __llvm_fence_sc_dev(); break;
-            }
-            break;
-        case __memory_scope_all_svm_devices:
-            switch (order) {
-            case __memory_order_relaxed: break;
-            case __memory_order_acquire: __llvm_fence_acq_sys(); break;
-            case __memory_order_release: __llvm_fence_rel_sys(); break;
-            case __memory_order_acq_rel: __llvm_fence_ar_sys(); break;
-            case __memory_order_seq_cst: __llvm_fence_sc_sys(); break;
-            }
-            break;
-        }
-    }
-}
-
 // Memory Fence Functions
 __device__
 inline
@@ -65,26 +65,30 @@ extern "C" __device__ __attribute__((const)) float __ocml_fmax_f32(float, float)
 __device__ inline static __local void* __to_local(unsigned x) { return (__local void*)x; }
 #endif //__HIP_DEVICE_COMPILE__

-// __llvm_fence* functions from device-libs/irif/src/fence.ll
-extern "C" __device__ void __llvm_fence_acq_sg(void);
-extern "C" __device__ void __llvm_fence_acq_wg(void);
-extern "C" __device__ void __llvm_fence_acq_dev(void);
-extern "C" __device__ void __llvm_fence_acq_sys(void);
+// Using hip.amdgcn.bc - sync threads
+#define __CLK_LOCAL_MEM_FENCE    0x01
+typedef unsigned __cl_mem_fence_flags;

-extern "C" __device__ void __llvm_fence_rel_sg(void);
-extern "C" __device__ void __llvm_fence_rel_wg(void);
-extern "C" __device__ void __llvm_fence_rel_dev(void);
-extern "C" __device__ void __llvm_fence_rel_sys(void);
+typedef enum __memory_scope {
+  __memory_scope_work_item = __OPENCL_MEMORY_SCOPE_WORK_ITEM,
+  __memory_scope_work_group = __OPENCL_MEMORY_SCOPE_WORK_GROUP,
+  __memory_scope_device = __OPENCL_MEMORY_SCOPE_DEVICE,
+  __memory_scope_all_svm_devices = __OPENCL_MEMORY_SCOPE_ALL_SVM_DEVICES,
+  __memory_scope_sub_group = __OPENCL_MEMORY_SCOPE_SUB_GROUP
+} __memory_scope;

-extern "C" __device__ void __llvm_fence_ar_sg(void);
-extern "C" __device__ void __llvm_fence_ar_wg(void);
-extern "C" __device__ void __llvm_fence_ar_dev(void);
-extern "C" __device__ void __llvm_fence_ar_sys(void);
+// enum values aligned with what clang uses in EmitAtomicExpr()
+typedef enum __memory_order
+{
+  __memory_order_relaxed = __ATOMIC_RELAXED,
+  __memory_order_acquire = __ATOMIC_ACQUIRE,
+  __memory_order_release = __ATOMIC_RELEASE,
+  __memory_order_acq_rel = __ATOMIC_ACQ_REL,
+  __memory_order_seq_cst = __ATOMIC_SEQ_CST
+} __memory_order;

-
-extern "C" __device__ void __llvm_fence_sc_sg(void);
-extern "C" __device__ void __llvm_fence_sc_wg(void);
-extern "C" __device__ void __llvm_fence_sc_dev(void);
-extern "C" __device__ void __llvm_fence_sc_sys(void);
+// Linked from hip.amdgcn.bc
+extern "C" __device__ void
+__atomic_work_item_fence(__cl_mem_fence_flags, __memory_order, __memory_scope);

 #endif
@@ -590,8 +590,6 @@ const char* hipGetErrorString(hipError_t hipError);
 *
 *  The following Stream APIs are not (yet) supported in HIP:
 *  - cudaStreamAttachMemAsync
- *  - cudaStreamCreateWithPriority
- *  - cudaStreamGetPriority
 */


@@ -609,7 +607,7 @@ const char* hipGetErrorString(hipError_t hipError);
 *
 * @return #hipSuccess, #hipErrorInvalidValue
 *
- * @see hipStreamCreateWithFlags, hipStreamSynchronize, hipStreamWaitEvent, hipStreamDestroy
+ * @see hipStreamCreateWithFlags, hipStreamCreateWithPriority, hipStreamSynchronize, hipStreamWaitEvent, hipStreamDestroy
 */
 hipError_t hipStreamCreate(hipStream_t* stream);

@@ -628,12 +626,50 @@ hipError_t hipStreamCreate(hipStream_t* stream);
 * stream.  See #hipStreamDefault, #hipStreamNonBlocking.
 *
 *
- * @see hipStreamCreate, hipStreamSynchronize, hipStreamWaitEvent, hipStreamDestroy
+ * @see hipStreamCreate, hipStreamCreateWithPriority, hipStreamSynchronize, hipStreamWaitEvent, hipStreamDestroy
 */

 hipError_t hipStreamCreateWithFlags(hipStream_t* stream, unsigned int flags);


+/**
+ * @brief Create an asynchronous stream with the specified priority.
+ *
+ * @param[in, out] stream Pointer to new stream
+ * @param[in ] flags to control stream creation.
+ * @param[in ] priority of the stream. Lower numbers represent higher priorities.
+ * @return #hipSuccess, #hipErrorInvalidValue
+ *
+ * Create a new asynchronous stream with the specified priority.  @p stream returns an opaque handle
+ * that can be used to reference the newly created stream in subsequent hipStream* commands.  The
+ * stream is allocated on the heap and will remain allocated even if the handle goes out-of-scope.
+ * To release the memory used by the stream, applicaiton must call hipStreamDestroy. Flags controls
+ * behavior of the stream.  See #hipStreamDefault, #hipStreamNonBlocking.
+ *
+ *
+ * @see hipStreamCreate, hipStreamSynchronize, hipStreamWaitEvent, hipStreamDestroy
+ */
+
+hipError_t hipStreamCreateWithPriority(hipStream_t* stream, unsigned int flags, int priority);
+
+
+/**
+ * @brief Returns numerical values that correspond to the least and greatest stream priority.
+ *
+ * @param[in, out] leastPriority pointer in which value corresponding to least priority is returned.
+ * @param[in, out] greatestPriority pointer in which value corresponding to greatest priority is returned.
+ *
+ * Returns in *leastPriority and *greatestPriority the numerical values that correspond to the least
+ * and greatest stream priority respectively. Stream priorities follow a convention where lower numbers
+ * imply greater priorities. The range of meaningful stream priorities is given by
+ * [*greatestPriority, *leastPriority]. If the user attempts to create a stream with a priority value
+ * that is outside the the meaningful range as specified by this API, the priority is automatically
+ * clamped to within the valid range.
+ */
+
+hipError_t hipDeviceGetStreamPriorityRange(int* leastPriority, int* greatestPriority);
+
+
 /**
 * @brief Destroys the specified stream.
 *
@@ -649,7 +685,7 @@ hipError_t hipStreamCreateWithFlags(hipStream_t* stream, unsigned int flags);
 * The queue may be destroyed while some commands are still inflight, or may wait for all commands
 * queued to the stream before destroying it.
 *
- * @see hipStreamCreate, hipStreamCreateWithFlags, hipStreamQuery, hipStreamWaitEvent,
+ * @see hipStreamCreate, hipStreamCreateWithFlags, hipStreamCreateWithPriority, hipStreamQuery, hipStreamWaitEvent,
 * hipStreamSynchronize
 */
 hipError_t hipStreamDestroy(hipStream_t stream);
@@ -667,7 +703,7 @@ hipError_t hipStreamDestroy(hipStream_t stream);
 * host threads are sending work to the stream, the status may change immediately after the function
 * is called.  It is typically used for debug.
 *
- * @see hipStreamCreate, hipStreamCreateWithFlags, hipStreamWaitEvent, hipStreamSynchronize,
+ * @see hipStreamCreate, hipStreamCreateWithFlags, hipStreamCreateWithPriority, hipStreamWaitEvent, hipStreamSynchronize,
 * hipStreamDestroy
 */
 hipError_t hipStreamQuery(hipStream_t stream);
@@ -689,7 +725,7 @@ hipError_t hipStreamQuery(hipStream_t stream);
 * This command honors the hipDeviceLaunchBlocking flag, which controls whether the wait is active
 * or blocking.
 *
- * @see hipStreamCreate, hipStreamCreateWithFlags, hipStreamWaitEvent, hipStreamDestroy
+ * @see hipStreamCreate, hipStreamCreateWithFlags, hipStreamCreateWithPriority, hipStreamWaitEvent, hipStreamDestroy
 *
 */
 hipError_t hipStreamSynchronize(hipStream_t stream);
@@ -712,7 +748,7 @@ hipError_t hipStreamSynchronize(hipStream_t stream);
 * does not impliciy wait for commands in the default stream to complete, even if the specified
 * stream is created with hipStreamNonBlocking = 0.
 *
- * @see hipStreamCreate, hipStreamCreateWithFlags, hipStreamSynchronize, hipStreamDestroy
+ * @see hipStreamCreate, hipStreamCreateWithFlags, hipStreamCreateWithPriority, hipStreamSynchronize, hipStreamDestroy
 */
 hipError_t hipStreamWaitEvent(hipStream_t stream, hipEvent_t event, unsigned int flags);

@@ -732,6 +768,23 @@ hipError_t hipStreamWaitEvent(hipStream_t stream, hipEvent_t event, unsigned int
 */
 hipError_t hipStreamGetFlags(hipStream_t stream, unsigned int* flags);

+
+/**
+ * @brief Query the priority of a stream.
+ *
+ * @param[in] stream stream to be queried
+ * @param[in,out] priority Pointer to an unsigned integer in which the stream's priority is returned
+ * @return #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidResourceHandle
+ *
+ * @returns #hipSuccess #hipErrorInvalidValue #hipErrorInvalidResourceHandle
+ *
+ * Query the priority of a stream. The priority is returned in in priority.
+ *
+ * @see hipStreamCreateWithFlags
+ */
+hipError_t hipStreamGetPriority(hipStream_t stream, int* priority);
+
+
 /**
 * Stream CallBack struct
 */
@@ -749,7 +802,7 @@ typedef void (*hipStreamCallback_t)(hipStream_t stream, hipError_t status, void*
 * @return #hipSuccess, #hipErrorInvalidResourceHandle, #hipErrorNotSupported
 *
 * @see hipStreamCreate, hipStreamCreateWithFlags, hipStreamQuery, hipStreamSynchronize,
- * hipStreamWaitEvent, hipStreamDestroy
+ * hipStreamWaitEvent, hipStreamDestroy, hipStreamCreateWithPriority
 *
 */
 hipError_t hipStreamAddCallback(hipStream_t stream, hipStreamCallback_t callback, void* userData,
@@ -93,11 +93,12 @@ public:
    }
 };

-const std::unordered_map<hsa_agent_t, std::vector<hsa_executable_t>>& executables();
+const std::unordered_map<hsa_agent_t, std::vector<hsa_executable_t>>& executables(
+    bool rebuild = false);
 const std::unordered_map<std::uintptr_t, std::vector<std::pair<hsa_agent_t, Kernel_descriptor>>>&
-functions();
-const std::unordered_map<std::uintptr_t, std::string>& function_names();
-std::unordered_map<std::string, void*>& globals();
+functions(bool rebuild = false);
+const std::unordered_map<std::uintptr_t, std::string>& function_names(bool rebuild = false);
+std::unordered_map<std::string, void*>& globals(bool rebuild = false);

 hsa_executable_t load_executable(const std::string& file, hsa_executable_t executable,
                                 hsa_agent_t agent);
@@ -105,8 +105,10 @@ typedef enum hipChannelFormatKind {
 #define hipHostMallocCoherent 0x0
 #define hipHostMallocNonCoherent 0x0

+#define hipHostRegisterDefault cudaHostRegisterDefault
 #define hipHostRegisterPortable cudaHostRegisterPortable
 #define hipHostRegisterMapped cudaHostRegisterMapped
+#define hipHostRegisterIoMemory cudaHostRegisterIoMemory

 #define HIP_LAUNCH_PARAM_BUFFER_POINTER CU_LAUNCH_PARAM_BUFFER_POINTER
 #define HIP_LAUNCH_PARAM_BUFFER_SIZE CU_LAUNCH_PARAM_BUFFER_SIZE
@@ -890,6 +892,13 @@ inline static hipError_t hipStreamCreateWithFlags(hipStream_t* stream, unsigned
    return hipCUDAErrorTohipError(cudaStreamCreateWithFlags(stream, flags));
 }

+inline static hipError_t hipStreamCreateWithPriority(hipStream_t* stream, unsigned int flags, int priority) {
+    return hipCUDAErrorTohipError(cudaStreamCreateWithPriority(stream, flags, priority));
+}
+
+inline static hipError_t hipDeviceGetStreamPriorityRange(int* leastPriority, int* greatestPriority) {
+    return hipCUDAErrorTohipError(cudaDeviceGetStreamPriorityRange(leastPriority, greatestPriority));
+}

 inline static hipError_t hipStreamCreate(hipStream_t* stream) {
    return hipCUDAErrorTohipError(cudaStreamCreate(stream));
@@ -903,6 +912,13 @@ inline static hipError_t hipStreamDestroy(hipStream_t stream) {
    return hipCUDAErrorTohipError(cudaStreamDestroy(stream));
 }

+inline static hipError_t hipStreamGetFlags(hipStream_t stream, unsigned int *flags) {
+    return hipCUDAErrorTohipError(cudaStreamGetFlags(stream, flags));
+}
+
+inline static hipError_t hipStreamGetPriority(hipStream_t stream, int *priority) {
+    return hipCUDAErrorTohipError(cudaStreamGetPriority(stream, priority));
+}

 inline static hipError_t hipStreamWaitEvent(hipStream_t stream, hipEvent_t event,
                                            unsigned int flags) {
@@ -38,7 +38,7 @@ THE SOFTWARE.
        }                                                                                          \
    }

-__global__ void bit_extract_kernel(hipLaunchParm lp, uint32_t* C_d, const uint32_t* A_d, size_t N) {
+__global__ void bit_extract_kernel(uint32_t* C_d, const uint32_t* A_d, size_t N) {
    size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
    size_t stride = hipBlockDim_x * hipGridDim_x;

@@ -85,7 +85,7 @@ int main(int argc, char* argv[]) {
    printf("info: launch 'bit_extract_kernel' \n");
    const unsigned blocks = 512;
    const unsigned threadsPerBlock = 256;
-    hipLaunchKernel(bit_extract_kernel, dim3(blocks), dim3(threadsPerBlock), 0, 0, C_d, A_d, N);
+    hipLaunchKernelGGL(bit_extract_kernel, dim3(blocks), dim3(threadsPerBlock), 0, 0, C_d, A_d, N);

    printf("info: copy Device2Host\n");
    CHECK(hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost));
@@ -22,7 +22,7 @@ THE SOFTWARE.

 #include "hip/hip_runtime.h"

-__global__ void vadd_hip(hipLaunchParm lp, const float* a, const float* b, float* c, int N) {
+__global__ void vadd_hip(const float* a, const float* b, float* c, int N) {
    int idx = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);

    if (idx < N) {
@@ -60,7 +60,7 @@ int main(int argc, char* argv[]) {
    // Launch kernel onto default accelerator
    int blockSize = 256;                                      // pick arbitrary block size
    int blocks = (sizeElements + blockSize - 1) / blockSize;  // round up to launch enough blocks
-    hipLaunchKernel(vadd_hip, dim3(blocks), dim3(blockSize), 0, 0, A_d, B_d, C_d, sizeElements);
+    hipLaunchKernelGGL(vadd_hip, dim3(blocks), dim3(blockSize), 0, 0, A_d, B_d, C_d, sizeElements);

    // D2H Copy
    hipMemcpy(C_h, C_d, sizeBytes, hipMemcpyDeviceToHost);
@@ -37,7 +37,7 @@ THE SOFTWARE.
 * Square each element in the array A and write to array C.
 */
 template <typename T>
-__global__ void vector_square(hipLaunchParm lp, T* C_d, const T* A_d, size_t N) {
+__global__ void vector_square(T* C_d, const T* A_d, size_t N) {
    size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
    size_t stride = hipBlockDim_x * hipGridDim_x;

@@ -81,7 +81,7 @@ int main(int argc, char* argv[]) {
    const unsigned threadsPerBlock = 256;

    printf("info: launch 'vector_square' kernel\n");
-    hipLaunchKernel(vector_square, dim3(blocks), dim3(threadsPerBlock), 0, 0, C_d, A_d, N);
+    hipLaunchKernelGGL(vector_square, dim3(blocks), dim3(threadsPerBlock), 0, 0, C_d, A_d, N);

    printf("info: copy Device2Host\n");
    CHECK(hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost));
@@ -434,7 +434,7 @@ class KernelCommand : public Command {

        switch (_kind) {
            case Null:
-                hipLaunchKernel(NullKernel, dim3(gridX / groupX), dim3(gridX), 0, _stream, nullptr);
+                hipLaunchKernelGGL(NullKernel, dim3(gridX / groupX), dim3(gridX), 0, _stream, nullptr);
                break;
            case VectorAdd:
                assert(0);  // TODO
@@ -1,6 +1,6 @@
 #include "hip/hip_runtime.h"

-extern "C" __global__ void NullKernel(hipLaunchParm lp, float* Ad) {
+extern "C" __global__ void NullKernel(float* Ad) {
    if (Ad) {
        Ad[0] = 42;
    }
@@ -3,7 +3,7 @@
 static const int BLOCKSIZEX = 32;
 static const int BLOCKSIZEY = 16;

-__global__ void fails(hipLaunchParm lp, float* pErrorI) {
+__global__ void fails(float* pErrorI) {
    if (pErrorI != 0) {
        pErrorI[0] = 1;
    }
@@ -14,5 +14,5 @@ int main() {
    dim3 threads(BLOCKSIZEX, BLOCKSIZEY);
    float error;

-    hipLaunchKernel(HIP_KERNEL_NAME(fails), blocks, threads, 0, 0, &error);
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(fails), blocks, threads, 0, 0, &error);
 }
@@ -48,7 +48,7 @@ const unsigned p_tests = 0xfffffff;


 // HCC optimizes away fully NULL kernel calls, so run one that is nearly null:
-__global__ void NearlyNull(hipLaunchParm lp, float* Ad) {
+__global__ void NearlyNull(float* Ad) {
    if (Ad) {
        Ad[0] = 42;
    }
@@ -94,14 +94,14 @@ int main() {

    if (p_tests & 0x1) {
        hipEventRecord(start);
-        hipLaunchKernel(NearlyNull, dim3(NUM_GROUPS), dim3(GROUP_SIZE), 0, stream0, Ad);
+        hipLaunchKernelGGL(NearlyNull, dim3(NUM_GROUPS), dim3(GROUP_SIZE), 0, stream0, Ad);
        stopTest(start, stop, "FirstKernelLaunch", 1);
    }


    if (p_tests & 0x2) {
        hipEventRecord(start);
-        hipLaunchKernel(NearlyNull, dim3(NUM_GROUPS), dim3(GROUP_SIZE), 0, stream0, Ad);
+        hipLaunchKernelGGL(NearlyNull, dim3(NUM_GROUPS), dim3(GROUP_SIZE), 0, stream0, Ad);
        stopTest(start, stop, "SecondKernelLaunch", 1);
    }

@@ -110,7 +110,7 @@ int main() {
        for (int t = 0; t < TEST_ITERS; t++) {
            hipEventRecord(start);
            for (int i = 0; i < DISPATCHES_PER_TEST; i++) {
-                hipLaunchKernel(NearlyNull, dim3(NUM_GROUPS), dim3(GROUP_SIZE), 0, stream0, Ad);
+                hipLaunchKernelGGL(NearlyNull, dim3(NUM_GROUPS), dim3(GROUP_SIZE), 0, stream0, Ad);
                hipEventRecord(sync);
                hipEventSynchronize(sync);
            }
@@ -123,7 +123,7 @@ int main() {
        for (int t = 0; t < TEST_ITERS; t++) {
            hipEventRecord(start);
            for (int i = 0; i < DISPATCHES_PER_TEST; i++) {
-                hipLaunchKernel(NearlyNull, dim3(NUM_GROUPS), dim3(GROUP_SIZE), 0, stream, Ad);
+                hipLaunchKernelGGL(NearlyNull, dim3(NUM_GROUPS), dim3(GROUP_SIZE), 0, stream, Ad);
                hipEventRecord(sync);
                hipEventSynchronize(sync);
            }
@@ -137,7 +137,7 @@ int main() {
        for (int t = 0; t < TEST_ITERS; t++) {
            hipEventRecord(start);
            for (int i = 0; i < DISPATCHES_PER_TEST; i++) {
-                hipLaunchKernel(NearlyNull, dim3(NUM_GROUPS), dim3(GROUP_SIZE), 0, stream0, Ad);
+                hipLaunchKernelGGL(NearlyNull, dim3(NUM_GROUPS), dim3(GROUP_SIZE), 0, stream0, Ad);
            }
            stopTest(start, stop, "NullStreamASyncDispatchNoWait", DISPATCHES_PER_TEST);
        }
@@ -147,7 +147,7 @@ int main() {
        for (int t = 0; t < TEST_ITERS; t++) {
            hipEventRecord(start);
            for (int i = 0; i < DISPATCHES_PER_TEST; i++) {
-                hipLaunchKernel(NearlyNull, dim3(NUM_GROUPS), dim3(GROUP_SIZE), 0, stream, Ad);
+                hipLaunchKernelGGL(NearlyNull, dim3(NUM_GROUPS), dim3(GROUP_SIZE), 0, stream, Ad);
            }
            stopTest(start, stop, "StreamASyncDispatchNoWait", DISPATCHES_PER_TEST);
        }
@@ -36,8 +36,7 @@ THE SOFTWARE.
 #define THREADS_PER_BLOCK_Z 1

 // Device (Kernel) function, it must be void
-// hipLaunchParm provides the execution configuration
-__global__ void matrixTranspose(hipLaunchParm lp, float* out, float* in, const int width) {
+__global__ void matrixTranspose(float* out, float* in, const int width) {
    int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
    int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;

@@ -86,7 +85,7 @@ int main() {
    hipMemcpy(gpuMatrix, Matrix, NUM * sizeof(float), hipMemcpyHostToDevice);

    // Lauching kernel from host
-    hipLaunchKernel(matrixTranspose, dim3(WIDTH / THREADS_PER_BLOCK_X, WIDTH / THREADS_PER_BLOCK_Y),
+    hipLaunchKernelGGL(matrixTranspose, dim3(WIDTH / THREADS_PER_BLOCK_X, WIDTH / THREADS_PER_BLOCK_Y),
                    dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), 0, 0, gpuTransposeMatrix,
                    gpuMatrix, WIDTH);

@@ -21,8 +21,7 @@ In order to use the HIP framework, we need to add the "hip_runtime.h" header fil
 ## Device-side code
 We will work on device side code first, Here is simple example showing a snippet of HIP device side code:

-`__global__ void matrixTranspose(hipLaunchParm lp,            `
-`                                float *out,                  `
+`__global__ void matrixTranspose(float *out,                  `
 `                                float *in,                   `
 `                                const int width,             `
 `                                const int height)            `
@@ -41,11 +40,9 @@ other function-type qualifiers are:
 `__host__` can combine with `__device__`, in which case the function compiles for both the host and device. These functions cannot use the HIP grid coordinate functions (for example, "hipThreadIdx_x", will talk about it latter). A possible workaround is to pass the necessary coordinate info as an argument to the function.
 `__host__` cannot combine with `__global__`.

-`__global__` functions are often referred to as *kernels, and calling one is termed *launching the kernel*. 
+`__global__` functions are often referred to as *kernels*, and calling one is termed *launching the kernel*. 

-Next keyword is `void`. HIP `__global__` functions must have a `void` return type, and the first parameter to a HIP `__global__` function must have the type `hipLaunchParm`, which is for execution configuration. Global functions require the caller to specify an "execution configuration" that includes the grid and block dimensions. The execution configuration can also include other information for the launch, such as the amount of additional shared memory to allocate and the stream where the kernel should execute.
-
-After `hipLaunchParm`, Kernel arguments follows next(i.e.,  `float *out, float *in, const int width, const int height`).
+Next keyword is `void`. HIP `__global__` functions must have a `void` return type. Global functions require the caller to specify an "execution configuration" that includes the grid and block dimensions. The execution configuration can also include other information for the launch, such as the amount of additional shared memory to allocate and the stream where the kernel should execute.

 The kernel function begins with 
 `    int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;`
@@ -63,15 +60,15 @@ We allocated memory to the Matrix on host side by using malloc and initiallized
 here the first parameter is the destination pointer, second is the source pointer, third is the size of memory copy and the last specify the direction on memory copy(which is in this case froom host to device). While in order to transfer memory from device to host, use `hipMemcpyDeviceToHost` and for device to device memory copy use `hipMemcpyDeviceToDevice`.

 Now, we'll see how to launch the kernel.
-`  hipLaunchKernel(matrixTranspose,                                                         `
+`  hipLaunchKernelGGL(matrixTranspose,                                                      `
 `                  dim3(WIDTH/THREADS_PER_BLOCK_X, HEIGHT/THREADS_PER_BLOCK_Y),             `
 `                  dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),                          `
 `                  0, 0,                                                                    `
 `                  gpuTransposeMatrix , gpuMatrix, WIDTH ,HEIGHT);                          `

 HIP introduces a standard C++ calling convention to pass the execution configuration to the kernel (this convention replaces the `Cuda <<< >>>` syntax). In HIP,
- Kernels launch with the `"hipLaunchKernel"` function
- The first five parameters to hipLaunchKernel are the following:
+- Kernels launch with the `"hipLaunchKernelGGL"` function
+- The first five parameters to hipLaunchKernelGGL are the following:
   - **symbol kernelName**: the name of the kernel to launch.  To support template kernels which contains "," use the HIP_KERNEL_NAME macro. In current application it's "matrixTranspose".
   - **dim3 gridDim**: 3D-grid dimensions specifying the number of blocks to launch. In MatrixTranspose sample, it's "dim3(WIDTH/THREADS_PER_BLOCK_X, HEIGHT/THREADS_PER_BLOCK_Y)". 
   - **dim3 blockDim**: 3D-block dimensions specifying the number of threads in each block.In MatrixTranspose sample, it's "dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y)". 
@@ -34,8 +34,7 @@ THE SOFTWARE.
 #define THREADS_PER_BLOCK_Z 1

 // Device (Kernel) function, it must be void
-// hipLaunchParm provides the execution configuration
-__global__ void matrixTranspose(hipLaunchParm lp, float* out, float* in, const int width) {
+__global__ void matrixTranspose(float* out, float* in, const int width) {
    int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
    int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;

@@ -103,7 +102,7 @@ int main() {
    hipEventRecord(start, NULL);

    // Lauching kernel from host
-    hipLaunchKernel(matrixTranspose, dim3(WIDTH / THREADS_PER_BLOCK_X, WIDTH / THREADS_PER_BLOCK_Y),
+    hipLaunchKernelGGL(matrixTranspose, dim3(WIDTH / THREADS_PER_BLOCK_X, WIDTH / THREADS_PER_BLOCK_Y),
                    dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), 0, 0, gpuTransposeMatrix,
                    gpuMatrix, WIDTH);

@@ -36,8 +36,7 @@ THE SOFTWARE.
 #define THREADS_PER_BLOCK_Z 1

 // Device (Kernel) function, it must be void
-// hipLaunchParm provides the execution configuration
-__global__ void matrixTranspose(hipLaunchParm lp, float* out, float* in, const int width) {
+__global__ void matrixTranspose(float* out, float* in, const int width) {
    int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
    int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;

@@ -86,7 +85,7 @@ int main() {
    hipMemcpy(gpuMatrix, Matrix, NUM * sizeof(float), hipMemcpyHostToDevice);

    // Lauching kernel from host
-    hipLaunchKernel(matrixTranspose, dim3(WIDTH / THREADS_PER_BLOCK_X, WIDTH / THREADS_PER_BLOCK_Y),
+    hipLaunchKernelGGL(matrixTranspose, dim3(WIDTH / THREADS_PER_BLOCK_X, WIDTH / THREADS_PER_BLOCK_Y),
                    dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), 0, 0, gpuTransposeMatrix,
                    gpuMatrix, WIDTH);

@@ -41,7 +41,7 @@ Now, we'll have the operation for which we need to compute the time taken. For t
 `  hipMemcpy(gpuMatrix, Matrix, NUM*sizeof(float), hipMemcpyHostToDevice);`

 and for kernel execution time we'll use `hipKernelLaunch`:
-`  hipLaunchKernel(matrixTranspose,                                                    `
+`  hipLaunchKernelGGL(matrixTranspose,                                                    `
 `                  dim3(WIDTH/THREADS_PER_BLOCK_X, HEIGHT/THREADS_PER_BLOCK_Y),        `
 `                  dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),                     `
 `                  0, 0,                                                               `
@@ -34,8 +34,7 @@ THE SOFTWARE.
 #define THREADS_PER_BLOCK_Z 1

 // Device (Kernel) function, it must be void
-// hipLaunchParm provides the execution configuration
-__global__ void matrixTranspose(hipLaunchParm lp, float* out, float* in, const int width) {
+__global__ void matrixTranspose(float* out, float* in, const int width) {
    int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
    int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;

@@ -103,7 +102,7 @@ int main() {
    hipEventRecord(start, NULL);

    // Lauching kernel from host
-    hipLaunchKernel(matrixTranspose, dim3(WIDTH / THREADS_PER_BLOCK_X, WIDTH / THREADS_PER_BLOCK_Y),
+    hipLaunchKernelGGL(matrixTranspose, dim3(WIDTH / THREADS_PER_BLOCK_X, WIDTH / THREADS_PER_BLOCK_Y),
                    dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), 0, 0, gpuTransposeMatrix,
                    gpuMatrix, WIDTH);

@@ -41,8 +41,7 @@ int startTriggerIteration = -1;
 int stopTriggerIteration = -1;

 // Device (Kernel) function, it must be void
-// hipLaunchParm provides the execution configuration
-__global__ void matrixTranspose(hipLaunchParm lp, float* out, float* in, const int width) {
+__global__ void matrixTranspose(float* out, float* in, const int width) {
    int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
    int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;

@@ -98,7 +97,7 @@ void runGPU(float* Matrix, float* TransposeMatrix, float* gpuMatrix, float* gpuT
        hipEventRecord(start, NULL);

        // Lauching kernel from host
-        hipLaunchKernel(matrixTranspose,
+        hipLaunchKernelGGL(matrixTranspose,
                        dim3(WIDTH / THREADS_PER_BLOCK_X, WIDTH / THREADS_PER_BLOCK_Y),
                        dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), 0, 0, gpuTransposeMatrix,
                        gpuMatrix, WIDTH);
@@ -35,8 +35,7 @@ THE SOFTWARE.
 #define THREADS_PER_BLOCK_Z 1

 // Device (Kernel) function, it must be void
-// hipLaunchParm provides the execution configuration
-__global__ void matrixTranspose(hipLaunchParm lp, float* out, float* in, const int width) {
+__global__ void matrixTranspose(float* out, float* in, const int width) {
    __shared__ float sharedMem[WIDTH * WIDTH];

    int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
@@ -91,7 +90,7 @@ int main() {
    hipMemcpy(gpuMatrix, Matrix, NUM * sizeof(float), hipMemcpyHostToDevice);

    // Lauching kernel from host
-    hipLaunchKernel(matrixTranspose, dim3(WIDTH / THREADS_PER_BLOCK_X, WIDTH / THREADS_PER_BLOCK_Y),
+    hipLaunchKernelGGL(matrixTranspose, dim3(WIDTH / THREADS_PER_BLOCK_X, WIDTH / THREADS_PER_BLOCK_Y),
                    dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), 0, 0, gpuTransposeMatrix,
                    gpuMatrix, WIDTH);

@@ -35,8 +35,7 @@ THE SOFTWARE.
 #define THREADS_PER_BLOCK_Z 1

 // Device (Kernel) function, it must be void
-// hipLaunchParm provides the execution configuration
-__global__ void matrixTranspose(hipLaunchParm lp, float* out, float* in, const int width) {
+__global__ void matrixTranspose(float* out, float* in, const int width) {
    int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;

    float val = in[x];
@@ -88,7 +87,7 @@ int main() {
    hipMemcpy(gpuMatrix, Matrix, NUM * sizeof(float), hipMemcpyHostToDevice);

    // Lauching kernel from host
-    hipLaunchKernel(matrixTranspose, dim3(1), dim3(THREADS_PER_BLOCK_X * THREADS_PER_BLOCK_Y), 0, 0,
+    hipLaunchKernelGGL(matrixTranspose, dim3(1), dim3(THREADS_PER_BLOCK_X * THREADS_PER_BLOCK_Y), 0, 0,
                    gpuTransposeMatrix, gpuMatrix, WIDTH);

    // Memory transfer from device to host
@@ -35,8 +35,7 @@ THE SOFTWARE.
 #define THREADS_PER_BLOCK_Z 1

 // Device (Kernel) function, it must be void
-// hipLaunchParm provides the execution configuration
-__global__ void matrixTranspose(hipLaunchParm lp, float* out, float* in, const int width) {
+__global__ void matrixTranspose(float* out, float* in, const int width) {
    int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
    int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;
    float val = in[y * width + x];
@@ -86,7 +85,7 @@ int main() {
    hipMemcpy(gpuMatrix, Matrix, NUM * sizeof(float), hipMemcpyHostToDevice);

    // Lauching kernel from host
-    hipLaunchKernel(matrixTranspose, dim3(1), dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), 0, 0,
+    hipLaunchKernelGGL(matrixTranspose, dim3(1), dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), 0, 0,
                    gpuTransposeMatrix, gpuMatrix, WIDTH);

    // Memory transfer from device to host
@@ -25,7 +25,7 @@ Shared memory is way more faster than that of global and constant memory and acc
 here the first parameter is the data type while the second one is the variable name.

 The other important change is:
-`  hipLaunchKernel(matrixTranspose,                                                   `
+`  hipLaunchKernelGGL(matrixTranspose,                                                   `
                  dim3(WIDTH/THREADS_PER_BLOCK_X, WIDTH/THREADS_PER_BLOCK_Y),
                  dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),
                  sizeof(float)*WIDTH*WIDTH, 0,
@@ -33,8 +33,7 @@ THE SOFTWARE.
 #define THREADS_PER_BLOCK_Z 1

 // Device (Kernel) function, it must be void
-// hipLaunchParm provides the execution configuration
-__global__ void matrixTranspose(hipLaunchParm lp, float* out, float* in, const int width) {
+__global__ void matrixTranspose(float* out, float* in, const int width) {
    // declare dynamic shared memory
    HIP_DYNAMIC_SHARED(float, sharedMem);

@@ -90,7 +89,7 @@ int main() {
    hipMemcpy(gpuMatrix, Matrix, NUM * sizeof(float), hipMemcpyHostToDevice);

    // Lauching kernel from host
-    hipLaunchKernel(matrixTranspose, dim3(WIDTH / THREADS_PER_BLOCK_X, WIDTH / THREADS_PER_BLOCK_Y),
+    hipLaunchKernelGGL(matrixTranspose, dim3(WIDTH / THREADS_PER_BLOCK_X, WIDTH / THREADS_PER_BLOCK_Y),
                    dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), sizeof(float) * WIDTH * WIDTH,
                    0, gpuTransposeMatrix, gpuMatrix, WIDTH);

@@ -26,15 +26,15 @@ and create stream using `hipStreamCreate` as follows:
 `  for(int i=0;i<num_streams;i++)                                                `
 `  hipStreamCreate(&streams[i]);                                                 `

-and while kernel launch, we make the following changes in 5th parameter to hipLaunchKernel(having 0 as the default stream value):
+and while kernel launch, we make the following changes in 5th parameter to hipLaunchKernelGGL(having 0 as the default stream value):

-`  hipLaunchKernel(matrixTranspose_static_shared,                               `
+`  hipLaunchKernelGGL(matrixTranspose_static_shared,                             `
                    dim3(WIDTH/THREADS_PER_BLOCK_X, WIDTH/THREADS_PER_BLOCK_Y),
                    dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),
                    0, streams[0],
                    gpuTransposeMatrix[0], data[0], width);

-`  hipLaunchKernel(matrixTranspose_dynamic_shared,                              `
+`  hipLaunchKernelGGL(matrixTranspose_dynamic_shared,                            `
                    dim3(WIDTH/THREADS_PER_BLOCK_X, WIDTH/THREADS_PER_BLOCK_Y),
                    dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),
                    sizeof(float)*WIDTH*WIDTH, streams[1],
@@ -30,7 +30,7 @@ THE SOFTWARE.

 using namespace std;

-__global__ void matrixTranspose_static_shared(hipLaunchParm lp, float* out, float* in,
+__global__ void matrixTranspose_static_shared(float* out, float* in,
                                              const int width) {
    __shared__ float sharedMem[WIDTH * WIDTH];

@@ -44,7 +44,7 @@ __global__ void matrixTranspose_static_shared(hipLaunchParm lp, float* out, floa
    out[y * width + x] = sharedMem[y * width + x];
 }

-__global__ void matrixTranspose_dynamic_shared(hipLaunchParm lp, float* out, float* in,
+__global__ void matrixTranspose_dynamic_shared(float* out, float* in,
                                               const int width) {
    // declare dynamic shared memory
    HIP_DYNAMIC_SHARED(float, sharedMem)
@@ -71,12 +71,12 @@ void MultipleStream(float** data, float* randArray, float** gpuTransposeMatrix,
        hipMemcpyAsync(data[i], randArray, NUM * sizeof(float), hipMemcpyHostToDevice, streams[i]);
    }

-    hipLaunchKernel(matrixTranspose_static_shared,
+    hipLaunchKernelGGL(matrixTranspose_static_shared,
                    dim3(WIDTH / THREADS_PER_BLOCK_X, WIDTH / THREADS_PER_BLOCK_Y),
                    dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), 0, streams[0],
                    gpuTransposeMatrix[0], data[0], width);

-    hipLaunchKernel(matrixTranspose_dynamic_shared,
+    hipLaunchKernelGGL(matrixTranspose_dynamic_shared,
                    dim3(WIDTH / THREADS_PER_BLOCK_X, WIDTH / THREADS_PER_BLOCK_Y),
                    dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), sizeof(float) * WIDTH * WIDTH,
                    streams[1], gpuTransposeMatrix[1], data[1], width);
@@ -105,7 +105,7 @@ void disablePeer2Peer(int currentGpu, int peerGpu) {
 }


-__global__ void matrixTranspose_static_shared(hipLaunchParm lp, float* out, float* in,
+__global__ void matrixTranspose_static_shared(float* out, float* in,
                                              const int width) {
    __shared__ float sharedMem[WIDTH * WIDTH];

@@ -119,7 +119,7 @@ __global__ void matrixTranspose_static_shared(hipLaunchParm lp, float* out, floa
    out[y * width + x] = sharedMem[y * width + x];
 }

-__global__ void matrixTranspose_dynamic_shared(hipLaunchParm lp, float* out, float* in,
+__global__ void matrixTranspose_dynamic_shared(float* out, float* in,
                                               const int width) {
    // declare dynamic shared memory
    HIP_DYNAMIC_SHARED(float, sharedMem)
@@ -170,7 +170,7 @@ int main() {
    hipMalloc((void**)&data[0], NUM * sizeof(float));
    hipMemcpy(data[0], randArray, NUM * sizeof(float), hipMemcpyHostToDevice);

-    hipLaunchKernel(matrixTranspose_static_shared,
+    hipLaunchKernelGGL(matrixTranspose_static_shared,
                    dim3(WIDTH / THREADS_PER_BLOCK_X, WIDTH / THREADS_PER_BLOCK_Y),
                    dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), 0, 0, gpuTransposeMatrix[0],
                    data[0], width);
@@ -181,7 +181,7 @@ int main() {
    hipMalloc((void**)&data[1], NUM * sizeof(float));
    hipMemcpy(data[1], gpuTransposeMatrix[0], NUM * sizeof(float), hipMemcpyDeviceToDevice);

-    hipLaunchKernel(matrixTranspose_dynamic_shared,
+    hipLaunchKernelGGL(matrixTranspose_dynamic_shared,
                    dim3(WIDTH / THREADS_PER_BLOCK_X, WIDTH / THREADS_PER_BLOCK_Y),
                    dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), sizeof(float) * WIDTH * WIDTH,
                    0, gpuTransposeMatrix[1], data[1], width);
@@ -35,8 +35,7 @@ THE SOFTWARE.
 #define THREADS_PER_BLOCK_Z 1

 // Device (Kernel) function, it must be void
-// hipLaunchParm provides the execution configuration
-__global__ void matrixTranspose(hipLaunchParm lp, float* out, float* in, const int width) {
+__global__ void matrixTranspose(float* out, float* in, const int width) {
    int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
    float val = in[x];

@@ -88,7 +87,7 @@ int main() {
    hipMemcpy(gpuMatrix, Matrix, NUM * sizeof(float), hipMemcpyHostToDevice);

    // Lauching kernel from host
-    hipLaunchKernel(matrixTranspose, dim3(1), dim3(THREADS_PER_BLOCK_X * THREADS_PER_BLOCK_Y), 0, 0,
+    hipLaunchKernelGGL(matrixTranspose, dim3(1), dim3(THREADS_PER_BLOCK_X * THREADS_PER_BLOCK_Y), 0, 0,
                    gpuTransposeMatrix, gpuMatrix, WIDTH);

    // Memory transfer from device to host
@@ -92,13 +92,19 @@ namespace hip_impl
        hipStream_t stream,
        void** kernarg)
    {
-        const auto it0 = functions().find(function_address);
+        auto it0 = functions().find(function_address);

        if (it0 == functions().cend()) {
-            throw runtime_error{
-                "No device code available for function: " +
-                name(function_address)
-            };
+            // Re-init device code maps once again to help locate kernels
+            // loaded after HIP runtime initialization via means such as
+            // dlopen().
+            it0 = functions(true).find(function_address);
+            if (it0 == functions().cend()) {
+                throw runtime_error{
+                    "No device code available for function: " +
+                    name(function_address)
+                };
+            }
        }

        auto agent = target_agent(stream);
@@ -31,9 +31,15 @@ THE SOFTWARE.
 //-------------------------------------------------------------------------------------------------
 // Stream
 //
+enum queue_priority
+{
+    priority_high = Kalmar::priority_high,
+    priority_normal = Kalmar::priority_normal,
+    priority_low = Kalmar::priority_low
+};

 //---
-hipError_t ihipStreamCreate(hipStream_t* stream, unsigned int flags) {
+hipError_t ihipStreamCreate(hipStream_t* stream, unsigned int flags, int priority) {
    ihipCtx_t* ctx = ihipGetTlsDefaultCtx();

    hipError_t e = hipSuccess;
@@ -53,7 +59,7 @@ hipError_t ihipStreamCreate(hipStream_t* stream, unsigned int flags) {
                // Obtain mutex access to the device critical data, release by destructor
                LockedAccessor_CtxCrit_t ctxCrit(ctx->criticalData());

-                auto istream = new ihipStream_t(ctx, acc.create_view(), flags);
+                auto istream = new ihipStream_t(ctx, acc.create_view(Kalmar::execute_in_order, Kalmar::queuing_mode_automatic, (Kalmar::queue_priority)priority), flags);

                ctxCrit->addStream(istream);
                *stream = istream;
@@ -73,16 +79,33 @@ hipError_t ihipStreamCreate(hipStream_t* stream, unsigned int flags) {
 hipError_t hipStreamCreateWithFlags(hipStream_t* stream, unsigned int flags) {
    HIP_INIT_API(stream, flags);

-    return ihipLogStatus(ihipStreamCreate(stream, flags));
+    return ihipLogStatus(ihipStreamCreate(stream, flags, priority_normal));
 }

 //---
 hipError_t hipStreamCreate(hipStream_t* stream) {
    HIP_INIT_API(stream);

-    return ihipLogStatus(ihipStreamCreate(stream, hipStreamDefault));
+    return ihipLogStatus(ihipStreamCreate(stream, hipStreamDefault, priority_normal));
 }

+//---
+hipError_t hipStreamCreateWithPriority(hipStream_t* stream, unsigned int flags, int priority) {
+    HIP_INIT_API(stream, flags, priority);
+
+    // clamp priority to range [priority_high:priority_low]
+    priority = (priority < priority_high ? priority_high : (priority > priority_low ? priority_low : priority));
+    return ihipLogStatus(ihipStreamCreate(stream, flags, priority));
+}
+
+//---
+hipError_t hipDeviceGetStreamPriorityRange(int* leastPriority, int* greatestPriority) {
+    HIP_INIT_API(leastPriority, greatestPriority);
+
+    if (leastPriority != NULL) *leastPriority = priority_low;
+    if (greatestPriority != NULL) *greatestPriority = priority_high;
+    return ihipLogStatus(hipSuccess);
+}

 hipError_t hipStreamWaitEvent(hipStream_t stream, hipEvent_t event, unsigned int flags) {
    HIP_INIT_SPECIAL_API(TRACE_SYNC, stream, event, flags);
@@ -191,6 +214,22 @@ hipError_t hipStreamGetFlags(hipStream_t stream, unsigned int* flags) {
 }


+//--
+hipError_t hipStreamGetPriority(hipStream_t stream, int* priority) {
+    HIP_INIT_API(stream, priority);
+
+    if (priority == NULL) {
+        return ihipLogStatus(hipErrorInvalidValue);
+    } else if (stream == hipStreamNull) {
+        return ihipLogStatus(hipErrorInvalidResourceHandle);
+    } else {
+        LockedAccessor_StreamCrit_t crit(stream->_criticalData);
+        *priority = crit->_av.get_queue_priority();
+        return ihipLogStatus(hipSuccess);
+    }
+}
+
+
 //---
 hipError_t hipStreamAddCallback(hipStream_t stream, hipStreamCallback_t callback, void* userData,
                                unsigned int flags) {
@@ -263,7 +263,7 @@ hipError_t hipCreateTextureObject(hipTextureObject_t* pTexObject, const hipResou
                break;
            case hipResourceTypeLinear:
                devPtr = pResDesc->res.linear.devPtr;
-                imageDescriptor.width = pResDesc->res.linear.sizeInBytes;
+                imageDescriptor.width = pResDesc->res.linear.sizeInBytes/((pResDesc->res.linear.desc.x + pResDesc->res.linear.desc.y + pResDesc->res.linear.desc.z + pResDesc->res.linear.desc.w)/8);
                imageDescriptor.height = 1;
                imageDescriptor.depth = 0;
                imageDescriptor.array_size = 0;
@@ -74,11 +74,15 @@ vector<string> copy_names_of_undefined_symbols(const symbol_section_accessor& se
 }

 const std::unordered_map<std::string, std::pair<ELFIO::Elf64_Addr, ELFIO::Elf_Xword>>&
-symbol_addresses() {
+symbol_addresses(bool rebuild = false) {
    static unordered_map<string, pair<Elf64_Addr, Elf_Xword>> r;
    static once_flag f;

-    call_once(f, []() {
+    auto cons = [rebuild]() {
+        if (rebuild) {
+            r.clear();
+        }
+
        dl_iterate_phdr(
            [](dl_phdr_info* info, size_t, void*) {
                static constexpr const char self[] = "/proc/self/exe";
@@ -108,7 +112,12 @@ symbol_addresses() {
                return 0;
            },
            nullptr);
-    });
+    };
+
+    call_once(f, cons);
+    if (rebuild) {
+        cons();
+    }

    return r;
 }
@@ -166,21 +175,34 @@ vector<char> code_object_blob_for_process() {
    return r;
 }

-const unordered_map<hsa_isa_t, vector<vector<char>>>& code_object_blobs() {
+const unordered_map<hsa_isa_t, vector<vector<char>>>& code_object_blobs(bool rebuild = false) {
    static unordered_map<hsa_isa_t, vector<vector<char>>> r;
    static once_flag f;

-    call_once(f, []() {
+    auto cons = [rebuild]() {
+        // names of shared libraries who .kernel sections already loaded
+        static unordered_set<string> lib_names;
        static vector<vector<char>> blobs{code_object_blob_for_process()};

+        if (rebuild) {
+            r.clear();
+            blobs.clear();
+        }
+
        dl_iterate_phdr(
            [](dl_phdr_info* info, std::size_t, void*) {
                elfio tmp;
-                if (tmp.load(info->dlpi_name)) {
+                if ((lib_names.find(info->dlpi_name) == lib_names.end()) &&
+                    (tmp.load(info->dlpi_name))) {
                    const auto it = find_section_if(
                        tmp, [](const section* x) { return x->get_name() == ".kernel"; });

-                    if (it) blobs.emplace_back(it->get_data(), it->get_data() + it->get_size());
+                    if (it) {
+                        blobs.emplace_back(
+                            it->get_data(), it->get_data() + it->get_size());
+                        // register the shared library as already loaded
+                        lib_names.emplace(info->dlpi_name);
+                    }
                }
                return 0;
            },
@@ -194,7 +216,13 @@ const unordered_map<hsa_isa_t, vector<vector<char>>>& code_object_blobs() {
                }
            }
        }
-    });
+    };
+
+
+    call_once(f, cons);
+    if (rebuild) {
+        cons();
+    }

    return r;
 }
@@ -216,13 +244,13 @@ vector<pair<uintptr_t, string>> function_names_for(const elfio& reader, section*
    return r;
 }

-const vector<pair<uintptr_t, string>>& function_names_for_process() {
+const vector<pair<uintptr_t, string>>& function_names_for_process(bool rebuild = false) {
    static constexpr const char self[] = "/proc/self/exe";

    static vector<pair<uintptr_t, string>> r;
    static once_flag f;

-    call_once(f, []() {
+    auto cons = [rebuild]() {
        elfio reader;

        if (!reader.load(self)) {
@@ -233,16 +261,26 @@ const vector<pair<uintptr_t, string>>& function_names_for_process() {
            find_section_if(reader, [](const section* x) { return x->get_type() == SHT_SYMTAB; });

        if (symtab) r = function_names_for(reader, symtab);
-    });
+    };
+
+    call_once(f, cons);
+    if (rebuild) {
+        cons();
+    }

    return r;
 }

-const unordered_map<string, vector<hsa_executable_symbol_t>>& kernels() {
+const unordered_map<string, vector<hsa_executable_symbol_t>>& kernels(bool rebuild = false) {
    static unordered_map<string, vector<hsa_executable_symbol_t>> r;
    static once_flag f;

-    call_once(f, []() {
+    auto cons = [rebuild]() {
+        if (rebuild) {
+            r.clear();
+            executables(rebuild);
+        }
+
        static const auto copy_kernels = [](hsa_executable_t, hsa_agent_t,
                                            hsa_executable_symbol_t s, void*) {
            if (type(s) == HSA_SYMBOL_KIND_KERNEL) r[name(s)].push_back(s);
@@ -256,7 +294,12 @@ const unordered_map<string, vector<hsa_executable_symbol_t>>& kernels() {
                                                     copy_kernels, nullptr);
            }
        }
-    });
+    };
+
+    call_once(f, cons);
+    if (rebuild) {
+        cons();
+    }

    return r;
 }
@@ -295,13 +338,19 @@ void load_code_object_and_freeze_executable(

 namespace hip_impl {
 const unordered_map<hsa_agent_t, vector<hsa_executable_t>>&
-executables() {  // TODO: This leaks the hsa_executable_ts, it should use RAII.
+executables(bool rebuild) {  // TODO: This leaks the hsa_executable_ts, it should use RAII.
    static unordered_map<hsa_agent_t, vector<hsa_executable_t>> r;
    static once_flag f;

-    call_once(f, []() {
+    auto cons = [rebuild]() {
        static const auto accelerators = hc::accelerator::get_all();

+        if (rebuild) {
+            // do NOT clear r so we reuse instances of hsa_executable_t
+            // created previously
+            code_object_blobs(rebuild);
+        }
+
        for (auto&& acc : accelerators) {
            auto agent = static_cast<hsa_agent_t*>(acc.get_hsa_agent());

@@ -335,17 +384,29 @@ executables() {  // TODO: This leaks the hsa_executable_ts, it should use RAII.
                                   },
                                   agent);
        }
-    });
+    };
+
+    call_once(f, cons);
+    if (rebuild) {
+        cons();
+    }

    return r;
 }

-const unordered_map<uintptr_t, string>& function_names() {
+const unordered_map<uintptr_t, string>& function_names(bool rebuild) {
    static unordered_map<uintptr_t, string> r{function_names_for_process().cbegin(),
                                              function_names_for_process().cend()};
    static once_flag f;

-    call_once(f, []() {
+    auto cons = [rebuild]() {
+        if (rebuild) {
+            r.clear();
+            function_names_for_process(rebuild);
+            r.insert(function_names_for_process().cbegin(),
+                     function_names_for_process().cend());
+        }
+
        dl_iterate_phdr(
            [](dl_phdr_info* info, size_t, void*) {
                elfio tmp;
@@ -365,16 +426,32 @@ const unordered_map<uintptr_t, string>& function_names() {
                return 0;
            },
            nullptr);
-    });
+    };
+
+    call_once(f, cons);
+    if (rebuild) {
+        static mutex mtx;
+        lock_guard<mutex> lck{mtx};
+        cons();
+    }

    return r;
 }

-const unordered_map<uintptr_t, vector<pair<hsa_agent_t, Kernel_descriptor>>>& functions() {
+const unordered_map<uintptr_t, vector<pair<hsa_agent_t, Kernel_descriptor>>>& functions(bool rebuild) {
    static unordered_map<uintptr_t, vector<pair<hsa_agent_t, Kernel_descriptor>>> r;
    static once_flag f;

-    call_once(f, []() {
+    auto cons = [rebuild]() {
+        if (rebuild) {
+            // do NOT clear r so we reuse instances of pair<hsa_agent_t, Kernel_descriptor>
+            // created previously
+
+            function_names(rebuild);
+            kernels(rebuild);
+            globals(rebuild);
+        }
+
        for (auto&& function : function_names()) {
            const auto it = kernels().find(function.second);

@@ -386,15 +463,34 @@ const unordered_map<uintptr_t, vector<pair<hsa_agent_t, Kernel_descriptor>>>& fu
                }
            }
        }
-    });
+    };
+
+    call_once(f, cons);
+    if (rebuild) {
+        static mutex mtx;
+        lock_guard<mutex> lck{mtx};
+        cons();
+    }

    return r;
 }

-unordered_map<string, void*>& globals() {
+unordered_map<string, void*>& globals(bool rebuild) {
    static unordered_map<string, void*> r;
    static once_flag f;
-    call_once(f, []() { r.reserve(symbol_addresses().size()); });
+    auto cons =[rebuild]() {
+        if (rebuild) {
+            r.clear();
+            symbol_addresses(rebuild);
+        }
+
+        r.reserve(symbol_addresses().size());
+    };
+
+    call_once(f, cons);
+    if (rebuild) {
+        cons();
+    }

    return r;
 }
@@ -33,7 +33,7 @@ THE SOFTWARE.
 #define _SIZE sizeof(int) * 1024 * 1024
 #define NUM_STREAMS 2

-__global__ void Iter(hipLaunchParm lp, int* Ad, int num) {
+__global__ void Iter(int* Ad, int num) {
    int tx = threadIdx.x + blockIdx.x * blockDim.x;
    // Kernel loop designed to execute very slowly... ... ...   so we can test timing-related
    // behavior below
@@ -58,7 +58,7 @@ int main() {
        HIPCHECK(hipMemcpyAsync(Ad[i], A[i], _SIZE, hipMemcpyHostToDevice, stream[i]));
    }
    for (int i = 0; i < NUM_STREAMS; i++) {
-        hipLaunchKernel(HIP_KERNEL_NAME(Iter), dim3(1), dim3(1), 0, stream[i], Ad[i], 1 << 30);
+        hipLaunchKernelGGL(HIP_KERNEL_NAME(Iter), dim3(1), dim3(1), 0, stream[i], Ad[i], 1 << 30);
    }
    for (int i = 0; i < NUM_STREAMS; i++) {
        HIPCHECK(hipMemcpyAsync(A[i], Ad[i], _SIZE, hipMemcpyDeviceToHost, stream[i]));
@@ -14,12 +14,12 @@
 */


-__global__ void cpy(hipLaunchParm lp, uint32_t* Out, uint32_t* In) {
+__global__ void cpy(uint32_t* Out, uint32_t* In) {
    int tx = threadIdx.x;
    memcpy(Out + tx, In + tx, sizeof(uint32_t));
 }

-__global__ void set(hipLaunchParm lp, uint32_t* ptr, uint8_t val, size_t size) {
+__global__ void set(uint32_t* ptr, uint8_t val, size_t size) {
    int tx = threadIdx.x;
    memset(ptr + tx, val, sizeof(uint32_t));
 }
@@ -39,7 +39,7 @@ int main() {
    hipMalloc((void**)&Bd, SIZE);
    hipMemcpy(Ad, A, SIZE, hipMemcpyHostToDevice);

-    hipLaunchKernel(cpy, dim3(1), dim3(LEN), 0, 0, Bd, Ad);
+    hipLaunchKernelGGL(cpy, dim3(1), dim3(LEN), 0, 0, Bd, Ad);

    hipMemcpy(B, Bd, SIZE, hipMemcpyDeviceToHost);
    for (int i = LEN - 16; i < LEN; i++) {
@@ -47,7 +47,7 @@ int main() {
            return 0;
        }
    }
-    hipLaunchKernel(set, dim3(1), dim3(LEN), 0, 0, Bd, 0x1, LEN);
+    hipLaunchKernelGGL(set, dim3(1), dim3(LEN), 0, 0, Bd, 0x1, LEN);

    hipMemcpy(B, Bd, SIZE, hipMemcpyDeviceToHost);
    for (int i = LEN - 16; i < LEN; i++) {
@@ -64,11 +64,11 @@ __device__ void double_precision_intrinsics() {
    __fma_rz(1.0, 2.0, 3.0);
 }

-__global__ void compileDoublePrecisionIntrinsics(hipLaunchParm lp, int ignored) {
+__global__ void compileDoublePrecisionIntrinsics(int ignored) {
    double_precision_intrinsics();
 }

 int main() {
-    hipLaunchKernel(compileDoublePrecisionIntrinsics, dim3(1, 1, 1), dim3(1, 1, 1), 0, 0, 1);
+    hipLaunchKernelGGL(compileDoublePrecisionIntrinsics, dim3(1, 1, 1), dim3(1, 1, 1), 0, 0, 1);
    passed();
 }
@@ -33,7 +33,7 @@ THE SOFTWARE.
 #define SIZE LEN << 2


-__global__ void floatMath(hipLaunchParm lp, float* In, float* Out) {
+__global__ void floatMath(float* In, float* Out) {
    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    Out[tid] = __cosf(In[tid]);
    Out[tid] = __exp10f(Out[tid]);
@@ -57,6 +57,6 @@ int main() {
    float *Ind, *Outd;
    hipMalloc((void**)&Ind, SIZE);
    hipMalloc((void**)&Outd, SIZE);
-    hipLaunchKernel(floatMath, dim3(LEN, 1, 1), dim3(1, 1, 1), 0, 0, Ind, Outd);
+    hipLaunchKernelGGL(floatMath, dim3(LEN, 1, 1), dim3(1, 1, 1), 0, 0, Ind, Outd);
    passed();
 }
@@ -66,9 +66,9 @@ __device__ void integer_intrinsics() {
    assert(1);
 }

-__global__ void compileIntegerIntrinsics(hipLaunchParm lp, int ignored) { integer_intrinsics(); }
+__global__ void compileIntegerIntrinsics(int ignored) { integer_intrinsics(); }

 int main() {
-    hipLaunchKernel(compileIntegerIntrinsics, dim3(1, 1, 1), dim3(1, 1, 1), 0, 0, 1);
+    hipLaunchKernelGGL(compileIntegerIntrinsics, dim3(1, 1, 1), dim3(1, 1, 1), 0, 0, 1);
    passed();
 }
@@ -31,12 +31,12 @@ THE SOFTWARE.

 #if __HIP_ARCH_GFX803__ || __HIP_ARCH_GFX900__ || __HIP_ARCH_GFX906__

-__global__ void kernel_abs_int64(hipLaunchParm lp, long long *input, long long *output) {
+__global__ void kernel_abs_int64(long long *input, long long *output) {
    int tx = threadIdx.x;
    output[tx] = abs(input[tx]);
 }

-__global__ void kernel_lgamma_double(hipLaunchParm lp, double *input, double *output) {
+__global__ void kernel_lgamma_double(double *input, double *output) {
    int tx = threadIdx.x;
    output[tx] = lgamma(input[tx]);
 }
@@ -79,7 +79,7 @@ void check_lgamma_double() {
  hipMemcpy(inputGPU, inputCPU, memsize, hipMemcpyHostToDevice);

  // launch kernel
-  hipLaunchKernel(kernel_lgamma_double, dim3(1), dim3(NUM_INPUTS), 0, 0, inputGPU, outputGPU);
+  hipLaunchKernelGGL(kernel_lgamma_double, dim3(1), dim3(NUM_INPUTS), 0, 0, inputGPU, outputGPU);

  // copy outputs from device
  hipMemcpy(outputCPU, outputGPU, memsize, hipMemcpyDeviceToHost);
@@ -127,7 +127,7 @@ void check_abs_int64() {
  hipMemcpy(inputGPU, inputCPU, memsize, hipMemcpyHostToDevice);

  // launch kernel
-  hipLaunchKernel(kernel_abs_int64, dim3(1), dim3(NUM_INPUTS), 0, 0, inputGPU, outputGPU);
+  hipLaunchKernelGGL(kernel_abs_int64, dim3(1), dim3(NUM_INPUTS), 0, 0, inputGPU, outputGPU);

  // copy outputs from device
  hipMemcpy(outputCPU, outputGPU, memsize, hipMemcpyDeviceToHost);
@@ -80,12 +80,12 @@ __device__ void single_precision_intrinsics() {
 }


-__global__ void compileSinglePrecisionIntrinsics(hipLaunchParm lp, int ignored) {
+__global__ void compileSinglePrecisionIntrinsics(int ignored) {
    single_precision_intrinsics();
 }


 int main() {
-    hipLaunchKernel(compileSinglePrecisionIntrinsics, dim3(1, 1, 1), dim3(1, 1, 1), 0, 0, 1);
+    hipLaunchKernelGGL(compileSinglePrecisionIntrinsics, dim3(1, 1, 1), dim3(1, 1, 1), 0, 0, 1);
    passed();
 }
@@ -129,11 +129,11 @@ __device__ void single_precision_math_functions() {
    ynf(1, 1.0f);
 }

-__global__ void compileSinglePrecisionMathOnDevice(hipLaunchParm lp, int ignored) {
+__global__ void compileSinglePrecisionMathOnDevice(int ignored) {
    single_precision_math_functions();
 }

 int main() {
-    hipLaunchKernel(compileSinglePrecisionMathOnDevice, dim3(1, 1, 1), dim3(1, 1, 1), 0, 0, 1);
+    hipLaunchKernelGGL(compileSinglePrecisionMathOnDevice, dim3(1, 1, 1), dim3(1, 1, 1), 0, 0, 1);
    passed();
 }
@@ -84,7 +84,7 @@ __device__ __host__ std::complex<FloatT> calc(std::complex<FloatT> A,
 }

 template<typename FloatT>
-__global__ void kernel(hipLaunchParm lp, std::complex<FloatT>* A,
+__global__ void kernel(std::complex<FloatT>* A,
                       std::complex<FloatT>* B, std::complex<FloatT>* C,
                       enum CalcKind CK) {
    int tx = threadIdx.x + blockIdx.x * blockDim.x;
@@ -114,7 +114,7 @@ void test() {
    // Run kernel for a calculation kind and verify by comparing with host
    // calculation result. Returns false if fails.
    auto test_fun = [&](enum CalcKind CK) {
-      hipLaunchKernel(kernel<FloatT>, dim3(1), dim3(LEN), 0, 0, Ad, Bd, Cd, CK);
+      hipLaunchKernelGGL(kernel<FloatT>, dim3(1), dim3(LEN), 0, 0, Ad, Bd, Cd, CK);
      hipMemcpy(C, Cd, sizeof(ComplexT)*LEN, hipMemcpyDeviceToHost);
      for (int i = 0; i < LEN; i++) {
        ComplexT Expected = calc(A[i], B[i], CK);
@@ -31,74 +31,74 @@ THE SOFTWARE.
 #define N 512
 #define SIZE N * sizeof(double)

-__global__ void test_sincos(hipLaunchParm lp, double* a, double* b, double* c) {
+__global__ void test_sincos(double* a, double* b, double* c) {
    int tid = threadIdx.x;
    sincos(a[tid], b + tid, c + tid);
 }

-__global__ void test_sincospi(hipLaunchParm lp, double* a, double* b, double* c) {
+__global__ void test_sincospi(double* a, double* b, double* c) {
    int tid = threadIdx.x;
    sincospi(a[tid], b + tid, c + tid);
 }

-__global__ void test_llrint(hipLaunchParm lp, double* a, long long int* b) {
+__global__ void test_llrint(double* a, long long int* b) {
    int tid = threadIdx.x;
    b[tid] = llrint(a[tid]);
 }

-__global__ void test_lrint(hipLaunchParm lp, double* a, long int* b) {
+__global__ void test_lrint(double* a, long int* b) {
    int tid = threadIdx.x;
    b[tid] = lrint(a[tid]);
 }

-__global__ void test_rint(hipLaunchParm lp, double* a, double* b) {
+__global__ void test_rint(double* a, double* b) {
    int tid = threadIdx.x;
    b[tid] = rint(a[tid]);
 }

-__global__ void test_llround(hipLaunchParm lp, double* a, long long int* b) {
+__global__ void test_llround(double* a, long long int* b) {
    int tid = threadIdx.x;
    b[tid] = llround(a[tid]);
 }

-__global__ void test_lround(hipLaunchParm lp, double* a, long int* b) {
+__global__ void test_lround(double* a, long int* b) {
    int tid = threadIdx.x;
    b[tid] = lround(a[tid]);
 }

-__global__ void test_rhypot(hipLaunchParm lp, double* a, double* b, double* c) {
+__global__ void test_rhypot(double* a, double* b, double* c) {
    int tid = threadIdx.x;
    c[tid] = rhypot(a[tid], b[tid]);
 }

-__global__ void test_norm3d(hipLaunchParm lp, double* a, double* b, double* c, double* d) {
+__global__ void test_norm3d(double* a, double* b, double* c, double* d) {
    int tid = threadIdx.x;
    d[tid] = norm3d(a[tid], b[tid], c[tid]);
 }

-__global__ void test_norm4d(hipLaunchParm lp, double* a, double* b, double* c, double* d,
+__global__ void test_norm4d(double* a, double* b, double* c, double* d,
                            double* e) {
    int tid = threadIdx.x;
    e[tid] = norm4d(a[tid], b[tid], c[tid], d[tid]);
 }

-__global__ void test_rnorm3d(hipLaunchParm lp, double* a, double* b, double* c, double* d) {
+__global__ void test_rnorm3d(double* a, double* b, double* c, double* d) {
    int tid = threadIdx.x;
    d[tid] = rnorm3d(a[tid], b[tid], c[tid]);
 }

-__global__ void test_rnorm4d(hipLaunchParm lp, double* a, double* b, double* c, double* d,
+__global__ void test_rnorm4d(double* a, double* b, double* c, double* d,
                             double* e) {
    int tid = threadIdx.x;
    e[tid] = rnorm4d(a[tid], b[tid], c[tid], d[tid]);
 }

-__global__ void test_rnorm(hipLaunchParm lp, double* a, double* b) {
+__global__ void test_rnorm(double* a, double* b) {
    int tid = threadIdx.x;
    b[tid] = rnorm(N, a);
 }

-__global__ void test_erfinv(hipLaunchParm lp, double* a, double* b) {
+__global__ void test_erfinv(double* a, double* b) {
    int tid = threadIdx.x;
    b[tid] = erf(erfinv(a[tid]));
 }
@@ -115,7 +115,7 @@ bool run_sincos() {
    hipMalloc((void**)&Bd, SIZE);
    hipMalloc((void**)&Cd, SIZE);
    hipMemcpy(Ad, A, SIZE, hipMemcpyHostToDevice);
-    hipLaunchKernel(test_sincos, dim3(1), dim3(N), 0, 0, Ad, Bd, Cd);
+    hipLaunchKernelGGL(test_sincos, dim3(1), dim3(N), 0, 0, Ad, Bd, Cd);
    hipMemcpy(B, Bd, SIZE, hipMemcpyDeviceToHost);
    hipMemcpy(C, Cd, SIZE, hipMemcpyDeviceToHost);
    int passed = 0;
@@ -157,7 +157,7 @@ bool run_sincospi() {
    hipMalloc((void**)&Bd, SIZE);
    hipMalloc((void**)&Cd, SIZE);
    hipMemcpy(Ad, A, SIZE, hipMemcpyHostToDevice);
-    hipLaunchKernel(test_sincospi, dim3(1), dim3(N), 0, 0, Ad, Bd, Cd);
+    hipLaunchKernelGGL(test_sincospi, dim3(1), dim3(N), 0, 0, Ad, Bd, Cd);
    hipMemcpy(B, Bd, SIZE, hipMemcpyDeviceToHost);
    hipMemcpy(C, Cd, SIZE, hipMemcpyDeviceToHost);
    int passed = 0;
@@ -199,7 +199,7 @@ bool run_llrint() {
    hipMalloc((void**)&Ad, SIZE);
    hipMalloc((void**)&Bd, N * sizeof(long long int));
    hipMemcpy(Ad, A, SIZE, hipMemcpyHostToDevice);
-    hipLaunchKernel(test_llrint, dim3(1), dim3(N), 0, 0, Ad, Bd);
+    hipLaunchKernelGGL(test_llrint, dim3(1), dim3(N), 0, 0, Ad, Bd);
    hipMemcpy(B, Bd, N * sizeof(long long int), hipMemcpyDeviceToHost);
    int passed = 0;
    for (int i = 0; i < 512; i++) {
@@ -233,7 +233,7 @@ bool run_lrint() {
    hipMalloc((void**)&Ad, SIZE);
    hipMalloc((void**)&Bd, N * sizeof(long int));
    hipMemcpy(Ad, A, SIZE, hipMemcpyHostToDevice);
-    hipLaunchKernel(test_lrint, dim3(1), dim3(N), 0, 0, Ad, Bd);
+    hipLaunchKernelGGL(test_lrint, dim3(1), dim3(N), 0, 0, Ad, Bd);
    hipMemcpy(B, Bd, N * sizeof(long int), hipMemcpyDeviceToHost);
    int passed = 0;
    for (int i = 0; i < 512; i++) {
@@ -266,7 +266,7 @@ bool run_rint() {
    hipMalloc((void**)&Ad, SIZE);
    hipMalloc((void**)&Bd, SIZE);
    hipMemcpy(Ad, A, SIZE, hipMemcpyHostToDevice);
-    hipLaunchKernel(test_rint, dim3(1), dim3(N), 0, 0, Ad, Bd);
+    hipLaunchKernelGGL(test_rint, dim3(1), dim3(N), 0, 0, Ad, Bd);
    hipMemcpy(B, Bd, SIZE, hipMemcpyDeviceToHost);
    int passed = 0;
    for (int i = 0; i < 512; i++) {
@@ -300,7 +300,7 @@ bool run_llround() {
    hipMalloc((void**)&Ad, SIZE);
    hipMalloc((void**)&Bd, N * sizeof(long long int));
    hipMemcpy(Ad, A, SIZE, hipMemcpyHostToDevice);
-    hipLaunchKernel(test_llround, dim3(1), dim3(N), 0, 0, Ad, Bd);
+    hipLaunchKernelGGL(test_llround, dim3(1), dim3(N), 0, 0, Ad, Bd);
    hipMemcpy(B, Bd, N * sizeof(long long int), hipMemcpyDeviceToHost);
    int passed = 0;
    for (int i = 0; i < 512; i++) {
@@ -333,7 +333,7 @@ bool run_lround() {
    hipMalloc((void**)&Ad, SIZE);
    hipMalloc((void**)&Bd, N * sizeof(long int));
    hipMemcpy(Ad, A, SIZE, hipMemcpyHostToDevice);
-    hipLaunchKernel(test_lround, dim3(1), dim3(N), 0, 0, Ad, Bd);
+    hipLaunchKernelGGL(test_lround, dim3(1), dim3(N), 0, 0, Ad, Bd);
    hipMemcpy(B, Bd, N * sizeof(long int), hipMemcpyDeviceToHost);
    int passed = 0;
    for (int i = 0; i < 512; i++) {
@@ -376,7 +376,7 @@ bool run_norm3d() {
    hipMemcpy(Ad, A, SIZE, hipMemcpyHostToDevice);
    hipMemcpy(Bd, B, SIZE, hipMemcpyHostToDevice);
    hipMemcpy(Cd, C, SIZE, hipMemcpyHostToDevice);
-    hipLaunchKernel(test_norm3d, dim3(1), dim3(N), 0, 0, Ad, Bd, Cd, Dd);
+    hipLaunchKernelGGL(test_norm3d, dim3(1), dim3(N), 0, 0, Ad, Bd, Cd, Dd);
    hipMemcpy(D, Dd, SIZE, hipMemcpyDeviceToHost);
    int passed = 0;
    for (int i = 0; i < 512; i++) {
@@ -425,7 +425,7 @@ bool run_norm4d() {
    hipMemcpy(Bd, B, SIZE, hipMemcpyHostToDevice);
    hipMemcpy(Cd, C, SIZE, hipMemcpyHostToDevice);
    hipMemcpy(Dd, D, SIZE, hipMemcpyHostToDevice);
-    hipLaunchKernel(test_norm4d, dim3(1), dim3(N), 0, 0, Ad, Bd, Cd, Dd, Ed);
+    hipLaunchKernelGGL(test_norm4d, dim3(1), dim3(N), 0, 0, Ad, Bd, Cd, Dd, Ed);
    hipMemcpy(E, Ed, SIZE, hipMemcpyDeviceToHost);
    int passed = 0;
    for (int i = 0; i < 512; i++) {
@@ -469,7 +469,7 @@ bool run_rhypot() {
    hipMalloc((void**)&Cd, SIZE);
    hipMemcpy(Ad, A, SIZE, hipMemcpyHostToDevice);
    hipMemcpy(Bd, B, SIZE, hipMemcpyHostToDevice);
-    hipLaunchKernel(test_rhypot, dim3(1), dim3(N), 0, 0, Ad, Bd, Cd);
+    hipLaunchKernelGGL(test_rhypot, dim3(1), dim3(N), 0, 0, Ad, Bd, Cd);
    hipMemcpy(C, Cd, SIZE, hipMemcpyDeviceToHost);
    int passed = 0;
    for (int i = 0; i < 512; i++) {
@@ -512,7 +512,7 @@ bool run_rnorm3d() {
    hipMemcpy(Ad, A, SIZE, hipMemcpyHostToDevice);
    hipMemcpy(Bd, B, SIZE, hipMemcpyHostToDevice);
    hipMemcpy(Cd, C, SIZE, hipMemcpyHostToDevice);
-    hipLaunchKernel(test_rnorm3d, dim3(1), dim3(N), 0, 0, Ad, Bd, Cd, Dd);
+    hipLaunchKernelGGL(test_rnorm3d, dim3(1), dim3(N), 0, 0, Ad, Bd, Cd, Dd);
    hipMemcpy(D, Dd, SIZE, hipMemcpyDeviceToHost);
    int passed = 0;
    for (int i = 0; i < 512; i++) {
@@ -561,7 +561,7 @@ bool run_rnorm4d() {
    hipMemcpy(Bd, B, SIZE, hipMemcpyHostToDevice);
    hipMemcpy(Cd, C, SIZE, hipMemcpyHostToDevice);
    hipMemcpy(Dd, D, SIZE, hipMemcpyHostToDevice);
-    hipLaunchKernel(test_rnorm4d, dim3(1), dim3(N), 0, 0, Ad, Bd, Cd, Dd, Ed);
+    hipLaunchKernelGGL(test_rnorm4d, dim3(1), dim3(N), 0, 0, Ad, Bd, Cd, Dd, Ed);
    hipMemcpy(E, Ed, SIZE, hipMemcpyDeviceToHost);
    int passed = 0;
    for (int i = 0; i < 512; i++) {
@@ -602,7 +602,7 @@ bool run_rnorm() {
    hipMalloc((void**)&Ad, SIZE);
    hipMalloc((void**)&Bd, SIZE);
    hipMemcpy(Ad, A, SIZE, hipMemcpyHostToDevice);
-    hipLaunchKernel(test_rnorm, dim3(1), dim3(N), 0, 0, Ad, Bd);
+    hipLaunchKernelGGL(test_rnorm, dim3(1), dim3(N), 0, 0, Ad, Bd);
    hipMemcpy(B, Bd, SIZE, hipMemcpyDeviceToHost);
    int passed = 0;
    for (int i = 0; i < 512; i++) {
@@ -634,7 +634,7 @@ bool run_erfinv() {
    hipMalloc((void**)&Ad, SIZE);
    hipMalloc((void**)&Bd, SIZE);
    hipMemcpy(Ad, A, SIZE, hipMemcpyHostToDevice);
-    hipLaunchKernel(test_erfinv, dim3(1), dim3(N), 0, 0, Ad, Bd);
+    hipLaunchKernelGGL(test_erfinv, dim3(1), dim3(N), 0, 0, Ad, Bd);
    hipMemcpy(B, Bd, SIZE, hipMemcpyDeviceToHost);
    int passed = 0;
    for (int i = 0; i < 512; i++) {
@@ -34,7 +34,7 @@ THE SOFTWARE.
 __device__ int globalIn[NUM];
 __device__ int globalOut[NUM];

-__global__ void Assign(hipLaunchParm lp, int* Out) {
+__global__ void Assign(int* Out) {
    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    Out[tid] = globalIn[tid];
    globalOut[tid] = globalIn[tid];
@@ -63,7 +63,7 @@ int main() {
    hipStreamCreate(&stream);
    hipMemcpyToSymbolAsync(HIP_SYMBOL(globalIn), Am, SIZE, 0, hipMemcpyHostToDevice, stream);
    hipStreamSynchronize(stream);
-    hipLaunchKernel(Assign, dim3(1, 1, 1), dim3(NUM, 1, 1), 0, 0, Ad);
+    hipLaunchKernelGGL(Assign, dim3(1, 1, 1), dim3(NUM, 1, 1), 0, 0, Ad);
    hipMemcpy(B, Ad, SIZE, hipMemcpyDeviceToHost);
    hipMemcpyFromSymbolAsync(Cm, HIP_SYMBOL(globalOut), SIZE, 0, hipMemcpyDeviceToHost, stream);
    hipStreamSynchronize(stream);
@@ -78,7 +78,7 @@ int main() {
    }

    hipMemcpyToSymbol(HIP_SYMBOL(globalIn), A, SIZE, 0, hipMemcpyHostToDevice);
-    hipLaunchKernel(Assign, dim3(1, 1, 1), dim3(NUM, 1, 1), 0, 0, Ad);
+    hipLaunchKernelGGL(Assign, dim3(1, 1, 1), dim3(NUM, 1, 1), 0, 0, Ad);
    hipMemcpy(B, Ad, SIZE, hipMemcpyDeviceToHost);
    hipMemcpyFromSymbol(C, HIP_SYMBOL(globalOut), SIZE, 0, hipMemcpyDeviceToHost);
    for (int i = 0; i < NUM; i++) {
@@ -93,7 +93,7 @@ int main() {

    hipMemcpyToSymbolAsync(HIP_SYMBOL(globalIn), A, SIZE, 0, hipMemcpyHostToDevice, stream);
    hipStreamSynchronize(stream);
-    hipLaunchKernel(Assign, dim3(1, 1, 1), dim3(NUM, 1, 1), 0, 0, Ad);
+    hipLaunchKernelGGL(Assign, dim3(1, 1, 1), dim3(NUM, 1, 1), 0, 0, Ad);
    hipMemcpy(B, Ad, SIZE, hipMemcpyDeviceToHost);
    hipMemcpyFromSymbolAsync(C, HIP_SYMBOL(globalOut), SIZE, 0, hipMemcpyDeviceToHost, stream);
    hipStreamSynchronize(stream);
@@ -31,7 +31,7 @@ THE SOFTWARE.
 #define NUM 1024
 #define SIZE NUM * sizeof(float)

-__global__ void vAdd(hipLaunchParm lp, float* In1, float* In2, float* In3, float* In4, float* Out) {
+__global__ void vAdd(float* In1, float* In2, float* In3, float* In4, float* Out) {
    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    In4[tid] = In1[tid] + In2[tid];
    __threadfence();
@@ -66,7 +66,7 @@ int main() {
    hipMemcpy(In3d, In3, SIZE, hipMemcpyHostToDevice);
    hipMemcpy(In4d, In4, SIZE, hipMemcpyHostToDevice);

-    hipLaunchKernel(vAdd, dim3(32, 1, 1), dim3(32, 1, 1), 0, 0, In1d, In2d, In3d, In4d, Outd);
+    hipLaunchKernelGGL(vAdd, dim3(32, 1, 1), dim3(32, 1, 1), 0, 0, In1d, In2d, In3d, In4d, Outd);
    hipMemcpy(Out, Outd, SIZE, hipMemcpyDeviceToHost);
    assert(Out[10] == 2 * In1[10] + 2 * In2[10] + In3[10]);
    passed();
@@ -33,7 +33,7 @@ THE SOFTWARE.
 #include <hip/device_functions.h>
 #define HIP_ASSERT(x) (assert((x) == hipSuccess))

-__global__ void warpvote(hipLaunchParm lp, int* device_any, int* device_all,
+__global__ void warpvote(int* device_any, int* device_all,
                         int Num_Warps_per_Block, int pshift) {
    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    device_any[threadIdx.x >> pshift] = __any(tid - 77);
@@ -73,7 +73,7 @@ int main(int argc, char* argv[]) {
    HIP_ASSERT(hipMemcpy(device_any, host_any, sizeof(int), hipMemcpyHostToDevice));
    HIP_ASSERT(hipMemcpy(device_all, host_all, sizeof(int), hipMemcpyHostToDevice));

-    hipLaunchKernel(warpvote, dim3(Num_Blocks_per_Grid), dim3(Num_Threads_per_Block), 0, 0,
+    hipLaunchKernelGGL(warpvote, dim3(Num_Blocks_per_Grid), dim3(Num_Threads_per_Block), 0, 0,
                    device_any, device_all, Num_Warps_per_Block, pshift);


@@ -30,7 +30,7 @@ THE SOFTWARE.

 #define HIP_ASSERT(x) (assert((x) == hipSuccess))

-__global__ void gpu_ballot(hipLaunchParm lp, unsigned int* device_ballot, int Num_Warps_per_Block,
+__global__ void gpu_ballot(unsigned int* device_ballot, int Num_Warps_per_Block,
                           int pshift) {
    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    const unsigned int warp_num = threadIdx.x >> pshift;
@@ -69,7 +69,7 @@ int main(int argc, char* argv[]) {
    HIP_ASSERT(hipMemcpy(device_ballot, host_ballot, Num_Warps_per_Grid * sizeof(unsigned int),
                         hipMemcpyHostToDevice));

-    hipLaunchKernel(gpu_ballot, dim3(Num_Blocks_per_Grid), dim3(Num_Threads_per_Block), 0, 0,
+    hipLaunchKernelGGL(gpu_ballot, dim3(Num_Blocks_per_Grid), dim3(Num_Threads_per_Block), 0, 0,
                    device_ballot, Num_Warps_per_Block, pshift);


@@ -53,8 +53,7 @@ T bit_extract(T src0, unsigned int src1, unsigned int src2) {
  }
 }

-__global__ void HIP_kernel(hipLaunchParm lp,
-                           unsigned int* out32, unsigned int* in32_0,
+__global__ void HIP_kernel(unsigned int* out32, unsigned int* in32_0,
                           unsigned int* in32_1, unsigned int* in32_2,
                           unsigned long long int* out64, unsigned long long int* in64_0,
                           unsigned int* in64_1, unsigned int* in64_2) {
@@ -150,7 +149,7 @@ int main() {
    HIP_ASSERT(hipMemcpy(deviceSrc264, hostSrc264, NUM * sizeof(unsigned int), hipMemcpyHostToDevice));


-    hipLaunchKernel(HIP_kernel, dim3(num_blocks), dim3(num_threads_per_block),
+    hipLaunchKernelGGL(HIP_kernel, dim3(num_blocks), dim3(num_threads_per_block),
                       0, 0,
                       deviceOut32, deviceSrc032, deviceSrc132, deviceSrc232,
                       deviceOut64, deviceSrc064, deviceSrc164, deviceSrc264);
@@ -50,7 +50,7 @@ T bit_insert(T src0, T src1, unsigned int src2, unsigned int src3) {
  return ((src0 & ~(mask << offset)) | ((src1 & mask) << offset));
 }

-__global__ void HIP_kernel(hipLaunchParm lp, unsigned int* out32,
+__global__ void HIP_kernel(unsigned int* out32,
                           unsigned int* in32_0, unsigned int* in32_1,
                           unsigned int* in32_2, unsigned int* in32_3,
                           unsigned long long int* out64, unsigned long long int* in64_0,
@@ -161,7 +161,7 @@ int main() {
    HIP_ASSERT(hipMemcpy(deviceSrc364, hostSrc364, NUM * sizeof(unsigned int), hipMemcpyHostToDevice));


-    hipLaunchKernel(HIP_kernel, dim3(num_blocks), dim3(num_threads_per_block),
+    hipLaunchKernelGGL(HIP_kernel, dim3(num_blocks), dim3(num_threads_per_block),
                       0, 0,
                       deviceOut32, deviceSrc032, deviceSrc132, deviceSrc232, deviceSrc332,
                       deviceOut64, deviceSrc064, deviceSrc164, deviceSrc264, deviceSrc364);
@@ -64,7 +64,7 @@ T bitreverse(T num) {
    return reverse_num;
 }

-__global__ void HIP_kernel(hipLaunchParm lp, unsigned int* a, unsigned int* b,
+__global__ void HIP_kernel(unsigned int* a, unsigned int* b,
                           unsigned long long int* c, unsigned long long int* d, int width,
                           int height) {
    int x = blockDim.x * blockIdx.x + threadIdx.x;
@@ -124,7 +124,7 @@ int main() {
        hipMemcpy(deviceD, hostD, NUM * sizeof(unsigned long long int), hipMemcpyHostToDevice));


-    hipLaunchKernel(HIP_kernel, dim3(WIDTH / THREADS_PER_BLOCK_X, HEIGHT / THREADS_PER_BLOCK_Y),
+    hipLaunchKernelGGL(HIP_kernel, dim3(WIDTH / THREADS_PER_BLOCK_X, HEIGHT / THREADS_PER_BLOCK_Y),
                    dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), 0, 0, deviceA, deviceB, deviceC,
                    deviceD, WIDTH, HEIGHT);

@@ -82,7 +82,7 @@ __device__ void test_ambiguity() {
  __clzll(ui);
 }

-__global__ void HIP_kernel(hipLaunchParm lp, unsigned int* a, unsigned int* b, unsigned int* c,
+__global__ void HIP_kernel(unsigned int* a, unsigned int* b, unsigned int* c,
                           unsigned long long int* d, int width, int height) {
    int x = blockDim.x * blockIdx.x + threadIdx.x;
    int y = blockDim.y * blockIdx.y + threadIdx.y;
@@ -138,7 +138,7 @@ int main() {
    HIP_ASSERT(
        hipMemcpy(deviceD, hostD, NUM * sizeof(unsigned long long int), hipMemcpyHostToDevice));

-    hipLaunchKernel(HIP_kernel, dim3(WIDTH / THREADS_PER_BLOCK_X, HEIGHT / THREADS_PER_BLOCK_Y),
+    hipLaunchKernelGGL(HIP_kernel, dim3(WIDTH / THREADS_PER_BLOCK_X, HEIGHT / THREADS_PER_BLOCK_Y),
                    dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), 0, 0, deviceA, deviceB, deviceC,
                    deviceD, WIDTH, HEIGHT);

@@ -59,7 +59,7 @@ int lastbit(T a) {
 }


-__global__ void HIP_kernel(hipLaunchParm lp, unsigned int* a, unsigned int* b, unsigned int* c,
+__global__ void HIP_kernel(unsigned int* a, unsigned int* b, unsigned int* c,
                           unsigned long long int* d, int width, int height) {
    int x = blockDim.x * blockIdx.x + threadIdx.x;
    int y = blockDim.y * blockIdx.y + threadIdx.y;
@@ -117,7 +117,7 @@ int main() {
    HIP_ASSERT(
        hipMemcpy(deviceD, hostD, NUM * sizeof(unsigned long long int), hipMemcpyHostToDevice));

-    hipLaunchKernel(HIP_kernel, dim3(WIDTH / THREADS_PER_BLOCK_X, HEIGHT / THREADS_PER_BLOCK_Y),
+    hipLaunchKernelGGL(HIP_kernel, dim3(WIDTH / THREADS_PER_BLOCK_X, HEIGHT / THREADS_PER_BLOCK_Y),
                    dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), 0, 0, deviceA, deviceB, deviceC,
                    deviceD, WIDTH, HEIGHT);

@@ -36,7 +36,7 @@ THE SOFTWARE.

 #define HIP_ASSERT(x) (assert((x) == hipSuccess))

-__global__ void HIP_kernel(hipLaunchParm lp, unsigned int* mbcnt_lo, unsigned int* mbcnt_hi, unsigned int* lane_id) {
+__global__ void HIP_kernel(unsigned int* mbcnt_lo, unsigned int* mbcnt_hi, unsigned int* lane_id) {
    int x = blockDim.x * blockIdx.x + threadIdx.x;
    mbcnt_lo[x] = __mbcnt_lo(0xFFFFFFFF, 0);
    mbcnt_hi[x] = __mbcnt_hi(0xFFFFFFFF, 0);
@@ -70,7 +70,7 @@ int main() {
    HIP_ASSERT(hipMalloc((void**)&device_mbcnt_hi, buffer_size));
    HIP_ASSERT(hipMalloc((void**)&device_lane_id, buffer_size));

-    hipLaunchKernel(HIP_kernel, dim3(num_blocks),
+    hipLaunchKernelGGL(HIP_kernel, dim3(num_blocks),
                    dim3(num_threads_per_block), 0, 0, device_mbcnt_lo, device_mbcnt_hi, device_lane_id);

    unsigned int* host_mbcnt_lo = (unsigned int*) malloc(buffer_size);
@@ -58,7 +58,7 @@ unsigned int popcountCPU(T value) {
    return ret;
 }

-__global__ void HIP_kernel(hipLaunchParm lp, unsigned int* a, unsigned int* b, unsigned int* c,
+__global__ void HIP_kernel(unsigned int* a, unsigned int* b, unsigned int* c,
                           unsigned long long int* d, int width, int height) {
    int x = blockDim.x * blockIdx.x + threadIdx.x;
    int y = blockDim.y * blockIdx.y + threadIdx.y;
@@ -117,7 +117,7 @@ int main() {
        hipMemcpy(deviceD, hostD, NUM * sizeof(unsigned long long int), hipMemcpyHostToDevice));


-    hipLaunchKernel(HIP_kernel, dim3(WIDTH / THREADS_PER_BLOCK_X, HEIGHT / THREADS_PER_BLOCK_Y),
+    hipLaunchKernelGGL(HIP_kernel, dim3(WIDTH / THREADS_PER_BLOCK_X, HEIGHT / THREADS_PER_BLOCK_Y),
                    dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), 0, 0, deviceA, deviceB, deviceC,
                    deviceD, WIDTH, HEIGHT);

@@ -52,7 +52,7 @@ THE SOFTWARE.
 using namespace std;

 template <typename T>
-__global__ void vectoradd_float(hipLaunchParm lp, T* a, const T* bm, int width, int height)
+__global__ void vectoradd_float(T* a, const T* bm, int width, int height)

 {
    int x = blockDim.x * blockIdx.x + threadIdx.x;
@@ -120,7 +120,7 @@ bool dataTypesRun() {
    HIP_ASSERT(hipMemcpy(deviceB, hostB, NUM * sizeof(T), hipMemcpyHostToDevice));


-    hipLaunchKernel(vectoradd_float,
+    hipLaunchKernelGGL(vectoradd_float,
                    dim3(WIDTH / THREADS_PER_BLOCK_X, HEIGHT / THREADS_PER_BLOCK_Y),
                    dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), 0, 0, deviceA,
                    static_cast<const T*>(deviceB), WIDTH, HEIGHT);
@@ -178,7 +178,7 @@ bool dataTypesRun2() {
    HIP_ASSERT(hipMalloc((void**)&deviceB, NUM * sizeof(T)));

    HIP_ASSERT(hipMemcpy(deviceB, hostB, NUM * sizeof(T), hipMemcpyHostToDevice));
-    hipLaunchKernel(vectoradd_float,
+    hipLaunchKernelGGL(vectoradd_float,
                    dim3(WIDTH / THREADS_PER_BLOCK_X, HEIGHT / THREADS_PER_BLOCK_Y),
                    dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), 0, 0, deviceA,
                    static_cast<const T*>(deviceB), WIDTH, HEIGHT);
@@ -236,7 +236,7 @@ bool dataTypesRun4() {
    HIP_ASSERT(hipMemcpy(deviceB, hostB, NUM * sizeof(T), hipMemcpyHostToDevice));


-    hipLaunchKernel(vectoradd_float,
+    hipLaunchKernelGGL(vectoradd_float,
                    dim3(WIDTH / THREADS_PER_BLOCK_X, HEIGHT / THREADS_PER_BLOCK_Y),
                    dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), 0, 0, deviceA,
                    static_cast<const T*>(deviceB), WIDTH, HEIGHT);
@@ -40,7 +40,7 @@ THE SOFTWARE.

 #define TEST_DEBUG (0)

-__global__ void kernel_trig(hipLaunchParm lp, float* In, float* sin_d, float* cos_d, float* tan_d,
+__global__ void kernel_trig(float* In, float* sin_d, float* cos_d, float* tan_d,
                            float* sin_pd, float* cos_pd) {
    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    sin_d[tid] = sinf(In[tid]);
@@ -75,7 +75,7 @@ int main() {
    HIP_ASSERT(hipMalloc((void**)&cos_pd, SIZE));

    hipMemcpy(In_d, In, SIZE, hipMemcpyHostToDevice);
-    hipLaunchKernel(kernel_trig, dim3(LEN, 1, 1), dim3(1, 1, 1), 0, 0,
+    hipLaunchKernelGGL(kernel_trig, dim3(LEN, 1, 1), dim3(1, 1, 1), 0, 0,
                    In_d, sin_d, cos_d, tan_d,
                    sin_pd, cos_pd);
    hipMemcpy(sin_h, sin_d, SIZE, hipMemcpyDeviceToHost);
@@ -33,7 +33,7 @@ THE SOFTWARE.
 #define ITER 1<<20
 #define SIZE 1024*1024*sizeof(int)

-__global__ void Iter(hipLaunchParm lp, int *Ad){
+__global__ void Iter(int *Ad){
    int tx = threadIdx.x + blockIdx.x * blockDim.x;
    if(tx == 0){
        for(int i=0;i<ITER;i++){
@@ -49,7 +49,7 @@ int main(){
    dim3 dimGrid, dimBlock;
    dimGrid.x = 1, dimGrid.y =1, dimGrid.z = 1;
    dimBlock.x = 1, dimBlock.y = 1, dimGrid.z = 1;
-    hipLaunchKernel(HIP_KERNEL_NAME(Iter), dimGrid, dimBlock, 0, 0, Ad);
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(Iter), dimGrid, dimBlock, 0, 0, Ad);
    hipMemcpy(&A, Ad, SIZE, hipMemcpyDeviceToHost);
    passed();
 }
@@ -30,7 +30,7 @@ THE SOFTWARE.
 #include "test_common.h"

 template <typename T>
-__global__ void testExternSharedKernel(hipLaunchParm lp, const T* A_d, const T* B_d, T* C_d,
+__global__ void testExternSharedKernel(const T* A_d, const T* B_d, T* C_d,
                                       size_t numElements, size_t groupElements) {
    // declare dynamic shared memory
 #if defined(__HIP_PLATFORM_HCC__)
@@ -114,7 +114,7 @@ void testExternShared(size_t N, size_t groupElements) {
    size_t groupMemBytes = groupElements * sizeof(T);

    // launch kernel with dynamic shared memory
-    hipLaunchKernel(HIP_KERNEL_NAME(testExternSharedKernel<T>), dim3(blocks), dim3(threadsPerBlock),
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(testExternSharedKernel<T>), dim3(blocks), dim3(threadsPerBlock),
                    groupMemBytes, 0, A_d, B_d, C_d, N, groupElements);

    HIPCHECK(hipDeviceSynchronize());
@@ -32,7 +32,7 @@ THE SOFTWARE.
 #define LEN 16 * 1024
 #define SIZE LEN * 4

-__global__ void vectorAdd(hipLaunchParm lp, float* Ad, float* Bd) {
+__global__ void vectorAdd(float* Ad, float* Bd) {
    HIP_DYNAMIC_SHARED(float, sBd);
    int tx = threadIdx.x;
    for (int i = 0; i < LEN / 64; i++) {
@@ -53,7 +53,7 @@ int main() {
    hipMalloc(&Bd, SIZE);
    hipMemcpy(Ad, A, SIZE, hipMemcpyHostToDevice);
    hipMemcpy(Bd, B, SIZE, hipMemcpyHostToDevice);
-    hipLaunchKernel(vectorAdd, dim3(1, 1, 1), dim3(64, 1, 1), SIZE, 0, Ad, Bd);
+    hipLaunchKernelGGL(vectorAdd, dim3(1, 1, 1), dim3(64, 1, 1), SIZE, 0, Ad, Bd);
    hipMemcpy(B, Bd, SIZE, hipMemcpyDeviceToHost);
    for (int i = 0; i < LEN; i++) {
        assert(B[i] > 1.0f && B[i] < 3.0f);
@@ -25,10 +25,10 @@ THE SOFTWARE.

 #include "test_common.h"

-__global__ void Empty(hipLaunchParm lp, int param) {}
+__global__ void Empty(int param) {}

 int main() {
-    hipLaunchKernel(HIP_KERNEL_NAME(Empty), dim3(1), dim3(1), 0, 0, 0);
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(Empty), dim3(1), dim3(1), 0, 0, 0);
    hipDeviceSynchronize();
    passed();
 }
@@ -37,7 +37,7 @@ __device__ int foo(int i) { return i + 1; }
 //---
 // Syntax we would like to support with GRID_LAUNCH enabled:
 template <typename T>
-__global__ void vectorADD2(hipLaunchParm lp, T* A_d, T* B_d, T* C_d, size_t N) {
+__global__ void vectorADD2(T* A_d, T* B_d, T* C_d, size_t N) {
    size_t offset = (blockIdx.x * blockDim.x + threadIdx.x);
    size_t stride = blockDim.x * gridDim.x;

@@ -63,7 +63,7 @@ int test_gl2(size_t N) {
    HIPCHECK(hipMemcpy(A_d, A_h, Nbytes, hipMemcpyHostToDevice));
    HIPCHECK(hipMemcpy(B_d, B_h, Nbytes, hipMemcpyHostToDevice));

-    hipLaunchKernel(vectorADD2, dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d, B_d, C_d, N);
+    hipLaunchKernelGGL(vectorADD2, dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d, B_d, C_d, N);

    HIPCHECK(hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost));

@@ -32,7 +32,7 @@ THE SOFTWARE.
 #include <test_common.h>

 #ifdef __HCC__
-    #include <hc.hpp>
+#include <hc.hpp>
 #endif

 // cudaA
@@ -916,7 +916,7 @@ int main() {
    hipLaunchKernelGGL(HIP_KERNEL_NAME(vAdd), dim3(1024), 1, 0, 0, Ad);
    hipLaunchKernelGGL(HIP_KERNEL_NAME(vAdd), dim3(1024), dim3(1), 0, 0, Ad);

-    // Test: Passing hipLaunchKernel inside another macro:
+    // Test: Passing hipLaunchKernelGGL inside another macro:
    float e0;
    GPU_PRINT_TIME(hipLaunchKernelGGL(vAdd, dim3(1024),
                   dim3(1), 0, 0, Ad), e0, j);
@@ -924,7 +924,7 @@ int main() {
                   dim3(1), 0, 0, Ad)), e0, j);

 #ifdef EXTRA_PARENS_1
-    // Don't wrap hipLaunchKernel in extra set of parens:
+    // Don't wrap hipLaunchKernelGGL in extra set of parens:
    GPU_PRINT_TIME((hipLaunchKernelGGL(vAdd, dim3(1024),
                    dim3(1), 0, 0, Ad)), e0, j);
 #endif
@@ -27,10 +27,10 @@ THE SOFTWARE.

 #include "test_common.h"

-__global__ void run_printf(hipLaunchParm lp) { printf("Hello World\n"); }
+__global__ void run_printf() { printf("Hello World\n"); }

 int main() {
-    hipLaunchKernel(HIP_KERNEL_NAME(run_printf), dim3(1), dim3(1), 0, 0);
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(run_printf), dim3(1), dim3(1), 0, 0);
    hipDeviceSynchronize();
    passed();
 }
@@ -35,7 +35,7 @@ THE SOFTWARE.

 __constant__ int Value[LEN];

-__global__ void Get(hipLaunchParm lp, int* Ad) {
+__global__ void Get(int* Ad) {
    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    Ad[tid] = Value[tid];
 }
@@ -52,7 +52,7 @@ int main() {
    HIP_ASSERT(hipMalloc((void**)&Ad, SIZE));

    HIP_ASSERT(hipMemcpyToSymbol(HIP_SYMBOL(Value), A, SIZE, 0, hipMemcpyHostToDevice));
-    hipLaunchKernel(Get, dim3(1, 1, 1), dim3(LEN, 1, 1), 0, 0, Ad);
+    hipLaunchKernelGGL(Get, dim3(1, 1, 1), dim3(LEN, 1, 1), 0, 0, Ad);
    HIP_ASSERT(hipMemcpy(B, Ad, SIZE, hipMemcpyDeviceToHost));

    for (unsigned i = 0; i < LEN; i++) {
@@ -32,12 +32,12 @@ THE SOFTWARE.
 #define NUM 1024
 #define SIZE NUM * 8

-__global__ void Alloc(hipLaunchParm lp, uint64_t* Ptr) {
+__global__ void Alloc(uint64_t* Ptr) {
    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    Ptr[tid] = (uint64_t)malloc(128);
 }

-__global__ void Free(hipLaunchParm lp, uint64_t* Ptr) {
+__global__ void Free(uint64_t* Ptr) {
    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    free((void*)Ptr[tid]);
 }
@@ -54,10 +54,10 @@ int main() {
        HIP_ASSERT(hipSetDevice(i));
        HIP_ASSERT(hipMalloc((void**)&dPtr, SIZE));
        HIP_ASSERT(hipMemcpy(dPtr, hPtr, SIZE, hipMemcpyHostToDevice));
-        hipLaunchKernel(Alloc, dim3(1, 1, 1), dim3(NUM, 1, 1), 0, 0, dPtr);
+        hipLaunchKernelGGL(Alloc, dim3(1, 1, 1), dim3(NUM, 1, 1), 0, 0, dPtr);
        HIP_ASSERT(hipMemcpy(hPtr, dPtr, SIZE, hipMemcpyDeviceToHost));
        assert(hPtr[0] != 0);
-        hipLaunchKernel(Free, dim3(1, 1, 1), dim3(NUM, 1, 1), 0, 0, dPtr);
+        hipLaunchKernelGGL(Free, dim3(1, 1, 1), dim3(NUM, 1, 1), 0, 0, dPtr);
        HIP_ASSERT(hipFree(dPtr));
        for (uint32_t i = 1; i < NUM; i++) {
            assert(hPtr[i] == hPtr[i - 1] + 4096);
@@ -34,52 +34,52 @@ THE SOFTWARE.
 #define LEN11 11 * 4
 #define LEN12 12 * 4

-__global__ void MemCpy8(hipLaunchParm lp, uint8_t* In, uint8_t* Out) {
+__global__ void MemCpy8(uint8_t* In, uint8_t* Out) {
    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    memcpy(Out + tid * 8, In + tid * 8, 8);
 }

-__global__ void MemCpy9(hipLaunchParm lp, uint8_t* In, uint8_t* Out) {
+__global__ void MemCpy9(uint8_t* In, uint8_t* Out) {
    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    memcpy(Out + tid * 9, In + tid * 9, 9);
 }

-__global__ void MemCpy10(hipLaunchParm lp, uint8_t* In, uint8_t* Out) {
+__global__ void MemCpy10(uint8_t* In, uint8_t* Out) {
    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    memcpy(Out + tid * 10, In + tid * 10, 10);
 }

-__global__ void MemCpy11(hipLaunchParm lp, uint8_t* In, uint8_t* Out) {
+__global__ void MemCpy11(uint8_t* In, uint8_t* Out) {
    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    memcpy(Out + tid * 11, In + tid * 11, 11);
 }

-__global__ void MemCpy12(hipLaunchParm lp, uint8_t* In, uint8_t* Out) {
+__global__ void MemCpy12(uint8_t* In, uint8_t* Out) {
    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    memcpy(Out + tid * 12, In + tid * 12, 12);
 }

-__global__ void MemSet8(hipLaunchParm lp, uint8_t* In) {
+__global__ void MemSet8(uint8_t* In) {
    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    memset(In + tid * 8, 1, 8);
 }

-__global__ void MemSet9(hipLaunchParm lp, uint8_t* In) {
+__global__ void MemSet9(uint8_t* In) {
    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    memset(In + tid * 9, 1, 9);
 }

-__global__ void MemSet10(hipLaunchParm lp, uint8_t* In) {
+__global__ void MemSet10(uint8_t* In) {
    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    memset(In + tid * 10, 1, 10);
 }

-__global__ void MemSet11(hipLaunchParm lp, uint8_t* In) {
+__global__ void MemSet11(uint8_t* In) {
    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    memset(In + tid * 11, 1, 11);
 }

-__global__ void MemSet12(hipLaunchParm lp, uint8_t* In) {
+__global__ void MemSet12(uint8_t* In) {
    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    memset(In + tid * 12, 1, 12);
 }
@@ -98,8 +98,8 @@ int main() {
    hipMalloc((void**)&Bd, LEN8);
    hipMalloc((void**)&Cd, LEN8);
    hipMemcpy(Ad, A, LEN8, hipMemcpyHostToDevice);
-    hipLaunchKernel(MemCpy8, dim3(2, 1, 1), dim3(2, 1, 1), 0, 0, Ad, Bd);
-    hipLaunchKernel(MemSet8, dim3(2, 1, 1), dim3(2, 1, 1), 0, 0, Cd);
+    hipLaunchKernelGGL(MemCpy8, dim3(2, 1, 1), dim3(2, 1, 1), 0, 0, Ad, Bd);
+    hipLaunchKernelGGL(MemSet8, dim3(2, 1, 1), dim3(2, 1, 1), 0, 0, Cd);
    hipMemcpy(B, Bd, LEN8, hipMemcpyDeviceToHost);
    hipMemcpy(C, Cd, LEN8, hipMemcpyDeviceToHost);
    for (uint32_t i = 0; i < LEN8; i++) {
@@ -126,8 +126,8 @@ int main() {
    hipMalloc((void**)&Bd, LEN9);
    hipMalloc((void**)&Cd, LEN9);
    hipMemcpy(Ad, A, LEN9, hipMemcpyHostToDevice);
-    hipLaunchKernel(MemCpy9, dim3(2, 1, 1), dim3(2, 1, 1), 0, 0, Ad, Bd);
-    hipLaunchKernel(MemSet9, dim3(2, 1, 1), dim3(2, 1, 1), 0, 0, Cd);
+    hipLaunchKernelGGL(MemCpy9, dim3(2, 1, 1), dim3(2, 1, 1), 0, 0, Ad, Bd);
+    hipLaunchKernelGGL(MemSet9, dim3(2, 1, 1), dim3(2, 1, 1), 0, 0, Cd);
    hipMemcpy(B, Bd, LEN9, hipMemcpyDeviceToHost);
    hipMemcpy(C, Cd, LEN9, hipMemcpyDeviceToHost);
    for (uint32_t i = 0; i < LEN9; i++) {
@@ -154,8 +154,8 @@ int main() {
    hipMalloc((void**)&Bd, LEN10);
    hipMalloc((void**)&Cd, LEN10);
    hipMemcpy(Ad, A, LEN10, hipMemcpyHostToDevice);
-    hipLaunchKernel(MemCpy10, dim3(2, 1, 1), dim3(2, 1, 1), 0, 0, Ad, Bd);
-    hipLaunchKernel(MemSet10, dim3(2, 1, 1), dim3(2, 1, 1), 0, 0, Cd);
+    hipLaunchKernelGGL(MemCpy10, dim3(2, 1, 1), dim3(2, 1, 1), 0, 0, Ad, Bd);
+    hipLaunchKernelGGL(MemSet10, dim3(2, 1, 1), dim3(2, 1, 1), 0, 0, Cd);
    hipMemcpy(B, Bd, LEN10, hipMemcpyDeviceToHost);
    hipMemcpy(C, Cd, LEN10, hipMemcpyDeviceToHost);
    for (uint32_t i = 0; i < LEN10; i++) {
@@ -182,8 +182,8 @@ int main() {
    hipMalloc((void**)&Bd, LEN11);
    hipMalloc((void**)&Cd, LEN11);
    hipMemcpy(Ad, A, LEN11, hipMemcpyHostToDevice);
-    hipLaunchKernel(MemCpy11, dim3(2, 1, 1), dim3(2, 1, 1), 0, 0, Ad, Bd);
-    hipLaunchKernel(MemSet11, dim3(2, 1, 1), dim3(2, 1, 1), 0, 0, Cd);
+    hipLaunchKernelGGL(MemCpy11, dim3(2, 1, 1), dim3(2, 1, 1), 0, 0, Ad, Bd);
+    hipLaunchKernelGGL(MemSet11, dim3(2, 1, 1), dim3(2, 1, 1), 0, 0, Cd);
    hipMemcpy(B, Bd, LEN11, hipMemcpyDeviceToHost);
    hipMemcpy(C, Cd, LEN11, hipMemcpyDeviceToHost);
    for (uint32_t i = 0; i < LEN11; i++) {
@@ -210,8 +210,8 @@ int main() {
    hipMalloc((void**)&Bd, LEN12);
    hipMalloc((void**)&Cd, LEN12);
    hipMemcpy(Ad, A, LEN12, hipMemcpyHostToDevice);
-    hipLaunchKernel(MemCpy12, dim3(2, 1, 1), dim3(2, 1, 1), 0, 0, Ad, Bd);
-    hipLaunchKernel(MemSet12, dim3(2, 1, 1), dim3(2, 1, 1), 0, 0, Cd);
+    hipLaunchKernelGGL(MemCpy12, dim3(2, 1, 1), dim3(2, 1, 1), 0, 0, Ad, Bd);
+    hipLaunchKernelGGL(MemSet12, dim3(2, 1, 1), dim3(2, 1, 1), 0, 0, Cd);
    hipMemcpy(B, Bd, LEN12, hipMemcpyDeviceToHost);
    hipMemcpy(C, Cd, LEN12, hipMemcpyDeviceToHost);
    for (uint32_t i = 0; i < LEN12; i++) {
@@ -33,7 +33,7 @@ OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWA

 // Device (Kernel) function, it must be void
 // hipLaunchParm provides the execution configuration
-__global__ void vadd_asm(hipLaunchParm lp, float* out, float* in) {
+__global__ void vadd_asm(float* out, float* in) {
    int i = blockDim.x * blockIdx.x + threadIdx.x;

 #ifdef __HIP_PLATFORM_NVCC__
@@ -82,7 +82,7 @@ int main() {
    hipMemcpy(gpuResultVector, VectorB, NUM * sizeof(float), hipMemcpyHostToDevice);

    // Lauching kernel from host
-    hipLaunchKernel(vadd_asm, dim3(NUM / THREADS_PER_BLOCK_X), dim3(THREADS_PER_BLOCK_X), 0, 0,
+    hipLaunchKernelGGL(vadd_asm, dim3(NUM / THREADS_PER_BLOCK_X), dim3(THREADS_PER_BLOCK_X), 0, 0,
                    gpuResultVector, gpuVector);

    // Memory transfer from device to host
@@ -33,7 +33,7 @@ THE SOFTWARE.
 #define _SIZE sizeof(int) * 1024 * 1024
 #define NUM_STREAMS 2

-__global__ void Iter(hipLaunchParm lp, int* Ad, int num) {
+__global__ void Iter(int* Ad, int num) {
    int tx = threadIdx.x + blockIdx.x * blockDim.x;
    // Kernel loop designed to execute very slowly... ... ...   so we can test timing-related
    // behavior below
@@ -58,7 +58,7 @@ int main() {
        HIPCHECK(hipMemcpyAsync(Ad[i], A[i], _SIZE, hipMemcpyHostToDevice, stream[i]));
    }
    for (int i = 0; i < NUM_STREAMS; i++) {
-        hipLaunchKernel(HIP_KERNEL_NAME(Iter), dim3(1), dim3(1), 0, stream[i], Ad[i], 1 << 30);
+        hipLaunchKernelGGL(HIP_KERNEL_NAME(Iter), dim3(1), dim3(1), 0, stream[i], Ad[i], 1 << 30);
    }
    for (int i = 0; i < NUM_STREAMS; i++) {
        HIPCHECK(hipMemcpyAsync(A[i], Ad[i], _SIZE, hipMemcpyDeviceToHost, stream[i]));
@@ -66,7 +66,7 @@ int main(int argc, char* argv[]) {
        // Record the start event
        HIPCHECK(hipEventRecord(start, NULL));

-        hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0,
+        hipLaunchKernelGGL(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0,
                        static_cast<const float*>(A_d), static_cast<const float*>(B_d), C_d, N);


@@ -67,7 +67,7 @@ void memcpy2Dtest(size_t numW, size_t numH, bool usePinnedHost) {
    HIPCHECK(hipMemcpy2D(A_d, pitch_A, A_h, width, width, numH, hipMemcpyHostToDevice));
    HIPCHECK(hipMemcpy2D(B_d, pitch_B, B_h, width, width, numH, hipMemcpyHostToDevice));

-    hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d, B_d, C_d,
+    hipLaunchKernelGGL(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d, B_d, C_d,
                    (pitch_C / sizeof(T)) * numH);

    HIPCHECK(hipMemcpy2D(C_h, width, C_d, pitch_C, width, numH, hipMemcpyDeviceToHost));
@@ -117,7 +117,7 @@ void memcpyArraytest(size_t numW, size_t numH, bool usePinnedHost, bool usePitch
        HIPCHECK(hipMemcpyToArray(A_d, 0, 0, (void*)A_h, width, hipMemcpyHostToDevice));
        HIPCHECK(hipMemcpyToArray(B_d, 0, 0, (void*)B_h, width, hipMemcpyHostToDevice));

-        hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0,
+        hipLaunchKernelGGL(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0,
                        (T*)A_d->data, (T*)B_d->data, (T*)C_d->data, numW);

        HIPCHECK(hipMemcpy(C_h, C_d->data, width, hipMemcpyDeviceToHost));
@@ -156,7 +156,7 @@ void memcpyArraytest(size_t numW, size_t numH, bool usePinnedHost, bool usePitch
                                        hipMemcpyHostToDevice));
        }

-        hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0,
+        hipLaunchKernelGGL(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0,
                        (T*)A_d->data, (T*)B_d->data, (T*)C_d->data, numW * numH);

        HIPCHECK(hipMemcpy2D((void*)C_h, width, (void*)C_d->data, width, width, numH,
@@ -32,7 +32,7 @@ THE SOFTWARE.
 #define LEN 1024 * 1024
 #define SIZE LEN * sizeof(float)

-__global__ void Add(hipLaunchParm lp, float* Ad, float* Bd, float* Cd) {
+__global__ void Add(float* Ad, float* Bd, float* Cd) {
    int tx = threadIdx.x + blockIdx.x * blockDim.x;
    Cd[tx] = Ad[tx] + Bd[tx];
 }
@@ -74,7 +74,7 @@ int main() {
    dim3 dimGrid(LEN / 512, 1, 1);
    dim3 dimBlock(512, 1, 1);

-    hipLaunchKernel(HIP_KERNEL_NAME(Add), dimGrid, dimBlock, 0, 0, Ad, Bd, Cd);
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(Add), dimGrid, dimBlock, 0, 0, Ad, Bd, Cd);

    HIPCHECK(
        hipMemcpy(C, Cd, SIZE, hipMemcpyDeviceToHost));  // Note this really HostToHost not
@@ -28,7 +28,7 @@ THE SOFTWARE.
 #include "test_common.h"
 #include <malloc.h>

-__global__ void Inc(hipLaunchParm lp, float* Ad) {
+__global__ void Inc(float* Ad) {
    int tx = threadIdx.x + blockIdx.x * blockDim.x;
    Ad[tx] = Ad[tx] + float(1);
 }
@@ -99,7 +99,7 @@ int main(int argc, char* argv[]) {
        // Reference the registered device pointer Ad from inside the kernel:
        for (int i = 0; i < num_devices; i++) {
            HIPCHECK(hipSetDevice(i));
-            hipLaunchKernel(Inc, dim3(N / 512), dim3(512), 0, 0, Ad[i]);
+            hipLaunchKernelGGL(Inc, dim3(N / 512), dim3(512), 0, 0, Ad[i]);

            HIPCHECK(hipDeviceSynchronize());
        }
@@ -230,7 +230,7 @@ void memcpytest2(DeviceMemory<T>* dmem, HostMemory<T>* hmem, size_t numElements,
                           useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice));
    }

-    hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0,
+    hipLaunchKernelGGL(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0,
                    static_cast<const T*>(dmem->A_d()), static_cast<const T*>(dmem->B_d()),
                    dmem->C_d(), numElements);

@@ -51,7 +51,7 @@ int main() {
        HIPCHECK(hipSetDevice(0));
        HIPCHECK(hipMemcpy(A_d, A_h, Nbytes, hipMemcpyHostToDevice));
        HIPCHECK(hipMemcpy(B_d, B_h, Nbytes, hipMemcpyHostToDevice));
-        hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0,
+        hipLaunchKernelGGL(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0,
                        static_cast<const int*>(A_d), static_cast<const int*>(B_d), C_d, N);
        HIPCHECK(hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost));
        HIPCHECK(hipDeviceSynchronize());
@@ -62,7 +62,7 @@ int main() {
        HIPCHECK(hipMemcpyDtoD((hipDeviceptr_t)X_d, (hipDeviceptr_t)A_d, Nbytes));
        HIPCHECK(hipMemcpyDtoD((hipDeviceptr_t)Y_d, (hipDeviceptr_t)B_d, Nbytes));

-        hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0,
+        hipLaunchKernelGGL(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0,
                        static_cast<const int*>(X_d), static_cast<const int*>(Y_d), Z_d, N);
        HIPCHECK(hipMemcpyDtoH(C_h, (hipDeviceptr_t)Z_d, Nbytes));
        HIPCHECK(hipDeviceSynchronize());
@@ -52,7 +52,7 @@ int main() {
        HIPCHECK(hipSetDevice(0));
        HIPCHECK(hipMemcpy(A_d, A_h, Nbytes, hipMemcpyHostToDevice));
        HIPCHECK(hipMemcpy(B_d, B_h, Nbytes, hipMemcpyHostToDevice));
-        hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0,
+        hipLaunchKernelGGL(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0,
                        static_cast<const int*>(A_d), static_cast<const int*>(B_d), C_d, N);
        HIPCHECK(hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost));
        HIPCHECK(hipDeviceSynchronize());
@@ -63,7 +63,7 @@ int main() {
        HIPCHECK(hipMemcpyDtoDAsync((hipDeviceptr_t)X_d, (hipDeviceptr_t)A_d, Nbytes, s));
        HIPCHECK(hipMemcpyDtoDAsync((hipDeviceptr_t)Y_d, (hipDeviceptr_t)B_d, Nbytes, s));

-        hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0,
+        hipLaunchKernelGGL(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0,
                        static_cast<const int*>(X_d), static_cast<const int*>(Y_d), Z_d, N);
        HIPCHECK(hipMemcpyDtoHAsync(C_h, (hipDeviceptr_t)Z_d, Nbytes, s));
        HIPCHECK(hipStreamSynchronize(s));
@@ -50,7 +50,7 @@ int main() {
        HIPCHECK(hipSetDevice(0));
        HIPCHECK(hipMemcpy(A_d, A_h, Nbytes, hipMemcpyHostToDevice));
        HIPCHECK(hipMemcpy(B_d, B_h, Nbytes, hipMemcpyHostToDevice));
-        hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0,
+        hipLaunchKernelGGL(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0,
                        static_cast<const int*>(A_d), static_cast<const int*>(B_d), C_d, N);
        HIPCHECK(hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost));
        HIPCHECK(hipDeviceSynchronize());
@@ -62,7 +62,7 @@ int main() {
            Nbytes);  // this call is eqv to hipMemcpy(hipMemcpyD2D) which goes via stg bufs.
        hipMemcpyPeer(Y_d, 1, B_d, 0, Nbytes);

-        hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0,
+        hipLaunchKernelGGL(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0,
                        static_cast<const int*>(X_d), static_cast<const int*>(Y_d), Z_d, N);
        HIPCHECK(hipMemcpy(C_h, Z_d, Nbytes, hipMemcpyDeviceToHost));
        HIPCHECK(hipDeviceSynchronize());
@@ -54,7 +54,7 @@ int main() {
        HIPCHECK(hipSetDevice(0));
        HIPCHECK(hipMemcpy(A_d, A_h, Nbytes, hipMemcpyHostToDevice));
        HIPCHECK(hipMemcpy(B_d, B_h, Nbytes, hipMemcpyHostToDevice));
-        hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0,
+        hipLaunchKernelGGL(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0,
                        static_cast<const int*>(A_d), static_cast<const int*>(B_d), C_d, N);
        HIPCHECK(hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost));
        HIPCHECK(hipDeviceSynchronize());
@@ -65,7 +65,7 @@ int main() {
        HIPCHECK(hipMemcpyPeerAsync(X_d, 1, A_d, 0, Nbytes, s));
        HIPCHECK(hipMemcpyPeerAsync(Y_d, 1, B_d, 0, Nbytes, s));

-        hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0,
+        hipLaunchKernelGGL(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0,
                        static_cast<const int*>(X_d), static_cast<const int*>(Y_d), Z_d, N);
        HIPCHECK(hipMemcpy(C_h, Z_d, Nbytes, hipMemcpyDeviceToHost));
        HIPCHECK(hipDeviceSynchronize());
@@ -61,7 +61,7 @@ void simpleTest1() {
    HIPCHECK(memcopy(A_d, A_h, Nbytes, hipMemcpyHostToDevice));
    HIPCHECK(memcopy(B_d, B_h, Nbytes, hipMemcpyHostToDevice));

-    hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0,
+    hipLaunchKernelGGL(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0,
                    static_cast<const int*>(A_d), static_cast<const int*>(B_d), C_d, N);

    HIPCHECK(memcopy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost));
@@ -29,7 +29,7 @@ THE SOFTWARE.
 #include <cstdio>
 #include "hip/hip_runtime.h"

-__global__ void Kernel(hipLaunchParm lp, volatile float* hostRes) {
+__global__ void Kernel(volatile float* hostRes) {
    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    hostRes[tid] = tid + 1;
    __threadfence_system();
@@ -45,7 +45,7 @@ int main() {
    hipHostMalloc((void**)&hostRes, blocks * sizeof(float), hipHostMallocMapped);
    hostRes[0] = 0;
    hostRes[1] = 0;
-    hipLaunchKernel(HIP_KERNEL_NAME(Kernel), dim3(1), dim3(blocks), 0, 0, hostRes);
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(Kernel), dim3(1), dim3(blocks), 0, 0, hostRes);
    int eleCounter = 0;
    while (eleCounter < blocks) {
        // blocks until the value changes
@@ -82,9 +82,9 @@ void simpleVectorAdd(size_t numElements, int iters, hipStream_t stream) {
        // HIPCHECK(hipStreamSynchronize(stream));

        // This is the null stream?
-        // hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d, B_d,
+        // hipLaunchKernelGGL(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d, B_d,
        // C_d, numElements);
-        hipLaunchKernel(HipTest::vectorADDReverse, dim3(blocks), dim3(threadsPerBlock), 0, 0,
+        hipLaunchKernelGGL(HipTest::vectorADDReverse, dim3(blocks), dim3(threadsPerBlock), 0, 0,
                        static_cast<const T*>(A_d), static_cast<const T*>(B_d), C_d, numElements);

        MemTraits<C>::Copy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost, stream);
@@ -33,7 +33,7 @@ THE SOFTWARE.


 template <typename T>
-__global__ void Inc(hipLaunchParm lp, T* Array) {
+__global__ void Inc(T* Array) {
    int tx = threadIdx.x + blockIdx.x * blockDim.x;
    Array[tx] = Array[tx] + T(1);
 }
@@ -53,7 +53,7 @@ void run1(size_t size, hipStream_t stream) {

    HIPCHECK(hipMemcpyAsync(Bh, Ah, size, hipMemcpyHostToHost, stream));
    HIPCHECK(hipMemcpyAsync(Cd, Bh, size, hipMemcpyHostToDevice, stream));
-    hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N / 500), dim3(500), 0, stream, Cd);
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(Inc), dim3(N / 500), dim3(500), 0, stream, Cd);
    HIPCHECK(hipMemcpyAsync(Dd, Cd, size, hipMemcpyDeviceToDevice, stream));
    HIPCHECK(hipMemcpyAsync(Eh, Dd, size, hipMemcpyDeviceToHost, stream));
    HIPCHECK(hipDeviceSynchronize());
@@ -80,8 +80,8 @@ void run(size_t size, hipStream_t stream1, hipStream_t stream2) {
    HIPCHECK(hipMemcpyAsync(Bhh, Ahh, size, hipMemcpyHostToHost, stream2));
    HIPCHECK(hipMemcpyAsync(Cd, Bh, size, hipMemcpyHostToDevice, stream1));
    HIPCHECK(hipMemcpyAsync(Cdd, Bhh, size, hipMemcpyHostToDevice, stream2));
-    hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N / 500), dim3(500), 0, stream1, Cd);
-    hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N / 500), dim3(500), 0, stream2, Cdd);
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(Inc), dim3(N / 500), dim3(500), 0, stream1, Cd);
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(Inc), dim3(N / 500), dim3(500), 0, stream2, Cdd);
    HIPCHECK(hipMemcpyAsync(Dd, Cd, size, hipMemcpyDeviceToDevice, stream1));
    HIPCHECK(hipMemcpyAsync(Ddd, Cdd, size, hipMemcpyDeviceToDevice, stream2));
    HIPCHECK(hipMemcpyAsync(Eh, Dd, size, hipMemcpyDeviceToHost, stream1));
@@ -28,7 +28,7 @@ THE SOFTWARE.

 const int NN = 1 << 21;

-__global__ void kernel(hipLaunchParm lp, float* x, float* y, int n) {
+__global__ void kernel(float* x, float* y, int n) {
    int tid = threadIdx.x;
    if (tid < 1) {
        for (int i = 0; i < n; i++) {
@@ -38,7 +38,7 @@ __global__ void kernel(hipLaunchParm lp, float* x, float* y, int n) {
    }
 }

-__global__ void nKernel(hipLaunchParm lp, float* y) {
+__global__ void nKernel(float* y) {
    int tid = threadIdx.x;
    y[tid] = y[tid] + 1.0f;
 }
@@ -55,8 +55,8 @@ int main() {
    for (int i = 0; i < num_streams; i++) {
        HIPCHECK(hipStreamCreate(&streams[i]));
        HIPCHECK(hipMalloc(&data[i], NN * sizeof(float)));
-        hipLaunchKernel(HIP_KERNEL_NAME(kernel), dim3(1), dim3(1), 0, streams[i], data[i], xd, N);
-        hipLaunchKernel(HIP_KERNEL_NAME(nKernel), dim3(1), dim3(1), 0, 0, yd);
+        hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel), dim3(1), dim3(1), 0, streams[i], data[i], xd, N);
+        hipLaunchKernelGGL(HIP_KERNEL_NAME(nKernel), dim3(1), dim3(1), 0, 0, yd);
    }

    HIPCHECK(hipMemcpy(&x, xd, sizeof(float), hipMemcpyDeviceToHost));
@@ -30,7 +30,7 @@ THE SOFTWARE.

 const int NN = 1 << 21;

-__global__ void kernel(hipLaunchParm lp, float* x, float* y, int n) {
+__global__ void kernel(float* x, float* y, int n) {
    int tid = threadIdx.x;
    if (tid < 1) {
        for (int i = 0; i < n; i++) {
@@ -40,7 +40,7 @@ __global__ void kernel(hipLaunchParm lp, float* x, float* y, int n) {
    }
 }

-__global__ void nKernel(hipLaunchParm lp, float* y) {
+__global__ void nKernel(float* y) {
    int tid = threadIdx.x;
    y[tid] = y[tid] + 1.0f;
 }
@@ -57,8 +57,8 @@ int main() {
    for (int i = 0; i < num_streams; i++) {
        HIPCHECK(hipStreamCreate(&streams[i]));
        HIPCHECK(hipMalloc(&data[i], NN * sizeof(float)));
-        hipLaunchKernel(HIP_KERNEL_NAME(kernel), dim3(1), dim3(1), 0, streams[i], data[i], xd, N);
-        hipLaunchKernel(HIP_KERNEL_NAME(nKernel), dim3(1), dim3(1), 0, 0, yd);
+        hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel), dim3(1), dim3(1), 0, streams[i], data[i], xd, N);
+        hipLaunchKernelGGL(HIP_KERNEL_NAME(nKernel), dim3(1), dim3(1), 0, 0, yd);
    }

    HIPCHECK(hipMemcpy(&x, xd, sizeof(float), hipMemcpyDeviceToHost));
@@ -37,7 +37,7 @@ int p_db = 0;
 using namespace std;

 template <typename T>
-__global__ void vectorADDRepeat(hipLaunchParm lp, const T* A_d, const T* B_d, T* C_d, size_t NELEM,
+__global__ void vectorADDRepeat(const T* A_d, const T* B_d, T* C_d, size_t NELEM,
                                int repeat) {
    size_t offset = (blockIdx.x * blockDim.x + threadIdx.x);
    size_t stride = blockDim.x * gridDim.x;
@@ -117,7 +117,7 @@ void Streamer<T>::enqueAsync() {
    printf("testing: %s  numElements=%zu size=%6.2fMB\n", __func__, _numElements,
           _numElements * sizeof(T) / 1024.0 / 1024.0);
    unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, _numElements);
-    hipLaunchKernel(vectorADDRepeat, dim3(blocks), dim3(threadsPerBlock), 0, _stream,
+    hipLaunchKernelGGL(vectorADDRepeat, dim3(blocks), dim3(threadsPerBlock), 0, _stream,
                    static_cast<const T*>(_A_d), static_cast<const T*>(_B_d), _C_d, _numElements,
                    p_repeat);
 }
@@ -210,7 +210,7 @@ int main(int argc, char* argv[]) {

            // Dispatch to NULL stream, should wait for prior async activity to complete before
            // beginning:
-            hipLaunchKernel(vectorADDRepeat, dim3(blocks), dim3(threadsPerBlock), 0,
+            hipLaunchKernelGGL(vectorADDRepeat, dim3(blocks), dim3(threadsPerBlock), 0,
                            0 /*nullstream*/, static_cast<const int*>(lastStreamer->_C_d),
                            static_cast<const int*>(lastStreamer->_C_d), nullStreamer->_C_d,
                            numElements, 1 /*repeat*/);
@@ -246,7 +246,7 @@ int main(int argc, char* argv[]) {

            // Dispatch to NULL stream, should wait for prior async activity to complete before
            // beginning:
-            hipLaunchKernel(vectorADDRepeat, dim3(blocks), dim3(threadsPerBlock), 0,
+            hipLaunchKernelGGL(vectorADDRepeat, dim3(blocks), dim3(threadsPerBlock), 0,
                            0 /*nullstream*/, static_cast<const int*>(lastStreamer->_C_d),
                            static_cast<const int*>(lastStreamer->_C_d), nullStreamer->_C_d,
                            numElements, 1 /*repeat*/);
@@ -72,7 +72,7 @@ void D2H(T* Dst, T* Src, size_t size) {
 }

 template <typename T>
-__global__ void Inc(hipLaunchParm lp, T* In) {
+__global__ void Inc(T* In) {
    int tx = threadIdx.x + blockIdx.x * blockDim.x;
    In[tx] = In[tx] + 1;
 }
--- a/Show More
+++ b/Show More