diff --git a/projects/hip/README.md b/projects/hip/README.md
index f61c3b106a..d04d63714f 100644
--- a/projects/hip/README.md
+++ b/projects/hip/README.md
@@ -32,6 +32,8 @@ HIP releases are typically of two types. The tag naming convention is different
 - [HIP Runtime API (Doxygen)](http://gpuopen-professionalcompute-tools.github.io/HIP)
 - [HIP Porting Guide](docs/markdown/hip_porting_guide.md)
 - [HIP Porting Driver Guide](docs/markdown/hip_porting_driver_api.md)
+- [HIP Profiling ](docs/markdown/hip_profiling.md)
+- [HIP Debugging](docs/markdown/hip_debugging.md)
 - [HIP Terminology](docs/markdown/hip_terms.md) (including Rosetta Stone of GPU computing terms across CUDA/HIP/HC/AMP/OpenL)
 - [hipify-clang](hipify-clang/README.md)
 - [Developer/CONTRIBUTING Info](CONTRIBUTING.md)
diff --git a/projects/hip/bin/findcode.sh b/projects/hip/bin/findcode.sh
index a2334b3e2d..d092d6bf8d 100755
--- a/projects/hip/bin/findcode.sh
+++ b/projects/hip/bin/findcode.sh
@@ -2,4 +2,4 @@
 
 SEARCH_DIRS=$@
 
-find $SEARCH_DIRS -name '*.cpp' -o -name '*.h' -o -name '*.cu' -o -name '*.cuh' -o -name '*.c' -o -name '*.hpp'
+find $SEARCH_DIRS -name '*.cpp' -o -name '*.h' -o -name '*.cu' -o -name '*.cuh' -o -name '*.c' -o -name '*.hpp' -o -name '*.inl'
diff --git a/projects/hip/bin/hipconvertinplace-perl.sh b/projects/hip/bin/hipconvertinplace-perl.sh
index a8c8d6d9e8..d500cc14c6 100755
--- a/projects/hip/bin/hipconvertinplace-perl.sh
+++ b/projects/hip/bin/hipconvertinplace-perl.sh
@@ -1,18 +1,18 @@
 #!/bin/bash
 
-#usage : hipconvertinplace.sh [DIRNAME] [HIPIFY_OPTIONS]
+#usage : hipconvertinplace-perl.sh DIRNAME [hipify-perl options]
 
-#hipify "inplace" all code files in specified directory.    
+#hipify "inplace" all code files in specified directory.
 # This can be quite handy when dealing with an existing CUDA code base since the script
 # preserves the existing directory structure.
 
 #  For each code file, this script will:
-#   - If ".prehip file does not exist, copy the original code to a new file with extension ".prehip".  Then Hipify the code file.
+#   - If ".prehip file does not exist, copy the original code to a new file with extension ".prehip". Then hipify the code file.
 #   - If ".prehip" file exists, this is used as input to hipify.
-# (this is useful for testing improvements to the hipify toolset).
+# (this is useful for testing improvements to the hipify-perl toolset).
 
 
 SCRIPT_DIR=`dirname $0`
 SEARCH_DIR=$1
 shift
-$SCRIPT_DIR/hipify -inplace -print-stats "$@" `$SCRIPT_DIR/findcode.sh $SEARCH_DIR`
+$SCRIPT_DIR/hipify-perl -inplace -print-stats "$@" `$SCRIPT_DIR/findcode.sh $SEARCH_DIR`
diff --git a/projects/hip/bin/hipexamine-perl.sh b/projects/hip/bin/hipexamine-perl.sh
index 40c1bf466d..4e3a261aa4 100755
--- a/projects/hip/bin/hipexamine-perl.sh
+++ b/projects/hip/bin/hipexamine-perl.sh
@@ -1,12 +1,12 @@
 #!/bin/bash
 
-#usage : hipexamine.sh DIRNAME [hipify.pl options]
+#usage : hipexamine-perl.sh DIRNAME [hipify-perl options]
 
-# Generate HIP stats (LOC, CUDA->API conversions, missing functionality) for all the code files 
+# Generate HIP stats (LOC, CUDA->API conversions, missing functionality) for all the code files
 # in the specified directory.
 
 
 SCRIPT_DIR=`dirname $0`
 SEARCH_DIR=$1
 shift
-$SCRIPT_DIR/hipify -no-output -print-stats "$@" `$SCRIPT_DIR/findcode.sh $SEARCH_DIR`
+$SCRIPT_DIR/hipify-perl -no-output -print-stats "$@" `$SCRIPT_DIR/findcode.sh $SEARCH_DIR`
diff --git a/projects/hip/bin/hipexamine.sh b/projects/hip/bin/hipexamine.sh
index 2a6fab7110..79e1b469f9 100755
--- a/projects/hip/bin/hipexamine.sh
+++ b/projects/hip/bin/hipexamine.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-#usage : hipexamine2.sh DIRNAME [hipify options] [--] [clang options]
+#usage : hipexamine.sh DIRNAME [hipify options] [--] [clang options]
 
 # Generate CUDA->HIP conversion statistics for all the code files in the specified directory.
 
diff --git a/projects/hip/bin/hipify b/projects/hip/bin/hipify-perl
similarity index 99%
rename from projects/hip/bin/hipify
rename to projects/hip/bin/hipify-perl
index 4d77fad3ed..27acc5bccc 100755
--- a/projects/hip/bin/hipify
+++ b/projects/hip/bin/hipify-perl
@@ -20,7 +20,7 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 # THE SOFTWARE.
 ##
-#usage hipify [OPTIONS] INPUT_FILE
+#usage hipify-perl [OPTIONS] INPUT_FILE
 use Getopt::Long;
 
 my $warn_whitelist ="";
@@ -201,7 +201,7 @@ while (@ARGV) {
     my %ft;
     clearStats(\%ft, \@statNames);
     my $countIncludes = 0;
-    my $countKeywords = 0; # keywords like __global__, __shared__ - not converted by hipify but counted here.
+    my $countKeywords = 0; # keywords like __global__, __shared__ - not converted by hipify-perl, but counted here.
     my $warnings = 0;
     my $warningsCublas = 0;
     my $warningsCurand = 0;
diff --git a/projects/hip/docs/markdown/CUDA_Driver_API_functions_supported_by_HIP.md b/projects/hip/docs/markdown/CUDA_Driver_API_functions_supported_by_HIP.md
index 3434d29a70..ad9d791a6d 100644
--- a/projects/hip/docs/markdown/CUDA_Driver_API_functions_supported_by_HIP.md
+++ b/projects/hip/docs/markdown/CUDA_Driver_API_functions_supported_by_HIP.md
@@ -323,6 +323,68 @@
 |          500 |*`CUDA_ERROR_NOT_FOUND`*                                       |*`hipErrorNotFound`*                                        | This indicates that a named symbol was not found. Examples of symbols are global/constant variable names, texture names, and surface names. |
 |          600 |*`CUDA_ERROR_NOT_READY`*                                       |*`hipErrorNotReady`*                                        | This indicates that asynchronous operations issued previously have not completed yet. This result is not actually an error, but must be indicated differently than CUDA_SUCCESS (which indicates completion). Calls that may return this value include cuEventQuery() and cuStreamQuery(). |
 |          700 |*`CUDA_ERROR_ILLEGAL_ADDRESS`*                                 |*`hipErrorIllegalAddress`*                                  | While executing a kernel, the device encountered a load or store instruction on an invalid memory address. The context cannot be used, so it must be destroyed (and a new one should be created). All existing device memory allocations from this context are invalid and must be reconstructed if the program is to continue using CUDA. |
+|          701 |*`CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES`*                         |*`hipErrorLaunchOutOfResources`*                            | This indicates that a launch did not occur because it did not have appropriate resources. This error usually indicates that the user has attempted to pass too many arguments to the device kernel, or the kernel launch specifies too many threads for the kernel's register count. Passing arguments of the wrong size (i.e. a 64-bit pointer when a 32-bit int is expected) is equivalent to passing too many arguments and can also result in this error. |
+|          702 |*`CUDA_ERROR_LAUNCH_TIMEOUT`*                                  |*`hipErrorLaunchTimeOut`*                                   | This indicates that the device kernel took too long to execute. This can only occur if timeouts are enabled - see the device attribute CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT for more information. The context cannot be used (and must be destroyed similar to CUDA_ERROR_LAUNCH_FAILED). All existing device memory allocations from this context are invalid and must be reconstructed if the program is to continue using CUDA. |
+|          703 |*`CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING`*                   |                                                            | This error indicates a kernel launch that uses an incompatible texturing mode.                                                 |
+|          704 |*`CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED`*                     |*`hipErrorPeerAccessAlreadyEnabled`*                        | This error indicates that a call to cuCtxEnablePeerAccess() is trying to re-enable peer access to a context which has already had peer access to it enabled. |
+|          705 |*`CUDA_ERROR_PEER_ACCESS_NOT_ENABLED`*                         |*`hipErrorPeerAccessNotEnabled`*                            | This error indicates that cuCtxDisablePeerAccess() is trying to disable peer access which has not been enabled yet via cuCtxEnablePeerAccess(). |
+|          708 |*`CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE`*                          |                                                            | This error indicates that cuCtxDisablePeerAccess() is trying to disable peer access which has not been enabled yet via cuCtxEnablePeerAccess(). |
+|          709 |*`CUDA_ERROR_CONTEXT_IS_DESTROYED`*                            |                                                            | This error indicates that the context current to the calling thread has been destroyed using cuCtxDestroy, or is a primary context which has not yet been initialized. |
+|          710 |*`CUDA_ERROR_ASSERT`*                                          |                                                            | A device-side assert triggered during kernel execution. The context cannot be used anymore, and must be destroyed. All existing device memory allocations from this context are invalid and must be reconstructed if the program is to continue using CUDA. |
+|          711 |*`CUDA_ERROR_TOO_MANY_PEERS`*                                  |                                                            | This error indicates that the hardware resources required to enable peer access have been exhausted for one or more of the devices passed to cuCtxEnablePeerAccess(). |
+|          712 |*`CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED`*                  |*`hipErrorHostMemoryAlreadyRegistered`*                     | This error indicates that the memory range passed to cuMemHostRegister() has already been registered.                          |
+|          713 |*`CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED`*                      |*`hipErrorHostMemoryNotRegistered`*                         | This error indicates that the pointer passed to cuMemHostUnregister() does not correspond to any currently registered memory region. |
+|          714 |*`CUDA_ERROR_HARDWARE_STACK_ERROR`*                            |                                                            | While executing a kernel, the device encountered a stack error. This can be due to stack corruption or exceeding the stack size limit. The context cannot be used, so it must be destroyed (and a new one should be created). All existing device memory allocations from this context are invalid and must be reconstructed if the program is to continue using CUDA. |
+|          715 |*`CUDA_ERROR_ILLEGAL_INSTRUCTION`*                             |                                                            | While executing a kernel, the device encountered an illegal instruction. The context cannot be used, so it must be destroyed (and a new one should be created). All existing device memory allocations from this context are invalid and must be reconstructed if the program is to continue using CUDA. |
+|          716 |*`CUDA_ERROR_MISALIGNED_ADDRESS`*                              |                                                            | While executing a kernel, the device encountered a load or store instruction on a memory address which is not aligned. The context cannot be used, so it must be destroyed (and a new one should be created). All existing device memory allocations from this context are invalid and must be reconstructed if the program is to continue using CUDA. |
+|          717 |*`CUDA_ERROR_INVALID_ADDRESS_SPACE`*                           |                                                            | While executing a kernel, the device encountered an instruction which can only operate on memory locations in certain address spaces (global, shared, or local), but was supplied a memory address not belonging to an allowed address space. The context cannot be used, so it must be destroyed (and a new one should be created). All existing device memory allocations from this context are invalid and must be reconstructed if the program is to continue using CUDA. |
+|          718 |*`CUDA_ERROR_INVALID_PC`*                                      |                                                            | While executing a kernel, the device program counter wrapped its address space. The context cannot be used, so it must be destroyed (and a new one should be created). All existing device memory allocations from this context are invalid and must be reconstructed if the program is to continue using CUDA. |
+|          719 |*`CUDA_ERROR_LAUNCH_FAILED`*                                   |                                                            | An exception occurred on the device while executing a kernel. Common causes include dereferencing an invalid device pointer and accessing out of bounds shared memory. The context cannot be used, so it must be destroyed (and a new one should be created). All existing device memory allocations from this context are invalid and must be reconstructed if the program is to continue using CUDA. |
+|          800 |*`CUDA_ERROR_NOT_PERMITTED`*                                   |                                                            | This error indicates that the attempted operation is not permitted.                                                            |
+|          801 |*`CUDA_ERROR_NOT_SUPPORTED`*                                   |                                                            | This error indicates that the attempted operation is not supported on the current system or device.                            |
+|          999 |*`CUDA_ERROR_UNKNOWN`*                                         |                                                            | This indicates that an unknown internal error has occurred.                                                                    |
+| enum         |***`CUstream_flags`***                                         |***`hipStreamFlags`***                                      | Stream creation flags                                                                                                          |
+|          0x0 |*`CU_STREAM_DEFAULT`*                                          |*`hipStreamDefault`*                                        | Default stream flag                                                                                                            |
+|          0x1 |*`CU_STREAM_NON_BLOCKING`*                                     |*`hipStreamNonBlocking`*                                    | Stream does not synchronize with stream 0 (the NULL stream)                                                                    |
+| typedef      | `CUarray`                                                     | `hipArray *`                                               | CUDA array                                                                                                                     |
+| struct       | `CUarray_st`                                                  | `hipArray`                                                 | CUDA array                                                                                                                     |
+| typedef      | `CUcontext`                                                   | `hipCtx_t`                                                 | CUDA context                                                                                                                   |
+| typedef      | `CUdevice`                                                    | `hipDevice_t`                                              | CUDA device                                                                                                                    |
+| typedef      | `CUdeviceptr`                                                 | `hipDeviceptr_t`                                           | CUDA device pointer CUdeviceptr is defined as an unsigned integer type whose size matches the size of a pointer on the target platform. |
+| typedef      | `CUevent`                                                     | `hipEvent_t`                                               | CUDA event                                                                                                                     |
+| typedef      | `CUfunction`                                                  | `hipFunction_t`                                            | CUDA function                                                                                                                  |
+| typedef      | `CUgraphicsResource`                                          |                                                            | CUDA graphics interop resource                                                                                                 |
+| typedef      | `CUmipmappedArray`                                            |                                                            | CUDA mipmapped array                                                                                                           |
+| typedef      | `CUmodule`                                                    | `hipModule_t`                                              | CUDA module                                                                                                                    |
+| typedef      | `CUstream`                                                    | `hipStream_t`                                              | CUDA module                                                                                                                    |
+| typedef      | `CUstreamCallback`                                            | `hipStreamCallback_t`                                      | CUDA stream callback                                                                                                           |
+| typedef      | `CUsurfObject`                                                |                                                            | An opaque value that represents a CUDA surface object                                                                          |
+| typedef      | `CUsurfref`                                                   |                                                            | CUDA surface reference                                                                                                         |
+| typedef      | `CUtexObject`                                                 |                                                            | An opaque value that represents a CUDA texture object                                                                          |
+| typedef      | `CUtexref`                                                    |                                                            | CUDA texture reference                                                                                                         |
+| define       |`CU_IPC_HANDLE_SIZE`                                           |                                                            | CUDA IPC handle size.                                                                                                          |
+| define       |`CU_LAUNCH_PARAM_BUFFER_POINTER`                               | `HIP_LAUNCH_PARAM_BUFFER_POINTER`                          | Indicator that the next value in the extra parameter to cuLaunchKernel will be a pointer to a buffer containing all kernel parameters used for launching kernel f. This buffer needs to honor all alignment/padding requirements of the individual parameters. If CU_LAUNCH_PARAM_BUFFER_SIZE is not also specified in the extra array, then CU_LAUNCH_PARAM_BUFFER_POINTER will have no effect. |
+| define       |`CU_LAUNCH_PARAM_BUFFER_SIZE`                                  | `HIP_LAUNCH_PARAM_BUFFER_SIZE`                             | Indicator that the next value in the extra parameter to cuLaunchKernel will be a pointer to a size_t which contains the size of the buffer specified with CU_LAUNCH_PARAM_BUFFER_POINTER. It is required that CU_LAUNCH_PARAM_BUFFER_POINTER also be specified in the extra array if the value associated with CU_LAUNCH_PARAM_BUFFER_SIZE is not zero. |
+| define       |`CU_LAUNCH_PARAM_END`                                          | `HIP_LAUNCH_PARAM_END`                                     | End of array terminator for the extra parameter to cuLaunchKernel.                                                             |
+| define       |`CU_MEMHOSTALLOC_DEVICEMAP`                                    |                                                            | If set, host memory is mapped into CUDA address space and cuMemHostGetDevicePointer() may be called on the host pointer. Flag for cuMemHostAlloc(). |
+| define       |`CU_MEMHOSTALLOC_PORTABLE`                                     |                                                            | If set, host memory is portable between CUDA contexts. Flag for cuMemHostAlloc().                                              |
+| define       |`CU_MEMHOSTALLOC_WRITECOMBINED`                                |                                                            | If set, host memory is allocated as write-combined - fast to write, faster to DMA, slow to read except via SSE4 streaming load instruction (MOVNTDQA). Flag for cuMemHostAlloc(). |
+| define       |`CU_MEMHOSTREGISTER_DEVICEMAP`                                 |                                                            | If set, host memory is mapped into CUDA address space and cuMemHostGetDevicePointer() may be called on the host pointer. Flag for cuMemHostRegister(). |
+| define       |`CU_MEMHOSTREGISTER_IOMEMORY`                                  |                                                            | If set, the passed memory pointer is treated as pointing to some memory-mapped I/O space, e.g. belonging to a third-party PCIe device. On Windows the flag is a no-op. On Linux that memory is marked as non cache-coherent for the GPU and is expected to be physically contiguous. It may return CUDA_ERROR_NOT_PERMITTED if run as an unprivileged user, CUDA_ERROR_NOT_SUPPORTED on older Linux kernel versions. On all other platforms, it is not supported and CUDA_ERROR_NOT_SUPPORTED is returned. Flag for cuMemHostRegister(). |
+| define       |`CU_MEMHOSTREGISTER_PORTABLE`                                  |                                                            | If set, host memory is portable between CUDA contexts. Flag for cuMemHostRegister().                                           |
+| define       |`CU_PARAM_TR_DEFAULT`                                          |                                                            | For texture references loaded into the module, use default texunit from texture reference.                                     |
+| define       |`CU_STREAM_LEGACY`                                             |                                                            | Legacy stream handle. Stream handle that can be passed as a CUstream to use an implicit stream with legacy synchronization behavior. See details of the synchronization behavior. |
+| define       |`CU_STREAM_PER_THREAD`                                         |                                                            | Per-thread stream handle. Stream handle that can be passed as a CUstream to use an implicit stream with perthread synchronization behavior. See details of the synchronization behavior. |
+| define       |`CU_TRSA_OVERRIDE_FORMAT`                                      |                                                            | Override the texref format with a format inferred from the array. Flag for cuTexRefSetArray().                                 |
+| define       |`CU_TRSF_NORMALIZED_COORDINATES`                               |                                                            | Use normalized texture coordinates in the range [0,1) instead of [0,dim). Flag for cuTexRefSetFlags().                         |
+| define       |`CU_TRSF_SRGB`                                                 |                                                            | Perform sRGB->linear conversion during texture read. Flag for cuTexRefSetFlags().                                              |
+| define       |`CUDA_ARRAY3D_2DARRAY`                                         |                                                            | Deprecated, use CUDA_ARRAY3D_LAYERED.                                                                                          |
+| define       |`CUDA_ARRAY3D_CUBEMAP`                                         |                                                            | If set, the CUDA array is a collection of six 2D arrays, representing faces of a cube. The width of such a CUDA array must be equal to its height, and Depth must be six. If CUDA_ARRAY3D_LAYERED flag is also set, then the CUDA array is a collection of cubemaps and Depth must be a multiple of six. |
+| define       |`CUDA_ARRAY3D_DEPTH_TEXTURE`                                   |                                                            | This flag if set indicates that the CUDA array is a DEPTH_TEXTURE.                                                             |
+| define       |`CUDA_ARRAY3D_LAYERED`                                         |                                                            | If set, the CUDA array is a collection of layers, where each layer is either a 1D or a 2D array and the Depth member of CUDA_ARRAY3D_DESCRIPTOR specifies the number of layers, not the depth of a 3D array. |
+| define       |`CUDA_ARRAY3D_SURFACE_LDST`                                    |                                                            | This flag must be set in order to bind a surface reference to the CUDA array.                                                  |
+| define       |`CUDA_ARRAY3D_TEXTURE_GATHER`                                  |                                                            | This flag must be set in order to perform texture gather operations on a CUDA array.                                           |
+| define       |`CUDA_VERSION`                                                 |                                                            | CUDA API version number.                                                                                                       |
 
 
 ## **2. Error Handling**
diff --git a/projects/hip/docs/markdown/hip_bugs.md b/projects/hip/docs/markdown/hip_bugs.md
index abb31d80e8..91b2a5a019 100644
--- a/projects/hip/docs/markdown/hip_bugs.md
+++ b/projects/hip/docs/markdown/hip_bugs.md
@@ -45,6 +45,8 @@ To correct, add the following flag to hcc or hipcc:
 $ hipcc -Wl,-Bsymbolic ...
 ```
 
+Ensure there is no space in the "Wl,-Bsymbolic" option.
+
 
 ### What is the current limitation of HIP Generic Grid Launch method?
 1. __global__ functions cannot be marked as static or put in an unnamed namespace i.e. they cannot be given internal linkage (this would clash with __attribute__((weak)));
diff --git a/projects/hip/docs/markdown/hip_debugging.md b/projects/hip/docs/markdown/hip_debugging.md
new file mode 100644
index 0000000000..e7e058d17a
--- /dev/null
+++ b/projects/hip/docs/markdown/hip_debugging.md
@@ -0,0 +1,168 @@
+Table of Contents
+=================
+
+  * [Profiling HIP Code](#profiling-hip-code" aria-hidden="true"><span aria-hidden="true)
+      * [Using HIP_DB](#using-hip_db" aria-hidden="true"><span aria-hidden="true)
+      * [Using ltrace](#using-ltrace" aria-hidden="true"><span aria-hidden="true)
+      * [Chicken bits](#chicken-bits" aria-hidden="true"><span aria-hidden="true)
+      * [Debugging HIP Applications](#debugging-hip-applications" aria-hidden="true"><span aria-hidden="true)
+      * [General Debugging Tips](#general-debugging-tips" aria-hidden="true"><span aria-hidden="true)
+        * [Print env var state](#print-env-var-state" aria-hidden="true"><span aria-hidden="true)
+
+### Using HIP_DB
+
+This flag is primarily targeted to assist HIP development team in the development of the HIP runtime, but in some situations may be useful to HIP application developers as well.
+The HIP debug information is designed to print important information during the execution of a HIP API.  HIP provides
+different color-coded levels of debug information:
+  - api  : Print the beginning and end of each HIP API, including the arguments and return codes.  This is equivalent to setting HIP_TRACE_API=1.
+  - sync : Print multi-thread and other synchronization debug information.
+  - copy : Print which engine is doing the copy, which copy flavor is selected, information on source and destination memory.
+  - mem  : Print information about memory allocation - which pointers are allocated, where they are allocated, peer mappings, and more.
+
+HIP_DB format is flags separated by '+' sign, or a hex code for the bitmask.  Generally the + format is preferred.  
+For example:
+```
+$ HIP_DB=api+copy+mem  my-application
+$ HIP_DB=0xF  my-application
+```
+
+### Using ltrace
+ltrace is a standard linux tool which provides a message to stderr on every dynamic library call.  Since ROCr and the ROCt (the ROC thunk, which is the thin user-space interface to the ROC kernel driver) are both dynamic libraries, this provides an easy way to trace the activity in these libraries.  Tracing can be a powerful way to quickly observe the flow of the application before diving into the details with a command-line debugger.
+The trace can also show performance issues related to accidental calls to expensive API calls on the critical path.
+
+ltrace can be easily combined with the HIP_DB switches to visualize the runtime behavior of the entire ROCm software stack.  Here's a sample command-line and output:
+
+```
+$ HIP_DB=api ltrace -C -e 'hsa*'   <applicationName> <applicationArguments>
+
+...
+
+<<hip-api tid:1.17 hipMemcpy (0x7f7776d3e010, 0x503d1d000, 4194304, hipMemcpyDeviceToHost)
+libmcwamp_hsa.so->hsa_signal_store_relaxed(0x1804000, 0, 0, 0x400000) = 0
+libmcwamp_hsa.so->hsa_signal_store_relaxed(0x1816000, 0, 0x7f777f85f2a0, 0x400000) = 0
+libmcwamp_hsa.so->hsa_amd_memory_lock(0x7f7776d3e010, 0x400000, 0x1213b70, 1 <unfinished ...>
+libhsa-runtime64.so.1->hsaKmtRegisterMemoryToNodes(0x7f7776d3e010, 0x400000, 1, 0x1220c10) = 0
+libhsa-runtime64.so.1->hsaKmtMapMemoryToGPUNodes(0x7f7776d3e010, 0x400000, 0x7ffc32865400, 64) = 0
+<... hsa_amd_memory_lock resumed> )              = 0
+libmcwamp_hsa.so->hsa_signal_store_relaxed(0x1804000, 1, 0x7f777e95a770, 0x12205b0) = 0
+libmcwamp_hsa.so->hsa_amd_memory_async_copy(0x50411d010, 0x11e70d0, 0x503d1d000, 0x11e70d0) = 0
+libmcwamp_hsa.so->hsa_signal_wait_acquire(0x1804000, 2, 1, -1) = 0
+libmcwamp_hsa.so->hsa_amd_memory_unlock(0x7f7776d3e010, 0x1213c6c, 0x12c3c600000000, 0x1804000 <unfinished ...>
+libhsa-runtime64.so.1->hsaKmtUnmapMemoryToGPU(0x7f7776d3e010, 0x7f7776d3e010, 0x12c3c600000000, 0x1804000) = 0
+libhsa-runtime64.so.1->hsaKmtDeregisterMemory(0x7f7776d3e010, 0x7f7776d3e010, 0x7f777f60f9e8, 0x1220580) = 0
+<... hsa_amd_memory_unlock resumed> )            = 0
+  hip-api tid:1.17 hipMemcpy                      ret= 0 (hipSuccess)>>
+```
+
+Some key information from the trace above.
+  - Thy trace snippet shows the execution of a hipMemcpy API, bracketed by the first and last message in the trace output.  The messages show the thread id and API sequence number (`1.17`).  ltrace output intermixes messages from all threads, so the HIP debug information can be useful to determine which threads are executing.
+  - The code flows through HIP APIs into ROCr (HSA) APIs (hsa*) and into the thunk (hsaKmt*) calls.
+  - The HCC runtime is "libmcwamp_hsa.so" and the HSA/ROCr runtime is "libhsa-runtime64.so".
+  - In this particular case, the memory copy is for unpinned memory, and the selected copy algorithm is to pin the host memory "in-place" before performing the copy.  The signaling APIs and calls to pin ("lock", "register") the memory are readily apparent in the trace output.
+
+
+### Chicken bits
+Chicken bits are environment variables which cause the HIP, HCC, or HSA driver to disable some feature or optimization.
+These are not intended for production but can be useful diagnose synchronization problems in the application (or driver).
+
+Some of the most useful chicken bits are described here. These bits are supported on the ROCm path:
+
+HIP provides 3 environment variables in the HIP_*_BLOCKING family.  These introduce additional synchronization and can be useful to isolate synchronization problems. Specifically, if the code works with this flag set, then it indicates the kernels are executing correctly, and any failures likely are causes by improper or missing synchronization.  These flags will have performance impact and are not intended for production use.
+
+- HIP_LAUNCH_BLOCKING=1 : Waits on the host after each kernel launch.  Equivalent to setting CUDA_LAUNCH_BLOCKING.
+- HIP_LAUNCH_BLOCKING_KERNELS: A comma-separated list of kernel names.  The HIP runtime will wait on the host after one of the named kernels executes.  This provides a more targeted version of HIP_LAUNCH_BLOCKING and may be useful to isolate exactly which kernel needs further analysis if HIP_LAUNCH_BLOCKING=1 improves functionality.  There is no indication if kernel names are spelled incorrectly.  One mechanism to verify that the blocking is working is to run with HIP_DB=api+sync and search for debug messages with "LAUNCH_BLOCKING".
+- HIP_API_BLOCKING : Forces hipMemcpyAsync and hipMemsetAsync to be host-synchronous, meaning they will wait for the requested operation to complete before returning to the caller.
+
+These options cause HCC to serialize.  Useful if you have libraries or code which is calling HCC kernels directly rather than using HIP.  
+- HCC_SERIALZIE_KERNELS : 0x1=pre-serialize before each kernel launch, 0x2=post-serialize after each kernel launch., 0x3= pre- and post- serialize.
+- HCC_SERIALIZE_COPY    : 0x1=pre-serialize before each async copy, 0x2=post-serialize after each async copy., 0x3= pre- and post- serialize.
+
+- HSA_ENABLE_SDMA=0     : Causes host-to-device and device-to-host copies to use compute shader blit kernels rather than the dedicated DMA copy engines.  Compute shader copies have low latency (typically < 5us) and can achieve approximately 80% of the bandwidth of the DMA copy engine.  This flag is useful to isolate issues with the hardware copy engines.
+- HSA_ENABLE_INTERRUPT=0 : Causes completion signals to be detected with memory-based polling rather than interrupts.  Can be useful to diagnose interrupt storm issues in the driver.
+- HSA_DISABLE_CACHE=1  : Disables the GPU L2 data cache.
+
+### Debugging HIP Applications
+
+- The variable "tls_tidInfo" contains the API sequence number (_apiSeqNum)- a monotonically increasing count of the HIP APIs called from this thread.  This can be useful for setting conditional breakpoints.  Also, each new HIP thread is mapped to monotically increasing shortTid ID.  Both of these fields are displayed in the HIP debug info. 
+```
+(gdb) p tls_tidInfo
+$32 = {_shortTid = 1, _apiSeqNum = 803}
+```
+
+- HCC tracks all of the application memory allocations, including those from HIP and HC's "am_alloc".
+If the HCC runtime is built with debug information (HCC_RUNTIME_DEBUG=ON when building HCC), then calling the function 'hc::am_memtracker_print()' will show all memory allocations. 
+An optional argument specifies a void * targetPointer - the print routine will mark the allocation which contains the specified pointer with "-->" in the printed output.
+This example shows a sample GDB session where we print the memory allocated by this process and mark a specified address by using the gdb "call" function..
+The gdb syntax also supports using the variable name (in this case 'dst'):
+```
+(gdb) p dst
+$33 = (void *) 0x5ec7e9000
+(gdb) call hc::am_memtracker_print(dst)
+TargetAddress:0x5ec7e9000
+   0x504cfc000-0x504cfc00f::  allocSeqNum:1 hostPointer:0x504cfc000 devicePointer:0x504cfc000 sizeBytes:16 isInDeviceMem:0 isAmManaged:1 appId:0 appAllocFlags:0 appPtr:(nil)
+...
+-->0x5ec7e9000-0x5f7e28fff::  allocSeqNum:488 hostPointer:(nil) devicePointer:0x5ec7e9000 sizeBytes:191102976 isInDeviceMem:1 isAmManaged:1 appId:0 appAllocFlags:0 appPtr:(nil)
+
+```
+
+To debug an explicit address, cast the address to (void*) :
+```
+(gdb) call hc::am_memtracker_print((void*)0x508c7f000)
+```
+- Debugging GPUVM fault.
+For example:
+```
+Memory access fault by GPU node-1 on address 0x5924000. Reason: Page not present or supervisor privilege.
+
+Program received signal SIGABRT, Aborted.
+[Switching to Thread 0x7fffdffb5700 (LWP 14893)]
+0x00007ffff2057c37 in __GI_raise (sig=sig@entry=6) at ../nptl/sysdeps/unix/sysv/linux/raise.c:56
+56      ../nptl/sysdeps/unix/sysv/linux/raise.c: No such file or directory.
+(gdb) bt
+#0  0x00007ffff2057c37 in __GI_raise (sig=sig@entry=6) at ../nptl/sysdeps/unix/sysv/linux/raise.c:56
+#1  0x00007ffff205b028 in __GI_abort () at abort.c:89
+#2  0x00007ffff6f960eb in ?? () from /opt/rocm/hsa/lib/libhsa-runtime64.so.1
+#3  0x00007ffff6f99ea5 in ?? () from /opt/rocm/hsa/lib/libhsa-runtime64.so.1
+#4  0x00007ffff6f78107 in ?? () from /opt/rocm/hsa/lib/libhsa-runtime64.so.1
+#5  0x00007ffff744f184 in start_thread (arg=0x7fffdffb5700) at pthread_create.c:312
+#6  0x00007ffff211b37d in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:111
+(gdb) info threads
+  Id   Target Id         Frame
+  4    Thread 0x7fffdd521700 (LWP 14895) "caffe" pthread_cond_wait@@GLIBC_2.3.2 () at ../nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S:185
+  3    Thread 0x7fffddd22700 (LWP 14894) "caffe" pthread_cond_wait@@GLIBC_2.3.2 () at ../nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S:185
+* 2    Thread 0x7fffdffb5700 (LWP 14893) "caffe" 0x00007ffff2057c37 in __GI_raise (sig=sig@entry=6) at ../nptl/sysdeps/unix/sysv/linux/raise.c:56
+  1    Thread 0x7ffff7fa6ac0 (LWP 14892) "caffe" 0x00007ffff6f934d5 in ?? () from /opt/rocm/hsa/lib/libhsa-runtime64.so.1
+(gdb) thread 1
+[Switching to thread 1 (Thread 0x7ffff7fa6ac0 (LWP 14892))]
+#0  0x00007ffff6f934d5 in ?? () from /opt/rocm/hsa/lib/libhsa-runtime64.so.1
+(gdb) bt
+#0  0x00007ffff6f934d5 in ?? () from /opt/rocm/hsa/lib/libhsa-runtime64.so.1
+#1  0x00007ffff6f929ba in ?? () from /opt/rocm/hsa/lib/libhsa-runtime64.so.1
+#2  0x00007fffe080beca in HSADispatch::waitComplete() () from /opt/rocm/hcc/lib/libmcwamp_hsa.so
+#3  0x00007fffe080415f in HSADispatch::dispatchKernelAsync(Kalmar::HSAQueue*, void const*, int, bool) () from /opt/rocm/hcc/lib/libmcwamp_hsa.so
+#4  0x00007fffe080238e in Kalmar::HSAQueue::dispatch_hsa_kernel(hsa_kernel_dispatch_packet_s const*, void const*, unsigned long, hc::completion_future*) () from /opt/rocm/hcc/lib/libmcwamp_hsa.so
+#5  0x00007ffff7bb7559 in hipModuleLaunchKernel () from /opt/rocm/hip/lib/libhip_hcc.so
+#6  0x00007ffff2e6cd2c in mlopen::HIPOCKernel::run (this=0x7fffffffb5a8, args=0x7fffffffb2a8, size=80) at /root/MIOpen/src/hipoc/hipoc_kernel.cpp:15  
+...
+```
+
+### General Debugging Tips
+- The fault will be caught by the runtime but was actually generated by an asynchronous command running on the GPU.    So, the GDB backtrace will show a path in the runtime, ie inside "GI_Raise" as shown in the example above.
+- To determine the true location of the fault, force the kernels to execute synchronously by seeing the environment variables HCC_SERIALIZE_KERNEL=3 HCC_SERIALIZE_COPY=3.  This will force HCC to wait for the kernel to finish executing before retuning.  If the fault occurs during the execution of a kernel, you can see the code which launched the kernel inside the backtrace.  A bit of guesswork is required to determine which thread is actually causing the issue - typically it will the thread which is waiting inside the libhsa-runtime64.so.
+- VM faults inside kernels can be caused byi:
+   - incorrect code (ie a for loop which extends past array boundaries), i
+   - memory issues  - kernel arguments which are invalid (null pointers, unregistered host pointers, bad pointers).
+   - synchronization issues
+   - compiler issues (incorrect code generation from the compiler)
+   - runtime issues 
+
+-- General debug tips:
+- 'gdb --args' can be used to conviently pass the executable and arguments to gdb.
+- From inside GDB, you can set environment variables "set env".  Note the command does not use an '=' sign:
+```
+(gdb) set env HIP_DB 1
+```
+
+#### Print env var state
+Setting HIP_PRINT_ENV=1 and then running a HIP application will print the HIP environment variables, their current values, and usage info.
+Setting HCC_PRINT_ENV=1 and then running a HCC application will print the HCC environment variables, their current values, and usage info.
diff --git a/projects/hip/docs/markdown/hip_faq.md b/projects/hip/docs/markdown/hip_faq.md
index e316d449ef..07ec5f1d8b 100644
--- a/projects/hip/docs/markdown/hip_faq.md
+++ b/projects/hip/docs/markdown/hip_faq.md
@@ -53,7 +53,7 @@ At a high-level, the following features are not supported:
 - Dynamic parallelism (CUDA 5.0)
 - Managed memory (CUDA 6.5)
 - Graphics interoperability with OpenGL or Direct3D
-- CUDA Driver API (Under Development)
+- CUDA Driver API
 - CUDA IPC Functions (Under Development)
 - CUDA array, mipmappedArray and pitched memory
 - MemcpyToSymbol functions
@@ -102,7 +102,7 @@ However, we can provide a rough summary of the features included in each CUDA SD
     - Per-thread-streams (under development)
     - C++11 (HCC supports all of C++11, all of C++14 and some C++17 features)
 - CUDA 7.5
-    - float16 (under development)
+    - float16
 - CUDA 8.0
     - TBD.
 
diff --git a/projects/hip/docs/markdown/hip_porting_driver_api.md b/projects/hip/docs/markdown/hip_porting_driver_api.md
index dd3b9c3e86..0912e676cc 100644
--- a/projects/hip/docs/markdown/hip_porting_driver_api.md
+++ b/projects/hip/docs/markdown/hip_porting_driver_api.md
@@ -98,48 +98,57 @@ HIP/HCC will push primary context to context stack when it is empty. This can ha
 #### Interoperation between HIP and CUDA Driver
 CUDA applications may want to mix CUDA driver code with HIP code (see example below). This table shows the type equivalence to enable this interaction.
 
-|**HIP Type** |**CU Driver Type**|**CUDA Runtime Type**|
-| ----        | ----             | ----                |
-| hipModule   |  CUmodule        |                     |
-| hipFunction |  CUfunction      |                     |
-| hipCtx_t    |  CUcontext       |                     |
-| hipDevice_t |  CUdevice        |                     |
-| hipStream_t |  CUstream        | cudaStream_t        |
-| hipEvent_t  |  CUevent         | cudaEvent_t         |
-| hipArray    |  CUarray         | cudaArray           |
-
-#### Compilation Flags
-The hipModule interface does not support the `cuModuleLoadDataEx` function, which is used to control PTX compilation options.
-HCC does not use PTX and does not support the same compilation options.
-In fact, HCC code objects always contain fully compiled ISA and do not require additional compilation as part of the load step.
-Code which requires this functionally should use platform-specific coding, calling `cuModuleLoadDataEx` on the NVCC path and `hipModuleLoadData` on the hcc path.
-For example:
+|**HIP Type**   |**CU Driver Type**|**CUDA Runtime Type**|
+| ----          | ----             | ----                |
+| hipModule_t   |  CUmodule        |                     |
+| hipFunction_t |  CUfunction      |                     |
+| hipCtx_t      |  CUcontext       |                     |
+| hipDevice_t   |  CUdevice        |                     |
+| hipStream_t   |  CUstream        | cudaStream_t        |
+| hipEvent_t    |  CUevent         | cudaEvent_t         |
+| hipArray      |  CUarray         | cudaArray           |
 
+#### Compilation Options
+The `hipModule_t` interface does not support `cuModuleLoadDataEx` function, which is used to control PTX compilation options.
+HCC does not use PTX and does not support these compilation options.
+In fact, HCC code objects always contain fully compiled ISA and do not require additional compilation as a part of the load step.
+The corresponding HIP function `hipModuleLoadDataEx` behaves as `hipModuleLoadData` on HCC path (compilation options are not used) and as `cuModuleLoadDataEx` on NVCC path.
+For example (CUDA):
 ```
-hipModule module;
-void *imagePtr = ... ;  // Somehow populate data pointer with code object
+CUmodule module;
+void *imagePtr = ...;  // Somehow populate data pointer with code object
 
-#ifdef __HIP_PLATFORM_NVCC__
-// Use CUDA driver API but write to hipModule since they are same type:
 const int numOptions = 1;
 CUJit_option options[numOptions];
 void * optionValues[numOptions];
 
 options[0] = CU_JIT_MAX_REGISTERS;
-unsigned maxRegs=15;
-optionValues[0] = (void*) (&maxRegs);
+unsigned maxRegs = 15;
+optionValues[0] = (void*)(&maxRegs);
 
 cuModuleLoadDataEx(module, imagePtr, numOptions, options, optionValues);
 
-#else // __HIP_PLATFORM_HCC__
+CUfunction k;
+cuModuleGetFunction(&k, module, "myKernel");
+```
+HIP:
+```
+hipModule_t module;
+void *imagePtr = ...;  // Somehow populate data pointer with code object
 
-// HCC path does not support or require JIT options, so just load the module.
-hipModuleLoadData(&module, imagePtr);
+const int numOptions = 1;
+hipJitOption options[numOptions];
+void * optionValues[numOptions];
 
-#endif
+options[0] = hipJitOptionMaxRegisters;
+unsigned maxRegs = 15;
+optionValues[0] = (void*)(&maxRegs);
 
-// Back to unified code - both paths above loaded the "module" variable.
-hipFunction k;
+// hipModuleLoadData(module, imagePtr) will be called on HCC path, JIT options will not be used, and
+// cupModuleLoadDataEx(module, imagePtr, numOptions, options, optionValues) will be called on NVCC path
+hipModuleLoadDataEx(module, imagePtr, numOptions, options, optionValues);
+
+hipFunction_t k;
 hipModuleGetFunction(&k, module, "myKernel");
 ```
 
diff --git a/projects/hip/docs/markdown/hip_porting_guide.md b/projects/hip/docs/markdown/hip_porting_guide.md
index 72f6384f6d..84887fd512 100644
--- a/projects/hip/docs/markdown/hip_porting_guide.md
+++ b/projects/hip/docs/markdown/hip_porting_guide.md
@@ -569,7 +569,6 @@ HIP_TRACE_API                  =  0 : Trace each HIP API call.  Print function n
 HIP_TRACE_API_COLOR            = green : Color to use for HIP_API.  None/Red/Green/Yellow/Blue/Magenta/Cyan/White
 HIP_PROFILE_API                 =  0 : Add HIP function begin/end to ATP file generated with CodeXL
 HIP_VISIBLE_DEVICES            =  0 : Only devices whose index is present in the secquence are visible to HIP applications and they are enumerated in the order of secquence
-HIP_NUM_KERNELS_INFLIGHT       = 128 : Number of kernels per stream
 
 ```
 
diff --git a/projects/hip/docs/markdown/hip_profiling.md b/projects/hip/docs/markdown/hip_profiling.md
index 6e5cde700d..ef349ef2a5 100644
--- a/projects/hip/docs/markdown/hip_profiling.md
+++ b/projects/hip/docs/markdown/hip_profiling.md
@@ -1,4 +1,4 @@
-# Profiling and Debugging HIP Code
+# Profiling HIP Code
 
 This section describes the profiling and debugging capabilities that HIP provides.  
 Profiling information can viewed in the CodeXL visualization tool or printed directly to stderr as the application runs.
@@ -267,6 +267,11 @@ info: check result
 PASSED!
 ```
 
+HIP_TRACE_API supports multiple levels of debug information:
+   - 0x1 = print all HIP APIs
+   - 0x2 = print HIP APIs which initiate GPU kernels, copies, or memsets.  Includes hipLaunchKernel, hipMemcpy*, hipMemset*.
+   - 0x4 = print HIP APIs which allocate or free memory.  Includes hipMalloc, hipHostMalloc, hipFree, hipHostFree.
+
 
 #### Color
 Note this trace mode uses colors. "less -r" can handle raw control characters and will display the debug output in proper colors.
@@ -275,161 +280,3 @@ None will disable use of color control codes for both the opening and closing an
 
 
 
-### Using HIP_DB
-
-This flag is primarily targeted to assist HIP development team in the development of the HIP runtime, but in some situations may be useful to HIP application developers as well.
-The HIP debug information is designed to print important information during the execution of a HIP API.  HIP provides
-different color-coded levels of debug information:
-  - api  : Print the beginning and end of each HIP API, including the arguments and return codes.  This is equivalent to setting HIP_TRACE_API=1.
-  - sync : Print multi-thread and other synchronization debug information.
-  - copy : Print which engine is doing the copy, which copy flavor is selected, information on source and destination memory.
-  - mem  : Print information about memory allocation - which pointers are allocated, where they are allocated, peer mappings, and more.
-
-DB_MEM format is flags separated by '+' sign, or a hex code for the bitmask.  Generally the + format is preferred.  
-For example:
-```
-$ HIP_DB=api+copy+mem  my-application
-$ HIP_DB=0xF  my-application
-```
-
-### Using ltrace
-ltrace is a standard linux tool which provides a message to stderr on every dynamic library call.  Since ROCr and the ROCt (the ROC thunk, which is the thin user-space interface to the ROC kernel driver) are both dynamic libraries, this provides an easy way to trace the activity in these libraries.  Tracing can be a powerful way to quickly observe the flow of the application before diving into the details with a command-line debugger.
-The trace can also show performance issues related to accidental calls to expensive API calls on the critical path.
-
-ltrace can be easily combined with the HIP_DB switches to visualize the runtime behavior of the entire ROCm software stack.  Here's a sample command-line and output:
-
-```
-$ HIP_DB=api ltrace -C -e 'hsa*'   <applicationName> <applicationArguments>
-
-...
-
-<<hip-api tid:1.17 hipMemcpy (0x7f7776d3e010, 0x503d1d000, 4194304, hipMemcpyDeviceToHost)
-libmcwamp_hsa.so->hsa_signal_store_relaxed(0x1804000, 0, 0, 0x400000) = 0
-libmcwamp_hsa.so->hsa_signal_store_relaxed(0x1816000, 0, 0x7f777f85f2a0, 0x400000) = 0
-libmcwamp_hsa.so->hsa_amd_memory_lock(0x7f7776d3e010, 0x400000, 0x1213b70, 1 <unfinished ...>
-libhsa-runtime64.so.1->hsaKmtRegisterMemoryToNodes(0x7f7776d3e010, 0x400000, 1, 0x1220c10) = 0
-libhsa-runtime64.so.1->hsaKmtMapMemoryToGPUNodes(0x7f7776d3e010, 0x400000, 0x7ffc32865400, 64) = 0
-<... hsa_amd_memory_lock resumed> )              = 0
-libmcwamp_hsa.so->hsa_signal_store_relaxed(0x1804000, 1, 0x7f777e95a770, 0x12205b0) = 0
-libmcwamp_hsa.so->hsa_amd_memory_async_copy(0x50411d010, 0x11e70d0, 0x503d1d000, 0x11e70d0) = 0
-libmcwamp_hsa.so->hsa_signal_wait_acquire(0x1804000, 2, 1, -1) = 0
-libmcwamp_hsa.so->hsa_amd_memory_unlock(0x7f7776d3e010, 0x1213c6c, 0x12c3c600000000, 0x1804000 <unfinished ...>
-libhsa-runtime64.so.1->hsaKmtUnmapMemoryToGPU(0x7f7776d3e010, 0x7f7776d3e010, 0x12c3c600000000, 0x1804000) = 0
-libhsa-runtime64.so.1->hsaKmtDeregisterMemory(0x7f7776d3e010, 0x7f7776d3e010, 0x7f777f60f9e8, 0x1220580) = 0
-<... hsa_amd_memory_unlock resumed> )            = 0
-  hip-api tid:1.17 hipMemcpy                      ret= 0 (hipSuccess)>>
-```
-
-Some key information from the trace above.
-  - Thy trace snippet shows the execution of a hipMemcpy API, bracketed by the first and last message in the trace output.  The messages show the thread id and API sequence number (`1.17`).  ltrace output intermixes messages from all threads, so the HIP debug information can be useful to determine which threads are executing.
-  - The code flows through HIP APIs into ROCr (HSA) APIs (hsa*) and into the thunk (hsaKmt*) calls.
-  - The HCC runtime is "libmcwamp_hsa.so" and the HSA/ROCr runtime is "libhsa-runtime64.so".
-  - In this particular case, the memory copy is for unpinned memory, and the selected copy algorithm is to pin the host memory "in-place" before performing the copy.  The signaling APIs and calls to pin ("lock", "register") the memory are readily apparent in the trace output.
-
-
-### Chicken bits
-Chicken bits are environment variables which cause the HIP, HCC, or HSA driver to disable some feature or optimization.
-These are not intended for production but can be useful diagnose synchronization problems in the application (or driver).
-
-Some of the most useful chicken bits are described here. These bits are supported on the ROCm path:
-
-HIP provides 3 environment variables in the HIP_*_BLOCKING family.  These introduce additional synchronization and can be useful to isolate synchronization problems. Specifically, if the code works with this flag set, then it indicates the kernels are executing correctly, and any failures likely are causes by improper or missing synchronization.  These flags will have performance impact and are not intended for production use.
-
-- HIP_LAUNCH_BLOCKING=1 : Waits on the host after each kernel launch.  Equivalent to setting CUDA_LAUNCH_BLOCKING.
-- HIP_LAUNCH_BLOCKING_KERNELS: A comma-separated list of kernel names.  The HIP runtime will wait on the host after one of the named kernels executes.  This provides a more targeted version of HIP_LAUNCH_BLOCKING and may be useful to isolate exactly which kernel needs further analysis if HIP_LAUNCH_BLOCKING=1 improves functionality.  There is no indication if kernel names are spelled incorrectly.  One mechanism to verify that the blocking is working is to run with HIP_DB=api+sync and search for debug messages with "LAUNCH_BLOCKING".
-- HIP_API_BLOCKING : Forces hipMemcpyAsync and hipMemsetAsync to be host-synchronous, meaning they will wait for the requested operation to complete before returning to the caller.
-
-These options cause HCC to serialize.  Useful if you have libraries or code which is calling HCC kernels directly rather than using HIP.  
-- HCC_SERIALZIE_KERNELS : 0x1=pre-serialize before each kernel launch, 0x2=post-serialize after each kernel launch., 0x3= pre- and post- serialize.
-- HCC_SERIALIZE_COPY    : 0x1=pre-serialize before each async copy, 0x2=post-serialize after each async copy., 0x3= pre- and post- serialize.
-
-- HSA_ENABLE_SDMA=0     : Causes host-to-device and device-to-host copies to use compute shader blit kernels rather than the dedicated DMA copy engines.  Compute shader copies have low latency (typically < 5us) and can achieve approximately 80% of the bandwidth of the DMA copy engine.  This flag is useful to isolate issues with the hardware copy engines.
-- HSA_ENABLE_INTERRUPT=0 : Causes completion signals to be detected with memory-based polling rather than interrupts.  Can be useful to diagnose interrupt storm issues in the driver.
-- HSA_DISABLE_CACHE=1  : Disables the GPU L2 data cache.
-
-### Debugging HIP Applications
-
-- The variable "tls_tidInfo" contains the API sequence number (_apiSeqNum)- a monotonically increasing count of the HIP APIs called from this thread.  This can be useful for setting conditional breakpoints.  Also, each new HIP thread is mapped to monotically increasing shortTid ID.  Both of these fields are displayed in the HIP debug info. 
-```
-(gdb) p tls_tidInfo
-$32 = {_shortTid = 1, _apiSeqNum = 803}
-```
-
-- HCC tracks all of the application memory allocations, including those from HIP and HC's "am_alloc".
-If the HCC runtime is built with debug information (HCC_RUNTIME_DEBUG=ON when building HCC), then calling the function 'hc::am_memtracker_print()' will show all memory allocations. 
-An optional argument specifies a void * targetPointer - the print routine will mark the allocation which contains the specified pointer with "-->" in the printed output.
-This example shows a sample GDB session where we print the memory allocated by this process and mark a specified address by using the gdb "call" function..
-The gdb syntax also supports using the variable name (in this case 'dst'):
-```
-(gdb) p dst
-$33 = (void *) 0x5ec7e9000
-(gdb) call hc::am_memtracker_print(dst)
-TargetAddress:0x5ec7e9000
-   0x504cfc000-0x504cfc00f::  allocSeqNum:1 hostPointer:0x504cfc000 devicePointer:0x504cfc000 sizeBytes:16 isInDeviceMem:0 isAmManaged:1 appId:0 appAllocFlags:0 appPtr:(nil)
-...
--->0x5ec7e9000-0x5f7e28fff::  allocSeqNum:488 hostPointer:(nil) devicePointer:0x5ec7e9000 sizeBytes:191102976 isInDeviceMem:1 isAmManaged:1 appId:0 appAllocFlags:0 appPtr:(nil)
-
-```
-
-To debug an explicit address, cast the address to (void*) :
-```
-(gdb) call hc::am_memtracker_print((void*)0x508c7f000)
-```
-- Debugging GPUVM fault.
-For example:
-```
-Memory access fault by GPU node-1 on address 0x5924000. Reason: Page not present or supervisor privilege.
-
-Program received signal SIGABRT, Aborted.
-[Switching to Thread 0x7fffdffb5700 (LWP 14893)]
-0x00007ffff2057c37 in __GI_raise (sig=sig@entry=6) at ../nptl/sysdeps/unix/sysv/linux/raise.c:56
-56      ../nptl/sysdeps/unix/sysv/linux/raise.c: No such file or directory.
-(gdb) bt
-#0  0x00007ffff2057c37 in __GI_raise (sig=sig@entry=6) at ../nptl/sysdeps/unix/sysv/linux/raise.c:56
-#1  0x00007ffff205b028 in __GI_abort () at abort.c:89
-#2  0x00007ffff6f960eb in ?? () from /opt/rocm/hsa/lib/libhsa-runtime64.so.1
-#3  0x00007ffff6f99ea5 in ?? () from /opt/rocm/hsa/lib/libhsa-runtime64.so.1
-#4  0x00007ffff6f78107 in ?? () from /opt/rocm/hsa/lib/libhsa-runtime64.so.1
-#5  0x00007ffff744f184 in start_thread (arg=0x7fffdffb5700) at pthread_create.c:312
-#6  0x00007ffff211b37d in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:111
-(gdb) info threads
-  Id   Target Id         Frame
-  4    Thread 0x7fffdd521700 (LWP 14895) "caffe" pthread_cond_wait@@GLIBC_2.3.2 () at ../nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S:185
-  3    Thread 0x7fffddd22700 (LWP 14894) "caffe" pthread_cond_wait@@GLIBC_2.3.2 () at ../nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S:185
-* 2    Thread 0x7fffdffb5700 (LWP 14893) "caffe" 0x00007ffff2057c37 in __GI_raise (sig=sig@entry=6) at ../nptl/sysdeps/unix/sysv/linux/raise.c:56
-  1    Thread 0x7ffff7fa6ac0 (LWP 14892) "caffe" 0x00007ffff6f934d5 in ?? () from /opt/rocm/hsa/lib/libhsa-runtime64.so.1
-(gdb) thread 1
-[Switching to thread 1 (Thread 0x7ffff7fa6ac0 (LWP 14892))]
-#0  0x00007ffff6f934d5 in ?? () from /opt/rocm/hsa/lib/libhsa-runtime64.so.1
-(gdb) bt
-#0  0x00007ffff6f934d5 in ?? () from /opt/rocm/hsa/lib/libhsa-runtime64.so.1
-#1  0x00007ffff6f929ba in ?? () from /opt/rocm/hsa/lib/libhsa-runtime64.so.1
-#2  0x00007fffe080beca in HSADispatch::waitComplete() () from /opt/rocm/hcc/lib/libmcwamp_hsa.so
-#3  0x00007fffe080415f in HSADispatch::dispatchKernelAsync(Kalmar::HSAQueue*, void const*, int, bool) () from /opt/rocm/hcc/lib/libmcwamp_hsa.so
-#4  0x00007fffe080238e in Kalmar::HSAQueue::dispatch_hsa_kernel(hsa_kernel_dispatch_packet_s const*, void const*, unsigned long, hc::completion_future*) () from /opt/rocm/hcc/lib/libmcwamp_hsa.so
-#5  0x00007ffff7bb7559 in hipModuleLaunchKernel () from /opt/rocm/hip/lib/libhip_hcc.so
-#6  0x00007ffff2e6cd2c in mlopen::HIPOCKernel::run (this=0x7fffffffb5a8, args=0x7fffffffb2a8, size=80) at /root/MIOpen/src/hipoc/hipoc_kernel.cpp:15  
-...
-```
-
-### General Debugging Tips
-- The fault will be caught by the runtime but was actually generated by an asynchronous command running on the GPU.    So, the GDB backtrace will show a path in the runtime, ie inside "GI_Raise" as shown in the example above.
-- To determine the true location of the fault, force the kernels to execute synchronously by seeing the environment variables HCC_SERIALIZE_KERNEL=3 HCC_SERIALIZE_COPY=3.  This will force HCC to wait for the kernel to finish executing before retuning.  If the fault occurs during the execution of a kernel, you can see the code which launched the kernel inside the backtrace.  A bit of guesswork is required to determine which thread is actually causing the issue - typically it will the thread which is waiting inside the libhsa-runtime64.so.
-- VM faults inside kernels can be caused byi:
-   - incorrect code (ie a for loop which extends past array boundaries), i
-   - memory issues  - kernel arguments which are invalid (null pointers, unregistered host pointers, bad pointers).
-   - synchronization issues
-   - compiler issues (incorrect code generation from the compiler)
-   - runtime issues 
-
--- General debug tips:
-- 'gdb --args' can be used to conviently pass the executable and arguments to gdb.
-- From inside GDB, you can set environment variables "set env".  Note the command does not use an '=' sign:
-```
-(gdb) set env HIP_DB 1
-```
-Setting HIP_PRINT_ENV=1 and then running a HIP application will print the HIP environment variables, their current values, and usage info.
-Setting HCC_PRINT_ENV=1 and then running a HCC application will print the HCC environment variables, their current values, and usage info.
-
-
-
diff --git a/projects/hip/hipify-clang/CMakeLists.txt b/projects/hip/hipify-clang/CMakeLists.txt
index a02b91407f..872db3defe 100644
--- a/projects/hip/hipify-clang/CMakeLists.txt
+++ b/projects/hip/hipify-clang/CMakeLists.txt
@@ -6,8 +6,12 @@ set(BUILD_HIPIFY_CLANG 0 PARENT_SCOPE)
 # Find LLVM package
 find_package(LLVM 3.8 QUIET PATHS ${HIPIFY_CLANG_LLVM_DIR} NO_DEFAULT_PATH)
 if (NOT ${LLVM_FOUND})
-    message(STATUS "hipify-clang will not be built. To build it please specify absolute path to LLVM (v3.8) package using -DHIPIFY_CLANG_LLVM_DIR")
-else()
+    find_package(LLVM 3.9 QUIET PATHS ${HIPIFY_CLANG_LLVM_DIR} NO_DEFAULT_PATH)
+    if (NOT ${LLVM_FOUND})
+        message(STATUS "hipify-clang will not be built. To build it please specify absolute path to LLVM 3.8 or LLVM 3.9 package using -DHIPIFY_CLANG_LLVM_DIR")
+    endif()
+endif()
+if (${LLVM_FOUND})
     list(APPEND CMAKE_MODULE_PATH ${LLVM_CMAKE_DIR})
     include(AddLLVM)
 
@@ -31,6 +35,7 @@ else()
         clangSerialization
         clangSema
         clangEdit
+        clangFormat
         clangLex
         clangAnalysis
         clangDriver
diff --git a/projects/hip/hipify-clang/src/Cuda2Hip.cpp b/projects/hip/hipify-clang/src/Cuda2Hip.cpp
index 5a2940322e..0825285b51 100644
--- a/projects/hip/hipify-clang/src/Cuda2Hip.cpp
+++ b/projects/hip/hipify-clang/src/Cuda2Hip.cpp
@@ -191,7 +191,7 @@ struct cuda2hipMap {
 
     // Error codes and return types
     cuda2hipRename["CUresult"]                                  = {"hipError_t", CONV_TYPE, API_DRIVER};
-    cuda2hipRename["cudaError_enum"]                            = {"hipError_t", CONV_TYPE, API_DRIVER};
+//  cuda2hipRename["cudaError_enum"]                            = {"hipError_t", CONV_TYPE, API_DRIVER};
     cuda2hipRename["cudaError_t"]                               = {"hipError_t", CONV_TYPE, API_RUNTIME};
     cuda2hipRename["cudaError"]                                 = {"hipError_t", CONV_TYPE, API_RUNTIME};
 
@@ -208,59 +208,55 @@ struct cuda2hipMap {
     cuda2hipRename["CUDA_ERROR_INVALID_SOURCE"]                 = {"hipErrorInvalidSource", CONV_ERR, API_DRIVER};                                  // 300
     cuda2hipRename["CUDA_ERROR_FILE_NOT_FOUND"]                 = {"hipErrorFileNotFound", CONV_ERR, API_DRIVER};                                   // 301
     cuda2hipRename["CUDA_ERROR_NOT_FOUND"]                      = {"hipErrorNotFound", CONV_ERR, API_DRIVER};                                       // 500
+    cuda2hipRename["CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING"]  = {"hipErrorLaunchIncompatibleTexturing", CONV_ERR, API_DRIVER, HIP_UNSUPPORTED};   // 703
+    cuda2hipRename["CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE"]         = {"hipErrorPrimaryContextActive", CONV_ERR, API_DRIVER, HIP_UNSUPPORTED};          // 708
+    cuda2hipRename["CUDA_ERROR_CONTEXT_IS_DESTROYED"]           = {"hipErrorContextIsDestroyed", CONV_ERR, API_DRIVER, HIP_UNSUPPORTED};            // 709
+    cuda2hipRename["CUDA_ERROR_NOT_PERMITTED"]                  = {"hipErrorNotPermitted", CONV_ERR, API_DRIVER, HIP_UNSUPPORTED};                  // 800
+    cuda2hipRename["CUDA_ERROR_NOT_SUPPORTED"]                  = {"hipErrorNotSupported", CONV_ERR, API_DRIVER, HIP_UNSUPPORTED};                  // 801
 
     // CUDA RT API error code only
-    cuda2hipRename["cudaErrorMissingConfiguration"]             = {"hipErrorMissingConfiguration", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 1
-    cuda2hipRename["cudaErrorPriorLaunchFailure"]               = {"hipErrorPriorLaunchFailure", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 5
-    cuda2hipRename["cudaErrorInvalidDeviceFunction"]            = {"hipErrorInvalidDeviceFunction", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 8
-    cuda2hipRename["cudaErrorInvalidConfiguration"]             = {"hipErrorInvalidConfiguration", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 9
-    cuda2hipRename["cudaErrorInvalidPitchValue"]                = {"hipErrorInvalidPitchValue", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 12
-    cuda2hipRename["cudaErrorInvalidSymbol"]                    = {"hipErrorInvalidSymbol", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 13
-    cuda2hipRename["cudaErrorInvalidHostPointer"]               = {"hipErrorInvalidHostPointer", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 16
-    cuda2hipRename["cudaErrorInvalidDevicePointer"]             = {"hipErrorInvalidDevicePointer", CONV_ERR, API_RUNTIME}; // 17
-    cuda2hipRename["cudaErrorInvalidTexture"]                   = {"hipErrorInvalidTexture", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 18
-    cuda2hipRename["cudaErrorInvalidTextureBinding"]            = {"hipErrorInvalidTextureBinding", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 19
-    cuda2hipRename["cudaErrorInvalidChannelDescriptor"]         = {"hipErrorInvalidChannelDescriptor", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 20
-    cuda2hipRename["cudaErrorInvalidMemcpyDirection"]           = {"hipErrorInvalidMemcpyDirection", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 21
-    cuda2hipRename["cudaErrorAddressOfConstant"]                = {"hipErrorAddressOfConstant", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 22
-    cuda2hipRename["cudaErrorTextureFetchFailed"]               = {"hipErrorTextureFetchFailed", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 23
-    cuda2hipRename["cudaErrorTextureNotBound"]                  = {"hipErrorTextureNotBound", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 24
-    cuda2hipRename["cudaErrorSynchronizationError"]             = {"hipErrorSynchronizationError", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 25
-    cuda2hipRename["cudaErrorInvalidFilterSetting"]             = {"hipErrorInvalidFilterSetting", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 26
-    cuda2hipRename["cudaErrorInvalidNormSetting"]               = {"hipErrorInvalidNormSetting", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 27
-    cuda2hipRename["cudaErrorMixedDeviceExecution"]             = {"hipErrorMixedDeviceExecution", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 28
+    cuda2hipRename["cudaErrorMissingConfiguration"]             = {"hipErrorMissingConfiguration", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED};         // 1
+    cuda2hipRename["cudaErrorPriorLaunchFailure"]               = {"hipErrorPriorLaunchFailure", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED};           // 5
+    cuda2hipRename["cudaErrorInvalidDeviceFunction"]            = {"hipErrorInvalidDeviceFunction", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED};        // 8
+    cuda2hipRename["cudaErrorInvalidConfiguration"]             = {"hipErrorInvalidConfiguration", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED};         // 9
+    cuda2hipRename["cudaErrorInvalidPitchValue"]                = {"hipErrorInvalidPitchValue", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED};            // 12
+    cuda2hipRename["cudaErrorInvalidSymbol"]                    = {"hipErrorInvalidSymbol", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED};                // 13
+    cuda2hipRename["cudaErrorInvalidHostPointer"]               = {"hipErrorInvalidHostPointer", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED};           // 16
+    cuda2hipRename["cudaErrorInvalidDevicePointer"]             = {"hipErrorInvalidDevicePointer", CONV_ERR, API_RUNTIME};                          // 17
+    cuda2hipRename["cudaErrorInvalidTexture"]                   = {"hipErrorInvalidTexture", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED};               // 18
+    cuda2hipRename["cudaErrorInvalidTextureBinding"]            = {"hipErrorInvalidTextureBinding", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED};        // 19
+    cuda2hipRename["cudaErrorInvalidChannelDescriptor"]         = {"hipErrorInvalidChannelDescriptor", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED};     // 20
+    cuda2hipRename["cudaErrorInvalidMemcpyDirection"]           = {"hipErrorInvalidMemcpyDirection", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED};       // 21
+    cuda2hipRename["cudaErrorAddressOfConstant"]                = {"hipErrorAddressOfConstant", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED};            // 22
+    cuda2hipRename["cudaErrorTextureFetchFailed"]               = {"hipErrorTextureFetchFailed", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED};           // 23
+    cuda2hipRename["cudaErrorTextureNotBound"]                  = {"hipErrorTextureNotBound", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED};              // 24
+    cuda2hipRename["cudaErrorSynchronizationError"]             = {"hipErrorSynchronizationError", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED};         // 25
+    cuda2hipRename["cudaErrorInvalidFilterSetting"]             = {"hipErrorInvalidFilterSetting", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED};         // 26
+    cuda2hipRename["cudaErrorInvalidNormSetting"]               = {"hipErrorInvalidNormSetting", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED};           // 27
+    cuda2hipRename["cudaErrorMixedDeviceExecution"]             = {"hipErrorMixedDeviceExecution", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED};         // 28
     // Deprecated as of CUDA 4.1
-    cuda2hipRename["cudaErrorNotYetImplemented"]                = {"hipErrorNotYetImplemented", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 31
+    cuda2hipRename["cudaErrorNotYetImplemented"]                = {"hipErrorNotYetImplemented", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED};            // 31
     // Deprecated as of CUDA 3.1
-    cuda2hipRename["cudaErrorMemoryValueTooLarge"]              = {"hipErrorMemoryValueTooLarge", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 32
-    cuda2hipRename["cudaErrorInsufficientDriver"]               = {"hipErrorInsufficientDriver", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 35
-    cuda2hipRename["cudaErrorSetOnActiveProcess"]               = {"hipErrorSetOnActiveProcess", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 36
-    cuda2hipRename["cudaErrorInvalidSurface"]                   = {"hipErrorInvalidSurface", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 37
-    cuda2hipRename["cudaErrorDuplicateVariableName"]            = {"hipErrorDuplicateVariableName", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 43
-    cuda2hipRename["cudaErrorDuplicateTextureName"]             = {"hipErrorDuplicateTextureName", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 44
-    cuda2hipRename["cudaErrorDuplicateSurfaceName"]             = {"hipErrorDuplicateSurfaceName", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 45
-    cuda2hipRename["cudaErrorDevicesUnavailable"]               = {"hipErrorDevicesUnavailable", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 46
-    cuda2hipRename["cudaErrorIncompatibleDriverContext"]        = {"hipErrorIncompatibleDriverContext", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 49
-    cuda2hipRename["cudaErrorDeviceAlreadyInUse"]               = {"hipErrorDeviceAlreadyInUse", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 54
-    cuda2hipRename["cudaErrorAssert"]                           = {"hipErrorAssert", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 59
-    cuda2hipRename["cudaErrorTooManyPeers"]                     = {"hipErrorTooManyPeers", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 60
-    cuda2hipRename["cudaErrorLaunchMaxDepthExceeded"]           = {"hipErrorLaunchMaxDepthExceeded", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 65
-    cuda2hipRename["cudaErrorLaunchFileScopedTex"]              = {"hipErrorLaunchFileScopedTex", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 66
-    cuda2hipRename["cudaErrorLaunchFileScopedSurf"]             = {"hipErrorLaunchFileScopedSurf", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 67
-    cuda2hipRename["cudaErrorSyncDepthExceeded"]                = {"hipErrorSyncDepthExceeded", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 68
-    cuda2hipRename["cudaErrorLaunchPendingCountExceeded"]       = {"hipErrorLaunchPendingCountExceeded", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 69
-    cuda2hipRename["cudaErrorNotPermitted"]                     = {"hipErrorNotPermitted", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 70
-    cuda2hipRename["cudaErrorNotSupported"]                     = {"hipErrorNotSupported", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 71
-    cuda2hipRename["cudaErrorHardwareStackError"]               = {"hipErrorHardwareStackError", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 72
-    cuda2hipRename["cudaErrorIllegalInstruction"]               = {"hipErrorIllegalInstruction", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 73
-    cuda2hipRename["cudaErrorMisalignedAddress"]                = {"hipErrorMisalignedAddress", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 74
-    cuda2hipRename["cudaErrorInvalidAddressSpace"]              = {"hipErrorInvalidAddressSpace", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 75
-    cuda2hipRename["cudaErrorInvalidPc"]                        = {"hipErrorInvalidPc", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 76
-    cuda2hipRename["cudaErrorStartupFailure"]                   = {"hipErrorStartupFailure", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 0x7f
+    cuda2hipRename["cudaErrorMemoryValueTooLarge"]              = {"hipErrorMemoryValueTooLarge", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED};          // 32
+    cuda2hipRename["cudaErrorInsufficientDriver"]               = {"hipErrorInsufficientDriver", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED};           // 35
+    cuda2hipRename["cudaErrorSetOnActiveProcess"]               = {"hipErrorSetOnActiveProcess", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED};           // 36
+    cuda2hipRename["cudaErrorInvalidSurface"]                   = {"hipErrorInvalidSurface", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED};               // 37
+    cuda2hipRename["cudaErrorDuplicateVariableName"]            = {"hipErrorDuplicateVariableName", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED};        // 43
+    cuda2hipRename["cudaErrorDuplicateTextureName"]             = {"hipErrorDuplicateTextureName", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED};         // 44
+    cuda2hipRename["cudaErrorDuplicateSurfaceName"]             = {"hipErrorDuplicateSurfaceName", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED};         // 45
+    cuda2hipRename["cudaErrorDevicesUnavailable"]               = {"hipErrorDevicesUnavailable", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED};           // 46
+    cuda2hipRename["cudaErrorIncompatibleDriverContext"]        = {"hipErrorIncompatibleDriverContext", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED};    // 49
+    cuda2hipRename["cudaErrorDeviceAlreadyInUse"]               = {"hipErrorDeviceAlreadyInUse", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED};           // 54
+    cuda2hipRename["cudaErrorLaunchMaxDepthExceeded"]           = {"hipErrorLaunchMaxDepthExceeded", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED};       // 65
+    cuda2hipRename["cudaErrorLaunchFileScopedTex"]              = {"hipErrorLaunchFileScopedTex", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED};          // 66
+    cuda2hipRename["cudaErrorLaunchFileScopedSurf"]             = {"hipErrorLaunchFileScopedSurf", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED};         // 67
+    cuda2hipRename["cudaErrorSyncDepthExceeded"]                = {"hipErrorSyncDepthExceeded", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED};            // 68
+    cuda2hipRename["cudaErrorLaunchPendingCountExceeded"]       = {"hipErrorLaunchPendingCountExceeded", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED};   // 69
+    cuda2hipRename["cudaErrorNotPermitted"]                     = {"hipErrorNotPermitted", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED};                 // 70
+    cuda2hipRename["cudaErrorNotSupported"]                     = {"hipErrorNotSupported", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED};                 // 71
+    cuda2hipRename["cudaErrorStartupFailure"]                   = {"hipErrorStartupFailure", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED};               // 0x7f
     // Deprecated as of CUDA 4.1
-    cuda2hipRename["cudaErrorApiFailureBase"]                   = {"hipErrorApiFailureBase", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 10000
-
-
+    cuda2hipRename["cudaErrorApiFailureBase"]                   = {"hipErrorApiFailureBase", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED};               // 10000
 
     cuda2hipRename["CUDA_SUCCESS"]                              = {"hipSuccess", CONV_ERR, API_DRIVER};                                             // 0
     cuda2hipRename["cudaSuccess"]                               = {"hipSuccess", CONV_ERR, API_RUNTIME};                                            // 0
@@ -328,6 +324,9 @@ struct cuda2hipMap {
     cuda2hipRename["CUDA_ERROR_INVALID_GRAPHICS_CONTEXT"]       = {"hipErrorInvalidGraphicsContext", CONV_ERR, API_DRIVER};                         // 219
     cuda2hipRename["cudaErrorInvalidGraphicsContext"]           = {"hipErrorInvalidGraphicsContext", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED};       // 79
 
+    cuda2hipRename["CUDA_ERROR_NVLINK_UNCORRECTABLE"]           = {"hipErrorNvlinkUncorrectable", CONV_ERR, API_DRIVER, HIP_UNSUPPORTED};           // 220 [CUDA 8.0.44]
+    cuda2hipRename["cudaErrorNvlinkUncorrectable"]              = {"hipErrorNvlinkUncorrectable", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED};          // 80  [CUDA 8.0.44]
+
     cuda2hipRename["CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND"] = {"hipErrorSharedObjectSymbolNotFound", CONV_ERR, API_DRIVER};                     // 302
     cuda2hipRename["cudaErrorSharedObjectSymbolNotFound"]       = {"hipErrorSharedObjectSymbolNotFound", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED};   // 40
 
@@ -346,34 +345,50 @@ struct cuda2hipMap {
     cuda2hipRename["CUDA_ERROR_ILLEGAL_ADDRESS"]                = {"hipErrorIllegalAddress", CONV_ERR, API_DRIVER};                                 // 700
     cuda2hipRename["cudaErrorIllegalAddress"]                   = {"hipErrorIllegalAddress", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED};               // 77
 
-    /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    cuda2hipRename["CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES"]        = {"hipErrorLaunchOutOfResources", CONV_ERR, API_DRIVER};                           // 701
+    cuda2hipRename["cudaErrorLaunchOutOfResources"]             = {"hipErrorLaunchOutOfResources", CONV_ERR, API_RUNTIME};                          // 7
 
-    cuda2hipRename["CUDA_ERROR_LAUNCH_FAILED"]                  = {"hipErrorLaunchFailure", CONV_ERR, API_DRIVER};                                  // 719
+    cuda2hipRename["CUDA_ERROR_LAUNCH_TIMEOUT"]                 = {"hipErrorLaunchTimeOut", CONV_ERR, API_DRIVER};                                  // 702
+    cuda2hipRename["cudaErrorLaunchTimeout"]                    = {"hipErrorLaunchTimeOut", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED};                // 6
+
+    cuda2hipRename["CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED"]    = {"hipErrorPeerAccessAlreadyEnabled", CONV_ERR, API_DRIVER};                       // 704
+    cuda2hipRename["cudaErrorPeerAccessAlreadyEnabled"]         = {"hipErrorPeerAccessAlreadyEnabled", CONV_ERR, API_RUNTIME};                      // 50
+
+    cuda2hipRename["CUDA_ERROR_PEER_ACCESS_NOT_ENABLED"]        = {"hipErrorPeerAccessNotEnabled", CONV_ERR, API_DRIVER};                           // 705
+    cuda2hipRename["cudaErrorPeerAccessNotEnabled"]             = {"hipErrorPeerAccessNotEnabled", CONV_ERR, API_RUNTIME};                          // 51
+
+    cuda2hipRename["CUDA_ERROR_ASSERT"]                         = {"hipErrorAssert", CONV_ERR, API_DRIVER, HIP_UNSUPPORTED};                        // 710
+    cuda2hipRename["cudaErrorAssert"]                           = {"hipErrorAssert", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED};                       // 59
+
+    cuda2hipRename["CUDA_ERROR_TOO_MANY_PEERS"]                 = {"hipErrorTooManyPeers", CONV_ERR, API_DRIVER, HIP_UNSUPPORTED};                  // 711
+    cuda2hipRename["cudaErrorTooManyPeers"]                     = {"hipErrorTooManyPeers", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED};                 // 60
+
+    cuda2hipRename["CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED"] = {"hipErrorHostMemoryAlreadyRegistered", CONV_ERR, API_DRIVER};                    // 712
+    cuda2hipRename["cudaErrorHostMemoryAlreadyRegistered"]      = {"hipErrorHostMemoryAlreadyRegistered", CONV_ERR, API_RUNTIME};                   // 61
+
+    cuda2hipRename["CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED"]     = {"hipErrorHostMemoryNotRegistered", CONV_ERR, API_DRIVER};                        // 713
+    cuda2hipRename["cudaErrorHostMemoryNotRegistered"]          = {"hipErrorHostMemoryNotRegistered", CONV_ERR, API_RUNTIME};                       // 62
+
+    cuda2hipRename["CUDA_ERROR_HARDWARE_STACK_ERROR"]           = {"hipErrorHardwareStackError", CONV_ERR, API_DRIVER, HIP_UNSUPPORTED};            // 714
+    cuda2hipRename["cudaErrorHardwareStackError"]               = {"hipErrorHardwareStackError", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED};           // 72
+
+    cuda2hipRename["CUDA_ERROR_ILLEGAL_INSTRUCTION"]            = {"hipErrorIllegalInstruction", CONV_ERR, API_DRIVER, HIP_UNSUPPORTED};            // 715
+    cuda2hipRename["cudaErrorIllegalInstruction"]               = {"hipErrorIllegalInstruction", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED};           // 73
+
+    cuda2hipRename["CUDA_ERROR_MISALIGNED_ADDRESS"]             = {"hipErrorMisalignedAddress", CONV_ERR, API_DRIVER, HIP_UNSUPPORTED};             // 716
+    cuda2hipRename["cudaErrorMisalignedAddress"]                = {"hipErrorMisalignedAddress", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED};            // 74
+
+    cuda2hipRename["CUDA_ERROR_INVALID_ADDRESS_SPACE"]          = {"hipErrorInvalidAddressSpace", CONV_ERR, API_DRIVER, HIP_UNSUPPORTED};           // 717
+    cuda2hipRename["cudaErrorInvalidAddressSpace"]              = {"hipErrorInvalidAddressSpace", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED};          // 75
+
+    cuda2hipRename["CUDA_ERROR_INVALID_PC"]                     = {"hipErrorInvalidPc", CONV_ERR, API_DRIVER, HIP_UNSUPPORTED};                     // 718
+    cuda2hipRename["cudaErrorInvalidPc"]                        = {"hipErrorInvalidPc", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED};                    // 76
+
+    cuda2hipRename["CUDA_ERROR_LAUNCH_FAILED"]                  = {"hipErrorLaunchFailure", CONV_ERR, API_DRIVER, HIP_UNSUPPORTED};                 // 719
     cuda2hipRename["cudaErrorLaunchFailure"]                    = {"hipErrorLaunchFailure", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED};                // 4
 
-    cuda2hipRename["CUDA_ERROR_LAUNCH_TIMEOUT"]                 = {"hipErrorLaunchTimeOut", CONV_ERR, API_DRIVER};
-    cuda2hipRename["cudaErrorLaunchTimeout"]                    = {"hipErrorLaunchTimeOut", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 6
-
-    cuda2hipRename["CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES"]        = {"hipErrorLaunchOutOfResources", CONV_ERR, API_DRIVER};
-    cuda2hipRename["cudaErrorLaunchOutOfResources"]             = {"hipErrorLaunchOutOfResources", CONV_ERR, API_RUNTIME}; // 7
-
-    cuda2hipRename["CUDA_ERROR_UNKNOWN"]                        = {"hipErrorUnknown", CONV_ERR, API_DRIVER};
-    cuda2hipRename["cudaErrorUnknown"]                          = {"hipErrorUnknown", CONV_ERR, API_RUNTIME}; // 30
-
-//    cuda2hipRename["CUDA_ERROR_NOT_INITIALIZED"]                = {"hipErrorInitializationError", CONV_ERR, API_DRIVER};
-//    cuda2hipRename["cudaErrorInitializationError"]              = {"hipErrorInitializationError", CONV_ERR, API_RUNTIME};
-
-    cuda2hipRename["CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED"]    = {"hipErrorPeerAccessAlreadyEnabled", CONV_ERR, API_DRIVER};
-    cuda2hipRename["cudaErrorPeerAccessAlreadyEnabled"]         = {"hipErrorPeerAccessAlreadyEnabled", CONV_ERR, API_RUNTIME}; // 50
-
-    cuda2hipRename["CUDA_ERROR_PEER_ACCESS_NOT_ENABLED"]        = {"hipErrorPeerAccessNotEnabled", CONV_ERR, API_DRIVER};
-    cuda2hipRename["cudaErrorPeerAccessNotEnabled"]             = {"hipErrorPeerAccessNotEnabled", CONV_ERR, API_RUNTIME}; // 51
-
-    cuda2hipRename["CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED"] = {"hipErrorHostMemoryAlreadyRegistered", CONV_ERR, API_DRIVER};
-    cuda2hipRename["cudaErrorHostMemoryAlreadyRegistered"]      = {"hipErrorHostMemoryAlreadyRegistered", CONV_ERR, API_RUNTIME}; // 61
-
-    cuda2hipRename["CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED"]     = {"hipErrorHostMemoryNotRegistered", CONV_ERR, API_DRIVER};
-    cuda2hipRename["cudaErrorHostMemoryNotRegistered"]          = {"hipErrorHostMemoryNotRegistered", CONV_ERR, API_RUNTIME}; // 62
+    cuda2hipRename["CUDA_ERROR_UNKNOWN"]                        = {"hipErrorUnknown", CONV_ERR, API_DRIVER, HIP_UNSUPPORTED};                       // 999
+    cuda2hipRename["cudaErrorUnknown"]                          = {"hipErrorUnknown", CONV_ERR, API_RUNTIME};                                       // 30
 
     ///////////////////////////// CUDA DRIVER API /////////////////////////////
     // enums
@@ -389,173 +404,219 @@ struct cuda2hipMap {
     cuda2hipRename["CUipcEventHandle"]                                            = {"hipIpcEventHandle", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED};
     cuda2hipRename["CUipcMemHandle"]                                              = {"hipIpcMemHandle", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED};
 
-
-
     cuda2hipRename["CUaddress_mode"]                                              = {"hipAddress_mode", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED};
-    cuda2hipRename["CU_TR_ADDRESS_MODE_WRAP"]                                     = {"HIP_TR_ADDRESS_MODE_WRAP", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0
-    cuda2hipRename["CU_TR_ADDRESS_MODE_CLAMP"]                                    = {"HIP_TR_ADDRESS_MODE_CLAMP", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 1
-    cuda2hipRename["CU_TR_ADDRESS_MODE_MIRROR"]                                   = {"HIP_TR_ADDRESS_MODE_MIRROR", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 2
-    cuda2hipRename["CU_TR_ADDRESS_MODE_BORDER"]                                   = {"HIP_TR_ADDRESS_MODE_BORDER", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 3
+    cuda2hipRename["CU_TR_ADDRESS_MODE_WRAP"]                                     = {"HIP_TR_ADDRESS_MODE_WRAP", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED};      // 0
+    cuda2hipRename["CU_TR_ADDRESS_MODE_CLAMP"]                                    = {"HIP_TR_ADDRESS_MODE_CLAMP", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED};     // 1
+    cuda2hipRename["CU_TR_ADDRESS_MODE_MIRROR"]                                   = {"HIP_TR_ADDRESS_MODE_MIRROR", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED};    // 2
+    cuda2hipRename["CU_TR_ADDRESS_MODE_BORDER"]                                   = {"HIP_TR_ADDRESS_MODE_BORDER", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED};    // 3
 
     cuda2hipRename["CUarray_cubemap_face"]                                        = {"hipArray_cubemap_face", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED};
-    cuda2hipRename["CU_CUBEMAP_FACE_POSITIVE_X"]                                  = {"HIP_CUBEMAP_FACE_POSITIVE_X", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x00
-    cuda2hipRename["CU_CUBEMAP_FACE_NEGATIVE_X"]                                  = {"HIP_CUBEMAP_FACE_NEGATIVE_X", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x01
-    cuda2hipRename["CU_CUBEMAP_FACE_POSITIVE_Y"]                                  = {"HIP_CUBEMAP_FACE_POSITIVE_Y", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x02
-    cuda2hipRename["CU_CUBEMAP_FACE_NEGATIVE_Y"]                                  = {"HIP_CUBEMAP_FACE_NEGATIVE_Y", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x03
-    cuda2hipRename["CU_CUBEMAP_FACE_POSITIVE_Z"]                                  = {"HIP_CUBEMAP_FACE_POSITIVE_Z", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x04
-    cuda2hipRename["CU_CUBEMAP_FACE_NEGATIVE_Z"]                                  = {"HIP_CUBEMAP_FACE_NEGATIVE_Z", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x05
+    cuda2hipRename["CU_CUBEMAP_FACE_POSITIVE_X"]                                  = {"HIP_CUBEMAP_FACE_POSITIVE_X", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED};   // 0x00
+    cuda2hipRename["CU_CUBEMAP_FACE_NEGATIVE_X"]                                  = {"HIP_CUBEMAP_FACE_NEGATIVE_X", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED};   // 0x01
+    cuda2hipRename["CU_CUBEMAP_FACE_POSITIVE_Y"]                                  = {"HIP_CUBEMAP_FACE_POSITIVE_Y", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED};   // 0x02
+    cuda2hipRename["CU_CUBEMAP_FACE_NEGATIVE_Y"]                                  = {"HIP_CUBEMAP_FACE_NEGATIVE_Y", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED};   // 0x03
+    cuda2hipRename["CU_CUBEMAP_FACE_POSITIVE_Z"]                                  = {"HIP_CUBEMAP_FACE_POSITIVE_Z", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED};   // 0x04
+    cuda2hipRename["CU_CUBEMAP_FACE_NEGATIVE_Z"]                                  = {"HIP_CUBEMAP_FACE_NEGATIVE_Z", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED};   // 0x05
 
     cuda2hipRename["CUarray_format"]                                              = {"hipArray_format", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED};
-    cuda2hipRename["CU_AD_FORMAT_UNSIGNED_INT8"]                                  = {"HIP_AD_FORMAT_UNSIGNED_INT8", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x01
-    cuda2hipRename["CU_AD_FORMAT_UNSIGNED_INT16"]                                 = {"HIP_AD_FORMAT_UNSIGNED_INT16", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x02
-    cuda2hipRename["CU_AD_FORMAT_UNSIGNED_INT32"]                                 = {"HIP_AD_FORMAT_UNSIGNED_INT32", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x03
-    cuda2hipRename["CU_AD_FORMAT_SIGNED_INT8"]                                    = {"HIP_AD_FORMAT_SIGNED_INT8", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x08
-    cuda2hipRename["CU_AD_FORMAT_SIGNED_INT16"]                                   = {"HIP_AD_FORMAT_SIGNED_INT16", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x09
-    cuda2hipRename["CU_AD_FORMAT_SIGNED_INT32"]                                   = {"HIP_AD_FORMAT_SIGNED_INT32", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x0a
-    cuda2hipRename["CU_AD_FORMAT_HALF"]                                           = {"HIP_AD_FORMAT_HALF", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x10
-    cuda2hipRename["CU_AD_FORMAT_FLOAT"]                                          = {"HIP_AD_FORMAT_FLOAT", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x20
+    cuda2hipRename["CU_AD_FORMAT_UNSIGNED_INT8"]                                  = {"HIP_AD_FORMAT_UNSIGNED_INT8", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED};   // 0x01
+    cuda2hipRename["CU_AD_FORMAT_UNSIGNED_INT16"]                                 = {"HIP_AD_FORMAT_UNSIGNED_INT16", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED};  // 0x02
+    cuda2hipRename["CU_AD_FORMAT_UNSIGNED_INT32"]                                 = {"HIP_AD_FORMAT_UNSIGNED_INT32", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED};  // 0x03
+    cuda2hipRename["CU_AD_FORMAT_SIGNED_INT8"]                                    = {"HIP_AD_FORMAT_SIGNED_INT8", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED};     // 0x08
+    cuda2hipRename["CU_AD_FORMAT_SIGNED_INT16"]                                   = {"HIP_AD_FORMAT_SIGNED_INT16", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED};    // 0x09
+    cuda2hipRename["CU_AD_FORMAT_SIGNED_INT32"]                                   = {"HIP_AD_FORMAT_SIGNED_INT32", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED};    // 0x0a
+    cuda2hipRename["CU_AD_FORMAT_HALF"]                                           = {"HIP_AD_FORMAT_HALF", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED};            // 0x10
+    cuda2hipRename["CU_AD_FORMAT_FLOAT"]                                          = {"HIP_AD_FORMAT_FLOAT", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED};           // 0x20
     // Compute mode
-    cuda2hipRename["CUcomputemode"]                                               = {"hipComputemode", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // API_RUNTIME ANALOGUE (cudaComputeMode)
-    cuda2hipRename["CU_COMPUTEMODE_DEFAULT"]                                      = {"hipComputeModeDefault", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 0
-    cuda2hipRename["CU_COMPUTEMODE_EXCLUSIVE"]                                    = {"hipComputeModeExclusive", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 1
-    cuda2hipRename["CU_COMPUTEMODE_PROHIBITED"]                                   = {"hipComputeModeProhibited", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 2
-    cuda2hipRename["CU_COMPUTEMODE_EXCLUSIVE_PROCESS"]                            = {"hipComputeModeExclusiveProcess", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 3
+    cuda2hipRename["CUcomputemode"]                                               = {"hipComputemode", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED};                     // API_RUNTIME ANALOGUE (cudaComputeMode)
+    cuda2hipRename["CU_COMPUTEMODE_DEFAULT"]                                      = {"hipComputeModeDefault", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};          // 0 // API_RUNTIME ANALOGUE (cudaComputeModeDefault = 0)
+    cuda2hipRename["CU_COMPUTEMODE_EXCLUSIVE"]                                    = {"hipComputeModeExclusive", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};        // 1 // API_RUNTIME ANALOGUE (cudaComputeModeExclusive = 1)
+    cuda2hipRename["CU_COMPUTEMODE_PROHIBITED"]                                   = {"hipComputeModeProhibited", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};       // 2 // API_RUNTIME ANALOGUE (cudaComputeModeProhibited = 2)
+    cuda2hipRename["CU_COMPUTEMODE_EXCLUSIVE_PROCESS"]                            = {"hipComputeModeExclusiveProcess", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 3 // API_RUNTIME ANALOGUE (cudaComputeModeExclusiveProcess = 3)
+
+    // unsupported yet by HIP [CUDA 8.0.44]
+    // Memory advise values
+    cuda2hipRename["CUmem_advise"]                                                = {"hipMemAdvise", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED};                                  // API_RUNTIME ANALOGUE (cudaComputeMode)
+    // cuda2hipRename["CUmem_advise_enum"]                                        = {"hipMemAdvise", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED};
+    cuda2hipRename["CU_MEM_ADVISE_SET_READ_MOSTLY"]                               = {"hipMemAdviseSetReadMostly", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED};                // 1 // API_RUNTIME ANALOGUE (cudaMemAdviseSetReadMostly = 1)
+    cuda2hipRename["CU_MEM_ADVISE_UNSET_READ_MOSTLY"]                             = {"hipMemAdviseUnsetReadMostly", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED};              // 2 // API_RUNTIME ANALOGUE (cudaMemAdviseUnsetReadMostly = 2)
+    cuda2hipRename["CU_MEM_ADVISE_SET_PREFERRED_LOCATION"]                        = {"hipMemAdviseSetPreferredLocation", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED};         // 3 // API_RUNTIME ANALOGUE (cudaMemAdviseSetPreferredLocation = 3)
+    cuda2hipRename["CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION"]                      = {"hipMemAdviseUnsetPreferredLocation", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED};       // 4 // API_RUNTIME ANALOGUE (cudaMemAdviseUnsetPreferredLocation = 4)
+    cuda2hipRename["CU_MEM_ADVISE_SET_ACCESSED_BY"]                               = {"hipMemAdviseSetAccessedBy", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED};                // 5 // API_RUNTIME ANALOGUE (cudaMemAdviseSetAccessedBy = 5)
+    cuda2hipRename["CU_MEM_ADVISE_UNSET_ACCESSED_BY"]                             = {"hipMemAdviseUnsetAccessedBy", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED};              // 6 // API_RUNTIME ANALOGUE (cudaMemAdviseUnsetAccessedBy = 6)
+    // CUmem_range_attribute
+    cuda2hipRename["CUmem_range_attribute"]                                       = {"hipMemRangeAttribute", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED};                          // API_RUNTIME ANALOGUE (cudaMemRangeAttribute)
+    // cuda2hipRename["CUmem_range_attribute_enum"]                               = {"hipMemRangeAttribute", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED};
+    cuda2hipRename["CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY"]                          = {"hipMemRangeAttributeReadMostly", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED};           // 1 // API_RUNTIME ANALOGUE (cudaMemRangeAttributeReadMostly = 1)
+    cuda2hipRename["CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION"]                   = {"hipMemRangeAttributePreferredLocation", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED};    // 2 // API_RUNTIME ANALOGUE (cudaMemRangeAttributePreferredLocation = 2)
+    cuda2hipRename["CU_MEM_RANGE_ATTRIBUTE_ACCESSED_BY"]                          = {"hipMemRangeAttributeAccessedBy", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED};           // 3 // API_RUNTIME ANALOGUE (cudaMemRangeAttributeAccessedBy = 3)
+    cuda2hipRename["CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION"]               = {"hipMemRangeAttributeLastPrefetchLocation", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 4 // API_RUNTIME ANALOGUE (cudaMemRangeAttributeLastPrefetchLocation = 4)
+
     // Context flags
-    cuda2hipRename["CUctx_flags"]                                                  = {"hipCctx_flags", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED};
-    cuda2hipRename["CU_CTX_SCHED_AUTO"]                                            = {"HIP_CTX_SCHED_AUTO", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x00
-    cuda2hipRename["CU_CTX_SCHED_SPIN"]                                            = {"HIP_CTX_SCHED_SPIN", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x01
-    cuda2hipRename["CU_CTX_SCHED_YIELD"]                                           = {"HIP_CTX_SCHED_YIELD", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x02
-    cuda2hipRename["CU_CTX_SCHED_BLOCKING_SYNC"]                                   = {"HIP_CTX_SCHED_BLOCKING_SYNC", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x04
-    cuda2hipRename["CU_CTX_BLOCKING_SYNC"]                                         = {"HIP_CTX_BLOCKING_SYNC", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x04
-    cuda2hipRename["CU_CTX_SCHED_MASK"]                                            = {"HIP_CTX_SCHED_MASK", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x07
-    cuda2hipRename["CU_CTX_MAP_HOST"]                                              = {"HIP_CTX_MAP_HOST", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x08
-    cuda2hipRename["CU_CTX_LMEM_RESIZE_TO_MAX"]                                    = {"HIP_CTX_LMEM_RESIZE_TO_MAX", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x10
-    cuda2hipRename["CU_CTX_FLAGS_MASK"]                                            = {"HIP_CTX_FLAGS_MASK", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x1f
+    cuda2hipRename["CUctx_flags"]                                                 = {"hipCctx_flags", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED};
+    cuda2hipRename["CU_CTX_SCHED_AUTO"]                                           = {"HIP_CTX_SCHED_AUTO", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED};            // 0x00
+    cuda2hipRename["CU_CTX_SCHED_SPIN"]                                           = {"HIP_CTX_SCHED_SPIN", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED};            // 0x01
+    cuda2hipRename["CU_CTX_SCHED_YIELD"]                                          = {"HIP_CTX_SCHED_YIELD", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED};           // 0x02
+    cuda2hipRename["CU_CTX_SCHED_BLOCKING_SYNC"]                                  = {"HIP_CTX_SCHED_BLOCKING_SYNC", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED};   // 0x04
+    cuda2hipRename["CU_CTX_BLOCKING_SYNC"]                                        = {"HIP_CTX_BLOCKING_SYNC", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED};         // 0x04
+    cuda2hipRename["CU_CTX_SCHED_MASK"]                                           = {"HIP_CTX_SCHED_MASK", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED};            // 0x07
+    cuda2hipRename["CU_CTX_MAP_HOST"]                                             = {"HIP_CTX_MAP_HOST", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED};              // 0x08
+    cuda2hipRename["CU_CTX_LMEM_RESIZE_TO_MAX"]                                   = {"HIP_CTX_LMEM_RESIZE_TO_MAX", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED};    // 0x10
+    cuda2hipRename["CU_CTX_FLAGS_MASK"]                                           = {"HIP_CTX_FLAGS_MASK", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED};            // 0x1f
 
     // Defines
-    cuda2hipRename["CU_LAUNCH_PARAM_BUFFER_POINTER"]                              = {"HIP_LAUNCH_PARAM_BUFFER_POINTER", CONV_DEV, API_DRIVER};
-    cuda2hipRename["CU_LAUNCH_PARAM_BUFFER_SIZE"]                                 = {"HIP_LAUNCH_PARAM_BUFFER_SIZE", CONV_DEV, API_DRIVER};
-    cuda2hipRename["CU_LAUNCH_PARAM_END"]                                         = {"HIP_LAUNCH_PARAM_END", CONV_DEV, API_DRIVER};
+    cuda2hipRename["CU_LAUNCH_PARAM_BUFFER_POINTER"]                              = {"HIP_LAUNCH_PARAM_BUFFER_POINTER", CONV_DEV, API_DRIVER};                 // ((void*)0x01)
+    cuda2hipRename["CU_LAUNCH_PARAM_BUFFER_SIZE"]                                 = {"HIP_LAUNCH_PARAM_BUFFER_SIZE", CONV_DEV, API_DRIVER};                    // ((void*)0x02)
+    cuda2hipRename["CU_LAUNCH_PARAM_END"]                                         = {"HIP_LAUNCH_PARAM_END", CONV_DEV, API_DRIVER};                            // ((void*)0x00)
+    cuda2hipRename["CU_IPC_HANDLE_SIZE"]                                          = {"HIP_LAUNCH_PARAM_END", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};           // 64
+    cuda2hipRename["CU_MEMHOSTALLOC_DEVICEMAP"]                                   = {"HIP_MEMHOSTALLOC_DEVICEMAP", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};     // 0x02
+    cuda2hipRename["CU_MEMHOSTALLOC_PORTABLE"]                                    = {"HIP_MEMHOSTALLOC_PORTABLE", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};      // 0x01
+    cuda2hipRename["CU_MEMHOSTALLOC_WRITECOMBINED"]                               = {"HIP_MEMHOSTALLOC_WRITECOMBINED", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 0x04
+    cuda2hipRename["CU_MEMHOSTREGISTER_DEVICEMAP"]                                = {"HIP_MEMHOSTREGISTER_DEVICEMAP", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};  // 0x02
+    cuda2hipRename["CU_MEMHOSTREGISTER_IOMEMORY"]                                 = {"HIP_MEMHOSTREGISTER_IOMEMORY", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};   // 0x04
+    cuda2hipRename["CU_MEMHOSTREGISTER_PORTABLE"]                                 = {"HIP_MEMHOSTREGISTER_PORTABLE", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};   // 0x01
+    cuda2hipRename["CU_PARAM_TR_DEFAULT"]                                         = {"HIP_PARAM_TR_DEFAULT", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};           // -1
+    cuda2hipRename["CU_STREAM_LEGACY"]                                            = {"HIP_STREAM_LEGACY", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};              // ((CUstream)0x1)
+    cuda2hipRename["CU_STREAM_PER_THREAD"]                                        = {"HIP_STREAM_PER_THREAD", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};          // ((CUstream)0x2)
+    cuda2hipRename["CU_TRSA_OVERRIDE_FORMAT"]                                     = {"HIP_TRSA_OVERRIDE_FORMAT", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};       // 0x01
+    cuda2hipRename["CU_TRSF_NORMALIZED_COORDINATES"]                              = {"HIP_TRSF_NORMALIZED_COORDINATES", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};// 0x02
+    cuda2hipRename["CU_TRSF_READ_AS_INTEGER"]                                     = {"HIP_TRSF_READ_AS_INTEGER", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};       // 0x01
+    cuda2hipRename["CU_TRSF_SRGB"]                                                = {"HIP_TRSF_SRGB", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};                  // 0x10
+    // Deprecated, use CUDA_ARRAY3D_LAYERED
+    cuda2hipRename["CUDA_ARRAY3D_2DARRAY"]                                        = {"HIP_ARRAY3D_LAYERED", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};            // 0x01
+    cuda2hipRename["CUDA_ARRAY3D_CUBEMAP"]                                        = {"HIP_ARRAY3D_CUBEMAP", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};            // 0x04
+    cuda2hipRename["CUDA_ARRAY3D_DEPTH_TEXTURE"]                                  = {"HIP_ARRAY3D_DEPTH_TEXTURE", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};      // 0x10
+    cuda2hipRename["CUDA_ARRAY3D_LAYERED"]                                        = {"HIP_ARRAY3D_LAYERED", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};            // 0x01
+    cuda2hipRename["CUDA_ARRAY3D_SURFACE_LDST"]                                   = {"HIP_ARRAY3D_SURFACE_LDST", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};       // 0x02
+    cuda2hipRename["CUDA_ARRAY3D_TEXTURE_GATHER"]                                 = {"HIP_ARRAY3D_TEXTURE_GATHER", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};     // 0x08
+    cuda2hipRename["CUDA_VERSION"]                                                = {"HIP_VERSION", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};                    // 7050
 
     // Types
     // NOTE: CUdevice might be changed to typedef int in the future.
     cuda2hipRename["CUdevice"]                                                    = {"hipDevice_t", CONV_TYPE, API_DRIVER};
-    cuda2hipRename["CUdevice_attribute_enum"]                                     = {"hipDeviceAttribute_t", CONV_TYPE, API_DRIVER}; // API_Runtime ANALOGUE (cudaDeviceAttr)
-    cuda2hipRename["CUdevice_attribute"]                                          = {"hipDeviceAttribute_t", CONV_TYPE, API_DRIVER}; // API_Runtime ANALOGUE (cudaDeviceAttr)
-    // unsupported yet by HIP
-    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK"]                   = {"hipDeviceAttributeMaxThreadsPerBlock", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};             //  1 // API_Runtime ANALOGUE (cudaDevAttrMaxThreadsPerBlock = 1)
-    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X"]                         = {"hipDeviceAttributeMaxBlockDimX", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};                   //  2 // API_Runtime ANALOGUE (cudaDevAttrMaxBlockDimX = 2)
-    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y"]                         = {"hipDeviceAttributeMaxBlockDimY", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};                   //  3 // API_Runtime ANALOGUE (cudaDevAttrMaxBlockDimY = 3)
-    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z"]                         = {"hipDeviceAttributeMaxBlockDimZ", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};                   //  4 // API_Runtime ANALOGUE (cudaDevAttrMaxBlockDimZ = 4)
-    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X"]                          = {"hipDeviceAttributeMaxGridDimX", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};                    //  5 // API_Runtime ANALOGUE (cudaDevAttrMaxGridDimX =5)
-    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y"]                          = {"hipDeviceAttributeMaxGridDimY", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};                    //  6 // API_Runtime ANALOGUE (cudaDevAttrMaxGridDimY = 6)
-    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z"]                          = {"hipDeviceAttributeMaxGridDimZ", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};                    //  7 // API_Runtime ANALOGUE (cudaDevAttrMaxGridDimZ - 7)
-    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK"]             = {"hipDeviceAttributeMaxSharedMemoryPerBlock", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};        //  8 // API_Runtime ANALOGUE (cudaDevAttrMaxSharedMemoryPerBlock = 8)
-    // Deprecated, use CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK
-    cuda2hipRename["CU_DEVICE_ATTRIBUTE_SHARED_MEMORY_PER_BLOCK"]                 = {"hipDeviceAttributeMaxSharedMemoryPerBlock", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};        //  8
-    cuda2hipRename["CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY"]                   = {"hipDeviceAttributeTotalConstantMemory", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};            //  9 // API_Runtime ANALOGUE (cudaDevAttrTotalConstantMemory = 9)
-    cuda2hipRename["CU_DEVICE_ATTRIBUTE_WARP_SIZE"]                               = {"hipDeviceAttributeWarpSize", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};                       // 10 // API_Runtime ANALOGUE (cudaDevAttrWarpSize = 10)
-    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_PITCH"]                               = {"hipDeviceAttributeMaxPitch", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};                       // 11 // API_Runtime ANALOGUE (cudaDevAttrMaxPitch = 11)
-    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK"]                 = {"hipDeviceAttributeMaxRegistersPerBlock", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};           // 12 // API_Runtime ANALOGUE (cudaDevAttrMaxRegistersPerBlock = 12)
-    cuda2hipRename["CU_DEVICE_ATTRIBUTE_REGISTERS_PER_BLOCK"]                     = {"hipDeviceAttributeMaxRegistersPerBlock", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};           // 12
-    cuda2hipRename["CU_DEVICE_ATTRIBUTE_CLOCK_RATE"]                              = {"hipDeviceAttributeClockRate", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};                      // 13 // API_Runtime ANALOGUE (cudaDevAttrMaxRegistersPerBlock = 13)
-    cuda2hipRename["CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT"]                       = {"hipDeviceAttributeTextureAlignment", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};               // 14 // API_Runtime ANALOGUE (cudaDevAttrTextureAlignment = 14)
-    // Deprecated. Use instead CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT
-    cuda2hipRename["CU_DEVICE_ATTRIBUTE_GPU_OVERLAP"]                             = {"hipDeviceAttributeAsyncEngineCount", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};               // 15 // API_Runtime ANALOGUE (cudaDevAttrGpuOverlap = 15)
-    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT"]                    = {"hipDeviceAttributeMultiprocessorCount", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};            // 16 // API_Runtime ANALOGUE (cudaDevAttrMultiProcessorCount = 16)
-    cuda2hipRename["CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT"]                     = {"hipDeviceAttributeKernelExecTimeout", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};              // 17 // API_Runtime ANALOGUE (cudaDevAttrKernelExecTimeout = 17)
-    cuda2hipRename["CU_DEVICE_ATTRIBUTE_INTEGRATED"]                              = {"hipDeviceAttributeIntegrated", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};                     // 18 // API_Runtime ANALOGUE (cudaDevAttrIntegrated = 18)
-    cuda2hipRename["CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY"]                     = {"hipDeviceAttributeCanMapHostMemory", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};               // 19 // API_Runtime ANALOGUE (cudaDevAttrCanMapHostMemory = 19)
-    cuda2hipRename["CU_DEVICE_ATTRIBUTE_COMPUTE_MODE"]                            = {"hipDeviceAttributeComputeMode", CONV_DEV, API_DRIVER};                                     // 20 // API_Runtime ANALOGUE (cudaDevAttrComputeMode = 20)
-    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH"]                 = {"hipDeviceAttributeMaxTexture1DWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};              // 21 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture1DWidth = 21)
-    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH"]                 = {"hipDeviceAttributeMaxTexture2DWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};              // 22 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture2DWidth = 22)
-    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT"]                = {"hipDeviceAttributeMaxTexture2DHeight", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};             // 23 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture2DHeight = 23)
-    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH"]                 = {"hipDeviceAttributeMaxTexture3DWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};              // 24 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture3DWidth = 24)
-    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT"]                = {"hipDeviceAttributeMaxTexture3DHeight", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};             // 25 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture3DHeight = 25)
-    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH"]                 = {"hipDeviceAttributeMaxTexture3DDepth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};              // 26 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture3DDepth = 26)
-    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH"]         = {"hipDeviceAttributeMaxTexture2DLayeredWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};       // 27 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture2DLayeredWidth = 27)
-    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT"]        = {"hipDeviceAttributeMaxTexture2DLayeredHeight", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};      // 28 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture2DLayeredHeight = 28)
-    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS"]        = {"hipDeviceAttributeMaxTexture2DLayeredLayers", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};      // 29 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture2DLayeredLayers = 29)
-    // Deprecated, use CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH
-    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_WIDTH"]           = {"hipDeviceAttributeMaxTexture2DLayeredWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};       // 27 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture2DLayeredWidth = 27)
-    // Deprecated, use CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT
-    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_HEIGHT"]          = {"hipDeviceAttributeMaxTexture2DLayeredHeight", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};      // 28 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture2DLayeredHeight = 28)
-    // Deprecated, use CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS
-    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_NUMSLICES"]       = {"hipDeviceAttributeMaxTexture2DLayeredLayers", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};      // 29 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture2DLayeredLayers = 29)
-    cuda2hipRename["CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT"]                       = {"hipDeviceAttributeSurfaceAlignment", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};               // 30 // API_Runtime ANALOGUE (cudaDevAttrSurfaceAlignment = 30)
-    cuda2hipRename["CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS"]                      = {"hipDeviceAttributeConcurrentKernels", CONV_DEV, API_DRIVER};                               // 31 // API_Runtime ANALOGUE (cudaDevAttrConcurrentKernels = 31)
-    cuda2hipRename["CU_DEVICE_ATTRIBUTE_ECC_ENABLED"]                             = {"hipDeviceAttributeEccEnabled", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};                     // 32 // API_Runtime ANALOGUE (cudaDevAttrEccEnabled = 32)
-    cuda2hipRename["CU_DEVICE_ATTRIBUTE_PCI_BUS_ID"]                              = {"hipDeviceAttributePciBusId", CONV_DEV, API_DRIVER};                                        // 33 // API_Runtime ANALOGUE (cudaDevAttrPciBusId = 33)
-    cuda2hipRename["CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID"]                           = {"hipDeviceAttributePciDeviceId", CONV_DEV, API_DRIVER};                                     // 34 // API_Runtime ANALOGUE (cudaDevAttrPciDeviceId = 34)
-    cuda2hipRename["CU_DEVICE_ATTRIBUTE_TCC_DRIVER"]                              = {"hipDeviceAttributeTccDriver", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};                      // 35 // API_Runtime ANALOGUE (cudaDevAttrTccDriver = 35)
-    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE"]                       = {"hipDeviceAttributeMemoryClockRate", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};                // 36 // API_Runtime ANALOGUE (cudaDevAttrMemoryClockRate = 36)
-    cuda2hipRename["CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH"]                 = {"hipDeviceAttributeMemoryBusWidth", CONV_DEV, API_DRIVER};                                  // 37 // API_Runtime ANALOGUE (cudaDevAttrGlobalMemoryBusWidth = 37)
-    cuda2hipRename["CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE"]                           = {"hipDeviceAttributeL2CacheSize", CONV_DEV, API_DRIVER};                                     // 38 // API_Runtime ANALOGUE (cudaDevAttrL2CacheSize = 38)
-    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR"]          = {"hipDeviceAttributeMaxThreadsPerMultiProcessor", CONV_DEV, API_DRIVER};                     // 39 // API_Runtime ANALOGUE (cudaDevAttrMaxThreadsPerMultiProcessor = 39)
-    cuda2hipRename["CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT"]                      = {"hipDeviceAttributeAsyncEngineCount", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};               // 40 // API_Runtime ANALOGUE (cudaDevAttrAsyncEngineCount = 40)
-    cuda2hipRename["CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING"]                      = {"hipDeviceAttributeUnifiedAddressing", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};              // 41 // API_Runtime ANALOGUE (cudaDevAttrUnifiedAddressing = 41)
-    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH"]         = {"hipDeviceAttributeMaxTexture1DLayeredWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};       // 42 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture1DLayeredWidth = 42)
-    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS"]        = {"hipDeviceAttributeMaxTexture1DLayeredLayers", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};      // 43 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture1DLayeredLayers = 43)
-    // deprecated, do not use
-    cuda2hipRename["CU_DEVICE_ATTRIBUTE_CAN_TEX2D_GATHER"]                        = {"hipDeviceAttributeCanTex2DGather", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};                 // 44 // API_Runtime ANALOGUE (no)
-    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_WIDTH"]          = {"hipDeviceAttributeMaxTexture2DGatherWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};        // 45 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture2DGatherWidth = 45)
-    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_HEIGHT"]         = {"hipDeviceAttributeMaxTexture2DGatherHeight", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};       // 46 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture2DGatherHeight = 46)
-    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH_ALTERNATE"]       = {"hipDeviceAttributeMaxTexture3DWidthAlternate", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};     // 47 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture3DWidthAlt = 47)
-    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT_ALTERNATE"]      = {"hipDeviceAttributeMaxTexture3DHeightAlternate", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};    // 48 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture3DHeightAlt = 48)
-    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH_ALTERNATE"]       = {"hipDeviceAttributeMaxTexture3DDepthAlternate", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};     // 49 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture3DDepthAlt = 49)
-    cuda2hipRename["CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID"]                           = {"hipDeviceAttributePciDomainId", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};                    // 50 // API_Runtime ANALOGUE (cudaDevAttrPciDomainId = 50)
-    cuda2hipRename["CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT"]                 = {"hipDeviceAttributeTexturePitchAlignment", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};          // 51 // API_Runtime ANALOGUE (cudaDevAttrTexturePitchAlignment = 51)
-    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_WIDTH"]            = {"hipDeviceAttributeMaxTextureCubemapWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};         // 52 // API_Runtime ANALOGUE (cudaDevAttrMaxTextureCubemapWidth = 52)
-    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_WIDTH"]    = {"hipDeviceAttributeMaxTextureCubemapLayeredWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};  // 53 // API_Runtime ANALOGUE (cudaDevAttrMaxTextureCubemapLayeredWidth = 53)
-    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_LAYERS"]   = {"hipDeviceAttributeMaxTextureCubemapLayeredLayers", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 54 // API_Runtime ANALOGUE (cudaDevAttrMaxTextureCubemapLayeredLayers = 54)
-    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_WIDTH"]                 = {"hipDeviceAttributeMaxSurface1DWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};              // 55 // API_Runtime ANALOGUE (cudaDevAttrMaxSurface1DWidth = 55)
-    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_WIDTH"]                 = {"hipDeviceAttributeMaxSurface2DWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};              // 56 // API_Runtime ANALOGUE (cudaDevAttrMaxSurface2DWidth = 56)
-    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_HEIGHT"]                = {"hipDeviceAttributeMaxSurface2DHeight", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};             // 57 // API_Runtime ANALOGUE (cudaDevAttrMaxSurface2DHeight = 57)
-    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_WIDTH"]                 = {"hipDeviceAttributeMaxSurface3DWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};              // 58 // API_Runtime ANALOGUE (cudaDevAttrMaxSurface3DWidth = 58)
-    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_HEIGHT"]                = {"hipDeviceAttributeMaxSurface3DHeight", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};             // 59 // API_Runtime ANALOGUE (cudaDevAttrMaxSurface3DHeight = 59)
-    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_DEPTH"]                 = {"hipDeviceAttributeMaxSurface3DDepth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};              // 60 // API_Runtime ANALOGUE (cudaDevAttrMaxSurface3DDepth = 60)
-    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_WIDTH"]         = {"hipDeviceAttributeMaxSurface1DLayeredWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};       // 61 // API_Runtime ANALOGUE (cudaDevAttrMaxSurface1DLayeredWidth = 61)
-    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_LAYERS"]        = {"hipDeviceAttributeMaxSurface1DLayeredLayers", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};      // 62 // API_Runtime ANALOGUE (cudaDevAttrMaxSurface1DLayeredLayers = 62)
-    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_WIDTH"]         = {"hipDeviceAttributeMaxSurface2DLayeredWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};       // 63 // API_Runtime ANALOGUE (cudaDevAttrMaxSurface2DLayeredWidth = 63)
-    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_HEIGHT"]        = {"hipDeviceAttributeMaxSurface2DLayeredHeight", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};      // 64 // API_Runtime ANALOGUE (cudaDevAttrMaxSurface2DLayeredHeight = 64)
-    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_LAYERS"]        = {"hipDeviceAttributeMaxSurface2DLayeredLayers", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};      // 65 // API_Runtime ANALOGUE (cudaDevAttrMaxSurface2DLayeredLayers = 65)
-    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_WIDTH"]            = {"hipDeviceAttributeMaxSurfaceCubemapWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};         // 66 // API_Runtime ANALOGUE (cudaDevAttrMaxSurfaceCubemapWidth = 66)
-    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_WIDTH"]    = {"hipDeviceAttributeMaxSurfaceCubemapLayeredWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};  // 67 // API_Runtime ANALOGUE (cudaDevAttrMaxSurfaceCubemapLayeredWidth = 67)
-    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_LAYERS"]   = {"hipDeviceAttributeMaxSurfaceCubemapLayeredLayers", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 68 // API_Runtime ANALOGUE (cudaDevAttrMaxSurfaceCubemapLayeredLayers = 68)
-    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LINEAR_WIDTH"]          = {"hipDeviceAttributeMaxTexture1DLinearWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};        // 69 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture1DLinearWidth = 69)
-    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_WIDTH"]          = {"hipDeviceAttributeMaxTexture2DLinearWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};        // 70 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture2DLinearWidth = 70)
-    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_HEIGHT"]         = {"hipDeviceAttributeMaxTexture2DLinearHeight", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};       // 71 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture2DLinearHeight = 71)
-    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH"]          = {"hipDeviceAttributeMaxTexture2DLinearPitch", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};        // 72 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture2DLinearPitch = 72)
-    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_WIDTH"]       = {"hipDeviceAttributeMaxTexture2DMipmappedWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};     // 73 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture2DMipmappedWidth = 73)
-    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_HEIGHT"]      = {"hipDeviceAttributeMaxTexture2DMipmappedHeight", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};    // 74 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture2DMipmappedHeight = 74)
-    cuda2hipRename["CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR"]                = {"hipDeviceAttributeComputeCapabilityMajor", CONV_DEV, API_DRIVER};                          // 75 // API_Runtime ANALOGUE (cudaDevAttrComputeCapabilityMajor = 75)
-    cuda2hipRename["CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR"]                = {"hipDeviceAttributeComputeCapabilityMinor", CONV_DEV, API_DRIVER};                          // 76 // API_Runtime ANALOGUE (cudaDevAttrComputeCapabilityMinor = 76)
-    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_MIPMAPPED_WIDTH"]       = {"hipDeviceAttributeMaxTexture1DMipmappedWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};     // 77 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture1DMipmappedWidth = 77)
-    cuda2hipRename["CU_DEVICE_ATTRIBUTE_STREAM_PRIORITIES_SUPPORTED"]             = {"hipDeviceAttributeStreamPrioritiesSupported", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};      // 78 // API_Runtime ANALOGUE (cudaDevAttrStreamPrioritiesSupported = 78)
-    cuda2hipRename["CU_DEVICE_ATTRIBUTE_GLOBAL_L1_CACHE_SUPPORTED"]               = {"hipDeviceAttributeGlobalL1CacheSupported", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};         // 79 // API_Runtime ANALOGUE (cudaDevAttrGlobalL1CacheSupported = 79)
-    cuda2hipRename["CU_DEVICE_ATTRIBUTE_LOCAL_L1_CACHE_SUPPORTED"]                = {"hipDeviceAttributeLocalL1CacheSupported", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};          // 80 // API_Runtime ANALOGUE (cudaDevAttrLocalL1CacheSupported = 80)
-    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR"]    = {"hipDeviceAttributeMaxSharedMemoryPerMultiprocessor", CONV_DEV, API_DRIVER};                // 81 // API_Runtime ANALOGUE (cudaDevAttrMaxSharedMemoryPerMultiprocessor = 81)
-    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR"]        = {"hipDeviceAttributeMaxRegistersPerMultiprocessor", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};  // 82 // API_Runtime ANALOGUE (cudaDevAttrMaxRegistersPerMultiprocessor = 82)
-    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY"]                          = {"hipDeviceAttributeManagedMemory", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};                  // 83 // API_Runtime ANALOGUE (cudaDevAttrManagedMemory = 83)
-    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD"]                         = {"hipDeviceAttributeIsMultiGpuBoard", CONV_DEV, API_DRIVER};                                 // 84 // API_Runtime ANALOGUE (cudaDevAttrIsMultiGpuBoard = 84)
-    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD_GROUP_ID"]                = {"hipDeviceAttributeMultiGpuBoardGroupId", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};           // 85 // API_Runtime ANALOGUE (cudaDevAttrMultiGpuBoardGroupID = 85)
-    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX"]                                     = {"hipDeviceAttributeMax", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};                            // 86 // API_Runtime ANALOGUE (no)
+    cuda2hipRename["CUdevice_attribute_enum"]                                     = {"hipDeviceAttribute_t", CONV_TYPE, API_DRIVER};                           // API_Runtime ANALOGUE (cudaDeviceAttr)
+    cuda2hipRename["CUdevice_attribute"]                                          = {"hipDeviceAttribute_t", CONV_TYPE, API_DRIVER};                           // API_Runtime ANALOGUE (cudaDeviceAttr)
+    cuda2hipRename["CUdeviceptr"]                                                 = {"hipDeviceptr_t", CONV_TYPE, API_DRIVER};
+    // CUDA: "The types::CUarray and struct ::cudaArray * represent the same data type and may be used interchangeably by casting the two types between each other."
+    //    typedef struct cudaArray  *cudaArray_t;
+    //    typedef struct CUarray_st *CUarray;
+    cuda2hipRename["CUarray_st"]                                                  = {"hipArray", CONV_MEM, API_RUNTIME};                                       // API_Runtime ANALOGUE (cudaArray)
+    cuda2hipRename["CUarray"]                                                     = {"hipArray *", CONV_TYPE, API_DRIVER};                                     // API_Runtime ANALOGUE (cudaArray_t)
 
+    // unsupported yet by HIP
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK"]                   = {"hipDeviceAttributeMaxThreadsPerBlock", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};                //  1 // API_Runtime ANALOGUE (cudaDevAttrMaxThreadsPerBlock = 1)
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X"]                         = {"hipDeviceAttributeMaxBlockDimX", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};                      //  2 // API_Runtime ANALOGUE (cudaDevAttrMaxBlockDimX = 2)
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y"]                         = {"hipDeviceAttributeMaxBlockDimY", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};                      //  3 // API_Runtime ANALOGUE (cudaDevAttrMaxBlockDimY = 3)
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z"]                         = {"hipDeviceAttributeMaxBlockDimZ", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};                      //  4 // API_Runtime ANALOGUE (cudaDevAttrMaxBlockDimZ = 4)
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X"]                          = {"hipDeviceAttributeMaxGridDimX", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};                       //  5 // API_Runtime ANALOGUE (cudaDevAttrMaxGridDimX =5)
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y"]                          = {"hipDeviceAttributeMaxGridDimY", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};                       //  6 // API_Runtime ANALOGUE (cudaDevAttrMaxGridDimY = 6)
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z"]                          = {"hipDeviceAttributeMaxGridDimZ", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};                       //  7 // API_Runtime ANALOGUE (cudaDevAttrMaxGridDimZ - 7)
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK"]             = {"hipDeviceAttributeMaxSharedMemoryPerBlock", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};           //  8 // API_Runtime ANALOGUE (cudaDevAttrMaxSharedMemoryPerBlock = 8)
+    // Deprecated, use CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_SHARED_MEMORY_PER_BLOCK"]                 = {"hipDeviceAttributeMaxSharedMemoryPerBlock", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};           //  8
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY"]                   = {"hipDeviceAttributeTotalConstantMemory", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};               //  9 // API_Runtime ANALOGUE (cudaDevAttrTotalConstantMemory = 9)
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_WARP_SIZE"]                               = {"hipDeviceAttributeWarpSize", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};                          // 10 // API_Runtime ANALOGUE (cudaDevAttrWarpSize = 10)
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_PITCH"]                               = {"hipDeviceAttributeMaxPitch", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};                          // 11 // API_Runtime ANALOGUE (cudaDevAttrMaxPitch = 11)
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK"]                 = {"hipDeviceAttributeMaxRegistersPerBlock", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};              // 12 // API_Runtime ANALOGUE (cudaDevAttrMaxRegistersPerBlock = 12)
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_REGISTERS_PER_BLOCK"]                     = {"hipDeviceAttributeMaxRegistersPerBlock", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};              // 12
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_CLOCK_RATE"]                              = {"hipDeviceAttributeClockRate", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};                         // 13 // API_Runtime ANALOGUE (cudaDevAttrMaxRegistersPerBlock = 13)
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT"]                       = {"hipDeviceAttributeTextureAlignment", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};                  // 14 // API_Runtime ANALOGUE (cudaDevAttrTextureAlignment = 14)
+    // Deprecated. Use instead CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_GPU_OVERLAP"]                             = {"hipDeviceAttributeAsyncEngineCount", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};                  // 15 // API_Runtime ANALOGUE (cudaDevAttrGpuOverlap = 15)
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT"]                    = {"hipDeviceAttributeMultiprocessorCount", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};               // 16 // API_Runtime ANALOGUE (cudaDevAttrMultiProcessorCount = 16)
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT"]                     = {"hipDeviceAttributeKernelExecTimeout", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};                 // 17 // API_Runtime ANALOGUE (cudaDevAttrKernelExecTimeout = 17)
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_INTEGRATED"]                              = {"hipDeviceAttributeIntegrated", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};                        // 18 // API_Runtime ANALOGUE (cudaDevAttrIntegrated = 18)
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY"]                     = {"hipDeviceAttributeCanMapHostMemory", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};                  // 19 // API_Runtime ANALOGUE (cudaDevAttrCanMapHostMemory = 19)
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_COMPUTE_MODE"]                            = {"hipDeviceAttributeComputeMode", CONV_DEV, API_DRIVER};                                        // 20 // API_Runtime ANALOGUE (cudaDevAttrComputeMode = 20)
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH"]                 = {"hipDeviceAttributeMaxTexture1DWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};                 // 21 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture1DWidth = 21)
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH"]                 = {"hipDeviceAttributeMaxTexture2DWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};                 // 22 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture2DWidth = 22)
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT"]                = {"hipDeviceAttributeMaxTexture2DHeight", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};                // 23 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture2DHeight = 23)
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH"]                 = {"hipDeviceAttributeMaxTexture3DWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};                 // 24 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture3DWidth = 24)
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT"]                = {"hipDeviceAttributeMaxTexture3DHeight", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};                // 25 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture3DHeight = 25)
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH"]                 = {"hipDeviceAttributeMaxTexture3DDepth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};                 // 26 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture3DDepth = 26)
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH"]         = {"hipDeviceAttributeMaxTexture2DLayeredWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};          // 27 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture2DLayeredWidth = 27)
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT"]        = {"hipDeviceAttributeMaxTexture2DLayeredHeight", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};         // 28 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture2DLayeredHeight = 28)
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS"]        = {"hipDeviceAttributeMaxTexture2DLayeredLayers", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};         // 29 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture2DLayeredLayers = 29)
+    // Deprecated, use CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_WIDTH"]           = {"hipDeviceAttributeMaxTexture2DLayeredWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};          // 27 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture2DLayeredWidth = 27)
+    // Deprecated, use CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_HEIGHT"]          = {"hipDeviceAttributeMaxTexture2DLayeredHeight", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};         // 28 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture2DLayeredHeight = 28)
+    // Deprecated, use CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_NUMSLICES"]       = {"hipDeviceAttributeMaxTexture2DLayeredLayers", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};         // 29 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture2DLayeredLayers = 29)
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT"]                       = {"hipDeviceAttributeSurfaceAlignment", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};                  // 30 // API_Runtime ANALOGUE (cudaDevAttrSurfaceAlignment = 30)
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS"]                      = {"hipDeviceAttributeConcurrentKernels", CONV_DEV, API_DRIVER};                                  // 31 // API_Runtime ANALOGUE (cudaDevAttrConcurrentKernels = 31)
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_ECC_ENABLED"]                             = {"hipDeviceAttributeEccEnabled", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};                        // 32 // API_Runtime ANALOGUE (cudaDevAttrEccEnabled = 32)
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_PCI_BUS_ID"]                              = {"hipDeviceAttributePciBusId", CONV_DEV, API_DRIVER};                                           // 33 // API_Runtime ANALOGUE (cudaDevAttrPciBusId = 33)
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID"]                           = {"hipDeviceAttributePciDeviceId", CONV_DEV, API_DRIVER};                                        // 34 // API_Runtime ANALOGUE (cudaDevAttrPciDeviceId = 34)
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_TCC_DRIVER"]                              = {"hipDeviceAttributeTccDriver", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};                         // 35 // API_Runtime ANALOGUE (cudaDevAttrTccDriver = 35)
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE"]                       = {"hipDeviceAttributeMemoryClockRate", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};                   // 36 // API_Runtime ANALOGUE (cudaDevAttrMemoryClockRate = 36)
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH"]                 = {"hipDeviceAttributeMemoryBusWidth", CONV_DEV, API_DRIVER};                                     // 37 // API_Runtime ANALOGUE (cudaDevAttrGlobalMemoryBusWidth = 37)
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE"]                           = {"hipDeviceAttributeL2CacheSize", CONV_DEV, API_DRIVER};                                        // 38 // API_Runtime ANALOGUE (cudaDevAttrL2CacheSize = 38)
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR"]          = {"hipDeviceAttributeMaxThreadsPerMultiProcessor", CONV_DEV, API_DRIVER};                        // 39 // API_Runtime ANALOGUE (cudaDevAttrMaxThreadsPerMultiProcessor = 39)
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT"]                      = {"hipDeviceAttributeAsyncEngineCount", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};                  // 40 // API_Runtime ANALOGUE (cudaDevAttrAsyncEngineCount = 40)
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING"]                      = {"hipDeviceAttributeUnifiedAddressing", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};                 // 41 // API_Runtime ANALOGUE (cudaDevAttrUnifiedAddressing = 41)
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH"]         = {"hipDeviceAttributeMaxTexture1DLayeredWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};          // 42 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture1DLayeredWidth = 42)
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS"]        = {"hipDeviceAttributeMaxTexture1DLayeredLayers", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};         // 43 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture1DLayeredLayers = 43)
+    // deprecated, do not use
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_CAN_TEX2D_GATHER"]                        = {"hipDeviceAttributeCanTex2DGather", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};                    // 44 // API_Runtime ANALOGUE (no)
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_WIDTH"]          = {"hipDeviceAttributeMaxTexture2DGatherWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};           // 45 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture2DGatherWidth = 45)
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_HEIGHT"]         = {"hipDeviceAttributeMaxTexture2DGatherHeight", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};          // 46 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture2DGatherHeight = 46)
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH_ALTERNATE"]       = {"hipDeviceAttributeMaxTexture3DWidthAlternate", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};        // 47 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture3DWidthAlt = 47)
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT_ALTERNATE"]      = {"hipDeviceAttributeMaxTexture3DHeightAlternate", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};       // 48 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture3DHeightAlt = 48)
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH_ALTERNATE"]       = {"hipDeviceAttributeMaxTexture3DDepthAlternate", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};        // 49 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture3DDepthAlt = 49)
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID"]                           = {"hipDeviceAttributePciDomainId", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};                       // 50 // API_Runtime ANALOGUE (cudaDevAttrPciDomainId = 50)
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT"]                 = {"hipDeviceAttributeTexturePitchAlignment", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};             // 51 // API_Runtime ANALOGUE (cudaDevAttrTexturePitchAlignment = 51)
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_WIDTH"]            = {"hipDeviceAttributeMaxTextureCubemapWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};            // 52 // API_Runtime ANALOGUE (cudaDevAttrMaxTextureCubemapWidth = 52)
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_WIDTH"]    = {"hipDeviceAttributeMaxTextureCubemapLayeredWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};     // 53 // API_Runtime ANALOGUE (cudaDevAttrMaxTextureCubemapLayeredWidth = 53)
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_LAYERS"]   = {"hipDeviceAttributeMaxTextureCubemapLayeredLayers", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};    // 54 // API_Runtime ANALOGUE (cudaDevAttrMaxTextureCubemapLayeredLayers = 54)
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_WIDTH"]                 = {"hipDeviceAttributeMaxSurface1DWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};                 // 55 // API_Runtime ANALOGUE (cudaDevAttrMaxSurface1DWidth = 55)
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_WIDTH"]                 = {"hipDeviceAttributeMaxSurface2DWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};                 // 56 // API_Runtime ANALOGUE (cudaDevAttrMaxSurface2DWidth = 56)
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_HEIGHT"]                = {"hipDeviceAttributeMaxSurface2DHeight", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};                // 57 // API_Runtime ANALOGUE (cudaDevAttrMaxSurface2DHeight = 57)
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_WIDTH"]                 = {"hipDeviceAttributeMaxSurface3DWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};                 // 58 // API_Runtime ANALOGUE (cudaDevAttrMaxSurface3DWidth = 58)
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_HEIGHT"]                = {"hipDeviceAttributeMaxSurface3DHeight", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};                // 59 // API_Runtime ANALOGUE (cudaDevAttrMaxSurface3DHeight = 59)
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_DEPTH"]                 = {"hipDeviceAttributeMaxSurface3DDepth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};                 // 60 // API_Runtime ANALOGUE (cudaDevAttrMaxSurface3DDepth = 60)
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_WIDTH"]         = {"hipDeviceAttributeMaxSurface1DLayeredWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};          // 61 // API_Runtime ANALOGUE (cudaDevAttrMaxSurface1DLayeredWidth = 61)
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_LAYERS"]        = {"hipDeviceAttributeMaxSurface1DLayeredLayers", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};         // 62 // API_Runtime ANALOGUE (cudaDevAttrMaxSurface1DLayeredLayers = 62)
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_WIDTH"]         = {"hipDeviceAttributeMaxSurface2DLayeredWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};          // 63 // API_Runtime ANALOGUE (cudaDevAttrMaxSurface2DLayeredWidth = 63)
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_HEIGHT"]        = {"hipDeviceAttributeMaxSurface2DLayeredHeight", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};         // 64 // API_Runtime ANALOGUE (cudaDevAttrMaxSurface2DLayeredHeight = 64)
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_LAYERS"]        = {"hipDeviceAttributeMaxSurface2DLayeredLayers", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};         // 65 // API_Runtime ANALOGUE (cudaDevAttrMaxSurface2DLayeredLayers = 65)
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_WIDTH"]            = {"hipDeviceAttributeMaxSurfaceCubemapWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};            // 66 // API_Runtime ANALOGUE (cudaDevAttrMaxSurfaceCubemapWidth = 66)
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_WIDTH"]    = {"hipDeviceAttributeMaxSurfaceCubemapLayeredWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};     // 67 // API_Runtime ANALOGUE (cudaDevAttrMaxSurfaceCubemapLayeredWidth = 67)
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_LAYERS"]   = {"hipDeviceAttributeMaxSurfaceCubemapLayeredLayers", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};    // 68 // API_Runtime ANALOGUE (cudaDevAttrMaxSurfaceCubemapLayeredLayers = 68)
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LINEAR_WIDTH"]          = {"hipDeviceAttributeMaxTexture1DLinearWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};           // 69 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture1DLinearWidth = 69)
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_WIDTH"]          = {"hipDeviceAttributeMaxTexture2DLinearWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};           // 70 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture2DLinearWidth = 70)
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_HEIGHT"]         = {"hipDeviceAttributeMaxTexture2DLinearHeight", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};          // 71 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture2DLinearHeight = 71)
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH"]          = {"hipDeviceAttributeMaxTexture2DLinearPitch", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};           // 72 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture2DLinearPitch = 72)
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_WIDTH"]       = {"hipDeviceAttributeMaxTexture2DMipmappedWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};        // 73 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture2DMipmappedWidth = 73)
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_HEIGHT"]      = {"hipDeviceAttributeMaxTexture2DMipmappedHeight", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};       // 74 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture2DMipmappedHeight = 74)
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR"]                = {"hipDeviceAttributeComputeCapabilityMajor", CONV_DEV, API_DRIVER};                             // 75 // API_Runtime ANALOGUE (cudaDevAttrComputeCapabilityMajor = 75)
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR"]                = {"hipDeviceAttributeComputeCapabilityMinor", CONV_DEV, API_DRIVER};                             // 76 // API_Runtime ANALOGUE (cudaDevAttrComputeCapabilityMinor = 76)
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_MIPMAPPED_WIDTH"]       = {"hipDeviceAttributeMaxTexture1DMipmappedWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};        // 77 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture1DMipmappedWidth = 77)
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_STREAM_PRIORITIES_SUPPORTED"]             = {"hipDeviceAttributeStreamPrioritiesSupported", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};         // 78 // API_Runtime ANALOGUE (cudaDevAttrStreamPrioritiesSupported = 78)
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_GLOBAL_L1_CACHE_SUPPORTED"]               = {"hipDeviceAttributeGlobalL1CacheSupported", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};            // 79 // API_Runtime ANALOGUE (cudaDevAttrGlobalL1CacheSupported = 79)
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_LOCAL_L1_CACHE_SUPPORTED"]                = {"hipDeviceAttributeLocalL1CacheSupported", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};             // 80 // API_Runtime ANALOGUE (cudaDevAttrLocalL1CacheSupported = 80)
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR"]    = {"hipDeviceAttributeMaxSharedMemoryPerMultiprocessor", CONV_DEV, API_DRIVER};                   // 81 // API_Runtime ANALOGUE (cudaDevAttrMaxSharedMemoryPerMultiprocessor = 81)
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR"]        = {"hipDeviceAttributeMaxRegistersPerMultiprocessor", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};     // 82 // API_Runtime ANALOGUE (cudaDevAttrMaxRegistersPerMultiprocessor = 82)
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY"]                          = {"hipDeviceAttributeManagedMemory", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};                     // 83 // API_Runtime ANALOGUE (cudaDevAttrManagedMemory = 83)
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD"]                         = {"hipDeviceAttributeIsMultiGpuBoard", CONV_DEV, API_DRIVER};                                    // 84 // API_Runtime ANALOGUE (cudaDevAttrIsMultiGpuBoard = 84)
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD_GROUP_ID"]                = {"hipDeviceAttributeMultiGpuBoardGroupId", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};              // 85 // API_Runtime ANALOGUE (cudaDevAttrMultiGpuBoardGroupID = 85)
     // unsupported yet by HIP [CUDA 8.0.44]
-    cuda2hipRename["CU_DEVICE_ATTRIBUTE_HOST_NATIVE_ATOMIC_SUPPORTED"]            = {"hipDeviceAttributeHostNativeAtomicSupported", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};
-    cuda2hipRename["CU_DEVICE_ATTRIBUTE_SINGLE_TO_DOUBLE_PRECISION_PERF_RATIO"]   = {"hipDeviceAttributeSingleToDoublePrecisionPerfRatio", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};
-    cuda2hipRename["CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS"]                  = {"hipDeviceAttributePageableMemoryAccess", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};
-    cuda2hipRename["CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS"]               = {"hipDeviceAttributeConcurrentManagedAccess", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};
-    cuda2hipRename["CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED"]            = {"hipDeviceAttributeComputePreemptionSupported", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};
-    cuda2hipRename["CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM"] = {"hipDeviceAttributeCanUseHostPointerForRegisteredMem", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_HOST_NATIVE_ATOMIC_SUPPORTED"]            = {"hipDeviceAttributeHostNativeAtomicSupported", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};         // 86 // API_Runtime ANALOGUE (cudaDevAttrHostNativeAtomicSupported = 86)
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_SINGLE_TO_DOUBLE_PRECISION_PERF_RATIO"]   = {"hipDeviceAttributeSingleToDoublePrecisionPerfRatio", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};  // 87 // API_Runtime ANALOGUE (cudaDevAttrSingleToDoublePrecisionPerfRatio = 87)
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS"]                  = {"hipDeviceAttributePageableMemoryAccess", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};              // 88 // API_Runtime ANALOGUE (cudaDevAttrPageableMemoryAccess = 88)
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS"]               = {"hipDeviceAttributeConcurrentManagedAccess", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};           // 89 // API_Runtime ANALOGUE (cudaDevAttrConcurrentManagedAccess = 89)
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED"]            = {"hipDeviceAttributeComputePreemptionSupported", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};        // 90 // API_Runtime ANALOGUE (cudaDevAttrComputePreemptionSupported = 90)
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM"] = {"hipDeviceAttributeCanUseHostPointerForRegisteredMem", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 91 // API_Runtime ANALOGUE (cudaDevAttrCanUseHostPointerForRegisteredMem = 91)
+
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX"]                                     = {"hipDeviceAttributeMax", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};                               // 92 // API_Runtime ANALOGUE (no)
 
     cuda2hipRename["CUdevprop_st"]                                                = {"hipDeviceProp_t", CONV_TYPE, API_DRIVER};
     cuda2hipRename["CUdevprop"]                                                   = {"hipDeviceProp_t", CONV_TYPE, API_DRIVER};
 
     // TODO: Analogues enum is needed in HIP. Couldn't map enum to struct hipPointerAttribute_t.
     // TODO: Do for Pointer Attributes the same as for Device Attributes.
-    // cuda2hipRename["CUpointer_attribute_enum"]                                 = {"hipPointerAttribute", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED};                                   // API_Runtime ANALOGUE (no)
-    // cuda2hipRename["CUpointer_attribute"]                                      = {"hipPointerAttribute", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED};                                   // API_Runtime ANALOGUE (no)
+    // cuda2hipRename["CUpointer_attribute_enum"]                                 = {"hipPointerAttribute_t", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED};                                 // API_Runtime ANALOGUE (no)
+    // cuda2hipRename["CUpointer_attribute"]                                      = {"hipPointerAttribute_t", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED};                                 // API_Runtime ANALOGUE (no)
     cuda2hipRename["CU_POINTER_ATTRIBUTE_CONTEXT"]                                = {"hipPointerAttributeContext", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};                        // 1 // API_Runtime ANALOGUE (no)
     cuda2hipRename["CU_POINTER_ATTRIBUTE_MEMORY_TYPE"]                            = {"hipPointerAttributeMemoryType", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};                     // 2 // API_Runtime ANALOGUE (no)
     cuda2hipRename["CU_POINTER_ATTRIBUTE_DEVICE_POINTER"]                         = {"hipPointerAttributeDevicePointer", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};                  // 3 // API_Runtime ANALOGUE (no)
@@ -565,13 +626,17 @@ struct cuda2hipMap {
     cuda2hipRename["CU_POINTER_ATTRIBUTE_BUFFER_ID"]                              = {"hipPointerAttributeBufferId", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};                       // 7 // API_Runtime ANALOGUE (no)
     cuda2hipRename["CU_POINTER_ATTRIBUTE_IS_MANAGED"]                             = {"hipPointerAttributeIsManaged", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};                      // 8 // API_Runtime ANALOGUE (no)
 
-
     // pointer to CUfunc_st
     cuda2hipRename["CUfunction"]                                                  = {"hipFunction_t", CONV_TYPE, API_DRIVER};
-    // TODO: in HIP ihipModuleSymbol_t should be declared in hip_runtime_api.h, not in hcc_detail/hip_runtime_api.h, as it's analogue CUfunc_st is declared also in cuda.h
-    // ToDO: examples are needed with CUfunc_st
+    // TODO: move "typedef struct ihipModuleSymbol_t *hipFunction_t;" from hcc_details to HIP
+    //             typedef struct CUfunc_st          *CUfunction;
     // cuda2hipRename["CUfunc_st"]                                                = {"ihipModuleSymbol_t", CONV_TYPE, API_DRIVER};
 
+    // typedef struct CUgraphicsResource_st *CUgraphicsResource;
+    cuda2hipRename["CUgraphicsResource"]                                          = {"hipGraphicsResource_t", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED};
+    // typedef struct CUmipmappedArray_st *CUmipmappedArray;
+    cuda2hipRename["CUmipmappedArray"]                                            = {"hipMipmappedArray_t", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED};
+
     // unsupported yet by HIP
     cuda2hipRename["CUfunction_attribute"]                         = {"hipFuncAttribute_t", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED};
     cuda2hipRename["CUfunction_attribute_enum"]                    = {"hipFuncAttribute_t", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED};
@@ -607,8 +672,6 @@ struct cuda2hipMap {
     cuda2hipRename["CU_OCCUPANCY_DEFAULT"]                         = {"hipOccupancyDefault", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED};                      // 0x00 // API_Runtime ANALOGUE (cudaOccupancyDefault = 0x0)
     cuda2hipRename["CU_OCCUPANCY_DISABLE_CACHING_OVERRIDE"]        = {"hipOccupancyDisableCachingOverride", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED};       // 0x01 // API_Runtime ANALOGUE (cudaOccupancyDisableCachingOverride = 0x1)
 
-
-
     cuda2hipRename["CUfunc_cache_enum"]                            = {"hipFuncCache", CONV_TYPE, API_DRIVER};                                                      // API_Runtime ANALOGUE (cudaFuncCache)
     cuda2hipRename["CUfunc_cache"]                                 = {"hipFuncCache", CONV_TYPE, API_DRIVER};                                                      // API_Runtime ANALOGUE (cudaFuncCache)
     cuda2hipRename["CU_FUNC_CACHE_PREFER_NONE"]                    = {"hipFuncCachePreferNone", CONV_CACHE, API_DRIVER};                                   // 0x00 // API_Runtime ANALOGUE (cudaFilterModePoint = 0)
@@ -637,24 +700,28 @@ struct cuda2hipMap {
     cuda2hipRename["CU_PREFER_PTX"]                                = {"hipJitFallbackPreferPtx", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED};
     cuda2hipRename["CU_PREFER_BINARY"]                             = {"hipJitFallbackPreferBinary", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED};
     // enum CUjit_option/CUjit_option_enum
-    cuda2hipRename["CUjit_option"]                                 = {"hipJitOption", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED};  // API_Runtime ANALOGUE (no)
-    cuda2hipRename["CUjit_option_enum"]                            = {"hipJitOption", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED};
-    cuda2hipRename["CU_JIT_MAX_REGISTERS"]                         = {"hipJitOptionMaxRegisters", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED};
-    cuda2hipRename["CU_JIT_THREADS_PER_BLOCK"]                     = {"hipJitOptionThreadsPerBlock", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED};
-    cuda2hipRename["CU_JIT_WALL_TIME"]                             = {"hipJitOptionWallTime", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED};
-    cuda2hipRename["CU_JIT_INFO_LOG_BUFFER"]                       = {"hipJitOptionInfoLogBuffer", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED};
-    cuda2hipRename["CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES"]            = {"hipJitOptionInfoLogBufferSizeBytes", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED};
-    cuda2hipRename["CU_JIT_ERROR_LOG_BUFFER"]                      = {"hipJitOptionErrorLogBuffer", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED};
-    cuda2hipRename["CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES"]           = {"hipJitOptionErrorLogBufferSizeBytes", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED};
-    cuda2hipRename["CU_JIT_OPTIMIZATION_LEVEL"]                    = {"hipJitOptionOptimizationLevel", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED};
-    cuda2hipRename["CU_JIT_TARGET_FROM_CUCONTEXT"]                 = {"hipJitOptionTargetFromContext", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED};
-    cuda2hipRename["CU_JIT_TARGET"]                                = {"hipJitOptionTarget", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED};
-    cuda2hipRename["CU_JIT_FALLBACK_STRATEGY"]                     = {"hipJitOptionFallbackStrategy", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED};
-    cuda2hipRename["CU_JIT_GENERATE_DEBUG_INFO"]                   = {"hipJitOptionGenerateDebugInfo", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED};
-    cuda2hipRename["CU_JIT_LOG_VERBOSE"]                           = {"hipJitOptionLogVerbose", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED};
-    cuda2hipRename["CU_JIT_GENERATE_LINE_INFO"]                    = {"hipJitOptionLogVerbose", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED};
-    cuda2hipRename["CU_JIT_CACHE_MODE"]                            = {"hipJitOptionCacheMode", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED};
-    cuda2hipRename["CU_JIT_NUM_OPTIONS"]                           = {"hipJitOptionNumOptions", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED};
+    cuda2hipRename["CUjit_option"]                                 = {"hipJitOption", CONV_JIT, API_DRIVER};  // API_Runtime ANALOGUE (no)
+    cuda2hipRename["CUjit_option_enum"]                            = {"hipJitOption", CONV_JIT, API_DRIVER};
+    cuda2hipRename["CU_JIT_MAX_REGISTERS"]                         = {"hipJitOptionMaxRegisters", CONV_JIT, API_DRIVER};
+    cuda2hipRename["CU_JIT_THREADS_PER_BLOCK"]                     = {"hipJitOptionThreadsPerBlock", CONV_JIT, API_DRIVER};
+    cuda2hipRename["CU_JIT_WALL_TIME"]                             = {"hipJitOptionWallTime", CONV_JIT, API_DRIVER};
+    cuda2hipRename["CU_JIT_INFO_LOG_BUFFER"]                       = {"hipJitOptionInfoLogBuffer", CONV_JIT, API_DRIVER};
+    cuda2hipRename["CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES"]            = {"hipJitOptionInfoLogBufferSizeBytes", CONV_JIT, API_DRIVER};
+    cuda2hipRename["CU_JIT_ERROR_LOG_BUFFER"]                      = {"hipJitOptionErrorLogBuffer", CONV_JIT, API_DRIVER};
+    cuda2hipRename["CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES"]           = {"hipJitOptionErrorLogBufferSizeBytes", CONV_JIT, API_DRIVER};
+    cuda2hipRename["CU_JIT_OPTIMIZATION_LEVEL"]                    = {"hipJitOptionOptimizationLevel", CONV_JIT, API_DRIVER};
+    cuda2hipRename["CU_JIT_TARGET_FROM_CUCONTEXT"]                 = {"hipJitOptionTargetFromContext", CONV_JIT, API_DRIVER};
+    cuda2hipRename["CU_JIT_TARGET"]                                = {"hipJitOptionTarget", CONV_JIT, API_DRIVER};
+    cuda2hipRename["CU_JIT_FALLBACK_STRATEGY"]                     = {"hipJitOptionFallbackStrategy", CONV_JIT, API_DRIVER};
+    cuda2hipRename["CU_JIT_GENERATE_DEBUG_INFO"]                   = {"hipJitOptionGenerateDebugInfo", CONV_JIT, API_DRIVER};
+    cuda2hipRename["CU_JIT_LOG_VERBOSE"]                           = {"hipJitOptionLogVerbose", CONV_JIT, API_DRIVER};
+    cuda2hipRename["CU_JIT_GENERATE_LINE_INFO"]                    = {"hipJitOptionGenerateLineInfo", CONV_JIT, API_DRIVER};
+    cuda2hipRename["CU_JIT_CACHE_MODE"]                            = {"hipJitOptionCacheMode", CONV_JIT, API_DRIVER};
+    // unsupported yet by HIP [CUDA 8.0.44]
+    cuda2hipRename["CU_JIT_NEW_SM3X_OPT"]                          = {"hipJitOptionSm3xOpt", CONV_JIT, API_DRIVER};
+    cuda2hipRename["CU_JIT_FAST_COMPILE"]                          = {"hipJitOptionFastCompile", CONV_JIT, API_DRIVER};
+
+    cuda2hipRename["CU_JIT_NUM_OPTIONS"]                           = {"hipJitOptionNumOptions", CONV_JIT, API_DRIVER};
     // enum CUjit_target/CUjit_target_enum
     cuda2hipRename["CUjit_target"]                                 = {"hipJitTarget", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED};  // API_Runtime ANALOGUE (no)
     cuda2hipRename["CUjit_target_enum"]                            = {"hipJitTarget", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED};
@@ -670,6 +737,11 @@ struct cuda2hipMap {
     cuda2hipRename["CU_TARGET_COMPUTE_37"]                         = {"hipJitTargetCompute37", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED};
     cuda2hipRename["CU_TARGET_COMPUTE_50"]                         = {"hipJitTargetCompute50", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED};
     cuda2hipRename["CU_TARGET_COMPUTE_52"]                         = {"hipJitTargetCompute52", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED};
+    // unsupported yet by HIP [CUDA 8.0.44]
+    cuda2hipRename["CU_TARGET_COMPUTE_53"]                         = {"hipJitTargetCompute53", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED};
+    cuda2hipRename["CU_TARGET_COMPUTE_60"]                         = {"hipJitTargetCompute60", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED};
+    cuda2hipRename["CU_TARGET_COMPUTE_61"]                         = {"hipJitTargetCompute61", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED};
+    cuda2hipRename["CU_TARGET_COMPUTE_62"]                         = {"hipJitTargetCompute62", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED};
     // enum CUjitInputType/CUjitInputType_enum
     cuda2hipRename["CUjitInputType"]                               = {"hipJitInputType", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED};  // API_Runtime ANALOGUE (no)
     cuda2hipRename["CUjitInputType_enum"]                          = {"hipJitInputType", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED};
@@ -752,27 +824,64 @@ struct cuda2hipMap {
     cuda2hipRename["CU_RES_VIEW_FORMAT_SIGNED_BC6H"]               = {"hipResViewFormatSignedBlockCompressed6H", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED};     // 0x21 // API_Runtime ANALOGUE (cudaResViewFormatSignedBlockCompressed6H = 0x21)
     cuda2hipRename["CU_RES_VIEW_FORMAT_UNSIGNED_BC7"]              = {"hipResViewFormatUnsignedBlockCompressed7", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED};    // 0x22 // API_Runtime ANALOGUE (cudaResViewFormatUnsignedBlockCompressed7 = 0x22)
 
-
-
-    cuda2hipRename["CUsharedconfig_enum"]                          = {"hipSharedMemConfig", CONV_TYPE, API_DRIVER};
     cuda2hipRename["CUsharedconfig"]                               = {"hipSharedMemConfig", CONV_TYPE, API_DRIVER};
+    cuda2hipRename["CUsharedconfig_enum"]                          = {"hipSharedMemConfig", CONV_TYPE, API_DRIVER};
     cuda2hipRename["CU_SHARED_MEM_CONFIG_DEFAULT_BANK_SIZE"]       = {"hipSharedMemBankSizeDefault", CONV_DEV, API_DRIVER};
     cuda2hipRename["CU_SHARED_MEM_CONFIG_FOUR_BYTE_BANK_SIZE"]     = {"hipSharedMemBankSizeFourByte", CONV_DEV, API_DRIVER};
     cuda2hipRename["CU_SHARED_MEM_CONFIG_EIGHT_BYTE_BANK_SIZE"]    = {"hipSharedMemBankSizeEightByte", CONV_DEV, API_DRIVER};
 
     cuda2hipRename["CUcontext"]                                    = {"hipCtx_t", CONV_TYPE, API_DRIVER};
-    // TODO:
-    // cuda2hipRename["CUctx_st"]                                  = {"XXXX", CONV_TYPE, API_DRIVER};
+    // TODO: move "typedef struct ihipCtx_t *hipCtx_t;" from hcc_details to HIP
+    //             typedef struct CUctx_st  *CUcontext;
+    // cuda2hipRename["CUctx_st"]                                  = {"ihipCtx_t", CONV_TYPE, API_DRIVER};
     cuda2hipRename["CUmodule"]                                     = {"hipModule_t", CONV_TYPE, API_DRIVER};
-    // TODO:
-    // cuda2hipRename["CUmod_st"]                                  = {"XXXX", CONV_TYPE, API_DRIVER};
+    // TODO: move "typedef struct ihipModule_t *hipModule_t;" from hcc_details to HIP
+    //             typedef struct CUmod_st     *CUmodule;
+    // cuda2hipRename["CUmod_st"]                                  = {"ihipModule_t", CONV_TYPE, API_DRIVER};
     cuda2hipRename["CUstream"]                                     = {"hipStream_t", CONV_TYPE, API_DRIVER};
-    // TODO:
-    // cuda2hipRename["CUstream_st"]                               = {"XXXX", CONV_TYPE, API_DRIVER};
-    // Stream Flags
+    // TODO: move "typedef struct ihipStream_t *hipStream_t;" from hcc_details to HIP
+    //             typedef struct CUstream_st *CUstream;
+    // cuda2hipRename["CUstream_st"]                               = {"ihipStream_t", CONV_TYPE, API_DRIVER};
+
+    // typedef void (*hipStreamCallback_t)      (hipStream_t stream, hipError_t status, void* userData);
+    // typedef void (CUDA_CB *CUstreamCallback) (CUstream hStream, CUresult status, void* userData)
+    cuda2hipRename["CUstreamCallback"]                             = {"hipStreamCallback_t", CONV_TYPE, API_DRIVER};
+
+    cuda2hipRename["CUsurfObject"]                                 = {"hipSurfaceObject", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED};
+    // typedef struct CUsurfref_st *CUsurfref;
+    cuda2hipRename["CUsurfref"]                                    = {"hipSurfaceReference_t", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED};
+    // cuda2hipRename["CUsurfref_st"]                              = {"ihipSurfaceReference_t", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED};
+    cuda2hipRename["CUtexObject"]                                  = {"hipTextureObject", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED};
+    // typedef struct CUtexref_st *CUtexref;
+    cuda2hipRename["CUtexref"]                                     = {"hipTextureReference_t", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED};
+    // cuda2hipRename["CUtexref_st"]                               = {"ihipTextureReference_t", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED};
+
+    // Stream Flags enum
+    cuda2hipRename["CUstream_flags"]                               = {"hipStreamFlags", CONV_STREAM, API_DRIVER};
+    // cuda2hipRename["CUstream_flags_enum"]                       = {"hipStreamFlags", CONV_STREAM, API_DRIVER};
     cuda2hipRename["CU_STREAM_DEFAULT"]                            = {"hipStreamDefault", CONV_STREAM, API_DRIVER};
     cuda2hipRename["CU_STREAM_NON_BLOCKING"]                       = {"hipStreamNonBlocking", CONV_STREAM, API_DRIVER};
 
+    // unsupported yet by HIP [CUDA 8.0.44]
+    // Flags for ::cuStreamWaitValue32
+    cuda2hipRename["CUstreamWaitValue_flags"]                      = {"hipStreamWaitValueFlags", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED};
+    // cuda2hipRename["CUstreamWaitValue_flags_enum"]              = {"hipStreamWaitValueFlags", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED};
+    cuda2hipRename["CU_STREAM_WAIT_VALUE_GEQ"]                     = {"hipStreamWaitValueGeq", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED};                // 0x0
+    cuda2hipRename["CU_STREAM_WAIT_VALUE_EQ"]                      = {"hipStreamWaitValueEq", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED};                 // 0x1
+    cuda2hipRename["CU_STREAM_WAIT_VALUE_AND"]                     = {"hipStreamWaitValueAnd", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED};                // 0x2
+    cuda2hipRename["CU_STREAM_WAIT_VALUE_FLUSH"]                   = {"hipStreamWaitValueFlush", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED};              // 1<<30
+    // Flags for ::cuStreamWriteValue32
+    cuda2hipRename["CUstreamWriteValue_flags"]                     = {"hipStreamWriteValueFlags", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED};
+    // cuda2hipRename["CUstreamWriteValue_flags"]                  = {"hipStreamWriteValueFlags", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED};
+    cuda2hipRename["CU_STREAM_WRITE_VALUE_DEFAULT"]                = {"hipStreamWriteValueDefault", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED};           // 0x0
+    cuda2hipRename["CU_STREAM_WRITE_VALUE_NO_MEMORY_BARRIER"]      = {"hipStreamWriteValueNoMemoryBarrier", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED};   // 0x1
+    // Flags for ::cuStreamBatchMemOp
+    cuda2hipRename["CUstreamBatchMemOpType"]                       = {"hipStreamBatchMemOpType", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED};
+    // cuda2hipRename["CUstreamBatchMemOpType_enum"]               = {"hipStreamBatchMemOpType", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED};
+    cuda2hipRename["CU_STREAM_MEM_OP_WAIT_VALUE_32"]               = {"hipStreamBatchMemOpWaitValue32", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED};       // 1
+    cuda2hipRename["CU_STREAM_MEM_OP_WRITE_VALUE_32"]              = {"hipStreamBatchMemOpWriteValue32", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED};      // 2
+    cuda2hipRename["CU_STREAM_MEM_OP_FLUSH_REMOTE_WRITES"]         = {"hipStreamBatchMemOpFlushRemoteWrites", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED}; // 3
+
     // Init
     cuda2hipRename["cuInit"]                                       = {"hipInit", CONV_DRIVER, API_DRIVER};
 
@@ -800,6 +909,13 @@ struct cuda2hipMap {
     cuda2hipRename["cuCtxSetLimit"]                             = {"hipCtxSetLimit", CONV_CONTEXT, API_DRIVER, HIP_UNSUPPORTED};
     cuda2hipRename["cuCtxGetLimit"]                             = {"hipCtxGetLimit", CONV_CONTEXT, API_DRIVER, HIP_UNSUPPORTED};
 
+    // Primary Context Management
+    cuda2hipRename["cuDevicePrimaryCtxGetState"]                = {"hipDevicePrimaryCtxGetState", CONV_CONTEXT, API_DRIVER};
+    cuda2hipRename["cuDevicePrimaryCtxRelease"]                 = {"hipDevicePrimaryCtxRelease", CONV_CONTEXT, API_DRIVER};
+    cuda2hipRename["cuDevicePrimaryCtxRetain"]                  = {"hipDevicePrimaryCtxRetain", CONV_CONTEXT, API_DRIVER};
+    cuda2hipRename["cuDevicePrimaryCtxReset"]                   = {"hipDevicePrimaryCtxReset", CONV_CONTEXT, API_DRIVER};
+    cuda2hipRename["cuDevicePrimaryCtxSetFlags"]                = {"hipDevicePrimaryCtxSetFlags", CONV_CONTEXT, API_DRIVER};
+
     // Device
     cuda2hipRename["cuDeviceGet"]                               = {"hipGetDevice", CONV_DEV, API_DRIVER};
     cuda2hipRename["cuDeviceGetName"]                           = {"hipDeviceGetName", CONV_DEV, API_DRIVER};
@@ -813,6 +929,16 @@ struct cuda2hipMap {
     cuda2hipRename["cuDeviceComputeCapability"]                 = {"hipDeviceComputeCapability", CONV_DEV, API_DRIVER};
     cuda2hipRename["cuDeviceCanAccessPeer"]                     = {"hipDeviceCanAccessPeer", CONV_DEV, API_DRIVER};
 
+    // unsupported yet by HIP [CUDA 8.0.44]
+    // P2P Attributes
+    cuda2hipRename["CUdevice_P2PAttribute"]                            = {"hipDeviceP2PAttribute", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};                              // API_Runtime ANALOGUE (cudaDeviceP2PAttr)
+    // cuda2hipRename["CUdevice_P2PAttribute_enum"]                    = {"hipDeviceP2PAttribute", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};
+    cuda2hipRename["CU_DEVICE_P2P_ATTRIBUTE_PERFORMANCE_RANK"]         = {"hipDeviceP2PAttributePerformanceRank", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};       // 0x01 // API_Runtime ANALOGUE (cudaDevP2PAttrPerformanceRank = 0x01)
+    cuda2hipRename["CU_DEVICE_P2P_ATTRIBUTE_ACCESS_SUPPORTED"]         = {"hipDeviceP2PAttributeAccessSupported", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};       // 0x02 // API_Runtime ANALOGUE (cudaDevP2PAttrAccessSupported = 0x02)
+    cuda2hipRename["CU_DEVICE_P2P_ATTRIBUTE_NATIVE_ATOMIC_SUPPORTED"]  = {"hipDeviceP2PAttributeNativeAtomicSupported", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 0x03 // API_Runtime ANALOGUE (cudaDevP2PAttrNativeAtomicSupported = 0x03)
+
+    cuda2hipRename["cuDeviceGetP2PAttribute"]                          = {"hipDeviceGetP2PAttribute", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};                           // API_Runtime ANALOGUE (cudaDeviceGetP2PAttribute)
+
     // Events
     // pointer to CUevent_st
     cuda2hipRename["CUevent"]                                   = {"hipEvent_t", CONV_TYPE, API_DRIVER};
@@ -840,7 +966,7 @@ struct cuda2hipMap {
     cuda2hipRename["cuModuleLoad"]                              = {"hipModuleLoad", CONV_MODULE, API_DRIVER};
     cuda2hipRename["cuModuleLoadData"]                          = {"hipModuleLoadData", CONV_MODULE, API_DRIVER};
     // unsupported yet by HIP
-    cuda2hipRename["cuModuleLoadDataEx"]                        = {"hipModuleLoadDataEx", CONV_MODULE, API_DRIVER, HIP_UNSUPPORTED};
+    cuda2hipRename["cuModuleLoadDataEx"]                        = {"hipModuleLoadDataEx", CONV_MODULE, API_DRIVER};
     cuda2hipRename["cuModuleLoadFatBinary"]                     = {"hipModuleLoadFatBinary", CONV_MODULE, API_DRIVER, HIP_UNSUPPORTED};
 
     cuda2hipRename["cuModuleUnload"]                            = {"hipModuleUnload", CONV_MODULE, API_DRIVER};
@@ -849,6 +975,9 @@ struct cuda2hipMap {
     // Streams
     // unsupported yet by HIP
     cuda2hipRename["cuStreamAddCallback"]                       = {"hipStreamAddCallback", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED};
+    cuda2hipRename["cuStreamWaitValue32"]                       = {"hipStreamWaitValue32", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED}; // [CUDA 8.0.44] // no API_Runtime ANALOGUE
+    cuda2hipRename["cuStreamWriteValue32"]                      = {"hipStreamWriteValue32", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED}; // [CUDA 8.0.44] // no API_Runtime ANALOGUE
+    cuda2hipRename["cuStreamBatchMemOp"]                        = {"hipStreamBatchMemOp", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED}; // [CUDA 8.0.44] // no API_Runtime ANALOGUE
 
     cuda2hipRename["cuStreamCreate"]                            = {"hipStreamCreate", CONV_STREAM, API_DRIVER};
     cuda2hipRename["cuStreamDestroy_v2"]                        = {"hipStreamDestroy", CONV_STREAM, API_DRIVER};
@@ -890,6 +1019,11 @@ struct cuda2hipMap {
     cuda2hipRename["cuMemHostRegister_v2"]                      = {"hipHostRegister", CONV_MEM, API_DRIVER};
     cuda2hipRename["cuMemHostUnregister"]                       = {"hipHostUnregister", CONV_MEM, API_DRIVER};
 
+    cuda2hipRename["cuMemPrefetchAsync"]                        = {"hipMemPrefetchAsync___", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED};   // [CUDA 8.0.44] // no API_Runtime ANALOGUE (cudaMemPrefetchAsync has different signature)
+    cuda2hipRename["cuMemAdvise"]                               = {"hipMemAdvise", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED};             // [CUDA 8.0.44] // API_Runtime ANALOGUE (cudaMemAdvise)
+    cuda2hipRename["cuMemRangeGetAttribute"]                    = {"hipMemRangeGetAttribute", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED};  // [CUDA 8.0.44] // API_Runtime ANALOGUE (cudaMemRangeGetAttribute)
+    cuda2hipRename["cuMemRangeGetAttributes"]                   = {"hipMemRangeGetAttributes", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED}; // [CUDA 8.0.44] // API_Runtime ANALOGUE (cudaMemRangeGetAttributes)
+
     // Texture Reference Mngmnt
     // Texture reference filtering modes
     cuda2hipRename["CUfilter_mode"]                             = {"hipTextureFilterMode", CONV_TEX, API_DRIVER};                         // API_Runtime ANALOGUE (cudaTextureFilterMode)
@@ -898,6 +1032,9 @@ struct cuda2hipMap {
     cuda2hipRename["CU_TR_FILTER_MODE_POINT"]                   = {"hipFilterModePoint", CONV_TEX, API_DRIVER};                      // 0 // API_Runtime ANALOGUE (cudaFilterModePoint = 0)
     cuda2hipRename["CU_TR_FILTER_MODE_LINEAR"]                  = {"hipFilterModeLinear", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED};    // 1 // API_Runtime ANALOGUE (cudaFilterModeLinear = 1)
 
+    cuda2hipRename["cuTexRefSetBorderColor"]                    = {"hipTexRefSetBorderColor", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; // [CUDA 8.0.44] // no API_Runtime ANALOGUE
+    cuda2hipRename["cuTexRefGetBorderColor"]                    = {"hipTexRefGetBorderColor", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; // [CUDA 8.0.44] // no API_Runtime ANALOGUE
+
     // Profiler
     // unsupported yet by HIP
     cuda2hipRename["cuProfilerInitialize"]                      = {"hipProfilerInitialize", CONV_OTHER, API_DRIVER, HIP_UNSUPPORTED};
@@ -972,10 +1109,10 @@ struct cuda2hipMap {
     cuda2hipRename["cudaMemcpyToSymbolAsync"]     = {"hipMemcpyToSymbolAsync", CONV_MEM, API_RUNTIME};
     cuda2hipRename["cudaMemcpyAsync"]             = {"hipMemcpyAsync", CONV_MEM, API_RUNTIME};
     cuda2hipRename["cudaMemcpy2D"]                = {"hipMemcpy2D", CONV_MEM, API_RUNTIME};
+    cuda2hipRename["cudaMemcpy2DAsync"]           = {"hipMemcpy2DAsync", CONV_MEM, API_RUNTIME};
     cuda2hipRename["cudaMemcpy2DToArray"]         = {"hipMemcpy2DToArray", CONV_MEM, API_RUNTIME};
     // unsupported yet by HIP
     cuda2hipRename["cudaMemcpy2DArrayToArray"]    = {"hipMemcpy2DArrayToArray", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED};
-    cuda2hipRename["cudaMemcpy2DAsync"]           = {"hipMemcpy2DAsync", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED};
     cuda2hipRename["cudaMemcpy2DFromArray"]       = {"hipMemcpy2DFromArray", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED};
     cuda2hipRename["cudaMemcpy2DFromArrayAsync"]  = {"hipMemcpy2DFromArrayAsync", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED};
     cuda2hipRename["cudaMemcpy2DToArrayAsync"]    = {"hipMemcpy2DToArrayAsync", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED};
@@ -985,7 +1122,27 @@ struct cuda2hipMap {
     cuda2hipRename["cudaMemcpy3DPeerAsync"]       = {"hipMemcpy3DPeerAsync", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED};
     cuda2hipRename["cudaMemcpyArrayToArray"]      = {"hipMemcpyArrayToArray", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED};
     cuda2hipRename["cudaMemcpyFromArrayAsync"]    = {"hipMemcpyFromArrayAsync", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED};
-    cuda2hipRename["cudaMemcpyFromSymbolAsync"]   = {"hipMemcpyFromSymbolAsync", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED};
+    cuda2hipRename["cudaMemcpyFromSymbol"]        = {"hipMemcpyFromSymbol", CONV_MEM, API_RUNTIME};
+    cuda2hipRename["cudaMemcpyFromSymbolAsync"]   = {"hipMemcpyFromSymbolAsync", CONV_MEM, API_RUNTIME};
+    cuda2hipRename["cudaMemAdvise"]               = {"hipMemAdvise", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED}; // [CUDA 8.0.44]
+    cuda2hipRename["cudaMemRangeGetAttribute"]    = {"hipMemRangeGetAttribute", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED}; // [CUDA 8.0.44]
+    cuda2hipRename["cudaMemRangeGetAttributes"]   = {"hipMemRangeGetAttributes", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED}; // [CUDA 8.0.44]
+
+    // unsupported yet by HIP [CUDA 8.0.44]
+    // Memory advise values
+    cuda2hipRename["cudaMemoryAdvise"]                                            = {"hipMemAdvise", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED};                                  // API_Driver ANALOGUE (CUmem_advise)
+    cuda2hipRename["cudaMemAdviseSetReadMostly"]                                  = {"hipMemAdviseSetReadMostly", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED};                // 1 // API_Driver ANALOGUE (CU_MEM_ADVISE_SET_READ_MOSTLY = 1)
+    cuda2hipRename["cudaMemAdviseUnsetReadMostly"]                                = {"hipMemAdviseUnsetReadMostly", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED};              // 2 // API_Driver ANALOGUE (CU_MEM_ADVISE_UNSET_READ_MOSTLY = 2)
+    cuda2hipRename["cudaMemAdviseSetPreferredLocation"]                           = {"hipMemAdviseSetPreferredLocation", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED};         // 3 // API_Driver ANALOGUE (CU_MEM_ADVISE_SET_PREFERRED_LOCATION = 3)
+    cuda2hipRename["cudaMemAdviseUnsetPreferredLocation"]                         = {"hipMemAdviseUnsetPreferredLocation", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED};       // 4 // API_Driver ANALOGUE (CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION = 4)
+    cuda2hipRename["cudaMemAdviseSetAccessedBy"]                                  = {"hipMemAdviseSetAccessedBy", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED};                // 5 // API_Driver ANALOGUE (CU_MEM_ADVISE_SET_ACCESSED_BY = 5)
+    cuda2hipRename["cudaMemAdviseUnsetAccessedBy"]                                = {"hipMemAdviseUnsetAccessedBy", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED};              // 6 // API_Driver ANALOGUE (CU_MEM_ADVISE_UNSET_ACCESSED_BY = 6)
+    // CUmem_range_attribute
+    cuda2hipRename["cudaMemRangeAttribute"]                                       = {"hipMemRangeAttribute", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED};                          // API_Driver ANALOGUE (CUmem_range_attribute)
+    cuda2hipRename["cudaMemRangeAttributeReadMostly"]                             = {"hipMemRangeAttributeReadMostly", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED};           // 1 // API_Driver ANALOGUE (CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY = 1)
+    cuda2hipRename["cudaMemRangeAttributePreferredLocation"]                      = {"hipMemRangeAttributePreferredLocation", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED};    // 2 // API_Driver ANALOGUE (CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION = 2)
+    cuda2hipRename["cudaMemRangeAttributeAccessedBy"]                             = {"hipMemRangeAttributeAccessedBy", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED};           // 3 // API_Driver ANALOGUE (CU_MEM_RANGE_ATTRIBUTE_ACCESSED_BY = 3)
+    cuda2hipRename["cudaMemRangeAttributeLastPrefetchLocation"]                   = {"hipMemRangeAttributeLastPrefetchLocation", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}; // 4 // API_Driver ANALOGUE (CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION = 4)
 
     // memcpy kind
     cuda2hipRename["cudaMemcpyKind"]              = {"hipMemcpyKind", CONV_MEM, API_RUNTIME};
@@ -1012,6 +1169,7 @@ struct cuda2hipMap {
     cuda2hipRename["cudaGetMipmappedArrayLevel"]  = {"hipGetMipmappedArrayLevel", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED};
     cuda2hipRename["cudaGetSymbolAddress"]        = {"hipGetSymbolAddress", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED};
     cuda2hipRename["cudaGetSymbolSize"]           = {"hipGetSymbolSize", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED};
+    cuda2hipRename["cudaMemPrefetchAsync"]        = {"hipMemPrefetchAsync", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED}; // [CUDA 8.0.44] // API_Driver ANALOGUE (cuMemPrefetchAsync)
 
     // malloc
     cuda2hipRename["cudaMalloc"]                  = {"hipMalloc", CONV_MEM, API_RUNTIME};
@@ -1022,7 +1180,7 @@ struct cuda2hipMap {
     cuda2hipRename["cudaMalloc3DArray"]           = {"hipMalloc3DArray", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED};
     cuda2hipRename["cudaMallocManaged"]           = {"hipMallocManaged", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED};
     cuda2hipRename["cudaMallocMipmappedArray"]    = {"hipMallocMipmappedArray", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED};
-    cuda2hipRename["cudaMallocPitch"]             = {"hipMallocPitch", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED};
+    cuda2hipRename["cudaMallocPitch"]             = {"hipMallocPitch", CONV_MEM, API_RUNTIME};
 
     cuda2hipRename["cudaFree"]                    = {"hipFree", CONV_MEM, API_RUNTIME};
     cuda2hipRename["cudaFreeHost"]                = {"hipHostFree", CONV_MEM, API_RUNTIME};
@@ -1229,12 +1387,12 @@ struct cuda2hipMap {
     cuda2hipRename["cudaDevAttrMultiGpuBoardGroupID"]              = {"hipDeviceAttributeMultiGpuBoardGroupID", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED};              // 85 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD_GROUP_ID = 85)
 
     // unsupported yet by HIP [CUDA 8.0.44]
-    cuda2hipRename["cudaDevAttrHostNativeAtomicSupported"]         = {"hipDeviceAttributeHostNativeAtomicSupported", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED};
-    cuda2hipRename["cudaDevAttrSingleToDoublePrecisionPerfRatio"]  = {"hipDeviceAttributeSingleToDoublePrecisionPerfRatio", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED};
-    cuda2hipRename["cudaDevAttrPageableMemoryAccess"]              = {"hipDeviceAttributePageableMemoryAccess", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED};
-    cuda2hipRename["cudaDevAttrConcurrentManagedAccess"]           = {"hipDeviceAttributeConcurrentManagedAccess", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED};
-    cuda2hipRename["cudaDevAttrComputePreemptionSupported"]        = {"hipDeviceAttributeComputePreemptionSupported", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED};
-    cuda2hipRename["cudaDevAttrCanUseHostPointerForRegisteredMem"] = {"hipDeviceAttributeCanUseHostPointerForRegisteredMem", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED};
+    cuda2hipRename["cudaDevAttrHostNativeAtomicSupported"]         = {"hipDeviceAttributeHostNativeAtomicSupported", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED};         // 86 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_HOST_NATIVE_ATOMIC_SUPPORTED = 86)
+    cuda2hipRename["cudaDevAttrSingleToDoublePrecisionPerfRatio"]  = {"hipDeviceAttributeSingleToDoublePrecisionPerfRatio", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED};  // 87 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_SINGLE_TO_DOUBLE_PRECISION_PERF_RATIO = 87)
+    cuda2hipRename["cudaDevAttrPageableMemoryAccess"]              = {"hipDeviceAttributePageableMemoryAccess", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED};              // 88 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS = 88)
+    cuda2hipRename["cudaDevAttrConcurrentManagedAccess"]           = {"hipDeviceAttributeConcurrentManagedAccess", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED};           // 89 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS = 89)
+    cuda2hipRename["cudaDevAttrComputePreemptionSupported"]        = {"hipDeviceAttributeComputePreemptionSupported", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED};        // 90 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED = 90)
+    cuda2hipRename["cudaDevAttrCanUseHostPointerForRegisteredMem"] = {"hipDeviceAttributeCanUseHostPointerForRegisteredMem", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 91 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM = 91)
 
     // Pointer Attributes
     // struct cudaPointerAttributes
@@ -1252,12 +1410,21 @@ struct cuda2hipMap {
     cuda2hipRename["cudaDeviceGetStreamPriorityRange"] = {"hipDeviceGetStreamPriorityRange", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED};
     cuda2hipRename["cudaSetValidDevices"]              = {"hipSetValidDevices", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED};
 
+    // unsupported yet by HIP [CUDA 8.0.44]
+    // P2P Attributes
+    cuda2hipRename["cudaDeviceP2PAttr"]                    = {"hipDeviceP2PAttribute", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED};                              // API_DRIVER ANALOGUE (CUdevice_P2PAttribute)
+    cuda2hipRename["cudaDevP2PAttrPerformanceRank"]        = {"hipDeviceP2PAttributePerformanceRank", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED};       // 0x01 // API_DRIVER ANALOGUE (CU_DEVICE_P2P_ATTRIBUTE_PERFORMANCE_RANK = 0x01)
+    cuda2hipRename["cudaDevP2PAttrAccessSupported"]        = {"hipDeviceP2PAttributeAccessSupported", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED};       // 0x02 // API_DRIVER ANALOGUE (CU_DEVICE_P2P_ATTRIBUTE_ACCESS_SUPPORTED = 0x02)
+    cuda2hipRename["cudaDevP2PAttrNativeAtomicSupported"]  = {"hipDeviceP2PAttributeNativeAtomicSupported", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 0x03 // API_DRIVER ANALOGUE (CU_DEVICE_P2P_ATTRIBUTE_NATIVE_ATOMIC_SUPPORTED = 0x03)
+    // [CUDA 8.0.44]
+    cuda2hipRename["cudaDeviceGetP2PAttribute"]            = {"hipDeviceGetP2PAttribute", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED};                              // API_DRIVER ANALOGUE (cuDeviceGetP2PAttribute)
+
     // Compute mode
     cuda2hipRename["cudaComputeMode"]                  = {"hipComputeMode", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED};                      // API_DRIVER ANALOGUE (CUcomputemode)
-    cuda2hipRename["cudaComputeModeDefault"]           = {"hipComputeModeDefault", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED};          // 0
-    cuda2hipRename["cudaComputeModeExclusive"]         = {"hipComputeModeExclusive", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED};        // 1
-    cuda2hipRename["cudaComputeModeProhibited"]        = {"hipComputeModeProhibited", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED};       // 2
-    cuda2hipRename["cudaComputeModeExclusiveProcess"]  = {"hipComputeModeExclusiveProcess", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 3
+    cuda2hipRename["cudaComputeModeDefault"]           = {"hipComputeModeDefault", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED};          // 0 // API_DRIVER ANALOGUE (CU_COMPUTEMODE_DEFAULT = 0)
+    cuda2hipRename["cudaComputeModeExclusive"]         = {"hipComputeModeExclusive", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED};        // 1 // API_DRIVER ANALOGUE (CU_COMPUTEMODE_EXCLUSIVE = 1)
+    cuda2hipRename["cudaComputeModeProhibited"]        = {"hipComputeModeProhibited", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED};       // 2 // API_DRIVER ANALOGUE (CU_COMPUTEMODE_PROHIBITED = 2)
+    cuda2hipRename["cudaComputeModeExclusiveProcess"]  = {"hipComputeModeExclusiveProcess", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 3 // API_DRIVER ANALOGUE (CU_COMPUTEMODE_EXCLUSIVE_PROCESS = 3)
 
     // Device Flags
     // unsupported yet by HIP
@@ -1415,7 +1582,7 @@ struct cuda2hipMap {
     cuda2hipRename["cudaResourceTypeLinear"]                      = {"hipResourceTypeLinear", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED};                 // 0x02 // API_Driver ANALOGUE (CU_RESOURCE_TYPE_LINEAR = 0x02)
     cuda2hipRename["cudaResourceTypePitch2D"]                     = {"hipResourceTypePitch2D", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED};                // 0x03 // API_Driver ANALOGUE (CU_RESOURCE_TYPE_PITCH2D = 0x03)
 
-
+    // enum cudaResourceViewFormat
     cuda2hipRename["cudaResourceViewFormat"]                      = {"hipResourceViewFormat", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED};                              // API_Driver ANALOGUE (CUresourceViewFormat)
     cuda2hipRename["cudaResViewFormatNone"]                       = {"hipResViewFormatNone", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED};                       // 0x00 // API_Driver ANALOGUE (CU_RES_VIEW_FORMAT_NONE = 0x00)
     cuda2hipRename["cudaResViewFormatUnsignedChar1"]              = {"hipResViewFormatUnsignedChar1", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED};              // 0x01 // API_Driver ANALOGUE (CU_RES_VIEW_FORMAT_UINT_1X8 = 0x01)
@@ -1543,7 +1710,7 @@ struct cuda2hipMap {
     // Blas types
     cuda2hipRename["cublasHandle_t"]                 = {"hipblasHandle_t", CONV_TYPE, API_BLAS};
     // TODO: dereferencing: typedef struct cublasContext *cublasHandle_t;
-    cuda2hipRename["cublasContext"]                  = {"hipblasHandle_t", CONV_TYPE, API_BLAS};
+    // cuda2hipRename["cublasContext"]                  = {"hipblasHandle_t", CONV_TYPE, API_BLAS};
     // Blas management functions
     // unsupported yet by hipblas/hcblas
     cuda2hipRename["cublasInit"]                     = {"hipblasInit", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED};
@@ -1651,10 +1818,9 @@ struct cuda2hipMap {
 
     // AXPY
     cuda2hipRename["cublasSaxpy"]                    = {"hipblasSaxpy", CONV_MATH_FUNC, API_BLAS};
-    // there is no such a function in CUDA
     cuda2hipRename["cublasSaxpyBatched"]             = {"hipblasSaxpyBatched", CONV_MATH_FUNC, API_BLAS};
     // unsupported yet by hipblas/hcblas
-    cuda2hipRename["cublasDaxpy"]                    = {"hipblasDaxpy", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED};
+    cuda2hipRename["cublasDaxpy"]                    = {"hipblasDaxpy", CONV_MATH_FUNC, API_BLAS};
     cuda2hipRename["cublasCaxpy"]                    = {"hipblasCaxpy", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED};
     cuda2hipRename["cublasZaxpy"]                    = {"hipblasZaxpy", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED};
 
@@ -1731,8 +1897,8 @@ struct cuda2hipMap {
     cuda2hipRename["cublasSgemv"]                    = {"hipblasSgemv", CONV_MATH_FUNC, API_BLAS};
     // there is no such a function in CUDA
     cuda2hipRename["cublasSgemvBatched"]             = {"hipblasSgemvBatched", CONV_MATH_FUNC, API_BLAS};
+    cuda2hipRename["cublasDgemv"]                    = {"hipblasDgemv", CONV_MATH_FUNC, API_BLAS};
     // unsupported yet by hipblas/hcblas
-    cuda2hipRename["cublasDgemv"]                    = {"hipblasDgemv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED};
     cuda2hipRename["cublasCgemv"]                    = {"hipblasCgemv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED};
     cuda2hipRename["cublasZgemv"]                    = {"hipblasZgemv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED};
 
@@ -1810,8 +1976,8 @@ struct cuda2hipMap {
 
     // GER
     cuda2hipRename["cublasSger"]                     = {"hipblasSger", CONV_MATH_FUNC, API_BLAS};
+    cuda2hipRename["cublasDger"]                     = {"hipblasDger", CONV_MATH_FUNC, API_BLAS};
     // unsupported yet by hipblas/hcblas
-    cuda2hipRename["cublasDger"]                     = {"hipblasDger", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED};
     cuda2hipRename["cublasCgeru"]                    = {"hipblasCgeru", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED};
     cuda2hipRename["cublasCgerc"]                    = {"hipblasCgerc", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED};
     cuda2hipRename["cublasZgeru"]                    = {"hipblasZgeru", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED};
@@ -1848,8 +2014,7 @@ struct cuda2hipMap {
     // Blas3 (v1) Routines
     // GEMM
     cuda2hipRename["cublasSgemm"]                    = {"hipblasSgemm", CONV_MATH_FUNC, API_BLAS};
-    // unsupported yet by hipblas/hcblas
-    cuda2hipRename["cublasDgemm"]                    = {"hipblasDgemm", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED};
+    cuda2hipRename["cublasDgemm"]                    = {"hipblasDgemm", CONV_MATH_FUNC, API_BLAS};
 
     cuda2hipRename["cublasCgemm"]                    = {"hipblasCgemm", CONV_MATH_FUNC, API_BLAS};
     // unsupported yet by hipblas/hcblas
@@ -1857,8 +2022,7 @@ struct cuda2hipMap {
 
     // BATCH GEMM
     cuda2hipRename["cublasSgemmBatched"]             = {"hipblasSgemmBatched", CONV_MATH_FUNC, API_BLAS};
-    // unsupported yet by hipblas/hcblas
-    cuda2hipRename["cublasDgemmBatched"]             = {"hipblasDgemmBatched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED};
+    cuda2hipRename["cublasDgemmBatched"]             = {"hipblasDgemmBatched", CONV_MATH_FUNC, API_BLAS};
 
     cuda2hipRename["cublasCgemmBatched"]             = {"hipblasCgemmBatched", CONV_MATH_FUNC, API_BLAS};
     // unsupported yet by hipblas/hcblas
@@ -2016,10 +2180,9 @@ struct cuda2hipMap {
     cuda2hipRename["cublasCreate_v2"]                = {"hipblasCreate", CONV_MATH_FUNC, API_BLAS};
     cuda2hipRename["cublasDestroy_v2"]               = {"hipblasDestroy", CONV_MATH_FUNC, API_BLAS};
 
-    // unsupported yet by hipblas/hcblas
     cuda2hipRename["cublasGetVersion_v2"]            = {"hipblasGetVersion", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED};
-    cuda2hipRename["cublasSetStream_v2"]             = {"hipblasSetStream", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED};
-    cuda2hipRename["cublasGetStream_v2"]             = {"hipblasGetStream", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED};
+    cuda2hipRename["cublasSetStream_v2"]             = {"hipblasSetStream", CONV_MATH_FUNC, API_BLAS};
+    cuda2hipRename["cublasGetStream_v2"]             = {"hipblasGetStream", CONV_MATH_FUNC, API_BLAS};
     cuda2hipRename["cublasGetPointerMode_v2"]        = {"hipblasGetPointerMode", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED};
     cuda2hipRename["cublasSetPointerMode_v2"]        = {"hipblasSetPointerMode", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED};
 
@@ -2236,7 +2399,7 @@ struct cuda2hipMap {
     // AXPY
     cuda2hipRename["cublasSaxpy_v2"]                 = {"hipblasSaxpy", CONV_MATH_FUNC, API_BLAS};
     // unsupported yet by hipblas/hcblas
-    cuda2hipRename["cublasDaxpy_v2"]                 = {"hipblasDaxpy", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED};
+    cuda2hipRename["cublasDaxpy_v2"]                 = {"hipblasDaxpy", CONV_MATH_FUNC, API_BLAS};
     cuda2hipRename["cublasCaxpy_v2"]                 = {"hipblasCaxpy", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED};
     cuda2hipRename["cublasZaxpy_v2"]                 = {"hipblasZaxpy", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED};
 
@@ -2635,7 +2798,7 @@ private:
   bool cudaCall(const MatchFinder::MatchResult &Result) {
     if (const CallExpr *call = Result.Nodes.getNodeAs<CallExpr>("cudaCall")) {
       const FunctionDecl *funcDcl = call->getDirectCallee();
-      StringRef name = funcDcl->getDeclName().getAsString();
+      std::string name = funcDcl->getDeclName().getAsString();
       SourceManager *SM = Result.SourceManager;
       SourceLocation sl = call->getLocStart();
       const auto found = N.cuda2hipRename.find(name);
@@ -2659,16 +2822,16 @@ private:
             }
           }
           if (bReplace) {
-            updateCounters(found->second, name.str());
+            updateCounters(found->second, name);
             Replacement Rep(*SM, sl, length, repName);
             FullSourceLoc fullSL(sl, *SM);
             insertReplacement(Rep, fullSL);
           }
         } else {
-          updateCounters(found->second, name.str());
+          updateCounters(found->second, name);
         }
       } else {
-        std::string msg = "the following reference is not handled: '" + name.str() + "' [function call].";
+        std::string msg = "the following reference is not handled: '" + name + "' [function call].";
         printHipifyMessage(*SM, sl, msg);
       }
       return true;
@@ -2783,7 +2946,7 @@ private:
 
   bool cudaEnumConstantRef(const MatchFinder::MatchResult &Result) {
     if (const DeclRefExpr *enumConstantRef = Result.Nodes.getNodeAs<DeclRefExpr>("cudaEnumConstantRef")) {
-      StringRef name = enumConstantRef->getDecl()->getNameAsString();
+      StringRef name = enumConstantRef->getDecl()->getName();
       SourceLocation sl = enumConstantRef->getLocStart();
       SourceManager *SM = Result.SourceManager;
       const auto found = N.cuda2hipRename.find(name);
@@ -2804,20 +2967,30 @@ private:
     return false;
   }
 
-  bool cudaEnumConstantDecl(const MatchFinder::MatchResult &Result) {
-    if (const VarDecl *enumConstantDecl = Result.Nodes.getNodeAs<VarDecl>("cudaEnumConstantDecl")) {
-      StringRef name =
-        enumConstantDecl->getType()->getAsTagDecl()->getNameAsString();
-      // anonymous typedef enum
-      if (name.empty()) {
-        QualType QT = enumConstantDecl->getType().getUnqualifiedType();
-        name = QT.getAsString();
+  bool cudaEnumDecl(const MatchFinder::MatchResult &Result) {
+    if (const VarDecl *enumDecl = Result.Nodes.getNodeAs<VarDecl>("cudaEnumDecl")) {
+      std::string name = enumDecl->getType()->getAsTagDecl()->getNameAsString();
+      QualType QT = enumDecl->getType().getUnqualifiedType();
+      std::string name_unqualified = QT.getAsString();
+      if ((name_unqualified.find(' ') == std::string::npos && name.find(' ') == std::string::npos) || name.empty()) {
+        name = name_unqualified;
       }
-      SourceLocation sl = enumConstantDecl->getLocStart();
+      // Workaround for enum VarDecl as param decl, declared with enum type specifier
+      // Example: void func(enum cudaMemcpyKind kind);
+      //-------------------------------------------------
       SourceManager *SM = Result.SourceManager;
+      SourceLocation sl(enumDecl->getLocStart());
+      SourceLocation end(enumDecl->getLocEnd());
+      size_t repLength = SM->getCharacterData(end) - SM->getCharacterData(sl);
+      StringRef sfull = StringRef(SM->getCharacterData(sl), repLength);
+      size_t offset = sfull.find(name);
+      if (offset > 0) {
+        sl = sl.getLocWithOffset(offset);
+      }
+      //-------------------------------------------------
       const auto found = N.cuda2hipRename.find(name);
       if (found != N.cuda2hipRename.end()) {
-        updateCounters(found->second, name.str());
+        updateCounters(found->second, name);
         if (!found->second.unsupported) {
           StringRef repName = found->second.hipName;
           Replacement Rep(*SM, sl, name.size(), repName);
@@ -2825,7 +2998,7 @@ private:
           insertReplacement(Rep, fullSL);
         }
       } else {
-        std::string msg = "the following reference is not handled: '" + name.str() + "' [enum constant decl].";
+        std::string msg = "the following reference is not handled: '" + name + "' [enum constant decl].";
         printHipifyMessage(*SM, sl, msg);
       }
       return true;
@@ -2833,6 +3006,51 @@ private:
     return false;
   }
 
+  bool cudaEnumVarPtr(const MatchFinder::MatchResult &Result) {
+    if (const VarDecl *enumVarPtr = Result.Nodes.getNodeAs<VarDecl>("cudaEnumVarPtr")) {
+      const Type *t = enumVarPtr->getType().getTypePtrOrNull();
+      if (t) {
+        QualType QT = t->getPointeeType();
+        std::string name = QT.getAsString();
+        QT = enumVarPtr->getType().getUnqualifiedType();
+        std::string name_unqualified = QT.getAsString();
+        if ((name_unqualified.find(' ') == std::string::npos && name.find(' ') == std::string::npos) || name.empty()) {
+          name = name_unqualified;
+        }
+        // Workaround for enum VarDecl as param decl, declared with enum type specifier
+        // Example: void func(enum cudaMemcpyKind kind);
+        //-------------------------------------------------
+        SourceManager *SM = Result.SourceManager;
+        TypeLoc TL = enumVarPtr->getTypeSourceInfo()->getTypeLoc();
+        SourceLocation sl(TL.getUnqualifiedLoc().getLocStart());
+        SourceLocation end(TL.getUnqualifiedLoc().getLocEnd());
+        size_t repLength = SM->getCharacterData(end) - SM->getCharacterData(sl);
+        StringRef sfull = StringRef(SM->getCharacterData(sl), repLength);
+        size_t offset = sfull.find(name);
+        if (offset > 0) {
+          sl = sl.getLocWithOffset(offset);
+        }
+        //-------------------------------------------------
+        const auto found = N.cuda2hipRename.find(name);
+        if (found != N.cuda2hipRename.end()) {
+          updateCounters(found->second, name);
+          if (!found->second.unsupported) {
+            StringRef repName = found->second.hipName;
+            Replacement Rep(*SM, sl, name.size(), repName);
+            FullSourceLoc fullSL(sl, *SM);
+            insertReplacement(Rep, fullSL);
+          }
+        }
+        else {
+          std::string msg = "the following reference is not handled: '" + name + "' [enum var ptr].";
+          printHipifyMessage(*SM, sl, msg);
+        }
+      }
+      return true;
+    }
+    return false;
+  }
+
   bool cudaTypedefVar(const MatchFinder::MatchResult &Result) {
     if (const VarDecl *typedefVar = Result.Nodes.getNodeAs<VarDecl>("cudaTypedefVar")) {
       QualType QT = typedefVar->getType();
@@ -2840,12 +3058,12 @@ private:
         QT = QT.getTypePtr()->getAsArrayTypeUnsafe()->getElementType();
       }
       QT = QT.getUnqualifiedType();
-      StringRef name = QT.getAsString();
+      std::string name = QT.getAsString();
       SourceLocation sl = typedefVar->getLocStart();
       SourceManager *SM = Result.SourceManager;
       const auto found = N.cuda2hipRename.find(name);
       if (found != N.cuda2hipRename.end()) {
-        updateCounters(found->second, name.str());
+        updateCounters(found->second, name);
         if (!found->second.unsupported) {
           StringRef repName = found->second.hipName;
           Replacement Rep(*SM, sl, name.size(), repName);
@@ -2853,7 +3071,7 @@ private:
           insertReplacement(Rep, fullSL);
         }
       } else {
-        std::string msg = "the following reference is not handled: '" + name.str() + "' [typedef var].";
+        std::string msg = "the following reference is not handled: '" + name + "' [typedef var].";
         printHipifyMessage(*SM, sl, msg);
       }
       return true;
@@ -2870,10 +3088,10 @@ private:
         SourceLocation sl = TL.getUnqualifiedLoc().getLocStart();
         QualType QT = t->getPointeeType();
         QT = QT.getUnqualifiedType();
-        StringRef name = QT.getAsString();
+        std::string name = QT.getAsString();
         const auto found = N.cuda2hipRename.find(name);
         if (found != N.cuda2hipRename.end()) {
-          updateCounters(found->second, name.str());
+          updateCounters(found->second, name);
           if (!found->second.unsupported) {
             StringRef repName = found->second.hipName;
             Replacement Rep(*SM, sl, name.size(), repName);
@@ -2882,7 +3100,7 @@ private:
           }
         }
         else {
-          std::string msg = "the following reference is not handled: '" + name.str() + "' [typedef var ptr].";
+          std::string msg = "the following reference is not handled: '" + name + "' [typedef var ptr].";
           printHipifyMessage(*SM, sl, msg);
         }
       }
@@ -2896,13 +3114,13 @@ private:
       QualType QT = structVar->getType();
       // ToDo: find case-studies with types other than Struct.
       if (QT->isStructureType()) {
-        StringRef name = QT.getTypePtr()->getAsStructureType()->getDecl()->getNameAsString();
+        std::string name = QT.getTypePtr()->getAsStructureType()->getDecl()->getNameAsString();
         TypeLoc TL = structVar->getTypeSourceInfo()->getTypeLoc();
         SourceLocation sl = TL.getUnqualifiedLoc().getLocStart();
         SourceManager *SM = Result.SourceManager;
         const auto found = N.cuda2hipRename.find(name);
         if (found != N.cuda2hipRename.end()) {
-          updateCounters(found->second, name.str());
+          updateCounters(found->second, name);
           if (!found->second.unsupported) {
             StringRef repName = found->second.hipName;
             Replacement Rep(*SM, sl, name.size(), repName);
@@ -2911,7 +3129,7 @@ private:
           }
         }
         else {
-          std::string msg = "the following reference is not handled: '" + name.str() + "' [struct var].";
+          std::string msg = "the following reference is not handled: '" + name + "' [struct var].";
           printHipifyMessage(*SM, sl, msg);
         }
       }
@@ -2978,13 +3196,69 @@ private:
     return false;
   }
 
+  bool cudaNewOperatorDecl(const MatchFinder::MatchResult &Result) {
+    if (const auto *newOperator = Result.Nodes.getNodeAs<CXXNewExpr>("cudaNewOperatorDecl")) {
+      const Type *t = newOperator->getType().getTypePtrOrNull();
+      if (t) {
+        SourceManager *SM = Result.SourceManager;
+        TypeLoc TL = newOperator->getAllocatedTypeSourceInfo()->getTypeLoc();
+        SourceLocation sl = TL.getUnqualifiedLoc().getLocStart();
+        QualType QT = t->getPointeeType();
+        std::string name = QT.getAsString();
+        const auto found = N.cuda2hipRename.find(name);
+        if (found != N.cuda2hipRename.end()) {
+          updateCounters(found->second, name);
+          if (!found->second.unsupported) {
+            StringRef repName = found->second.hipName;
+            Replacement Rep(*SM, sl, name.size(), repName);
+            FullSourceLoc fullSL(sl, *SM);
+            insertReplacement(Rep, fullSL);
+          }
+        }
+        else {
+          std::string msg = "the following reference is not handled: '" + name + "' [new operator].";
+          printHipifyMessage(*SM, sl, msg);
+        }
+      }
+    }
+    return false;
+  }
+
+  bool cudaFunctionReturn(const MatchFinder::MatchResult &Result) {
+    if (const auto *ret = Result.Nodes.getNodeAs<FunctionDecl>("cudaFunctionReturn")) {
+      QualType QT = ret->getReturnType();
+      SourceManager *SM = Result.SourceManager;
+      SourceRange sr = ret->getReturnTypeSourceRange();
+      SourceLocation sl = sr.getBegin();
+      std::string name = QT.getAsString();
+      if (QT.getTypePtr()->isEnumeralType()) {
+        name = QT.getTypePtr()->getAs<EnumType>()->getDecl()->getNameAsString();
+      }
+      const auto found = N.cuda2hipRename.find(name);
+      if (found != N.cuda2hipRename.end()) {
+        updateCounters(found->second, name);
+        if (!found->second.unsupported) {
+          StringRef repName = found->second.hipName;
+          Replacement Rep(*SM, sl, name.size(), repName);
+          FullSourceLoc fullSL(sl, *SM);
+          insertReplacement(Rep, fullSL);
+        }
+      }
+      else {
+        std::string msg = "the following reference is not handled: '" + name + "' [function return].";
+        printHipifyMessage(*SM, sl, msg);
+      }
+    }
+    return false;
+  }
+
   bool cudaSharedIncompleteArrayVar(const MatchFinder::MatchResult &Result) {
     StringRef refName = "cudaSharedIncompleteArrayVar";
     if (const VarDecl *sharedVar = Result.Nodes.getNodeAs<VarDecl>(refName)) {
       // Example: extern __shared__ uint sRadix1[];
       if (sharedVar->hasExternalFormalLinkage()) {
         QualType QT = sharedVar->getType();
-        StringRef typeName;
+        std::string typeName;
         if (QT->isIncompleteArrayType()) {
           const ArrayType *AT = QT.getTypePtr()->getAsArrayTypeUnsafe();
           QT = AT->getElementType();
@@ -3006,9 +3280,8 @@ private:
           SourceLocation slEnd = sharedVar->getLocEnd();
           SourceManager *SM = Result.SourceManager;
           size_t repLength = SM->getCharacterData(slEnd) - SM->getCharacterData(slStart) + 1;
-          SmallString<128> tmpData;
-          StringRef varName = sharedVar->getNameAsString();
-          StringRef repName = Twine("HIP_DYNAMIC_SHARED(" + typeName + ", " + varName + ")").toStringRef(tmpData);
+          std::string varName = sharedVar->getNameAsString();
+          std::string repName = "HIP_DYNAMIC_SHARED(" + typeName + ", " + varName + ")";
           Replacement Rep(*SM, slStart, repLength, repName);
           FullSourceLoc fullSL(slStart, *SM);
           insertReplacement(Rep, fullSL);
@@ -3024,7 +3297,7 @@ private:
   bool cudaParamDecl(const MatchFinder::MatchResult &Result) {
     if (const ParmVarDecl *paramDecl = Result.Nodes.getNodeAs<ParmVarDecl>("cudaParamDecl")) {
       QualType QT = paramDecl->getOriginalType().getUnqualifiedType();
-      StringRef name = QT.getAsString();
+      std::string name = QT.getAsString();
       const Type *t = QT.getTypePtr();
       if (t->isStructureOrClassType()) {
         name = t->getAsCXXRecordDecl()->getName();
@@ -3034,7 +3307,7 @@ private:
       SourceManager *SM = Result.SourceManager;
       const auto found = N.cuda2hipRename.find(name);
       if (found != N.cuda2hipRename.end()) {
-        updateCounters(found->second, name.str());
+        updateCounters(found->second, name);
         if (!found->second.unsupported) {
           StringRef repName = found->second.hipName;
           Replacement Rep(*SM, sl, name.size(), repName);
@@ -3042,7 +3315,7 @@ private:
           insertReplacement(Rep, fullSL);
         }
       } else {
-        std::string msg = "the following reference is not handled: '" + name.str() + "' [param decl].";
+        std::string msg = "the following reference is not handled: '" + name + "' [param decl].";
         printHipifyMessage(*SM, sl, msg);
       }
       return true;
@@ -3113,7 +3386,8 @@ public:
       if (cudaCall(Result)) break;
       if (cudaBuiltin(Result)) break;
       if (cudaEnumConstantRef(Result)) break;
-      if (cudaEnumConstantDecl(Result)) break;
+      if (cudaEnumDecl(Result)) break;
+      if (cudaEnumVarPtr(Result)) break;
       if (cudaTypedefVar(Result)) break;
       if (cudaTypedefVarPtr(Result)) break;
       if (cudaStructVar(Result)) break;
@@ -3122,6 +3396,8 @@ public:
       if (cudaParamDecl(Result)) break;
       if (cudaParamDeclPtr(Result)) break;
       if (cudaLaunchKernel(Result)) break;
+      if (cudaNewOperatorDecl(Result)) break;
+      if (cudaFunctionReturn(Result)) break;
       if (cudaSharedIncompleteArrayVar(Result)) break;
       if (stringLiteral(Result)) break;
       if (unresolvedTemplateName(Result)) break;
@@ -3159,7 +3435,12 @@ void addAllMatchers(ast_matchers::MatchFinder &Finder, Cuda2HipCallback *Callbac
                                 Callback);
   Finder.addMatcher(varDecl(isExpansionInMainFile(),
                             hasType(enumDecl()))
-                            .bind("cudaEnumConstantDecl"),
+                            .bind("cudaEnumDecl"),
+                            Callback);
+  Finder.addMatcher(varDecl(isExpansionInMainFile(),
+                            hasType(pointsTo(enumDecl(
+                            matchesName("cu.*|CU.*")))))
+                            .bind("cudaEnumVarPtr"),
                             Callback);
   Finder.addMatcher(varDecl(isExpansionInMainFile(),
                             hasType(typedefDecl(matchesName("cu.*|CU.*"))))
@@ -3214,6 +3495,23 @@ void addAllMatchers(ast_matchers::MatchFinder &Finder, Cuda2HipCallback *Callbac
                             hasType(incompleteArrayType())))
                            .bind("cudaSharedIncompleteArrayVar"),
                             Callback);
+  // Example:
+  // CUjit_option *jitOptions = new CUjit_option[jitNumOptions];
+  // hipJitOption *jitOptions = new hipJitOption[jitNumOptions];
+  Finder.addMatcher(cxxNewExpr(isExpansionInMainFile(),
+                               hasType(pointsTo(namedDecl(matchesName("cu.*|CU.*")))))
+                              .bind("cudaNewOperatorDecl"),
+                               Callback);
+  // Examples:
+  // 1.
+  // cudaStream_t cuda_memcpy_stream(...)
+  // 2.
+  // template<typename System1, typename System2> cudaMemcpyKind cuda_memcpy_kind(...)
+  Finder.addMatcher(functionDecl(isExpansionInMainFile(),
+                                 returns(hasDeclaration(namedDecl(matchesName("cu.*|CU.*")))))
+                                .bind("cudaFunctionReturn"),
+                                 Callback);
+
 }
 
 int64_t printStats(const std::string &csvFile, const std::string &srcFile,
@@ -3468,7 +3766,11 @@ void printAllStats(const std::string &csvFile, int64_t totalFiles, int64_t conve
 int main(int argc, const char **argv) {
   auto start = std::chrono::steady_clock::now();
   auto begin = start;
+#if (LLVM_VERSION_MAJOR >= 3) && (LLVM_VERSION_MINOR >= 9)
+  llvm::sys::PrintStackTraceOnErrorSignal(StringRef());
+#else
   llvm::sys::PrintStackTraceOnErrorSignal();
+#endif
   CommonOptionsParser OptionsParser(argc, argv, ToolTemplateCategory, llvm::cl::OneOrMore);
   std::vector<std::string> fileSources = OptionsParser.getSourcePathList();
   std::string dst = OutputFilename;
diff --git a/projects/hip/include/hip/hcc_detail/grid_launch_GGL.hpp b/projects/hip/include/hip/hcc_detail/grid_launch_GGL.hpp
index 2dd9a95bc6..8e3dab8482 100644
--- a/projects/hip/include/hip/hcc_detail/grid_launch_GGL.hpp
+++ b/projects/hip/include/hip/hcc_detail/grid_launch_GGL.hpp
@@ -89,8 +89,7 @@ namespace hip_impl
         dim3 dim_blocks,
         int group_mem_bytes,
         const hc::accelerator_view& acc_v,
-        K k,
-        Ts&&... args)
+        K k)
     {
         const auto d = hc::extent<3>{
             num_blocks.z * dim_blocks.z,
@@ -102,16 +101,11 @@ namespace hip_impl
             group_mem_bytes);
 
         try {
-            hc::parallel_for_each(
-                acc_v,
-                d,
-                [=](const hc::tiled_index<3>& idx) [[hc]] {
-                    k(args...);
-                });
+            hc::parallel_for_each(acc_v, d, k);
         }
         catch (std::exception& ex) {
-            std::cerr << "Failed in " << __FUNCTION__ << ", with exception: "
-                << ex.what() << std::endl;
+            std::cerr << "Failed in " << __func__ << ", with exception: "
+                      << ex.what() << std::endl;
             throw;
         }
     }
@@ -133,8 +127,7 @@ namespace hip_impl
         int group_mem_bytes,
         hipStream_t stream,
         const char* kernel_name,
-        K k,
-        Ts&&... args)
+        K k)
     {
         void* lck_stream = nullptr;
         auto acc_v = lock_stream_hip_(stream, lck_stream);
@@ -156,12 +149,11 @@ namespace hip_impl
                 std::move(dim_blocks),
                 group_mem_bytes,
                 acc_v,
-                std::move(k),
-                std::forward<Ts>(args)...);
+                std::move(k));
         }
         catch (std::exception& ex) {
-            std::cerr << "Failed in " << __FUNCTION__ << ", with exception: "
-                << ex.what() << std::endl;
+            std::cerr << "Failed in " << __func__ << ", with exception: "
+                      << ex.what() << std::endl;
             throw;
         }
     }
@@ -175,8 +167,7 @@ namespace hip_impl
         dim3 dim_blocks,
         int group_mem_bytes,
         hipStream_t stream,
-        K k,
-        Ts&&... args)
+        K k)
     {
         grid_launch_hip_impl_(
             New_grid_launch_tag{},
@@ -184,9 +175,7 @@ namespace hip_impl
             std::move(dim_blocks),
             group_mem_bytes,
             std::move(stream),
-            std::move(k),
-            hipLaunchParm{},
-            std::forward<Ts>(args)...);
+            std::move(k));
     }
 
     template<FunctionalProcedure K, typename... Ts>
@@ -199,8 +188,7 @@ namespace hip_impl
         int group_mem_bytes,
         hipStream_t stream,
         const char* kernel_name,
-        K k,
-        Ts&&... args)
+        K k)
     {
         grid_launch_hip_impl_(
             New_grid_launch_tag{},
@@ -209,9 +197,7 @@ namespace hip_impl
             group_mem_bytes,
             std::move(stream),
             kernel_name,
-            std::move(k),
-            hipLaunchParm{},
-            std::forward<Ts>(args)...);
+            std::move(k));
     }
 
     template<FunctionalProcedure K, typename... Ts>
@@ -223,8 +209,7 @@ namespace hip_impl
         int group_mem_bytes,
         hipStream_t stream,
         const char* kernel_name,
-        K k,
-        Ts&& ... args)
+        K k)
     {
         grid_launch_hip_impl_(
             is_new_grid_launch_t<K, Ts...>{},
@@ -233,8 +218,7 @@ namespace hip_impl
             group_mem_bytes,
             std::move(stream),
             kernel_name,
-            std::move(k),
-            std::forward<Ts>(args)...);
+            std::move(k));
     }
 
     template<FunctionalProcedure K, typename... Ts>
@@ -245,8 +229,7 @@ namespace hip_impl
         dim3 dim_blocks,
         int group_mem_bytes,
         hipStream_t stream,
-        K k,
-        Ts&& ... args)
+        K k)
     {
         grid_launch_hip_impl_(
             is_new_grid_launch_t<K, Ts...>{},
@@ -254,610 +237,649 @@ namespace hip_impl
             std::move(dim_blocks),
             group_mem_bytes,
             std::move(stream),
-            std::move(k),
-            std::forward<Ts>(args)...);
+            std::move(k));
     }
 
-    namespace
-    {
-        template<typename T>
-        constexpr
-        inline
-        T&& forward_(std::remove_reference_t<T>& x) [[hc]]
-        {
-            return static_cast<T&&>(x);
-        }
+    // TODO: these are temporary and purposefully noisy and disruptive.
+    #define make_kernel_name_hip(k, n)\
+        HIP_kernel_functor_name_begin ## _ ## k ## _ ## \
+        HIP_kernel_functor_name_end ## _ ## n
 
-        template<FunctionalProcedure K, K* k>
-        struct Forwarder {
-            template<typename... Ts>
-            void operator()(Ts&&...args) const [[hc]]
-            {
-                k(forward_<Ts>(args)...);
-            }
-        };
-    }
+    #define make_kernel_functor_hip_27(\
+        function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\
+        p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20, p21, p22, p23,\
+        p24)\
+        struct make_kernel_name_hip(function_name, 25) {\
+            std::decay_t<decltype(p0)> _p0_;\
+            std::decay_t<decltype(p1)> _p1_;\
+            std::decay_t<decltype(p2)> _p2_;\
+            std::decay_t<decltype(p3)> _p3_;\
+            std::decay_t<decltype(p4)> _p4_;\
+            std::decay_t<decltype(p5)> _p5_;\
+            std::decay_t<decltype(p6)> _p6_;\
+            std::decay_t<decltype(p7)> _p7_;\
+            std::decay_t<decltype(p8)> _p8_;\
+            std::decay_t<decltype(p9)> _p9_;\
+            std::decay_t<decltype(p10)> _p10_;\
+            std::decay_t<decltype(p11)> _p11_;\
+            std::decay_t<decltype(p12)> _p12_;\
+            std::decay_t<decltype(p13)> _p13_;\
+            std::decay_t<decltype(p14)> _p14_;\
+            std::decay_t<decltype(p15)> _p15_;\
+            std::decay_t<decltype(p16)> _p16_;\
+            std::decay_t<decltype(p17)> _p17_;\
+            std::decay_t<decltype(p18)> _p18_;\
+            std::decay_t<decltype(p19)> _p19_;\
+            std::decay_t<decltype(p20)> _p20_;\
+            std::decay_t<decltype(p21)> _p21_;\
+            std::decay_t<decltype(p22)> _p22_;\
+            std::decay_t<decltype(p23)> _p23_;\
+            std::decay_t<decltype(p24)> _p24_;\
+            void operator()(const hc::tiled_index<3>&) const [[hc]]\
+            {\
+                kernel_name(\
+                    _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\
+                    _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\
+                    _p18_, _p19_, _p20_, _p21_, _p22_, _p23_, _p24_);\
+            }\
+        }
+    #define make_kernel_functor_hip_26(\
+        function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\
+        p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20, p21, p22, p23)\
+        struct make_kernel_name_hip(function_name, 24) {\
+            std::decay_t<decltype(p0)> _p0_;\
+            std::decay_t<decltype(p1)> _p1_;\
+            std::decay_t<decltype(p2)> _p2_;\
+            std::decay_t<decltype(p3)> _p3_;\
+            std::decay_t<decltype(p4)> _p4_;\
+            std::decay_t<decltype(p5)> _p5_;\
+            std::decay_t<decltype(p6)> _p6_;\
+            std::decay_t<decltype(p7)> _p7_;\
+            std::decay_t<decltype(p8)> _p8_;\
+            std::decay_t<decltype(p9)> _p9_;\
+            std::decay_t<decltype(p10)> _p10_;\
+            std::decay_t<decltype(p11)> _p11_;\
+            std::decay_t<decltype(p12)> _p12_;\
+            std::decay_t<decltype(p13)> _p13_;\
+            std::decay_t<decltype(p14)> _p14_;\
+            std::decay_t<decltype(p15)> _p15_;\
+            std::decay_t<decltype(p16)> _p16_;\
+            std::decay_t<decltype(p17)> _p17_;\
+            std::decay_t<decltype(p18)> _p18_;\
+            std::decay_t<decltype(p19)> _p19_;\
+            std::decay_t<decltype(p20)> _p20_;\
+            std::decay_t<decltype(p21)> _p21_;\
+            std::decay_t<decltype(p22)> _p22_;\
+            std::decay_t<decltype(p23)> _p23_;\
+            void operator()(const hc::tiled_index<3>&) const [[hc]]\
+            {\
+                kernel_name(\
+                    _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\
+                    _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\
+                    _p18_, _p19_, _p20_, _p21_, _p22_, _p23_);\
+            }\
+        }
+    #define make_kernel_functor_hip_25(\
+        function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\
+        p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20, p21, p22)\
+        struct make_kernel_name_hip(function_name, 23) {\
+            std::decay_t<decltype(p0)> _p0_;\
+            std::decay_t<decltype(p1)> _p1_;\
+            std::decay_t<decltype(p2)> _p2_;\
+            std::decay_t<decltype(p3)> _p3_;\
+            std::decay_t<decltype(p4)> _p4_;\
+            std::decay_t<decltype(p5)> _p5_;\
+            std::decay_t<decltype(p6)> _p6_;\
+            std::decay_t<decltype(p7)> _p7_;\
+            std::decay_t<decltype(p8)> _p8_;\
+            std::decay_t<decltype(p9)> _p9_;\
+            std::decay_t<decltype(p10)> _p10_;\
+            std::decay_t<decltype(p11)> _p11_;\
+            std::decay_t<decltype(p12)> _p12_;\
+            std::decay_t<decltype(p13)> _p13_;\
+            std::decay_t<decltype(p14)> _p14_;\
+            std::decay_t<decltype(p15)> _p15_;\
+            std::decay_t<decltype(p16)> _p16_;\
+            std::decay_t<decltype(p17)> _p17_;\
+            std::decay_t<decltype(p18)> _p18_;\
+            std::decay_t<decltype(p19)> _p19_;\
+            std::decay_t<decltype(p20)> _p20_;\
+            std::decay_t<decltype(p21)> _p21_;\
+            std::decay_t<decltype(p22)> _p22_;\
+            __attribute__((used, flatten))\
+            void operator()(const hc::tiled_index<3>&) const [[hc]]\
+            {\
+                kernel_name(\
+                    _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\
+                    _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\
+                    _p18_, _p19_, _p20_, _p21_, _p22_);\
+            }\
+        }
+    #define make_kernel_functor_hip_24(\
+        function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\
+        p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20, p21)\
+        struct make_kernel_name_hip(function_name, 22) {\
+            std::decay_t<decltype(p0)> _p0_;\
+            std::decay_t<decltype(p1)> _p1_;\
+            std::decay_t<decltype(p2)> _p2_;\
+            std::decay_t<decltype(p3)> _p3_;\
+            std::decay_t<decltype(p4)> _p4_;\
+            std::decay_t<decltype(p5)> _p5_;\
+            std::decay_t<decltype(p6)> _p6_;\
+            std::decay_t<decltype(p7)> _p7_;\
+            std::decay_t<decltype(p8)> _p8_;\
+            std::decay_t<decltype(p9)> _p9_;\
+            std::decay_t<decltype(p10)> _p10_;\
+            std::decay_t<decltype(p11)> _p11_;\
+            std::decay_t<decltype(p12)> _p12_;\
+            std::decay_t<decltype(p13)> _p13_;\
+            std::decay_t<decltype(p14)> _p14_;\
+            std::decay_t<decltype(p15)> _p15_;\
+            std::decay_t<decltype(p16)> _p16_;\
+            std::decay_t<decltype(p17)> _p17_;\
+            std::decay_t<decltype(p18)> _p18_;\
+            std::decay_t<decltype(p19)> _p19_;\
+            std::decay_t<decltype(p20)> _p20_;\
+            std::decay_t<decltype(p21)> _p21_;\
+            void operator()(const hc::tiled_index<3>&) const [[hc]]\
+            {\
+                kernel_name(\
+                    _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\
+                    _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\
+                    _p18_, _p19_, _p20_, _p21_);\
+            }\
+        }
+    #define make_kernel_functor_hip_23(\
+        function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\
+        p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20)\
+        struct make_kernel_name_hip(function_name, 21) {\
+            std::decay_t<decltype(p0)> _p0_;\
+            std::decay_t<decltype(p1)> _p1_;\
+            std::decay_t<decltype(p2)> _p2_;\
+            std::decay_t<decltype(p3)> _p3_;\
+            std::decay_t<decltype(p4)> _p4_;\
+            std::decay_t<decltype(p5)> _p5_;\
+            std::decay_t<decltype(p6)> _p6_;\
+            std::decay_t<decltype(p7)> _p7_;\
+            std::decay_t<decltype(p8)> _p8_;\
+            std::decay_t<decltype(p9)> _p9_;\
+            std::decay_t<decltype(p10)> _p10_;\
+            std::decay_t<decltype(p11)> _p11_;\
+            std::decay_t<decltype(p12)> _p12_;\
+            std::decay_t<decltype(p13)> _p13_;\
+            std::decay_t<decltype(p14)> _p14_;\
+            std::decay_t<decltype(p15)> _p15_;\
+            std::decay_t<decltype(p16)> _p16_;\
+            std::decay_t<decltype(p17)> _p17_;\
+            std::decay_t<decltype(p18)> _p18_;\
+            std::decay_t<decltype(p19)> _p19_;\
+            std::decay_t<decltype(p20)> _p20_;\
+            void operator()(const hc::tiled_index<3>&) const [[hc]]\
+            {\
+                kernel_name(\
+                    _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\
+                    _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\
+                    _p18_, _p19_, _p20_);\
+            }\
+        }
+    #define make_kernel_functor_hip_22(\
+        function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\
+        p10, p11, p12, p13, p14, p15, p16, p17, p18, p19)\
+        struct make_kernel_name_hip(function_name, 20) {\
+            std::decay_t<decltype(p0)> _p0_;\
+            std::decay_t<decltype(p1)> _p1_;\
+            std::decay_t<decltype(p2)> _p2_;\
+            std::decay_t<decltype(p3)> _p3_;\
+            std::decay_t<decltype(p4)> _p4_;\
+            std::decay_t<decltype(p5)> _p5_;\
+            std::decay_t<decltype(p6)> _p6_;\
+            std::decay_t<decltype(p7)> _p7_;\
+            std::decay_t<decltype(p8)> _p8_;\
+            std::decay_t<decltype(p9)> _p9_;\
+            std::decay_t<decltype(p10)> _p10_;\
+            std::decay_t<decltype(p11)> _p11_;\
+            std::decay_t<decltype(p12)> _p12_;\
+            std::decay_t<decltype(p13)> _p13_;\
+            std::decay_t<decltype(p14)> _p14_;\
+            std::decay_t<decltype(p15)> _p15_;\
+            std::decay_t<decltype(p16)> _p16_;\
+            std::decay_t<decltype(p17)> _p17_;\
+            std::decay_t<decltype(p18)> _p18_;\
+            std::decay_t<decltype(p19)> _p19_;\
+            void operator()(const hc::tiled_index<3>&) const [[hc]]\
+            {\
+                kernel_name(\
+                    _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\
+                    _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\
+                    _p18_, _p19_);\
+            }\
+        }
+    #define make_kernel_functor_hip_21(\
+        function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\
+        p10, p11, p12, p13, p14, p15, p16, p17, p18)\
+        struct make_kernel_name_hip(function_name, 19) {\
+            std::decay_t<decltype(p0)> _p0_;\
+            std::decay_t<decltype(p1)> _p1_;\
+            std::decay_t<decltype(p2)> _p2_;\
+            std::decay_t<decltype(p3)> _p3_;\
+            std::decay_t<decltype(p4)> _p4_;\
+            std::decay_t<decltype(p5)> _p5_;\
+            std::decay_t<decltype(p6)> _p6_;\
+            std::decay_t<decltype(p7)> _p7_;\
+            std::decay_t<decltype(p8)> _p8_;\
+            std::decay_t<decltype(p9)> _p9_;\
+            std::decay_t<decltype(p10)> _p10_;\
+            std::decay_t<decltype(p11)> _p11_;\
+            std::decay_t<decltype(p12)> _p12_;\
+            std::decay_t<decltype(p13)> _p13_;\
+            std::decay_t<decltype(p14)> _p14_;\
+            std::decay_t<decltype(p15)> _p15_;\
+            std::decay_t<decltype(p16)> _p16_;\
+            std::decay_t<decltype(p17)> _p17_;\
+            std::decay_t<decltype(p18)> _p18_;\
+            void operator()(const hc::tiled_index<3>&) const [[hc]]\
+            {\
+                kernel_name(\
+                    _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\
+                    _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\
+                    _p18_);\
+            }\
+        }
+    #define make_kernel_functor_hip_20(\
+        function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\
+        p10, p11, p12, p13, p14, p15, p16, p17)\
+        struct make_kernel_name_hip(function_name, 18) {\
+            std::decay_t<decltype(p0)> _p0_;\
+            std::decay_t<decltype(p1)> _p1_;\
+            std::decay_t<decltype(p2)> _p2_;\
+            std::decay_t<decltype(p3)> _p3_;\
+            std::decay_t<decltype(p4)> _p4_;\
+            std::decay_t<decltype(p5)> _p5_;\
+            std::decay_t<decltype(p6)> _p6_;\
+            std::decay_t<decltype(p7)> _p7_;\
+            std::decay_t<decltype(p8)> _p8_;\
+            std::decay_t<decltype(p9)> _p9_;\
+            std::decay_t<decltype(p10)> _p10_;\
+            std::decay_t<decltype(p11)> _p11_;\
+            std::decay_t<decltype(p12)> _p12_;\
+            std::decay_t<decltype(p13)> _p13_;\
+            std::decay_t<decltype(p14)> _p14_;\
+            std::decay_t<decltype(p15)> _p15_;\
+            std::decay_t<decltype(p16)> _p16_;\
+            std::decay_t<decltype(p17)> _p17_;\
+            void operator()(const hc::tiled_index<3>&) const [[hc]]\
+            {\
+                kernel_name(\
+                    _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\
+                    _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_);\
+            }\
+        }
+    #define make_kernel_functor_hip_19(\
+        function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\
+        p10, p11, p12, p13, p14, p15, p16)\
+        struct make_kernel_name_hip(function_name, 17) {\
+            std::decay_t<decltype(p0)> _p0_;\
+            std::decay_t<decltype(p1)> _p1_;\
+            std::decay_t<decltype(p2)> _p2_;\
+            std::decay_t<decltype(p3)> _p3_;\
+            std::decay_t<decltype(p4)> _p4_;\
+            std::decay_t<decltype(p5)> _p5_;\
+            std::decay_t<decltype(p6)> _p6_;\
+            std::decay_t<decltype(p7)> _p7_;\
+            std::decay_t<decltype(p8)> _p8_;\
+            std::decay_t<decltype(p9)> _p9_;\
+            std::decay_t<decltype(p10)> _p10_;\
+            std::decay_t<decltype(p11)> _p11_;\
+            std::decay_t<decltype(p12)> _p12_;\
+            std::decay_t<decltype(p13)> _p13_;\
+            std::decay_t<decltype(p14)> _p14_;\
+            std::decay_t<decltype(p15)> _p15_;\
+            std::decay_t<decltype(p16)> _p16_;\
+            void operator()(const hc::tiled_index<3>&) const [[hc]]\
+            {\
+                kernel_name(\
+                    _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\
+                    _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_);\
+            }\
+        }
+    #define make_kernel_functor_hip_18(\
+        function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\
+        p10, p11, p12, p13, p14, p15)\
+        struct make_kernel_name_hip(function_name, 16) {\
+            std::decay_t<decltype(p0)> _p0_;\
+            std::decay_t<decltype(p1)> _p1_;\
+            std::decay_t<decltype(p2)> _p2_;\
+            std::decay_t<decltype(p3)> _p3_;\
+            std::decay_t<decltype(p4)> _p4_;\
+            std::decay_t<decltype(p5)> _p5_;\
+            std::decay_t<decltype(p6)> _p6_;\
+            std::decay_t<decltype(p7)> _p7_;\
+            std::decay_t<decltype(p8)> _p8_;\
+            std::decay_t<decltype(p9)> _p9_;\
+            std::decay_t<decltype(p10)> _p10_;\
+            std::decay_t<decltype(p11)> _p11_;\
+            std::decay_t<decltype(p12)> _p12_;\
+            std::decay_t<decltype(p13)> _p13_;\
+            std::decay_t<decltype(p14)> _p14_;\
+            std::decay_t<decltype(p15)> _p15_;\
+            void operator()(const hc::tiled_index<3>&) const [[hc]]\
+            {\
+                kernel_name(\
+                    _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\
+                    _p10_, _p11_, _p12_, _p13_, _p14_, _p15_);\
+            }\
+        }
+    #define make_kernel_functor_hip_17(\
+        function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\
+        p10, p11, p12, p13, p14)\
+        struct make_kernel_name_hip(function_name, 15) {\
+            std::decay_t<decltype(p0)> _p0_;\
+            std::decay_t<decltype(p1)> _p1_;\
+            std::decay_t<decltype(p2)> _p2_;\
+            std::decay_t<decltype(p3)> _p3_;\
+            std::decay_t<decltype(p4)> _p4_;\
+            std::decay_t<decltype(p5)> _p5_;\
+            std::decay_t<decltype(p6)> _p6_;\
+            std::decay_t<decltype(p7)> _p7_;\
+            std::decay_t<decltype(p8)> _p8_;\
+            std::decay_t<decltype(p9)> _p9_;\
+            std::decay_t<decltype(p10)> _p10_;\
+            std::decay_t<decltype(p11)> _p11_;\
+            std::decay_t<decltype(p12)> _p12_;\
+            std::decay_t<decltype(p13)> _p13_;\
+            std::decay_t<decltype(p14)> _p14_;\
+            void operator()(const hc::tiled_index<3>&) const [[hc]]\
+            {\
+                kernel_name(\
+                    _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\
+                    _p10_, _p11_, _p12_, _p13_, _p14_);\
+            }\
+        }
+    #define make_kernel_functor_hip_16(\
+        function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\
+        p10, p11, p12, p13)\
+        struct make_kernel_name_hip(function_name, 14) {\
+            std::decay_t<decltype(p0)> _p0_;\
+            std::decay_t<decltype(p1)> _p1_;\
+            std::decay_t<decltype(p2)> _p2_;\
+            std::decay_t<decltype(p3)> _p3_;\
+            std::decay_t<decltype(p4)> _p4_;\
+            std::decay_t<decltype(p5)> _p5_;\
+            std::decay_t<decltype(p6)> _p6_;\
+            std::decay_t<decltype(p7)> _p7_;\
+            std::decay_t<decltype(p8)> _p8_;\
+            std::decay_t<decltype(p9)> _p9_;\
+            std::decay_t<decltype(p10)> _p10_;\
+            std::decay_t<decltype(p11)> _p11_;\
+            std::decay_t<decltype(p12)> _p12_;\
+            std::decay_t<decltype(p13)> _p13_;\
+            void operator()(const hc::tiled_index<3>&) const [[hc]]\
+            {\
+                kernel_name(\
+                    _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\
+                    _p10_, _p11_, _p12_, _p13_);\
+            }\
+        }
+    #define make_kernel_functor_hip_15(\
+        function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\
+        p10, p11, p12)\
+        struct make_kernel_name_hip(function_name, 13) {\
+            std::decay_t<decltype(p0)> _p0_;\
+            std::decay_t<decltype(p1)> _p1_;\
+            std::decay_t<decltype(p2)> _p2_;\
+            std::decay_t<decltype(p3)> _p3_;\
+            std::decay_t<decltype(p4)> _p4_;\
+            std::decay_t<decltype(p5)> _p5_;\
+            std::decay_t<decltype(p6)> _p6_;\
+            std::decay_t<decltype(p7)> _p7_;\
+            std::decay_t<decltype(p8)> _p8_;\
+            std::decay_t<decltype(p9)> _p9_;\
+            std::decay_t<decltype(p10)> _p10_;\
+            std::decay_t<decltype(p11)> _p11_;\
+            std::decay_t<decltype(p12)> _p12_;\
+            void operator()(const hc::tiled_index<3>&) const [[hc]]\
+            {\
+                kernel_name(\
+                    _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\
+                    _p10_, _p11_, _p12_);\
+            }\
+        }
+    #define make_kernel_functor_hip_14(\
+        function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\
+        p10, p11)\
+        struct make_kernel_name_hip(function_name, 12) {\
+            std::decay_t<decltype(p0)> _p0_;\
+            std::decay_t<decltype(p1)> _p1_;\
+            std::decay_t<decltype(p2)> _p2_;\
+            std::decay_t<decltype(p3)> _p3_;\
+            std::decay_t<decltype(p4)> _p4_;\
+            std::decay_t<decltype(p5)> _p5_;\
+            std::decay_t<decltype(p6)> _p6_;\
+            std::decay_t<decltype(p7)> _p7_;\
+            std::decay_t<decltype(p8)> _p8_;\
+            std::decay_t<decltype(p9)> _p9_;\
+            std::decay_t<decltype(p10)> _p10_;\
+            std::decay_t<decltype(p11)> _p11_;\
+            void operator()(const hc::tiled_index<3>&) const [[hc]]\
+            {\
+                kernel_name(\
+                    _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\
+                    _p10_, _p11_);\
+            }\
+        }
+    #define make_kernel_functor_hip_13(\
+        function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, p10)\
+        struct make_kernel_name_hip(function_name, 11) {\
+            std::decay_t<decltype(p0)> _p0_;\
+            std::decay_t<decltype(p1)> _p1_;\
+            std::decay_t<decltype(p2)> _p2_;\
+            std::decay_t<decltype(p3)> _p3_;\
+            std::decay_t<decltype(p4)> _p4_;\
+            std::decay_t<decltype(p5)> _p5_;\
+            std::decay_t<decltype(p6)> _p6_;\
+            std::decay_t<decltype(p7)> _p7_;\
+            std::decay_t<decltype(p8)> _p8_;\
+            std::decay_t<decltype(p9)> _p9_;\
+            std::decay_t<decltype(p10)> _p10_;\
+            void operator()(const hc::tiled_index<3>&) const [[hc]]\
+            {\
+                kernel_name(\
+                    _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\
+                    _p10_);\
+            }\
+        }
+    #define make_kernel_functor_hip_12(\
+        function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9)\
+        struct make_kernel_name_hip(function_name, 10) {\
+            std::decay_t<decltype(p0)> _p0_;\
+            std::decay_t<decltype(p1)> _p1_;\
+            std::decay_t<decltype(p2)> _p2_;\
+            std::decay_t<decltype(p3)> _p3_;\
+            std::decay_t<decltype(p4)> _p4_;\
+            std::decay_t<decltype(p5)> _p5_;\
+            std::decay_t<decltype(p6)> _p6_;\
+            std::decay_t<decltype(p7)> _p7_;\
+            std::decay_t<decltype(p8)> _p8_;\
+            std::decay_t<decltype(p9)> _p9_;\
+            void operator()(const hc::tiled_index<3>&) const [[hc]]\
+            {\
+                kernel_name(\
+                    _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_,\
+                    _p9_);\
+            }\
+        }
+    #define make_kernel_functor_hip_11(\
+        function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8)\
+        struct make_kernel_name_hip(function_name, 9) {\
+            std::decay_t<decltype(p0)> _p0_;\
+            std::decay_t<decltype(p1)> _p1_;\
+            std::decay_t<decltype(p2)> _p2_;\
+            std::decay_t<decltype(p3)> _p3_;\
+            std::decay_t<decltype(p4)> _p4_;\
+            std::decay_t<decltype(p5)> _p5_;\
+            std::decay_t<decltype(p6)> _p6_;\
+            std::decay_t<decltype(p7)> _p7_;\
+            std::decay_t<decltype(p8)> _p8_;\
+            void operator()(const hc::tiled_index<3>&) const [[hc]]\
+            {\
+                kernel_name(\
+                    _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_);\
+            }\
+        }
+    #define make_kernel_functor_hip_10(\
+        function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7)\
+        struct make_kernel_name_hip(function_name, 8) {\
+            std::decay_t<decltype(p0)> _p0_;\
+            std::decay_t<decltype(p1)> _p1_;\
+            std::decay_t<decltype(p2)> _p2_;\
+            std::decay_t<decltype(p3)> _p3_;\
+            std::decay_t<decltype(p4)> _p4_;\
+            std::decay_t<decltype(p5)> _p5_;\
+            std::decay_t<decltype(p6)> _p6_;\
+            std::decay_t<decltype(p7)> _p7_;\
+            void operator()(const hc::tiled_index<3>&) const [[hc]]\
+            {\
+                kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_);\
+            }\
+        }
+    #define make_kernel_functor_hip_9(\
+        function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6)\
+        struct make_kernel_name_hip(function_name, 7) {\
+            std::decay_t<decltype(p0)> _p0_;\
+            std::decay_t<decltype(p1)> _p1_;\
+            std::decay_t<decltype(p2)> _p2_;\
+            std::decay_t<decltype(p3)> _p3_;\
+            std::decay_t<decltype(p4)> _p4_;\
+            std::decay_t<decltype(p5)> _p5_;\
+            std::decay_t<decltype(p6)> _p6_;\
+            void operator()(const hc::tiled_index<3>&) const [[hc]]\
+            {\
+                kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_);\
+            }\
+        }
+    #define make_kernel_functor_hip_8(\
+        function_name, kernel_name, p0, p1, p2, p3, p4, p5)\
+        struct make_kernel_name_hip(function_name, 6) {\
+            std::decay_t<decltype(p0)> _p0_;\
+            std::decay_t<decltype(p1)> _p1_;\
+            std::decay_t<decltype(p2)> _p2_;\
+            std::decay_t<decltype(p3)> _p3_;\
+            std::decay_t<decltype(p4)> _p4_;\
+            std::decay_t<decltype(p5)> _p5_;\
+            void operator()(const hc::tiled_index<3>&) const [[hc]]\
+            {\
+                kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_);\
+            }\
+        }
+    #define make_kernel_functor_hip_7(\
+        function_name, kernel_name, p0, p1, p2, p3, p4)\
+        struct make_kernel_name_hip(function_name, 5) {\
+            std::decay_t<decltype(p0)> _p0_;\
+            std::decay_t<decltype(p1)> _p1_;\
+            std::decay_t<decltype(p2)> _p2_;\
+            std::decay_t<decltype(p3)> _p3_;\
+            std::decay_t<decltype(p4)> _p4_;\
+            void operator()(const hc::tiled_index<3>&) const [[hc]]\
+            {\
+                kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_);\
+            }\
+        }
+    #define make_kernel_functor_hip_6(function_name, kernel_name, p0, p1, p2, p3)\
+        struct make_kernel_name_hip(function_name, 4) {\
+            std::decay_t<decltype(p0)> _p0_;\
+            std::decay_t<decltype(p1)> _p1_;\
+            std::decay_t<decltype(p2)> _p2_;\
+            std::decay_t<decltype(p3)> _p3_;\
+            void operator()(const hc::tiled_index<3>&) const [[hc]]\
+            {\
+                kernel_name(_p0_, _p1_, _p2_, _p3_);\
+            }\
+        }
+    #define make_kernel_functor_hip_5(function_name, kernel_name, p0, p1, p2)\
+        struct make_kernel_name_hip(function_name, 3) {\
+            std::decay_t<decltype(p0)> _p0_;\
+            std::decay_t<decltype(p1)> _p1_;\
+            std::decay_t<decltype(p2)> _p2_;\
+            void operator()(const hc::tiled_index<3>&) const [[hc]]\
+            {\
+                kernel_name(_p0_, _p1_, _p2_);\
+            }\
+        }
+    #define make_kernel_functor_hip_4(function_name, kernel_name, p0, p1)\
+        struct make_kernel_name_hip(function_name, 2) {\
+            std::decay_t<decltype(p0)> _p0_;\
+            std::decay_t<decltype(p1)> _p1_;\
+            void operator()(const hc::tiled_index<3>&) const [[hc]]\
+            {\
+                kernel_name(_p0_, _p1_);\
+            }\
+        }
+    #define fofo(f, n) kernel_prefix_hip ## f ## kernel_suffix_hip ## n
+    #define make_kernel_functor_hip_3(function_name, kernel_name, p0)\
+        struct make_kernel_name_hip(function_name, 1) {\
+            std::decay_t<decltype(p0)> _p0_;\
+            void operator()(const hc::tiled_index<3>&) const [[hc]]\
+            {\
+                kernel_name(_p0_);\
+            }\
+        }
+    #define make_kernel_functor_hip_2(function_name, kernel_name)\
+        struct make_kernel_name_hip(function_name, 0) {\
+            void operator()(const hc::tiled_index<3>&) [[hc]]\
+            {\
+                return kernel_name(hipLaunchParm{});\
+            }\
+        }
+    #define make_kernel_functor_hip_1(...)
+    #define make_kernel_functor_hip_0(...)
+    #define make_kernel_functor_hip_(...)\
+        overload_macro_hip_(make_kernel_functor_hip_, __VA_ARGS__)
 
-    template<FunctionalProcedure K, K* k, typename... Ts>
-        requires(Domain<K> == {Ts...})
-    inline
-    void grid_launch(
-        New_grid_launch_tag,
-        dim3 num_blocks,
-        dim3 dim_blocks,
-        int group_mem_bytes,
-        hipStream_t stream,
-        Ts&&... args)
-    {
-        grid_launch_hip_impl_(
-            New_grid_launch_tag{},
-            std::move(num_blocks),
-            std::move(dim_blocks),
-            group_mem_bytes,
-            std::move(stream),
-            Forwarder<K, k>{},
-            std::forward<Ts>(args)...);
-    }
 
-    template<FunctionalProcedure K, K* k, typename... Ts>
-        requires(Domain<K> == {Ts...})
-    inline
-    void grid_launch(
-        Old_grid_launch_tag,
-        dim3 num_blocks,
-        dim3 dim_blocks,
-        int group_mem_bytes,
-        hipStream_t stream,
-        Ts&&... args)
-    {
-        grid_launch_hip_<K, k>(
-            New_grid_launch_tag{},
-            std::move(num_blocks),
-            std::move(dim_blocks),
-            group_mem_bytes,
-            std::move(stream),
-            hipLaunchParm{},
-            std::forward<Ts>(args)...);
-    }
-
-    template<FunctionalProcedure K, K* k, typename... Ts>
-        requires(Domain<K> == {Ts...})
-    inline
-    std::enable_if_t<std::is_function<K>::value> grid_launch_hip_(
-        dim3 num_blocks,
-        dim3 dim_blocks,
-        int group_mem_bytes,
-        hipStream_t stream,
-        Ts&&... args)
-    {
-        grid_launch_hip_<K, k>(
-            is_new_grid_launch_t<K*, Ts...>{},
-            std::move(num_blocks),
-            std::move(dim_blocks),
-            group_mem_bytes,
-            std::move(stream),
-            std::forward<Ts>(args)...);
-    }
-
-    // TODO: these are temporary, they need to be completely removed once we
-    //       enable C++14 support and can have proper generic, variadic lambdas.
-    #define make_kernel_lambda_hip_26(\
-        kernel_name,\
-        p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14, p15,\
-        p16, p17, p18, p19, p20, p21, p22, p23, p24)\
-        [](const std::decay_t<decltype(p0)>& _p0_,\
-           const std::decay_t<decltype(p1)>& _p1_,\
-           const std::decay_t<decltype(p2)>& _p2_,\
-           const std::decay_t<decltype(p3)>& _p3_,\
-           const std::decay_t<decltype(p4)>& _p4_,\
-           const std::decay_t<decltype(p5)>& _p5_,\
-           const std::decay_t<decltype(p6)>& _p6_,\
-           const std::decay_t<decltype(p7)>& _p7_,\
-           const std::decay_t<decltype(p8)>& _p8_,\
-           const std::decay_t<decltype(p9)>& _p9_,\
-           const std::decay_t<decltype(p10)>& _p10_,\
-           const std::decay_t<decltype(p11)>& _p11_,\
-           const std::decay_t<decltype(p12)>& _p12_,\
-           const std::decay_t<decltype(p13)>& _p13_,\
-           const std::decay_t<decltype(p14)>& _p14_,\
-           const std::decay_t<decltype(p15)>& _p15_,\
-           const std::decay_t<decltype(p16)>& _p16_,\
-           const std::decay_t<decltype(p17)>& _p17_,\
-           const std::decay_t<decltype(p18)>& _p18_,\
-           const std::decay_t<decltype(p19)>& _p19_,\
-           const std::decay_t<decltype(p20)>& _p20_,\
-           const std::decay_t<decltype(p21)>& _p21_,\
-           const std::decay_t<decltype(p22)>& _p22_,\
-           const std::decay_t<decltype(p23)>& _p23_,\
-           const std::decay_t<decltype(p24)>& _p24_) [[hc]] {\
-            kernel_name(\
-                _p0_,  _p1_,  _p2_,  _p3_,  _p4_,  _p5_,  _p6_,  _p7_,  _p8_,\
-                _p9_, _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\
-                _p18_, _p19_, _p20_, _p21_, _p22_, _p23_, _p24_);\
-        }
-    #define make_kernel_lambda_hip_25(\
-        kernel_name,\
-        p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14, p15,\
-        p16, p17, p18, p19, p20, p21, p22, p23)\
-        [](const std::decay_t<decltype(p0)>& _p0_,\
-           const std::decay_t<decltype(p1)>& _p1_,\
-           const std::decay_t<decltype(p2)>& _p2_,\
-           const std::decay_t<decltype(p3)>& _p3_,\
-           const std::decay_t<decltype(p4)>& _p4_,\
-           const std::decay_t<decltype(p5)>& _p5_,\
-           const std::decay_t<decltype(p6)>& _p6_,\
-           const std::decay_t<decltype(p7)>& _p7_,\
-           const std::decay_t<decltype(p8)>& _p8_,\
-           const std::decay_t<decltype(p9)>& _p9_,\
-           const std::decay_t<decltype(p10)>& _p10_,\
-           const std::decay_t<decltype(p11)>& _p11_,\
-           const std::decay_t<decltype(p12)>& _p12_,\
-           const std::decay_t<decltype(p13)>& _p13_,\
-           const std::decay_t<decltype(p14)>& _p14_,\
-           const std::decay_t<decltype(p15)>& _p15_,\
-           const std::decay_t<decltype(p16)>& _p16_,\
-           const std::decay_t<decltype(p17)>& _p17_,\
-           const std::decay_t<decltype(p18)>& _p18_,\
-           const std::decay_t<decltype(p19)>& _p19_,\
-           const std::decay_t<decltype(p20)>& _p20_,\
-           const std::decay_t<decltype(p21)>& _p21_,\
-           const std::decay_t<decltype(p22)>& _p22_,\
-           const std::decay_t<decltype(p23)>& _p23_) [[hc]] {\
-            kernel_name(\
-                _p0_,  _p1_,  _p2_,  _p3_,  _p4_,  _p5_,  _p6_,  _p7_,  _p8_,\
-                _p9_, _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\
-                _p18_, _p19_, _p20_, _p21_, _p22_, _p23_);\
-        }
-    #define make_kernel_lambda_hip_24(\
-        kernel_name,\
-        p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14, p15,\
-        p16, p17, p18, p19, p20, p21, p22)\
-        [](const std::decay_t<decltype(p0)>& _p0_,\
-           const std::decay_t<decltype(p1)>& _p1_,\
-           const std::decay_t<decltype(p2)>& _p2_,\
-           const std::decay_t<decltype(p3)>& _p3_,\
-           const std::decay_t<decltype(p4)>& _p4_,\
-           const std::decay_t<decltype(p5)>& _p5_,\
-           const std::decay_t<decltype(p6)>& _p6_,\
-           const std::decay_t<decltype(p7)>& _p7_,\
-           const std::decay_t<decltype(p8)>& _p8_,\
-           const std::decay_t<decltype(p9)>& _p9_,\
-           const std::decay_t<decltype(p10)>& _p10_,\
-           const std::decay_t<decltype(p11)>& _p11_,\
-           const std::decay_t<decltype(p12)>& _p12_,\
-           const std::decay_t<decltype(p13)>& _p13_,\
-           const std::decay_t<decltype(p14)>& _p14_,\
-           const std::decay_t<decltype(p15)>& _p15_,\
-           const std::decay_t<decltype(p16)>& _p16_,\
-           const std::decay_t<decltype(p17)>& _p17_,\
-           const std::decay_t<decltype(p18)>& _p18_,\
-           const std::decay_t<decltype(p19)>& _p19_,\
-           const std::decay_t<decltype(p20)>& _p20_,\
-           const std::decay_t<decltype(p21)>& _p21_,\
-           const std::decay_t<decltype(p22)>& _p22_) [[hc]] {\
-            kernel_name(\
-                _p0_,  _p1_,  _p2_,  _p3_,  _p4_,  _p5_,  _p6_,  _p7_,  _p8_,\
-                _p9_, _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\
-                _p18_, _p19_, _p20_, _p21_, _p22_);\
-        }
-    #define make_kernel_lambda_hip_23(\
-        kernel_name,\
-        p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14, p15,\
-        p16, p17, p18, p19, p20, p21)\
-        [](const std::decay_t<decltype(p0)>& _p0_,\
-           const std::decay_t<decltype(p1)>& _p1_,\
-           const std::decay_t<decltype(p2)>& _p2_,\
-           const std::decay_t<decltype(p3)>& _p3_,\
-           const std::decay_t<decltype(p4)>& _p4_,\
-           const std::decay_t<decltype(p5)>& _p5_,\
-           const std::decay_t<decltype(p6)>& _p6_,\
-           const std::decay_t<decltype(p7)>& _p7_,\
-           const std::decay_t<decltype(p8)>& _p8_,\
-           const std::decay_t<decltype(p9)>& _p9_,\
-           const std::decay_t<decltype(p10)>& _p10_,\
-           const std::decay_t<decltype(p11)>& _p11_,\
-           const std::decay_t<decltype(p12)>& _p12_,\
-           const std::decay_t<decltype(p13)>& _p13_,\
-           const std::decay_t<decltype(p14)>& _p14_,\
-           const std::decay_t<decltype(p15)>& _p15_,\
-           const std::decay_t<decltype(p16)>& _p16_,\
-           const std::decay_t<decltype(p17)>& _p17_,\
-           const std::decay_t<decltype(p18)>& _p18_,\
-           const std::decay_t<decltype(p19)>& _p19_,\
-           const std::decay_t<decltype(p20)>& _p20_,\
-           const std::decay_t<decltype(p21)>& _p21_) [[hc]] {\
-            kernel_name(\
-                _p0_,  _p1_,  _p2_,  _p3_,  _p4_,  _p5_,  _p6_,  _p7_,  _p8_,\
-                _p9_, _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\
-                _p18_, _p19_, _p20_, _p21_);\
-        }
-    #define make_kernel_lambda_hip_22(\
-        kernel_name,\
-        p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14, p15,\
-        p16, p17, p18, p19, p20)\
-        [](const std::decay_t<decltype(p0)>& _p0_,\
-           const std::decay_t<decltype(p1)>& _p1_,\
-           const std::decay_t<decltype(p2)>& _p2_,\
-           const std::decay_t<decltype(p3)>& _p3_,\
-           const std::decay_t<decltype(p4)>& _p4_,\
-           const std::decay_t<decltype(p5)>& _p5_,\
-           const std::decay_t<decltype(p6)>& _p6_,\
-           const std::decay_t<decltype(p7)>& _p7_,\
-           const std::decay_t<decltype(p8)>& _p8_,\
-           const std::decay_t<decltype(p9)>& _p9_,\
-           const std::decay_t<decltype(p10)>& _p10_,\
-           const std::decay_t<decltype(p11)>& _p11_,\
-           const std::decay_t<decltype(p12)>& _p12_,\
-           const std::decay_t<decltype(p13)>& _p13_,\
-           const std::decay_t<decltype(p14)>& _p14_,\
-           const std::decay_t<decltype(p15)>& _p15_,\
-           const std::decay_t<decltype(p16)>& _p16_,\
-           const std::decay_t<decltype(p17)>& _p17_,\
-           const std::decay_t<decltype(p18)>& _p18_,\
-           const std::decay_t<decltype(p19)>& _p19_,\
-           const std::decay_t<decltype(p20)>& _p20_) [[hc]] {\
-            kernel_name(\
-                _p0_,  _p1_,  _p2_,  _p3_,  _p4_,  _p5_,  _p6_,  _p7_,  _p8_,\
-                _p9_, _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\
-                _p18_, _p19_, _p20_);\
-        }
-    #define make_kernel_lambda_hip_21(\
-        kernel_name,\
-        p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14, p15,\
-        p16, p17, p18, p19)\
-        [](const std::decay_t<decltype(p0)>& _p0_,\
-           const std::decay_t<decltype(p1)>& _p1_,\
-           const std::decay_t<decltype(p2)>& _p2_,\
-           const std::decay_t<decltype(p3)>& _p3_,\
-           const std::decay_t<decltype(p4)>& _p4_,\
-           const std::decay_t<decltype(p5)>& _p5_,\
-           const std::decay_t<decltype(p6)>& _p6_,\
-           const std::decay_t<decltype(p7)>& _p7_,\
-           const std::decay_t<decltype(p8)>& _p8_,\
-           const std::decay_t<decltype(p9)>& _p9_,\
-           const std::decay_t<decltype(p10)>& _p10_,\
-           const std::decay_t<decltype(p11)>& _p11_,\
-           const std::decay_t<decltype(p12)>& _p12_,\
-           const std::decay_t<decltype(p13)>& _p13_,\
-           const std::decay_t<decltype(p14)>& _p14_,\
-           const std::decay_t<decltype(p15)>& _p15_,\
-           const std::decay_t<decltype(p16)>& _p16_,\
-           const std::decay_t<decltype(p17)>& _p17_,\
-           const std::decay_t<decltype(p18)>& _p18_,\
-           const std::decay_t<decltype(p19)>& _p19_) [[hc]] {\
-            kernel_name(\
-                _p0_,  _p1_,  _p2_,  _p3_,  _p4_,  _p5_,  _p6_,  _p7_,  _p8_,\
-                _p9_, _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\
-                _p18_, _p19_);\
-        }
-    #define make_kernel_lambda_hip_20(\
-        kernel_name,\
-        p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14, p15,\
-        p16, p17, p18)\
-        [](const std::decay_t<decltype(p0)>& _p0_,\
-           const std::decay_t<decltype(p1)>& _p1_,\
-           const std::decay_t<decltype(p2)>& _p2_,\
-           const std::decay_t<decltype(p3)>& _p3_,\
-           const std::decay_t<decltype(p4)>& _p4_,\
-           const std::decay_t<decltype(p5)>& _p5_,\
-           const std::decay_t<decltype(p6)>& _p6_,\
-           const std::decay_t<decltype(p7)>& _p7_,\
-           const std::decay_t<decltype(p8)>& _p8_,\
-           const std::decay_t<decltype(p9)>& _p9_,\
-           const std::decay_t<decltype(p10)>& _p10_,\
-           const std::decay_t<decltype(p11)>& _p11_,\
-           const std::decay_t<decltype(p12)>& _p12_,\
-           const std::decay_t<decltype(p13)>& _p13_,\
-           const std::decay_t<decltype(p14)>& _p14_,\
-           const std::decay_t<decltype(p15)>& _p15_,\
-           const std::decay_t<decltype(p16)>& _p16_,\
-           const std::decay_t<decltype(p17)>& _p17_,\
-           const std::decay_t<decltype(p18)>& _p18_) [[hc]] {\
-            kernel_name(\
-                _p0_,  _p1_,  _p2_,  _p3_,  _p4_,  _p5_,  _p6_,  _p7_,  _p8_,\
-                _p9_, _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\
-                _p18_);\
-        }
-    #define make_kernel_lambda_hip_19(\
-        kernel_name,\
-        p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14, p15,\
-        p16, p17)\
-        [](const std::decay_t<decltype(p0)>& _p0_,\
-           const std::decay_t<decltype(p1)>& _p1_,\
-           const std::decay_t<decltype(p2)>& _p2_,\
-           const std::decay_t<decltype(p3)>& _p3_,\
-           const std::decay_t<decltype(p4)>& _p4_,\
-           const std::decay_t<decltype(p5)>& _p5_,\
-           const std::decay_t<decltype(p6)>& _p6_,\
-           const std::decay_t<decltype(p7)>& _p7_,\
-           const std::decay_t<decltype(p8)>& _p8_,\
-           const std::decay_t<decltype(p9)>& _p9_,\
-           const std::decay_t<decltype(p10)>& _p10_,\
-           const std::decay_t<decltype(p11)>& _p11_,\
-           const std::decay_t<decltype(p12)>& _p12_,\
-           const std::decay_t<decltype(p13)>& _p13_,\
-           const std::decay_t<decltype(p14)>& _p14_,\
-           const std::decay_t<decltype(p15)>& _p15_,\
-           const std::decay_t<decltype(p16)>& _p16_,\
-           const std::decay_t<decltype(p17)>& _p17_) [[hc]] {\
-            kernel_name(\
-                _p0_,  _p1_,  _p2_,  _p3_,  _p4_,  _p5_,  _p6_,  _p7_,  _p8_,\
-                _p9_, _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_);\
-        }
-    #define make_kernel_lambda_hip_18(\
-        kernel_name,\
-        p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14, p15,\
-        p16)\
-        [](const std::decay_t<decltype(p0)>& _p0_,\
-           const std::decay_t<decltype(p1)>& _p1_,\
-           const std::decay_t<decltype(p2)>& _p2_,\
-           const std::decay_t<decltype(p3)>& _p3_,\
-           const std::decay_t<decltype(p4)>& _p4_,\
-           const std::decay_t<decltype(p5)>& _p5_,\
-           const std::decay_t<decltype(p6)>& _p6_,\
-           const std::decay_t<decltype(p7)>& _p7_,\
-           const std::decay_t<decltype(p8)>& _p8_,\
-           const std::decay_t<decltype(p9)>& _p9_,\
-           const std::decay_t<decltype(p10)>& _p10_,\
-           const std::decay_t<decltype(p11)>& _p11_,\
-           const std::decay_t<decltype(p12)>& _p12_,\
-           const std::decay_t<decltype(p13)>& _p13_,\
-           const std::decay_t<decltype(p14)>& _p14_,\
-           const std::decay_t<decltype(p15)>& _p15_,\
-           const std::decay_t<decltype(p16)>& _p16_) [[hc]] {\
-            kernel_name(\
-                _p0_,  _p1_,  _p2_,  _p3_,  _p4_,  _p5_,  _p6_,  _p7_,  _p8_,\
-                _p9_, _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_);\
-        }
-    #define make_kernel_lambda_hip_17(\
-        kernel_name,\
-        p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14, p15)\
-        [](const std::decay_t<decltype(p0)>& _p0_,\
-           const std::decay_t<decltype(p1)>& _p1_,\
-           const std::decay_t<decltype(p2)>& _p2_,\
-           const std::decay_t<decltype(p3)>& _p3_,\
-           const std::decay_t<decltype(p4)>& _p4_,\
-           const std::decay_t<decltype(p5)>& _p5_,\
-           const std::decay_t<decltype(p6)>& _p6_,\
-           const std::decay_t<decltype(p7)>& _p7_,\
-           const std::decay_t<decltype(p8)>& _p8_,\
-           const std::decay_t<decltype(p9)>& _p9_,\
-           const std::decay_t<decltype(p10)>& _p10_,\
-           const std::decay_t<decltype(p11)>& _p11_,\
-           const std::decay_t<decltype(p12)>& _p12_,\
-           const std::decay_t<decltype(p13)>& _p13_,\
-           const std::decay_t<decltype(p14)>& _p14_,\
-           const std::decay_t<decltype(p15)>& _p15_) [[hc]] {\
-            kernel_name(\
-                _p0_,  _p1_,  _p2_,  _p3_,  _p4_,  _p5_,  _p6_,  _p7_,  _p8_,\
-                _p9_, _p10_, _p11_, _p12_, _p13_, _p14_, _p15_);\
-        }
-    #define make_kernel_lambda_hip_16(\
-        kernel_name,\
-        p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14)\
-        [](const std::decay_t<decltype(p0)>& _p0_,\
-           const std::decay_t<decltype(p1)>& _p1_,\
-           const std::decay_t<decltype(p2)>& _p2_,\
-           const std::decay_t<decltype(p3)>& _p3_,\
-           const std::decay_t<decltype(p4)>& _p4_,\
-           const std::decay_t<decltype(p5)>& _p5_,\
-           const std::decay_t<decltype(p6)>& _p6_,\
-           const std::decay_t<decltype(p7)>& _p7_,\
-           const std::decay_t<decltype(p8)>& _p8_,\
-           const std::decay_t<decltype(p9)>& _p9_,\
-           const std::decay_t<decltype(p10)>& _p10_,\
-           const std::decay_t<decltype(p11)>& _p11_,\
-           const std::decay_t<decltype(p12)>& _p12_,\
-           const std::decay_t<decltype(p13)>& _p13_,\
-           const std::decay_t<decltype(p14)>& _p14_) [[hc]] {\
-            kernel_name(\
-                _p0_,  _p1_,  _p2_,  _p3_,  _p4_,  _p5_,  _p6_,  _p7_,  _p8_,\
-                _p9_, _p10_, _p11_, _p12_, _p13_, _p14_);\
-        }
-    #define make_kernel_lambda_hip_15(\
-        kernel_name,\
-        p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13)\
-        [](const std::decay_t<decltype(p0)>& _p0_,\
-           const std::decay_t<decltype(p1)>& _p1_,\
-           const std::decay_t<decltype(p2)>& _p2_,\
-           const std::decay_t<decltype(p3)>& _p3_,\
-           const std::decay_t<decltype(p4)>& _p4_,\
-           const std::decay_t<decltype(p5)>& _p5_,\
-           const std::decay_t<decltype(p6)>& _p6_,\
-           const std::decay_t<decltype(p7)>& _p7_,\
-           const std::decay_t<decltype(p8)>& _p8_,\
-           const std::decay_t<decltype(p9)>& _p9_,\
-           const std::decay_t<decltype(p10)>& _p10_,\
-           const std::decay_t<decltype(p11)>& _p11_,\
-           const std::decay_t<decltype(p12)>& _p12_,\
-           const std::decay_t<decltype(p13)>& _p13_) [[hc]] {\
-            kernel_name(\
-                _p0_,  _p1_,  _p2_,  _p3_,  _p4_,  _p5_,  _p6_,  _p7_,  _p8_,\
-                _p9_, _p10_, _p11_, _p12_, _p13_);\
-        }
-    #define make_kernel_lambda_hip_14(\
-        kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12)\
-        [](const std::decay_t<decltype(p0)>& _p0_,\
-           const std::decay_t<decltype(p1)>& _p1_,\
-           const std::decay_t<decltype(p2)>& _p2_,\
-           const std::decay_t<decltype(p3)>& _p3_,\
-           const std::decay_t<decltype(p4)>& _p4_,\
-           const std::decay_t<decltype(p5)>& _p5_,\
-           const std::decay_t<decltype(p6)>& _p6_,\
-           const std::decay_t<decltype(p7)>& _p7_,\
-           const std::decay_t<decltype(p8)>& _p8_,\
-           const std::decay_t<decltype(p9)>& _p9_,\
-           const std::decay_t<decltype(p10)>& _p10_,\
-           const std::decay_t<decltype(p11)>& _p11_,\
-           const std::decay_t<decltype(p12)>& _p12_) [[hc]] {\
-            kernel_name(\
-                _p0_,  _p1_,  _p2_,  _p3_,  _p4_,  _p5_,  _p6_,  _p7_,  _p8_,\
-                _p9_, _p10_, _p11_, _p12_);\
-        }
-    #define make_kernel_lambda_hip_13(\
-        kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11)\
-        [](const std::decay_t<decltype(p0)>& _p0_,\
-           const std::decay_t<decltype(p1)>& _p1_,\
-           const std::decay_t<decltype(p2)>& _p2_,\
-           const std::decay_t<decltype(p3)>& _p3_,\
-           const std::decay_t<decltype(p4)>& _p4_,\
-           const std::decay_t<decltype(p5)>& _p5_,\
-           const std::decay_t<decltype(p6)>& _p6_,\
-           const std::decay_t<decltype(p7)>& _p7_,\
-           const std::decay_t<decltype(p8)>& _p8_,\
-           const std::decay_t<decltype(p9)>& _p9_,\
-           const std::decay_t<decltype(p10)>& _p10_,\
-           const std::decay_t<decltype(p11)>& _p11_) [[hc]] {\
-            kernel_name(\
-                _p0_,  _p1_,  _p2_,  _p3_,  _p4_,  _p5_,  _p6_,  _p7_,  _p8_,\
-                _p9_, _p10_, _p11_);\
-        }
-    #define make_kernel_lambda_hip_12(\
-        kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, p10)\
-        [](const std::decay_t<decltype(p0)>& _p0_,\
-           const std::decay_t<decltype(p1)>& _p1_,\
-           const std::decay_t<decltype(p2)>& _p2_,\
-           const std::decay_t<decltype(p3)>& _p3_,\
-           const std::decay_t<decltype(p4)>& _p4_,\
-           const std::decay_t<decltype(p5)>& _p5_,\
-           const std::decay_t<decltype(p6)>& _p6_,\
-           const std::decay_t<decltype(p7)>& _p7_,\
-           const std::decay_t<decltype(p8)>& _p8_,\
-           const std::decay_t<decltype(p9)>& _p9_,\
-           const std::decay_t<decltype(p10)>& _p10_) [[hc]] {\
-            kernel_name(\
-                 _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\
-                _p10_);\
-        }
-    #define make_kernel_lambda_hip_11(\
-        kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9)\
-        [](const std::decay_t<decltype(p0)>& _p0_,\
-           const std::decay_t<decltype(p1)>& _p1_,\
-           const std::decay_t<decltype(p2)>& _p2_,\
-           const std::decay_t<decltype(p3)>& _p3_,\
-           const std::decay_t<decltype(p4)>& _p4_,\
-           const std::decay_t<decltype(p5)>& _p5_,\
-           const std::decay_t<decltype(p6)>& _p6_,\
-           const std::decay_t<decltype(p7)>& _p7_,\
-           const std::decay_t<decltype(p8)>& _p8_,\
-           const std::decay_t<decltype(p9)>& _p9_) [[hc]] {\
-            kernel_name(\
-                _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_);\
-        }
-    #define make_kernel_lambda_hip_10(\
-        kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8)\
-        [](const std::decay_t<decltype(p0)>& _p0_,\
-           const std::decay_t<decltype(p1)>& _p1_,\
-           const std::decay_t<decltype(p2)>& _p2_,\
-           const std::decay_t<decltype(p3)>& _p3_,\
-           const std::decay_t<decltype(p4)>& _p4_,\
-           const std::decay_t<decltype(p5)>& _p5_,\
-           const std::decay_t<decltype(p6)>& _p6_,\
-           const std::decay_t<decltype(p7)>& _p7_,\
-           const std::decay_t<decltype(p8)>& _p8_) [[hc]] {\
-            kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_);\
-        }
-    #define make_kernel_lambda_hip_9(\
-        kernel_name, p0, p1, p2, p3, p4, p5, p6, p7)\
-        [](const std::decay_t<decltype(p0)>& _p0_,\
-           const std::decay_t<decltype(p1)>& _p1_,\
-           const std::decay_t<decltype(p2)>& _p2_,\
-           const std::decay_t<decltype(p3)>& _p3_,\
-           const std::decay_t<decltype(p4)>& _p4_,\
-           const std::decay_t<decltype(p5)>& _p5_,\
-           const std::decay_t<decltype(p6)>& _p6_,\
-           const std::decay_t<decltype(p7)>& _p7_) [[hc]] {\
-            kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_);\
-        }
-    #define make_kernel_lambda_hip_8(kernel_name, p0, p1, p2, p3, p4, p5, p6)\
-        [](const std::decay_t<decltype(p0)>& _p0_,\
-           const std::decay_t<decltype(p1)>& _p1_,\
-           const std::decay_t<decltype(p2)>& _p2_,\
-           const std::decay_t<decltype(p3)>& _p3_,\
-           const std::decay_t<decltype(p4)>& _p4_,\
-           const std::decay_t<decltype(p5)>& _p5_,\
-           const std::decay_t<decltype(p6)>& _p6_) [[hc]] {\
-            kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_);\
-        }
-    #define make_kernel_lambda_hip_7(kernel_name, p0, p1, p2, p3, p4, p5)\
-        [](const std::decay_t<decltype(p0)>& _p0_,\
-           const std::decay_t<decltype(p1)>& _p1_,\
-           const std::decay_t<decltype(p2)>& _p2_,\
-           const std::decay_t<decltype(p3)>& _p3_,\
-           const std::decay_t<decltype(p4)>& _p4_,\
-           const std::decay_t<decltype(p5)>& _p5_) [[hc]] {\
-            kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_);\
-        }
-    #define make_kernel_lambda_hip_6(kernel_name, p0, p1, p2, p3, p4)\
-        [](const std::decay_t<decltype(p0)>& _p0_,\
-           const std::decay_t<decltype(p1)>& _p1_,\
-           const std::decay_t<decltype(p2)>& _p2_,\
-           const std::decay_t<decltype(p3)>& _p3_,\
-           const std::decay_t<decltype(p4)>& _p4_) [[hc]] {\
-            kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_);\
-        }
-    #define make_kernel_lambda_hip_5(kernel_name, p0, p1, p2, p3)\
-        [](const std::decay_t<decltype(p0)>& _p0_,\
-           const std::decay_t<decltype(p1)>& _p1_,\
-           const std::decay_t<decltype(p2)>& _p2_,\
-           const std::decay_t<decltype(p3)>& _p3_) [[hc]] {\
-            kernel_name(_p0_, _p1_, _p2_, _p3_);\
-        }
-    #define make_kernel_lambda_hip_4(kernel_name, p0, p1, p2)\
-        [](const std::decay_t<decltype(p0)>& _p0_,\
-           const std::decay_t<decltype(p1)>& _p1_,\
-           const std::decay_t<decltype(p2)>& _p2_) [[hc]] {\
-            kernel_name(_p0_, _p1_, _p2_);\
-        }
-    #define make_kernel_lambda_hip_3(kernel_name, p0, p1)\
-        [](const std::decay_t<decltype(p0)>& _p0_,\
-           const std::decay_t<decltype(p1)>& _p1_) [[hc]] {\
-            kernel_name(_p0_, _p1_);\
-        }
-    #define make_kernel_lambda_hip_2(kernel_name, p0)\
-        [](const std::decay_t<decltype(p0)>& _p0_) [[hc]] {\
-            kernel_name(_p0_);\
-        }
-    #define make_kernel_lambda_hip_1(kernel_name)\
-        []() [[hc]] { return kernel_name(hipLaunchParm{}); }
-
-    #define make_kernel_lambda_hip_(...)\
-        overload_macro_hip_(make_kernel_lambda_hip_, __VA_ARGS__)
+    #define hipLaunchNamedKernelGGL(\
+    function_name,\
+    kernel_name,\
+    num_blocks,\
+    dim_blocks,\
+    group_mem_bytes,\
+    stream,\
+    ...)\
+	do {\
+        make_kernel_functor_hip_(function_name, kernel_name, __VA_ARGS__)\
+            hip_kernel_functor_impl_{__VA_ARGS__};\
+	    hip_impl::grid_launch_hip_(\
+		    num_blocks,\
+		    dim_blocks,\
+		    group_mem_bytes,\
+		    stream,\
+		    #kernel_name,\
+			hip_kernel_functor_impl_);\
+    } while(0)
 
     #define hipLaunchKernelGGL(\
-        kernel_name,\
-        num_blocks,\
-        dim_blocks,\
-        group_mem_bytes,\
-        stream,\
-        ...)\
-    do {\
-        hip_impl::grid_launch_hip_(\
-            num_blocks,\
-            dim_blocks,\
-            group_mem_bytes,\
-            stream,\
-            #kernel_name,\
-            make_kernel_lambda_hip_(kernel_name, __VA_ARGS__),\
-            ##__VA_ARGS__);\
-    } while(0)
+        kernel_name, num_blocks, dim_blocks, group_mem_bytes, stream, ...)\
+	do {\
+	    hipLaunchNamedKernelGGL(\
+		    unnamed,\
+		    kernel_name,\
+		    num_blocks,\
+		    dim_blocks,\
+		    group_mem_bytes,\
+		    stream,\
+		    ##__VA_ARGS__);\
+	} while (0)
 
     #define hipLaunchKernel(\
-        kernel_name,\
-        num_blocks,\
-        dim_blocks,\
-        group_mem_bytes,\
-        stream,\
-        ...)\
-    do {\
-        hipLaunchKernelGGL(\
-            kernel_name,\
-            num_blocks,\
-            dim_blocks,\
-            group_mem_bytes,\
-            stream,\
-            hipLaunchParm{},\
-            ##__VA_ARGS__);\
-    } while(0)
+        kernel_name, num_blocks, dim_blocks, group_mem_bytes, stream, ...)\
+	do {\
+	    hipLaunchKernelGGL(\
+		    kernel_name,\
+		    num_blocks,\
+		    dim_blocks,\
+		    group_mem_bytes,\
+		    stream,\
+		    hipLaunchParm{},\
+		    ##__VA_ARGS__);\
+	} while(0)
 }
 #endif //GENERIC_GRID_LAUNCH
diff --git a/projects/hip/include/hip/hcc_detail/helpers.hpp b/projects/hip/include/hip/hcc_detail/helpers.hpp
index 611929766b..b5502c1efb 100644
--- a/projects/hip/include/hip/hcc_detail/helpers.hpp
+++ b/projects/hip/include/hip/hcc_detail/helpers.hpp
@@ -102,9 +102,6 @@ namespace hip_impl
         // Not callable.
         template<FunctionalProcedure F>
         struct is_callable_impl<F, 5u> : std::false_type {};
-
-        template<typename Call>
-        struct is_callable : is_callable_impl<Call> {};
     #else
         template<typename, typename = void>
         struct is_callable_impl : std::false_type {};
@@ -114,6 +111,8 @@ namespace hip_impl
             F(Ts...),
             void_t_<std::result_of_t<F(Ts...)>>> : std::true_type {};
     #endif
+        template<typename Call>
+        struct is_callable : is_callable_impl<Call> {};
 
     #define count_macro_args_impl_hip_(\
          _0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15,\
diff --git a/projects/hip/include/hip/hcc_detail/hip_fp16.h b/projects/hip/include/hip/hcc_detail/hip_fp16.h
index 0a861b64af..b1ecc61cb0 100644
--- a/projects/hip/include/hip/hcc_detail/hip_fp16.h
+++ b/projects/hip/include/hip/hcc_detail/hip_fp16.h
@@ -25,19 +25,10 @@ THE SOFTWARE.
 
 #include "hip/hcc_detail/hip_vector_types.h"
 
-#if __clang_major__ > 3
-
 typedef __fp16 __half;
-
-typedef struct __attribute__((aligned(4))){
-  union {
-    __half p[2];
-    unsigned int q;
-  };
-} __half2;
-
-typedef __half half;
-typedef __half2 half2;
+typedef __fp16 __half1 __attribute__((ext_vector_type(1)));
+typedef __fp16 __half2 __attribute__((ext_vector_type(2)));
+typedef __fp16 half;
 
 /*
 Half Arithmetic Functions
@@ -214,10 +205,10 @@ __device__  __half __ushort2half_ru(unsigned short int i);
 __device__  __half __ushort2half_rz(unsigned short int i);
 __device__  __half __ushort_as_half(const unsigned short int i);
 
-extern "C" int __hip_hc_ir_hadd2_int(int, int);
-extern "C" int __hip_hc_ir_hfma2_int(int, int, int);
-extern "C" int __hip_hc_ir_hmul2_int(int, int);
-extern "C" int __hip_hc_ir_hsub2_int(int, int);
+extern "C" __half2 __hip_hc_ir_hadd2_int(__half2, __half2);
+extern "C" __half2 __hip_hc_ir_hfma2_int(__half2, __half2, __half2);
+extern "C" __half2 __hip_hc_ir_hmul2_int(__half2, __half2);
+extern "C" __half2 __hip_hc_ir_hsub2_int(__half2, __half2);
 
 extern "C" __half __hip_hc_ir_hceil_half(__half) __asm("llvm.ceil.f16");
 extern "C" __half __hip_hc_ir_hcos_half(__half) __asm("llvm.cos.f16");
@@ -231,16 +222,16 @@ extern "C" __half __hip_hc_ir_hsin_half(__half) __asm("llvm.sin.f16");
 extern "C" __half __hip_hc_ir_hsqrt_half(__half) __asm("llvm.sqrt.f16");
 extern "C" __half __hip_hc_ir_htrunc_half(__half) __asm("llvm.trunc.f16");
 
-extern "C" int __hip_hc_ir_h2ceil_int(int);
-extern "C" int __hip_hc_ir_h2cos_int(int);
-extern "C" int __hip_hc_ir_h2exp2_int(int);
-extern "C" int __hip_hc_ir_h2floor_int(int);
-extern "C" int __hip_hc_ir_h2log2_int(int);
-extern "C" int __hip_hc_ir_h2rcp_int(int);
-extern "C" int __hip_hc_ir_h2rsqrt_int(int);
-extern "C" int __hip_hc_ir_h2sin_int(int);
-extern "C" int __hip_hc_ir_h2sqrt_int(int);
-extern "C" int __hip_hc_ir_h2trunc_int(int);
+extern "C" __half2 __hip_hc_ir_h2ceil_int(__half2);
+extern "C" __half2 __hip_hc_ir_h2cos_int(__half2);
+extern "C" __half2 __hip_hc_ir_h2exp2_int(__half2);
+extern "C" __half2 __hip_hc_ir_h2floor_int(__half2);
+extern "C" __half2 __hip_hc_ir_h2log2_int(__half2);
+extern "C" __half2 __hip_hc_ir_h2rcp_int(__half2);
+extern "C" __half2 __hip_hc_ir_h2rsqrt_int(__half2);
+extern "C" __half2 __hip_hc_ir_h2sin_int(__half2);
+extern "C" __half2 __hip_hc_ir_h2sqrt_int(__half2);
+extern "C" __half2 __hip_hc_ir_h2trunc_int(__half2);
 
 /*
   Half2 Arithmetic Functions
@@ -248,63 +239,63 @@ extern "C" int __hip_hc_ir_h2trunc_int(int);
 
 __device__ static inline __half2 __hadd2(__half2 a, __half2 b) {
   __half2 c;
-  c.q = __hip_hc_ir_hadd2_int(a.q, b.q);
+  c.xy = __hip_hc_ir_hadd2_int(a.xy, b.xy);
   return c;
 }
 
 __device__ static inline __half2 __hadd2_sat(__half2 a, __half2 b) {
   __half2 c;
-  c.q = __hip_hc_ir_hadd2_int(a.q, b.q);
+  c.xy = __hip_hc_ir_hadd2_int(a.xy, b.xy);
   return c;
 }
 
 __device__ static inline __half2 __hfma2(__half2 a, __half2 b, __half2 c) {
   __half2 d;
-  d.q = __hip_hc_ir_hfma2_int(a.q, b.q, c.q);
+  d.xy = __hip_hc_ir_hfma2_int(a.xy, b.xy, c.xy);
   return d;
 }
 
 __device__ static inline __half2 __hfma2_sat(__half2 a, __half2 b, __half2 c) {
   __half2 d;
-  d.q = __hip_hc_ir_hfma2_int(a.q, b.q, c.q);
+  d.xy = __hip_hc_ir_hfma2_int(a.xy, b.xy, c.xy);
   return d;
 }
 
 __device__ static inline __half2 __hmul2(__half2 a, __half2 b) {
   __half2 c;
-  c.q = __hip_hc_ir_hmul2_int(a.q, b.q);
+  c.xy = __hip_hc_ir_hmul2_int(a.xy, b.xy);
   return c;
 }
 
 __device__ static inline __half2 __hmul2_sat(__half2 a, __half2 b) {
   __half2 c;
-  c.q = __hip_hc_ir_hmul2_int(a.q, b.q);
+  c.xy = __hip_hc_ir_hmul2_int(a.xy, b.xy);
   return c;
 }
 
 __device__ static inline __half2 __hsub2(__half2 a, __half2 b) {
   __half2 c;
-  c.q = __hip_hc_ir_hsub2_int(a.q, b.q);
+  c.xy = __hip_hc_ir_hsub2_int(a.xy, b.xy);
   return c;
 }
 
 __device__ static inline __half2 __hneg2(__half2 a) {
   __half2 c;
-  c.p[0] = - a.p[0];
-  c.p[1] = - a.p[1];
+  c.x = - a.x;
+  c.y = - a.y;
   return c;
 }
 
 __device__ static inline __half2 __hsub2_sat(__half2 a, __half2 b) {
   __half2 c;
-  c.q = __hip_hc_ir_hsub2_int(a.q, b.q);
+  c.xy = __hip_hc_ir_hsub2_int(a.xy, b.xy);
   return c;
 }
 
 __device__ static inline __half2 h2div(__half2 a, __half2 b) {
   __half2 c;
-  c.p[0] = a.p[0] / b.p[0];
-  c.p[1] = a.p[1] / b.p[1];
+  c.x = a.x / b.x;
+  c.y = a.y / b.y;
   return c;
 }
 
@@ -375,112 +366,94 @@ Half2 Math Operations
 
 __device__ static inline __half2 h2ceil(const __half2 h) {
   __half2 a;
-  a.q = __hip_hc_ir_h2ceil_int(h.q);
+  a.xy = __hip_hc_ir_h2ceil_int(h.xy);
   return a;
 }
 
 __device__ static inline __half2 h2cos(const __half2 h) {
   __half2 a;
-  a.q = __hip_hc_ir_h2cos_int(h.q);
+  a.xy = __hip_hc_ir_h2cos_int(h.xy);
   return a;
 }
 
 __device__ static inline __half2 h2exp(const __half2 h) {
   __half2 factor;
-  factor.p[0] = 1.442694;
-  factor.p[1] = 1.442694;
-  factor.q = __hip_hc_ir_h2exp2_int(__hip_hc_ir_hmul2_int(h.q, factor.q));
+  factor.x = 1.442694;
+  factor.y = 1.442694;
+  factor.xy = __hip_hc_ir_h2exp2_int(__hip_hc_ir_hmul2_int(h.xy, factor.xy));
   return factor;
 }
 
 __device__ static inline __half2 h2exp10(const __half2 h) {
   __half2 factor;
-  factor.p[0] = 3.3219281;
-  factor.p[1] = 3.3219281;
-  factor.q = __hip_hc_ir_h2exp2_int(__hip_hc_ir_hmul2_int(h.q, factor.q));
+  factor.x = 3.3219281;
+  factor.y = 3.3219281;
+  factor.xy = __hip_hc_ir_h2exp2_int(__hip_hc_ir_hmul2_int(h.xy, factor.xy));
   return factor;
 }
 
 __device__ static inline __half2 h2exp2(const __half2 h) {
   __half2 a;
-  a.q = __hip_hc_ir_h2exp2_int(h.q);
+  a.xy = __hip_hc_ir_h2exp2_int(h.xy);
   return a;
 }
 
 __device__ static inline __half2 h2floor(const __half2 h) {
   __half2 a;
-  a.q = __hip_hc_ir_h2floor_int(h.q);
+  a.xy = __hip_hc_ir_h2floor_int(h.xy);
   return a;
 }
 
 __device__ static inline __half2 h2log(const __half2 h) {
   __half2 factor;
-  factor.p[0] = 0.693147;
-  factor.p[1] = 0.693147;
-  factor. q = __hip_hc_ir_hmul2_int(__hip_hc_ir_h2log2_int(h.q), factor.q);
+  factor.x = 0.693147;
+  factor.y = 0.693147;
+  factor.xy = __hip_hc_ir_hmul2_int(__hip_hc_ir_h2log2_int(h.xy), factor.xy);
   return factor;
 }
 
 __device__ static inline __half2 h2log10(const __half2 h) {
   __half2 factor;
-  factor.p[0] = 0.301029;
-  factor.p[1] = 0.301029;
-  factor.q = __hip_hc_ir_hmul2_int(__hip_hc_ir_h2log2_int(h.q),  factor.q);
+  factor.x = 0.301029;
+  factor.y = 0.301029;
+  factor.xy = __hip_hc_ir_hmul2_int(__hip_hc_ir_h2log2_int(h.xy),  factor.xy);
   return factor;
 }
 __device__ static inline __half2 h2log2(const __half2 h) {
   __half2 a;
-  a.q = __hip_hc_ir_h2log2_int(h.q);
+  a.xy = __hip_hc_ir_h2log2_int(h.xy);
   return a;
 }
 
 __device__ static inline __half2 h2rcp(const __half2 h) {
   __half2 a;
-  a.q = __hip_hc_ir_h2rcp_int(h.q);
+  a.xy = __hip_hc_ir_h2rcp_int(h.xy);
   return a;
 }
 
 __device__ static inline __half2 h2rsqrt(const __half2 h) {
   __half2 a;
-  a.q = __hip_hc_ir_h2rsqrt_int(h.q);
+  a.xy = __hip_hc_ir_h2rsqrt_int(h.xy);
   return a;
 }
 
 __device__ static inline __half2 h2sin(const __half2 h) {
   __half2 a;
-  a.q = __hip_hc_ir_h2sin_int(h.q);
+  a.xy = __hip_hc_ir_h2sin_int(h.xy);
   return a;
 }
 
 __device__ static inline __half2 h2sqrt(const __half2 h) {
   __half2 a;
-  a.q = __hip_hc_ir_h2sqrt_int(h.q);
+  a.xy = __hip_hc_ir_h2sqrt_int(h.xy);
   return a;
 }
 
 __device__ static inline __half2 h2trunc(const __half2 h) {
   __half2 a;
-  a.q = __hip_hc_ir_h2trunc_int(h.q);
+  a.xy = __hip_hc_ir_h2trunc_int(h.xy);
   return a;
 }
 
-#endif
-
-#if __clang_major__ == 3
-
-typedef struct {
-  unsigned x: 16;
-} __half;
-
-typedef struct __attribute__((aligned(4))){
-  union {
-    __half p[2];
-    unsigned int q;
-  };
-} __half2;
-
-
-#endif
-
 
 #endif
diff --git a/projects/hip/include/hip/hcc_detail/hip_runtime.h b/projects/hip/include/hip/hcc_detail/hip_runtime.h
index 06ce65bc9a..129020d9cd 100644
--- a/projects/hip/include/hip/hcc_detail/hip_runtime.h
+++ b/projects/hip/include/hip/hcc_detail/hip_runtime.h
@@ -41,6 +41,8 @@ THE SOFTWARE.
 #include <stddef.h>
 #endif//__cplusplus
 
+#if __HCC__
+
 // Define NVCC_COMPAT for CUDA compatibility
 #define NVCC_COMPAT
 #define CUDA_SUCCESS hipSuccess
@@ -147,8 +149,15 @@ extern int HIP_TRACE_API;
 #endif /* Device feature flags */
 
 
-//TODO-HCC  this is currently ignored by HCC target of HIP
-#define __launch_bounds__(requiredMaxThreadsPerBlock, minBlocksPerMultiprocessor)
+#define launch_bounds_impl0(requiredMaxThreadsPerBlock)\
+    __attribute__((amdgpu_flat_work_group_size(1, requiredMaxThreadsPerBlock)))
+#define launch_bounds_impl1(\
+    requiredMaxThreadsPerBlock, minBlocksPerMultiprocessor)\
+    __attribute__((amdgpu_flat_work_group_size(1, requiredMaxThreadsPerBlock),\
+        amdgpu_waves_per_eu(minBlocksPerMultiprocessor)))
+#define select_impl_(_1, _2, impl_, ...) impl_
+#define __launch_bounds__(...) select_impl_(\
+    __VA_ARGS__, launch_bounds_impl1, launch_bounds_impl0)(__VA_ARGS__)
 
 // Detect if we are compiling C++ mode or C mode
 #if defined(__cplusplus)
@@ -481,6 +490,6 @@ do {\
  */
 
 
-
+#endif
 
 #endif//HIP_HCC_DETAIL_RUNTIME_H
diff --git a/projects/hip/include/hip/hcc_detail/hip_runtime_api.h b/projects/hip/include/hip/hcc_detail/hip_runtime_api.h
index 7a99ff0810..fde38c8395 100644
--- a/projects/hip/include/hip/hcc_detail/hip_runtime_api.h
+++ b/projects/hip/include/hip/hcc_detail/hip_runtime_api.h
@@ -106,21 +106,27 @@ enum hipLimit_t
 #define hipEventBlockingSync        0x1  ///< Waiting will yield CPU.  Power-friendly and usage-friendly but may increase latency.
 #define hipEventDisableTiming       0x2  ///< Disable event's capability to record timing information.  May improve performance.
 #define hipEventInterprocess        0x4  ///< Event can support IPC.  @warning - not supported in HIP.
+#define hipEventReleaseToDevice     0x40000000  /// < Use a device-scope release when recording this event.  This flag is useful to obtain more precise timings of commands between events.  The flag is a no-op on CUDA platforms.
+#define hipEventReleaseToSystem     0x80000000  /// < Use a system-scope release that when recording this event.  This flag is useful to make non-coherent host memory visible to the host.  The flag is a no-op on CUDA platforms.
 
 
 //! Flags that can be used with hipHostMalloc
 #define hipHostMallocDefault        0x0
-#define hipHostMallocPortable       0x1
-#define hipHostMallocMapped         0x2
+#define hipHostMallocPortable       0x1  ///< Memory is considered allocated by all contexts.
+#define hipHostMallocMapped         0x2  ///< Map the allocation into the address space for the current device.  The device pointer can be obtained with #hipHostGetDevicePointer.
 #define hipHostMallocWriteCombined  0x4
+#define hipHostMallocCoherent       0x40000000 ///< Allocate coherent memory. Overrides HIP_COHERENT_HOST_ALLOC for specific allocation.
+#define hipHostMallocNonCoherent    0x80000000 ///< Allocate non-coherent memory. Overrides HIP_COHERENT_HOST_ALLOC for specific allocation.
+
 
 //! Flags that can be used with hipHostRegister
 #define hipHostRegisterDefault      0x0  ///< Memory is Mapped and Portable
-#define hipHostRegisterPortable     0x1  ///< Memory is considered registered by all contexts.  HIP only supports one context so this is always assumed true.
+#define hipHostRegisterPortable     0x1  ///< Memory is considered registered by all contexts.
 #define hipHostRegisterMapped       0x2  ///< Map the allocation into the address space for the current device.  The device pointer can be obtained with #hipHostGetDevicePointer.
 #define hipHostRegisterIoMemory     0x4  ///< Not supported.
 
 
+
 #define hipDeviceScheduleAuto       0x0  ///< Automatically select between Spin and Yield
 #define hipDeviceScheduleSpin       0x1  ///< Dedicate a CPU core to spin-wait.  Provides lowest latency, but burns a CPU core and may consume more power.
 #define hipDeviceScheduleYield      0x2  ///< Yield the CPU to the operating system when waiting.  May increase latency, but lowers power and is friendlier to other threads in the system.
@@ -131,6 +137,33 @@ enum hipLimit_t
 #define hipDeviceLmemResizeToMax    0x16
 
 
+/*
+* @brief hipJitOption
+* @enum
+* @ingroup Enumerations
+*/
+typedef enum hipJitOption {
+  hipJitOptionMaxRegisters = 0,
+  hipJitOptionThreadsPerBlock,
+  hipJitOptionWallTime,
+  hipJitOptionInfoLogBuffer,
+  hipJitOptionInfoLogBufferSizeBytes,
+  hipJitOptionErrorLogBuffer,
+  hipJitOptionErrorLogBufferSizeBytes,
+  hipJitOptionOptimizationLevel,
+  hipJitOptionTargetFromContext,
+  hipJitOptionTarget,
+  hipJitOptionFallbackStrategy,
+  hipJitOptionGenerateDebugInfo,
+  hipJitOptionLogVerbose,
+  hipJitOptionGenerateLineInfo,
+  hipJitOptionCacheMode,
+  hipJitOptionSm3xOpt,
+  hipJitOptionFastCompile,
+  hipJitOptionNumOptions
+} hipJitOption;
+
+
 /**
  * @warning On AMD devices and recent Nvidia devices, these hints and controls are ignored.
  */
@@ -385,7 +418,7 @@ hipError_t hipDeviceGetLimit(size_t *pValue, enum hipLimit_t limit);
  * Note: AMD devices and recent Nvidia GPUS do not support reconfigurable cache.  This hint is ignored on those architectures.
  *
  */
-hipError_t hipFuncSetCacheConfig ( hipFuncCache_t config );
+hipError_t hipFuncSetCacheConfig (const void* func, hipFuncCache_t config );
 
 /**
  * @brief Returns bank width of shared memory for current device
@@ -601,9 +634,12 @@ hipError_t hipStreamQuery(hipStream_t stream);
  *
  * @return #hipSuccess, #hipErrorInvalidResourceHandle
  *
- * If the null stream is specified, this command blocks until all
+ * This command is host-synchronous : the host will block until the specified stream is empty.
+ *
+ * This command follows standard null-stream semantics.  Specifically, specifying the null stream will cause the 
+ * command to wait for other streams on the same device to complete all pending operations.
+ *
  * This command honors the hipDeviceLaunchBlocking flag, which controls whether the wait is active or blocking.
- * This command is host-synchronous : the host will block until the stream is empty.
  *
  * @see hipStreamCreate, hipStreamCreateWithFlags, hipStreamWaitEvent, hipStreamDestroy
  *
@@ -622,10 +658,12 @@ hipError_t hipStreamSynchronize(hipStream_t stream);
  *
  * This function inserts a wait operation into the specified stream.
  * All future work submitted to @p stream will wait until @p event reports completion before beginning execution.
- * This function is host-asynchronous and the function may return before the wait has completed.
+ *
+ * This function only waits for commands in the current stream to complete.  Notably,, this function does 
+ * not impliciy wait for commands in the default stream to complete, even if the specified stream is 
+ * created with hipStreamNonBlocking = 0.  
  *
  * @see hipStreamCreate, hipStreamCreateWithFlags, hipStreamSynchronize, hipStreamDestroy
- *
  */
 hipError_t hipStreamWaitEvent(hipStream_t stream, hipEvent_t event, unsigned int flags);
 
@@ -730,10 +768,10 @@ hipError_t hipEventCreate(hipEvent_t* event);
  * the specified stream, after all previous
  * commands in that stream have completed executing.
  *
- * If hipEventRecord() has been previously called aon event, then this call will overwrite any existing state in event.
+ * If hipEventRecord() has been previously called on this event, then this call will overwrite any existing state in event.
  *
  * If this function is called on a an event that is currently being recorded, results are undefined - either
- * outstanding recording may save state into the event, and the order is not guaranteed.  This shoul be avoided.
+ * outstanding recording may save state into the event, and the order is not guaranteed.  
  *
  * @see hipEventCreate, hipEventCreateWithFlags, hipEventQuery, hipEventSynchronize, hipEventDestroy, hipEventElapsedTime
  *
@@ -1308,6 +1346,27 @@ hipError_t hipFreeArray(hipArray* array);
  */
 hipError_t hipMemcpy2D(void* dst, size_t dpitch, const void* src, size_t spitch, size_t width, size_t height, hipMemcpyKind kind);
 
+/**
+ *  @brief Copies data between host and device.
+ *
+ *  @param[in]   dst    Destination memory address
+ *  @param[in]   dpitch Pitch of destination memory
+ *  @param[in]   src    Source memory address
+ *  @param[in]   spitch Pitch of source memory
+ *  @param[in]   width  Width of matrix transfer (columns in bytes)
+ *  @param[in]   height Height of matrix transfer (rows)
+ *  @param[in]   kind   Type of transfer
+ *  @param[in]   stream Stream to use
+ *  @return      #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidPitchValue, #hipErrorInvalidDevicePointer, #hipErrorInvalidMemcpyDirection
+ *
+ *  @see hipMemcpy, hipMemcpyToArray, hipMemcpy2DToArray, hipMemcpyFromArray, hipMemcpyToSymbol, hipMemcpyAsync
+ */
+#if __cplusplus
+hipError_t hipMemcpy2DAsync(void* dst, size_t dpitch, const void* src, size_t spitch, size_t width, size_t height, hipMemcpyKind kind, hipStream_t stream = 0);
+#else
+hipError_t hipMemcpy2DAsync(void* dst, size_t dpitch, const void* src, size_t spitch, size_t width, size_t height, hipMemcpyKind kind, hipStream_t stream);
+#endif
+
 /**
  *  @brief Copies data between host and device.
  *
@@ -1890,7 +1949,7 @@ hipError_t hipModuleGetFunction(hipFunction_t *function, hipModule_t module, con
  * @brief returns device memory pointer and size of the kernel present in the module with symbol @p name
  *
  * @param [out] dptr
- * @param [out[ bytes
+ * @param [out] bytes
  * @param [in] hmod
  * @param [in] name
  *
@@ -1898,7 +1957,6 @@ hipError_t hipModuleGetFunction(hipFunction_t *function, hipModule_t module, con
  */
 hipError_t hipModuleGetGlobal(hipDeviceptr_t *dptr, size_t *bytes, hipModule_t hmod, const char *name);
 
-
 /**
  * @brief builds module from code object which resides in host memory. Image is pointer to that location.
  *
@@ -1909,11 +1967,23 @@ hipError_t hipModuleGetGlobal(hipDeviceptr_t *dptr, size_t *bytes, hipModule_t h
  */
 hipError_t hipModuleLoadData(hipModule_t *module, const void *image);
 
+/**
+* @brief builds module from code object which resides in host memory. Image is pointer to that location. Options are not used. hipModuleLoadData is called.
+*
+* @param [in] image
+* @param [out] module
+* @param [in] number of options
+* @param [in] options for JIT
+* @param [in] option values for JIT
+*
+* @returns hipSuccess, hipErrorNotInitialized, hipErrorOutOfMemory, hipErrorNotInitialized
+*/
+hipError_t hipModuleLoadDataEx(hipModule_t *module, const void *image, unsigned int numOptions, hipJitOption *options, void **optionValues);
 
 /**
  * @brief launches kernel f with launch parameters and shared memory on stream with arguments passed to kernelparams or extra
  *
- * @param [in[ f	 Kernel to launch.
+ * @param [in] f         Kernel to launch.
  * @param [in] gridDimX  X grid dimension specified as multiple of blockDimX.
  * @param [in] gridDimY  Y grid dimension specified as multiple of blockDimY.
  * @param [in] gridDimZ  Z grid dimension specified as multiple of blockDimZ.
@@ -1921,7 +1991,7 @@ hipError_t hipModuleLoadData(hipModule_t *module, const void *image);
  * @param [in] blockDimY Y grid dimension specified in work-items
  * @param [in] blockDimZ Z grid dimension specified in work-items
  * @param [in] sharedMemBytes Amount of dynamic shared memory to allocate for this kernel.  The kernel can access this with HIP_DYNAMIC_SHARED.
- * @param [in] stream Stream where the kernel should be dispatched.  May be 0, in which case th default stream is used with associated synchronization rules.
+ * @param [in] stream    Stream where the kernel should be dispatched.  May be 0, in which case th default stream is used with associated synchronization rules.
  * @param [in] kernelParams
  * @param [in] extra     Pointer to kernel arguments.   These are passed directly to the kernel and must be in the memory layout and alignment expected by the kernel.
  *
diff --git a/projects/hip/include/hip/hcc_detail/hip_vector_types.h b/projects/hip/include/hip/hcc_detail/hip_vector_types.h
index 35c6c23548..9da34d9f32 100644
--- a/projects/hip/include/hip/hcc_detail/hip_vector_types.h
+++ b/projects/hip/include/hip/hcc_detail/hip_vector_types.h
@@ -37,38 +37,41 @@ THE SOFTWARE.
 #define MAKE_DEFAULT_CONSTRUCTOR_ONE_COMPONENT(type) \
 __device__ __host__ type() {} \
 __device__ __host__ type(type& val) : x(val.x) { } \
-__device__ __host__ type(const type& val) : x(val.x) { }
+__device__ __host__ type(const type& val) : x(val.x) { } \
+__device__ __host__ ~type() {} 
 
 #define MAKE_DEFAULT_CONSTRUCTOR_TWO_COMPONENT(type) \
 __device__ __host__ type() {} \
 __device__ __host__ type(type& val) : x(val.x), y(val.y) { } \
-__device__ __host__ type(const type& val) : x(val.x), y(val.y) { }
+__device__ __host__ type(const type& val) : x(val.x), y(val.y) { } \
+__device__ __host__ ~type() {}
 
 #define MAKE_DEFAULT_CONSTRUCTOR_THREE_COMPONENT(type) \
 __device__ __host__ type() {} \
 __device__ __host__ type(type& val) : x(val.x), y(val.y), z(val.z) { } \
-__device__ __host__ type(const type& val) : x(val.x), y(val.y), z(val.z) { }
+__device__ __host__ type(const type& val) : x(val.x), y(val.y), z(val.z) { } \
+__device__ __host__ ~type() {} 
 
 #define MAKE_DEFAULT_CONSTRUCTOR_FOUR_COMPONENT(type) \
 __device__ __host__ type() {} \
 __device__ __host__ type(type& val) : x(val.x), y(val.y), z(val.z), w(val.w) { } \
-__device__ __host__ type(const type& val) : x(val.x), y(val.y), z(val.z), w(val.w) { }
-
+__device__ __host__ type(const type& val) : x(val.x), y(val.y), z(val.z), w(val.w) { } \
+__device__ __host__ ~type() {}
 
 #define MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(type, type1) \
 __device__ __host__ type(type1 val) : x(val) {} \
 
 #define MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(type, type1) \
 __device__ __host__ type(type1 val) : x(val), y(val) {} \
-__device__ __host__ type(type1 val1, type1 val2) : x(val1), y(val2) {}
+__device__ __host__ type(type1 val1, type1 val2) : x(val1), y(val2) {} \
 
 #define MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(type, type1) \
 __device__ __host__ type(type1 val) : x(val), y(val), z(val) {} \
-__device__ __host__ type(type1 val1, type1 val2, type1 val3) : x(val1), y(val2), z(val3) {}
+__device__ __host__ type(type1 val1, type1 val2, type1 val3) : x(val1), y(val2), z(val3) {} \
 
 #define MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(type, type1) \
 __device__ __host__ type(type1 val) : x(val), y(val), z(val), w(val) {} \
-__device__ __host__ type(type1 val1, type1 val2, type1 val3, type1 val4) : x(val1), y(val2), z(val3), w(val4) {}
+__device__ __host__ type(type1 val1, type1 val2, type1 val3, type1 val4) : x(val1), y(val2), z(val3), w(val4) {} \
 
 struct uchar1 {
   #ifdef __cplusplus
@@ -4115,4 +4118,5 @@ DECLOP_4VAR_SCALE_PRODUCT(longlong4, signed long long)
 
 #endif
 
+
 #endif
diff --git a/projects/hip/include/hip/hcc_detail/host_defines.h b/projects/hip/include/hip/hcc_detail/host_defines.h
index 5864cfa0e7..140cbb0678 100644
--- a/projects/hip/include/hip/hcc_detail/host_defines.h
+++ b/projects/hip/include/hip/hcc_detail/host_defines.h
@@ -48,7 +48,7 @@ THE SOFTWARE.
 #define __global__  __attribute__((hc_grid_launch)) __attribute__((used))
 #else
 //#warning "GGL global define reached"
-#define __global__ __attribute__((hc, weak))
+#define __global__ __attribute__((annotate("hip__global__"), hc, used))
 #endif //GENERIC_GRID_LAUNCH
 
 #define __noinline__      __attribute__((noinline))
diff --git a/projects/hip/include/hip/hip_runtime_api.h b/projects/hip/include/hip/hip_runtime_api.h
index 8eae1d6a3a..dc163d5c25 100644
--- a/projects/hip/include/hip/hip_runtime_api.h
+++ b/projects/hip/include/hip/hip_runtime_api.h
@@ -250,6 +250,7 @@ typedef enum hipDeviceAttribute_t {
     hipDeviceAttributeIsMultiGpuBoard,                      ///< Multiple GPU devices.
 } hipDeviceAttribute_t;
 
+
 /**
  *     @}
  */
diff --git a/projects/hip/include/hip/nvcc_detail/hip_runtime_api.h b/projects/hip/include/hip/nvcc_detail/hip_runtime_api.h
index 0cc40f32af..f92523a3e3 100644
--- a/projects/hip/include/hip/nvcc_detail/hip_runtime_api.h
+++ b/projects/hip/include/hip/nvcc_detail/hip_runtime_api.h
@@ -54,24 +54,49 @@ hipMemcpyHostToHost
 #define hipFilterModePoint cudaFilterModePoint
 
 //! Flags that can be used with hipEventCreateWithFlags:
-#define hipEventDefault             cudaEventDefault
-#define hipEventBlockingSync        cudaEventBlockingSync
-#define hipEventDisableTiming       cudaEventDisableTiming
-#define hipEventInterprocess        cudaEventInterprocess
+#define hipEventDefault              cudaEventDefault
+#define hipEventBlockingSync         cudaEventBlockingSync
+#define hipEventDisableTiming        cudaEventDisableTiming
+#define hipEventInterprocess         cudaEventInterprocess
+#define hipEventReleaseToDevice      0  /* no-op on CUDA platform */
+#define hipEventReleaseToSystem      0  /* no-op on CUDA platform */
 
-#define hipHostMallocDefault cudaHostAllocDefault
-#define hipHostMallocPortable cudaHostAllocPortable
-#define hipHostMallocMapped cudaHostAllocMapped
+
+#define hipHostMallocDefault       cudaHostAllocDefault
+#define hipHostMallocPortable      cudaHostAllocPortable
+#define hipHostMallocMapped        cudaHostAllocMapped
 #define hipHostMallocWriteCombined cudaHostAllocWriteCombined
+#define hipHostMallocCoherent      0x0 
+#define hipHostMallocNonCoherent   0x0
 
 #define hipHostRegisterPortable cudaHostRegisterPortable
-#define hipHostRegisterMapped cudaHostRegisterMapped
+#define hipHostRegisterMapped   cudaHostRegisterMapped
 
 #define HIP_LAUNCH_PARAM_BUFFER_POINTER CU_LAUNCH_PARAM_BUFFER_POINTER
-#define HIP_LAUNCH_PARAM_BUFFER_SIZE     CU_LAUNCH_PARAM_BUFFER_SIZE
+#define HIP_LAUNCH_PARAM_BUFFER_SIZE    CU_LAUNCH_PARAM_BUFFER_SIZE
 #define HIP_LAUNCH_PARAM_END            CU_LAUNCH_PARAM_END
 #define hipLimitMallocHeapSize          cudaLimitMallocHeapSize
-#define hipIpcMemLazyEnablePeerAccess          cudaIpcMemLazyEnablePeerAccess
+#define hipIpcMemLazyEnablePeerAccess   cudaIpcMemLazyEnablePeerAccess
+
+// enum CUjit_option redefines
+#define hipJitOptionMaxRegisters            CU_JIT_MAX_REGISTERS
+#define hipJitOptionThreadsPerBlock         CU_JIT_THREADS_PER_BLOCK
+#define hipJitOptionWallTime                CU_JIT_WALL_TIME
+#define hipJitOptionInfoLogBuffer           CU_JIT_INFO_LOG_BUFFER
+#define hipJitOptionInfoLogBufferSizeBytes  CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES
+#define hipJitOptionErrorLogBuffer          CU_JIT_ERROR_LOG_BUFFER
+#define hipJitOptionErrorLogBufferSizeBytes CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES
+#define hipJitOptionOptimizationLevel       CU_JIT_OPTIMIZATION_LEVEL
+#define hipJitOptionTargetFromContext       CU_JIT_TARGET_FROM_CUCONTEXT
+#define hipJitOptionTarget                  CU_JIT_TARGET
+#define hipJitOptionFallbackStrategy        CU_JIT_FALLBACK_STRATEGY
+#define hipJitOptionGenerateDebugInfo       CU_JIT_GENERATE_DEBUG_INFO
+#define hipJitOptionLogVerbose              CU_JIT_LOG_VERBOSE
+#define hipJitOptionGenerateLineInfo        CU_JIT_GENERATE_LINE_INFO
+#define hipJitOptionCacheMode               CU_JIT_CACHE_MODE
+#define hipJitOptionSm3xOpt                 CU_JIT_NEW_SM3X_OPT
+#define hipJitOptionFastCompile             CU_JIT_FAST_COMPILE
+#define hipJitOptionNumOptions              CU_JIT_NUM_OPTIONS
 
 typedef cudaEvent_t hipEvent_t;
 typedef cudaStream_t hipStream_t;
@@ -82,6 +107,7 @@ typedef cudaFuncCache hipFuncCache_t;
 typedef CUcontext hipCtx_t;
 typedef CUsharedconfig hipSharedMemConfig;
 typedef CUfunc_cache hipFuncCache;
+typedef CUjit_option hipJitOption;
 typedef CUdevice hipDevice_t;
 typedef CUmodule hipModule_t;
 typedef CUfunction hipFunction_t;
@@ -202,6 +228,10 @@ inline static hipError_t hipMalloc(void** ptr, size_t size) {
     return hipCUDAErrorTohipError(cudaMalloc(ptr, size));
 }
 
+inline static hipError_t hipMallocPitch(void** ptr, size_t* pitch, size_t width, size_t height) {
+    return hipCUDAErrorTohipError(cudaMallocPitch(ptr, pitch, width, height));
+}
+
 inline static hipError_t hipFree(void* ptr) {
     return hipCUDAErrorTohipError(cudaFree(ptr));
 }
@@ -345,7 +375,11 @@ inline static hipError_t hipMemcpyFromSymbolAsync(void *dst, const void* symbolN
 }
 
 inline static hipError_t hipMemcpy2D(void* dst, size_t dpitch, const void* src, size_t spitch, size_t width, size_t height, hipMemcpyKind kind){
-  return hipCUDAErrorTohipError(cudaMemcpy2D(dst, dpitch, src, spitch, width, height, hipMemcpyKindToCudaMemcpyKind(kind)));
+    return hipCUDAErrorTohipError(cudaMemcpy2D(dst, dpitch, src, spitch, width, height, hipMemcpyKindToCudaMemcpyKind(kind)));
+}
+
+inline static hipError_t hipMemcpy2DAsync(void* dst, size_t dpitch, const void* src, size_t spitch, size_t width, size_t height, hipMemcpyKind kind, hipStream_t stream) {
+    return hipCUDAErrorTohipError(cudaMemcpy2DAsync(dst, dpitch, src, spitch, width, height, hipMemcpyKindToCudaMemcpyKind(kind),stream));
 }
 
 inline static hipError_t hipMemcpy2DToArray(hipArray *dst, size_t wOffset, size_t hOffset, const void* src, size_t spitch, size_t width, size_t height, hipMemcpyKind kind){
@@ -884,6 +918,11 @@ inline static hipError_t hipModuleLoadData(hipModule_t *module, const void *imag
     return hipCUResultTohipError(cuModuleLoadData(module, image));
 }
 
+inline static hipError_t hipModuleLoadDataEx(hipModule_t *module, const void *image, unsigned int numOptions, hipJitOption *options, void **optionValues)
+{
+  return hipCUResultTohipError(cuModuleLoadDataEx(module, image, numOptions, options, optionValues));
+}
+
 inline static hipError_t hipModuleLaunchKernel(hipFunction_t f,
       unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ,
       unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ,
@@ -897,6 +936,10 @@ inline static hipError_t hipModuleLaunchKernel(hipFunction_t f,
 }
 
 
+inline static hipError_t hipFuncSetCacheConfig(const void* func, hipFuncCache_t cacheConfig)
+{
+    return hipCUDAErrorTohipError(cudaFuncSetCacheConfig(func, cacheConfig));
+}
 
 #ifdef __cplusplus
 }
diff --git a/projects/hip/samples/2_Cookbook/10_inline_asm/Makefile b/projects/hip/samples/2_Cookbook/10_inline_asm/Makefile
new file mode 100644
index 0000000000..77a7699635
--- /dev/null
+++ b/projects/hip/samples/2_Cookbook/10_inline_asm/Makefile
@@ -0,0 +1,35 @@
+HIP_PATH?= $(wildcard /opt/rocm/hip)
+ifeq (,$(HIP_PATH))
+	HIP_PATH=../../..
+endif
+
+HIPCC=$(HIP_PATH)/bin/hipcc
+
+TARGET=hcc
+
+SOURCES = inline_asm.cpp
+OBJECTS = $(SOURCES:.cpp=.o)
+
+EXECUTABLE=./inline_asm
+
+.PHONY: test
+
+
+all: $(EXECUTABLE) test
+
+CXXFLAGS =-g
+CXX=$(HIPCC)
+
+
+$(EXECUTABLE): $(OBJECTS)
+	$(HIPCC) $(OBJECTS) -o $@
+
+
+test: $(EXECUTABLE)
+	$(EXECUTABLE)
+
+
+clean:
+	rm -f $(EXECUTABLE)
+	rm -f $(OBJECTS)
+	rm -f $(HIP_PATH)/src/*.o
diff --git a/projects/hip/samples/2_Cookbook/10_inline_asm/Readme.md b/projects/hip/samples/2_Cookbook/10_inline_asm/Readme.md
new file mode 100644
index 0000000000..8c98547220
--- /dev/null
+++ b/projects/hip/samples/2_Cookbook/10_inline_asm/Readme.md
@@ -0,0 +1,47 @@
+## inline asm  ###
+
+This tutorial is about how to use inline GCN asm in kernel. In this tutorial, we'll explain how to by using the simple Matrix Transpose.
+
+## Introduction:
+
+If you want to take advantage of the extra performance benefits of writing in assembly as well as take advantage of special GPU hardware features that were only available through assemby, then this tutorial is for you. In this tutorial we'll be explaining how to start writing inline asm in kernel.
+
+For more insight Please read the following blogs by Ben Sander 
+[The Art of AMDGCN Assembly: How to Bend the Machine to Your Will](gpuopen.com/amdgcn-assembly)
+[AMD GCN Assembly: Cross-Lane Operations](http://gpuopen.com/amd-gcn-assembly-cross-lane-operations/)
+
+For more information:
+[AMD GCN3 ISA Architecture Manual](http://gpuopen.com/compute-product/amd-gcn3-isa-architecture-manual/)
+[User Guide for AMDGPU Back-end](llvm.org/docs/AMDGPUUsage.html)
+
+## Requirement:
+For hardware requirement and software installation [Installation](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/INSTALL.md) 
+
+## prerequiste knowledge:
+
+Programmers familiar with CUDA, OpenCL will be able to quickly learn and start coding with the HIP API. In case you are not, don't worry. You choose to start with the best one. We'll be explaining everything assuming you are completely new to gpgpu programming.
+
+## Simple Matrix Transpose 
+
+We will be using the Simple Matrix Transpose application from the our very first tutorial.
+
+## asm() Assembler statement
+
+We insert the GCN isa into the kernel using asm() Assembler statement. In the same sourcecode, we used for MatrixTranspose. We'll add the following:
+
+`  asm volatile ("v_mov_b32_e32 %0, %1" : "=v" (out[x*width + y]) : "v" (in[y*width + x]));                    `
+
+## How to build and run:
+Use the make command and execute it using ./exe
+Use hipcc to build the application, which is using hcc on AMD and nvcc on nvidia.
+
+
+## More Info:
+- [HIP FAQ](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/docs/markdown/hip_faq.md)
+- [HIP Kernel Language](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/docs/markdown/hip_kernel_language.md)
+- [HIP Runtime API (Doxygen)](http://gpuopen-professionalcompute-tools.github.io/HIP)
+- [HIP Porting Guide](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/docs/markdown/hip_porting_guide.md)
+- [HIP Terminology](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/docs/markdown/hip_terms.md) (including Rosetta Stone of GPU computing terms across CUDA/HIP/HC/AMP/OpenL)
+- [clang-hipify](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/clang-hipify/README.md)
+- [Developer/CONTRIBUTING Info](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/CONTRIBUTING.md)
+- [Release Notes](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/RELEASE.md)
diff --git a/projects/hip/samples/2_Cookbook/10_inline_asm/inline_asm.cpp b/projects/hip/samples/2_Cookbook/10_inline_asm/inline_asm.cpp
new file mode 100644
index 0000000000..2b4fc3de90
--- /dev/null
+++ b/projects/hip/samples/2_Cookbook/10_inline_asm/inline_asm.cpp
@@ -0,0 +1,174 @@
+/*
+Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include<iostream>
+
+// hip header file
+#include "hip/hip_runtime.h"
+
+#define WIDTH     1024
+
+#define NUM       (WIDTH*WIDTH)
+
+#define THREADS_PER_BLOCK_X  4
+#define THREADS_PER_BLOCK_Y  4
+#define THREADS_PER_BLOCK_Z  1
+
+// Device (Kernel) function, it must be void
+// hipLaunchParm provides the execution configuration
+__global__ void matrixTranspose(hipLaunchParm lp,
+                                float *out,
+                                float *in,
+                                const int width)
+{
+ 
+    int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
+    int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;
+
+    asm volatile ("v_mov_b32_e32 %0, %1" : "=v" (out[x*width + y]) : "v" (in[y*width + x]));
+}
+
+// CPU implementation of matrix transpose
+void matrixTransposeCPUReference(
+    float * output,
+    float * input,
+    const unsigned int width)
+{
+    for(unsigned int j=0; j < width; j++)
+    {
+        for(unsigned int i=0; i < width; i++)
+        {
+            output[i*width + j] = input[j*width + i];
+        }
+    }
+}
+
+int main() {
+
+  float* Matrix;
+  float* TransposeMatrix;
+  float* cpuTransposeMatrix;
+
+  float* gpuMatrix;
+  float* gpuTransposeMatrix;
+
+  hipDeviceProp_t devProp;
+  hipGetDeviceProperties(&devProp, 0);
+
+  std::cout << "Device name " << devProp.name << std::endl;
+
+  hipEvent_t start, stop;
+  hipEventCreate(&start);
+  hipEventCreate(&stop);
+  float eventMs = 1.0f;
+
+  int i;
+  int errors;
+
+  Matrix = (float*)malloc(NUM * sizeof(float));
+  TransposeMatrix = (float*)malloc(NUM * sizeof(float));
+  cpuTransposeMatrix = (float*)malloc(NUM * sizeof(float));
+
+  // initialize the input data
+  for (i = 0; i < NUM; i++) {
+    Matrix[i] = (float)i*10.0f;
+  }
+
+  // allocate the memory on the device side
+  hipMalloc((void**)&gpuMatrix, NUM * sizeof(float));
+  hipMalloc((void**)&gpuTransposeMatrix, NUM * sizeof(float));
+
+  // Record the start event
+  hipEventRecord(start, NULL);
+
+  // Memory transfer from host to device
+  hipMemcpy(gpuMatrix, Matrix, NUM*sizeof(float), hipMemcpyHostToDevice);
+
+  // Record the stop event
+  hipEventRecord(stop, NULL);
+  hipEventSynchronize(stop);
+
+  hipEventElapsedTime(&eventMs, start, stop);
+
+  printf ("hipMemcpyHostToDevice time taken  = %6.3fms\n", eventMs);
+
+  // Record the start event
+  hipEventRecord(start, NULL);
+
+  // Lauching kernel from host
+  hipLaunchKernel(matrixTranspose,
+                  dim3(WIDTH/THREADS_PER_BLOCK_X, WIDTH/THREADS_PER_BLOCK_Y),
+                  dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),
+                  0, 0,
+                  gpuTransposeMatrix , gpuMatrix, WIDTH);
+
+  // Record the stop event
+  hipEventRecord(stop, NULL);
+  hipEventSynchronize(stop);
+
+  hipEventElapsedTime(&eventMs, start, stop);
+
+  printf ("kernel Execution time             = %6.3fms\n", eventMs);
+
+  // Record the start event
+  hipEventRecord(start, NULL);
+
+  // Memory transfer from device to host
+  hipMemcpy(TransposeMatrix, gpuTransposeMatrix, NUM*sizeof(float), hipMemcpyDeviceToHost);
+
+  // Record the stop event
+  hipEventRecord(stop, NULL);
+  hipEventSynchronize(stop);
+
+  hipEventElapsedTime(&eventMs, start, stop);
+
+  printf ("hipMemcpyDeviceToHost time taken  = %6.3fms\n", eventMs);
+
+  // CPU MatrixTranspose computation
+  matrixTransposeCPUReference(cpuTransposeMatrix, Matrix, WIDTH);
+
+  // verify the results
+  errors = 0;
+  double eps = 1.0E-6;
+  for (i = 0; i < NUM; i++) {
+    if (std::abs(TransposeMatrix[i] - cpuTransposeMatrix[i]) > eps ) {
+    printf("gpu%f cpu %f \n",TransposeMatrix[i],cpuTransposeMatrix[i]);
+      errors++;
+    }
+  }
+  if (errors!=0) {
+    printf("FAILED: %d errors\n",errors);
+  } else {
+    printf ("PASSED!\n");
+  }
+
+  //free the resources on device side
+  hipFree(gpuMatrix);
+  hipFree(gpuTransposeMatrix);
+
+  //free the resources on host side
+  free(Matrix);
+  free(TransposeMatrix);
+  free(cpuTransposeMatrix);
+
+  return errors;
+}
diff --git a/projects/hip/samples/2_Cookbook/8_peer2peer/peer2peer.cpp b/projects/hip/samples/2_Cookbook/8_peer2peer/peer2peer.cpp
index 990599e1cb..0f532a2f0a 100644
--- a/projects/hip/samples/2_Cookbook/8_peer2peer/peer2peer.cpp
+++ b/projects/hip/samples/2_Cookbook/8_peer2peer/peer2peer.cpp
@@ -55,13 +55,9 @@ void checkPeer2PeerSupport()
 {
     int gpuCount;
     int canAccessPeer;
-    int p2pCapableDeviceCount=0;
 
     HIPCHECK(hipGetDeviceCount(&gpuCount));
 
-    if (gpuCount < 2)
-        printf("Peer2Peer application requires atleast 2 gpu devices");
-
     for (int currentGpu=0; currentGpu<gpuCount; currentGpu++)
     {
         HIPCHECK(hipSetDevice(currentGpu));
@@ -161,6 +157,12 @@ int main(){
 
     HIPCHECK(hipGetDeviceCount(&gpuCount));
 
+    if (gpuCount < 2)
+    {
+        printf("Peer2Peer application requires atleast 2 gpu devices");
+        return 0;
+    }
+
     currentGpu = 0;
     peerGpu = (currentGpu + 1);
 
diff --git a/projects/hip/samples/2_Cookbook/9_unroll/Makefile b/projects/hip/samples/2_Cookbook/9_unroll/Makefile
new file mode 100644
index 0000000000..b71f3d8353
--- /dev/null
+++ b/projects/hip/samples/2_Cookbook/9_unroll/Makefile
@@ -0,0 +1,39 @@
+HIP_PATH?= $(wildcard /opt/rocm/hip)
+ifeq (,$(HIP_PATH))
+	HIP_PATH=../../..
+endif
+
+ifeq (gfx701, $(findstring gfx701,$(HCC_AMDGPU_TARGET)))
+	$(error gfx701 is not a supported device for this sample)
+endif
+
+HIPCC=$(HIP_PATH)/bin/hipcc
+
+TARGET=hcc
+
+SOURCES = unroll.cpp
+OBJECTS = $(SOURCES:.cpp=.o)
+
+EXECUTABLE=./unroll
+
+.PHONY: test
+
+
+all: $(EXECUTABLE) test
+
+CXXFLAGS =-g
+CXX=$(HIPCC)
+
+
+$(EXECUTABLE): $(OBJECTS)
+	$(HIPCC) $(OBJECTS) -o $@
+
+
+test: $(EXECUTABLE)
+	$(EXECUTABLE)
+
+
+clean:
+	rm -f $(EXECUTABLE)
+	rm -f $(OBJECTS)
+	rm -f $(HIP_PATH)/src/*.o
diff --git a/projects/hip/samples/2_Cookbook/9_unroll/Readme.md b/projects/hip/samples/2_Cookbook/9_unroll/Readme.md
new file mode 100644
index 0000000000..3c2635c0eb
--- /dev/null
+++ b/projects/hip/samples/2_Cookbook/9_unroll/Readme.md
@@ -0,0 +1,48 @@
+## Using Pragma unroll ###
+
+In this tutorial, we'll explain how to use #pragma unroll to improve the performance. 
+
+## Introduction:
+
+Loop unrolling optimization hints can be specified with #pragma unroll and #pragma nounroll. The pragma is placed immediately before a for loop.
+Specifying #pragma unroll without a parameter directs the loop unroller to attempt to fully unroll the loop if the trip count is known at compile time and attempt to partially unroll the loop if the trip count is not known at compile time.
+
+## Requirement:
+For hardware requirement and software installation [Installation](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/INSTALL.md) 
+
+## prerequiste knowledge:
+
+Programmers familiar with CUDA, OpenCL will be able to quickly learn and start coding with the HIP API. In case you are not, don't worry. You choose to start with the best one. We'll be explaining everything assuming you are completely new to gpgpu programming.
+
+## Simple Matrix Transpose 
+
+For this tutorial we will be using MatrixTranspose with shfl operation i.e., our 4_shfl tutorial since it is the only examples where we used loops inside the kernel.
+
+In this tutorial, we'll use `#pragma unroll`. In the same sourcecode, we used for MatrixTranspose. We'll add it just before the for loop as following:
+
+`#pragma unroll                                                        `
+`    for(int i=0;i<width;i++)                                          `
+`    {                                                                 `
+`        for(int j=0;j<width;j++)                                      `
+`            out[i*width + j] = __shfl(val,j*width + i);               `
+`    }                                                                 `
+
+Specifying the optional parameter, #pragma unroll value, directs the unroller to unroll the loop value times. Be careful while using it.
+Specifying #pragma nounroll indicates that the loop should not be unroll. #pragma unroll 1 will show the same behaviour.
+
+## How to build and run:
+Use the make command and execute it using ./exe
+Use hipcc to build the application, which is using hcc on AMD and nvcc on nvidia.
+
+## requirement for nvidia
+please make sure you have a 3.0 or higher compute capable device in order to use warp shfl operations and add `-gencode arch=compute=30, code=sm_30` nvcc flag in the Makefile while using this application.
+
+## More Info:
+- [HIP FAQ](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/docs/markdown/hip_faq.md)
+- [HIP Kernel Language](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/docs/markdown/hip_kernel_language.md)
+- [HIP Runtime API (Doxygen)](http://gpuopen-professionalcompute-tools.github.io/HIP)
+- [HIP Porting Guide](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/docs/markdown/hip_porting_guide.md)
+- [HIP Terminology](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/docs/markdown/hip_terms.md) (including Rosetta Stone of GPU computing terms across CUDA/HIP/HC/AMP/OpenL)
+- [clang-hipify](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/clang-hipify/README.md)
+- [Developer/CONTRIBUTING Info](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/CONTRIBUTING.md)
+- [Release Notes](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/RELEASE.md)
diff --git a/projects/hip/samples/2_Cookbook/9_unroll/unroll.cpp b/projects/hip/samples/2_Cookbook/9_unroll/unroll.cpp
new file mode 100644
index 0000000000..22f1c75e6e
--- /dev/null
+++ b/projects/hip/samples/2_Cookbook/9_unroll/unroll.cpp
@@ -0,0 +1,141 @@
+/*
+Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include<iostream>
+
+// hip header file
+#include "hip/hip_runtime.h"
+
+
+#define WIDTH     4
+
+#define NUM       (WIDTH*WIDTH)
+
+#define THREADS_PER_BLOCK_X  4
+#define THREADS_PER_BLOCK_Y  4
+#define THREADS_PER_BLOCK_Z  1
+
+// Device (Kernel) function, it must be void
+// hipLaunchParm provides the execution configuration
+__global__ void matrixTranspose(hipLaunchParm lp,
+                                float *out,
+                                float *in,
+                                const int width)
+{
+    int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
+    float val = in[x];
+
+#pragma unroll
+    for(int i=0;i<width;i++)
+    {
+        for(int j=0;j<width;j++)
+            out[i*width + j] = __shfl(val,j*width + i);
+    }
+}
+
+// CPU implementation of matrix transpose
+void matrixTransposeCPUReference(
+    float * output,
+    float * input,
+    const unsigned int width)
+{
+    for(unsigned int j=0; j < width; j++)
+    {
+        for(unsigned int i=0; i < width; i++)
+        {
+            output[i*width + j] = input[j*width + i];
+        }
+    }
+}
+
+int main() {
+
+  float* Matrix;
+  float* TransposeMatrix;
+  float* cpuTransposeMatrix;
+
+  float* gpuMatrix;
+  float* gpuTransposeMatrix;
+
+  hipDeviceProp_t devProp;
+  hipGetDeviceProperties(&devProp, 0);
+
+  std::cout << "Device name " << devProp.name << std::endl;
+
+  int i;
+  int errors;
+
+  Matrix = (float*)malloc(NUM * sizeof(float));
+  TransposeMatrix = (float*)malloc(NUM * sizeof(float));
+  cpuTransposeMatrix = (float*)malloc(NUM * sizeof(float));
+
+  // initialize the input data
+  for (i = 0; i < NUM; i++) {
+    Matrix[i] = (float)i*10.0f;
+  }
+
+  // allocate the memory on the device side
+  hipMalloc((void**)&gpuMatrix, NUM * sizeof(float));
+  hipMalloc((void**)&gpuTransposeMatrix, NUM * sizeof(float));
+
+  // Memory transfer from host to device
+  hipMemcpy(gpuMatrix, Matrix, NUM*sizeof(float), hipMemcpyHostToDevice);
+
+  // Lauching kernel from host
+  hipLaunchKernel(matrixTranspose,
+                  dim3(1),
+                  dim3(THREADS_PER_BLOCK_X * THREADS_PER_BLOCK_Y),
+                  0, 0,
+                  gpuTransposeMatrix , gpuMatrix, WIDTH);
+
+  // Memory transfer from device to host
+  hipMemcpy(TransposeMatrix, gpuTransposeMatrix, NUM*sizeof(float), hipMemcpyDeviceToHost);
+
+  // CPU MatrixTranspose computation
+  matrixTransposeCPUReference(cpuTransposeMatrix, Matrix, WIDTH);
+
+  // verify the results
+  errors = 0;
+  double eps = 1.0E-6;
+  for (i = 0; i < NUM; i++) {
+    if (std::abs(TransposeMatrix[i] - cpuTransposeMatrix[i]) > eps ) {
+    printf("%d cpu: %f gpu  %f\n",i,cpuTransposeMatrix[i],TransposeMatrix[i]);
+      errors++;
+    }
+  }
+  if (errors!=0) {
+    printf("FAILED: %d errors\n",errors);
+  } else {
+    printf ("PASSED!\n");
+  }
+
+  //free the resources on device side
+  hipFree(gpuMatrix);
+  hipFree(gpuTransposeMatrix);
+
+  //free the resources on host side
+  free(Matrix);
+  free(TransposeMatrix);
+  free(cpuTransposeMatrix);
+
+  return errors;
+}
diff --git a/projects/hip/src/device_util.cpp b/projects/hip/src/device_util.cpp
index 8ce53765b5..e59a44e5ba 100644
--- a/projects/hip/src/device_util.cpp
+++ b/projects/hip/src/device_util.cpp
@@ -26,6 +26,7 @@ THE SOFTWARE.
 #include "device_util.h"
 #include "hip/hcc_detail/device_functions.h"
 #include "hip/hip_runtime.h"
+#include <atomic>
 
 //=================================================================================================
 /*
@@ -923,24 +924,45 @@ __device__  unsigned long long int atomicMax(unsigned long long int* address,
 }
 
 //atomicCAS()
+template<typename T>
+__device__ T atomicCAS_impl(T* address, T compare, T val)
+{
+  // the implementation assumes the atomic is lock-free and 
+  // has the same size as the non-atmoic equivalent type
+  static_assert(sizeof(T) == sizeof(std::atomic<T>)
+                , "size mismatch between atomic and non-atomic types");
+
+  union {
+    T*              address;
+    std::atomic<T>* atomic_address;
+  } u;
+  u.address = address;
+
+  T expected = compare;
+
+  // hcc should generate a system scope atomic CAS 
+  std::atomic_compare_exchange_weak_explicit(u.atomic_address
+                                            , &expected, val
+                                            , std::memory_order_acq_rel
+                                            , std::memory_order_relaxed);
+  return expected;
+}
+
 __device__  int atomicCAS(int* address, int compare, int val)
 {
-	hc::atomic_compare_exchange(address,&compare,val);
-	return *address;
+  return atomicCAS_impl(address, compare, val);
 }
 __device__  unsigned int atomicCAS(unsigned int* address,
                        unsigned int compare,
                        unsigned int val)
 {
-	hc::atomic_compare_exchange(address,&compare,val);
-	return *address;
+  return atomicCAS_impl(address, compare, val);
 }
 __device__  unsigned long long int atomicCAS(unsigned long long int* address,
                                  unsigned long long int compare,
                                  unsigned long long int val)
 {
-	hc::atomic_compare_exchange((uint64_t*)address,(uint64_t*)&compare,(uint64_t)val);
-	return *address;
+  return atomicCAS_impl(address, compare, val);
 }
 
 //atomicAnd()
@@ -1163,18 +1185,18 @@ __device__ double __hip_precise_dsqrt_rz(double x) {
   return hc::precise_math::sqrt(x);
 }
 
-#define LOG_BASE2_E_DIV_2 0.4426950408894701
-#define LOG_BASE2_5 2.321928094887362
+#define LOG_BASE2_E 1.4426950408889634
+#define LOG_BASE2_10 3.32192809488736
 #define ONE_DIV_LOG_BASE2_E 0.69314718056
 #define ONE_DIV_LOG_BASE2_10 0.30102999566
 
 // Fast Math Intrinsics
 __device__ float __hip_fast_exp10f(float x) {
-  return __hip_fast_exp2f(x*LOG_BASE2_E_DIV_2);
+  return __hip_fast_exp2f(x*LOG_BASE2_E);
 }
 
 __device__ float __hip_fast_expf(float x) {
-  return __hip_fast_expf(x*LOG_BASE2_5);
+  return __hip_fast_exp2f(x*LOG_BASE2_10);
 }
 
 __device__ float __hip_fast_frsqrt_rn(float x) {
@@ -1215,20 +1237,23 @@ __device__ float __hip_fast_tanf(float x) {
 }
 
 // Double Precision Math
+// FIXME - HCC doesn't have a fast_math version double FP sqrt
+// Another issue is that these intrinsics call for a specific rounding mode;
+// however, their implementation all map to the same sqrt builtin
 __device__ double __hip_fast_dsqrt_rd(double x) {
-  return hc::fast_math::sqrt(x);
+  return hc::precise_math::sqrt(x);
 }
 
 __device__ double __hip_fast_dsqrt_rn(double x) {
-  return hc::fast_math::sqrt(x);
+  return hc::precise_math::sqrt(x);
 }
 
 __device__ double __hip_fast_dsqrt_ru(double x) {
-  return hc::fast_math::sqrt(x);
+  return hc::precise_math::sqrt(x);
 }
 
 __device__ double __hip_fast_dsqrt_rz(double x) {
-  return hc::fast_math::sqrt(x);
+  return hc::precise_math::sqrt(x);
 }
 
 __device__ void  __threadfence_system(void){
diff --git a/projects/hip/src/grid_launch.cpp b/projects/hip/src/grid_launch.cpp
index cac01df7dc..f3b28c5f60 100644
--- a/projects/hip/src/grid_launch.cpp
+++ b/projects/hip/src/grid_launch.cpp
@@ -52,9 +52,9 @@ namespace hip_impl
         int group_mem_bytes,
         hipStream_t stream)
     {
-        if ((HIP_TRACE_API & (1 << TRACE_CMD)) ||
+        if ((HIP_TRACE_API & (1 << TRACE_KCMD)) ||
             HIP_PROFILE_API ||
-            (COMPILE_HIP_DB && HIP_TRACE_API)) {
+            (COMPILE_HIP_DB && (HIP_TRACE_API & (1<<TRACE_ALL)))) {
             std::stringstream os;
             os  << tls_tidInfo.tid() << "." << tls_tidInfo.apiSeqNum()
                 << " hipLaunchKernel '" << kernel_name << "'"
diff --git a/projects/hip/src/hip_device.cpp b/projects/hip/src/hip_device.cpp
index 88d94411e8..93c1c20484 100644
--- a/projects/hip/src/hip_device.cpp
+++ b/projects/hip/src/hip_device.cpp
@@ -112,7 +112,7 @@ hipError_t hipDeviceGetLimit (size_t *pValue, hipLimit_t limit)
     }
 }
 
-hipError_t hipFuncSetCacheConfig (hipFuncCache_t cacheConfig)
+hipError_t hipFuncSetCacheConfig (const void* func, hipFuncCache_t cacheConfig)
 {
     HIP_INIT_API(cacheConfig);
 
@@ -298,7 +298,7 @@ hipError_t ihipGetDeviceProperties(hipDeviceProp_t* props, int device)
 hipError_t hipGetDeviceProperties(hipDeviceProp_t* props, int device)
 {
     HIP_INIT_API(props, device);
-    return ihipGetDeviceProperties(props, device);
+    return ihipLogStatus(ihipGetDeviceProperties(props, device));
 }
 
 hipError_t hipSetDeviceFlags( unsigned int flags)
diff --git a/projects/hip/src/hip_event.cpp b/projects/hip/src/hip_event.cpp
index 61ac5cd3ab..ab1c43a00b 100644
--- a/projects/hip/src/hip_event.cpp
+++ b/projects/hip/src/hip_event.cpp
@@ -42,25 +42,29 @@ ihipEvent_t::ihipEvent_t(unsigned flags)
 
 
 // Attach to an existing completion future:
-void ihipEvent_t::attachToCompletionFuture(const hc::completion_future *cf, ihipEventType_t eventType)
+void ihipEvent_t::attachToCompletionFuture(const hc::completion_future *cf, 
+                                           hipStream_t stream, ihipEventType_t eventType)
 {
     _state  = hipEventStatusRecording;
     _marker = *cf;
     _type   = eventType;
+    _stream = stream;
 }
 
 
 
-void ihipEvent_t::setTimestamp()
+void ihipEvent_t::refereshEventStatus()
 {
-    if (_state == hipEventStatusRecorded) {
-        // already recorded, done:
-        return;
-    } else {
+    bool isReady0 = _marker.is_ready();
+    bool isReady1;
+    int val = 0;
+    if (_state == hipEventStatusRecording) {
         // TODO - use completion-future functions to obtain ticks and timestamps:
         hsa_signal_t *sig  = static_cast<hsa_signal_t*> (_marker.get_native_handle());
+        isReady1 = _marker.is_ready();
         if (sig) {
-            if (hsa_signal_load_acquire(*sig) == 0) {
+            val = hsa_signal_load_acquire(*sig);
+            if (val == 0) {
 
                 if ((_type == hipEventTypeIndependent) || (_type == hipEventTypeStopCommand)) {
                     _timestamp =  _marker.get_end_tick();
@@ -71,10 +75,14 @@ void ihipEvent_t::setTimestamp()
                     _timestamp =  0;
                 }
 
-                _state = hipEventStatusRecorded;
+                _state = hipEventStatusComplete;
             }
         }
     }
+
+    if (_state != hipEventStatusComplete) {
+        //printf (" not ready isReady0=%d val=%d isReady1=%d\n", isReady0, val, isReady1);
+    }
 }
 
 
@@ -83,11 +91,19 @@ hipError_t ihipEventCreate(hipEvent_t* event, unsigned flags)
     hipError_t e = hipSuccess;
 
     // TODO-IPC - support hipEventInterprocess.
-    unsigned supportedFlags = hipEventDefault | hipEventBlockingSync | hipEventDisableTiming;
-    if ((flags & ~supportedFlags) == 0) {
-        ihipEvent_t *eh = new ihipEvent_t(flags);
+    unsigned supportedFlags =   hipEventDefault 
+                              | hipEventBlockingSync 
+                              | hipEventDisableTiming 
+                              | hipEventReleaseToDevice 
+                              | hipEventReleaseToSystem 
+                              ;
+    const unsigned releaseFlags = (hipEventReleaseToDevice | hipEventReleaseToSystem);
 
-        *event = eh;
+    const bool illegalFlags = (flags & ~supportedFlags) ||            // can't set any unsupported flags.
+                              (flags & releaseFlags) == releaseFlags; // can't set both release flags
+
+    if (!illegalFlags) {
+        *event = new ihipEvent_t(flags);
     } else {
         e = hipErrorInvalidValue;
     }
@@ -114,17 +130,20 @@ hipError_t hipEventRecord(hipEvent_t event, hipStream_t stream)
     HIP_INIT_API(event, stream);
 
     if (event && event->_state != hipEventStatusUnitialized)   {
+        stream = ihipSyncAndResolveStream(stream);
+
         event->_stream = stream;
 
-        if (stream == NULL) {
-            // If stream == NULL, wait on all queues.
-            // TODO-HCC fix this - is this conservative or still uses device timestamps?
-            // TODO-HCC can we use barrier or event marker to implement better solution?
+        if (HIP_SYNC_NULL_STREAM && stream->isDefaultStream()) {
+
+            // TODO-HIP_SYNC_NULL_STREAM : can remove this code when HIP_SYNC_NULL_STREAM = 0
+            
+            // If default stream , then wait on all queues.
             ihipCtx_t *ctx = ihipGetTlsDefaultCtx();
-            ctx->locked_syncDefaultStream(true);
+            ctx->locked_syncDefaultStream(true, true);
 
             event->_timestamp = hc::get_system_ticks();
-            event->_state = hipEventStatusRecorded;
+            event->_state = hipEventStatusComplete;
             return ihipLogStatus(hipSuccess);
         } else {
             event->_state  = hipEventStatusRecording;
@@ -164,13 +183,16 @@ hipError_t hipEventSynchronize(hipEvent_t event)
         } else if (event->_state == hipEventStatusCreated ) {
             // Created but not actually recorded on any device:
             return ihipLogStatus(hipSuccess);
-        } else if (event->_stream == NULL) {
+        } else if (HIP_SYNC_NULL_STREAM && (event->_stream->isDefaultStream() )) {
             auto *ctx = ihipGetTlsDefaultCtx();
-            ctx->locked_syncDefaultStream(true);
+            // TODO-HIP_SYNC_NULL_STREAM - can remove this code
+            ctx->locked_syncDefaultStream(true, true);
             return ihipLogStatus(hipSuccess);
         } else {
             event->_marker.wait((event->_flags & hipEventBlockingSync) ? hc::hcWaitModeBlocked : hc::hcWaitModeActive);
 
+            assert (event->_marker.is_ready());
+
             return ihipLogStatus(hipSuccess);
         }
     } else {
@@ -182,40 +204,50 @@ hipError_t hipEventElapsedTime(float *ms, hipEvent_t start, hipEvent_t stop)
 {
     HIP_INIT_API(ms, start, stop);
 
-    ihipEvent_t *start_eh = start;
-    ihipEvent_t *stop_eh = stop;
-
-    start->setTimestamp();
-    stop->setTimestamp();
-
     hipError_t status = hipSuccess;
+
     *ms = 0.0f;
 
-    if (start_eh && stop_eh) {
-        if ((start_eh->_state == hipEventStatusRecorded) && (stop_eh->_state == hipEventStatusRecorded)) {
-            // Common case, we have good information for both events.
+    if ((start == nullptr) ||
+        (start->_flags & hipEventDisableTiming) ||
+        (start->_state == hipEventStatusUnitialized) || (start->_state == hipEventStatusCreated) ||
+        (stop == nullptr) ||
+        (stop->_flags & hipEventDisableTiming) ||
+        ( stop->_state == hipEventStatusUnitialized) || ( stop->_state == hipEventStatusCreated)) {
 
-            int64_t tickDiff = (stop_eh->timestamp() - start_eh->timestamp());
+        // Both events must be at least recorded else return hipErrorInvalidResourceHandle
 
-            uint64_t freqHz;
-            hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY, &freqHz);
-            if (freqHz) {
-                *ms = ((double)(tickDiff) /  (double)(freqHz)) * 1000.0f;
-                status = hipSuccess;
-            } else {
-                * ms = 0.0f;
-                status = hipErrorInvalidValue;
-            }
+        status = hipErrorInvalidResourceHandle;
 
+    } else {
+        // Refresh status, if still recording...
+        start->refereshEventStatus();
+        stop->refereshEventStatus();
 
-        } else if ((start_eh->_state == hipEventStatusRecording) ||
-                   (stop_eh->_state  == hipEventStatusRecording)) {
-            status = hipErrorNotReady;
-        } else if ((start_eh->_state == hipEventStatusUnitialized) ||
-                   (stop_eh->_state  == hipEventStatusUnitialized)) {
-            status = hipErrorInvalidResourceHandle;
+        if ((start->_state == hipEventStatusComplete) && (stop->_state == hipEventStatusComplete)) {
+        // Common case, we have good information for both events.
+
+        int64_t tickDiff = (stop->timestamp() - start->timestamp());
+
+        uint64_t freqHz;
+        hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY, &freqHz);
+        if (freqHz) {
+            *ms = ((double)(tickDiff) /  (double)(freqHz)) * 1000.0f;
+            status = hipSuccess;
+        } else {
+            * ms = 0.0f;
+            status = hipErrorInvalidValue;
         }
-    }
+
+
+        } else if ((start->_state == hipEventStatusRecording) ||
+                   (stop->_state  == hipEventStatusRecording)) {
+
+            status = hipErrorNotReady;
+        } else {
+                assert(0);
+        }
+    } 
 
     return ihipLogStatus(status);
 }
diff --git a/projects/hip/src/hip_fp16.cpp b/projects/hip/src/hip_fp16.cpp
index c2b7b47597..8e8f003f56 100644
--- a/projects/hip/src/hip_fp16.cpp
+++ b/projects/hip/src/hip_fp16.cpp
@@ -90,11 +90,11 @@ __device__  bool __hgt(__half a, __half b) {
 }
 
 __device__  bool __hisinf(__half a) {
-  return a == __hInfValue.h ? true : false;
+  return a == HINF ? true : false;
 }
 
 __device__  bool __hisnan(__half a) {
-  return a > __hInfValue.h ? true : false;
+  return a > HINF ? true : false;
 }
 
 __device__  bool __hle(__half a, __half b) {
@@ -114,75 +114,75 @@ Half2 Comparision Functions
 */
 
 __device__  bool __hbeq2(__half2 a, __half2 b) {
-  return (a.p[0] == b.p[0] ? true : false) && (a.p[1] == b.p[1] ? true : false);
+  return (a.x == b.x ? true : false) && (a.y == b.y ? true : false);
 }
 
 __device__  bool __hbge2(__half2 a, __half2 b) {
-  return (a.p[0] >= b.p[0] ? true : false) && (a.p[1] >= b.p[1] ? true : false);
+  return (a.x >= b.x ? true : false) && (a.y >= b.y ? true : false);
 }
 
 __device__  bool __hbgt2(__half2 a, __half2 b) {
-  return (a.p[0] > b.p[0] ? true : false) && (a.p[1] > b.p[1] ? true : false);
+  return (a.x > b.x ? true : false) && (a.y > b.y ? true : false);
 }
 
 __device__  bool __hble2(__half2 a, __half2 b) {
-  return (a.p[0] <= b.p[0] ? true : false) && (a.p[1] <= b.p[1] ? true : false);
+  return (a.x <= b.x ? true : false) && (a.y <= b.y ? true : false);
 }
 
 __device__  bool __hblt2(__half2 a, __half2 b) {
-  return (a.p[0] < b.p[0] ? true : false) && (a.p[1] < b.p[1] ? true : false);
+  return (a.x < b.x ? true : false) && (a.y < b.y ? true : false);
 }
 
 __device__  bool __hbne2(__half2 a, __half2 b) {
-  return (a.p[0] != b.p[0] ? true : false) && (a.p[1] != b.p[1] ? true : false);
+  return (a.x != b.x ? true : false) && (a.y != b.y ? true : false);
 }
 
 __device__  __half2 __heq2(__half2 a, __half2 b) {
   __half2 c;
-  c.p[0] = (a.p[0] == b.p[0]) ? (__half)1 : (__half)0;
-  c.p[1] = (a.p[1] == b.p[1]) ? (__half)1 : (__half)0;
+  c.x = (a.x == b.x) ? (__half)1 : (__half)0;
+  c.y = (a.y == b.y) ? (__half)1 : (__half)0;
   return c;
 }
 
 __device__  __half2 __hge2(__half2 a, __half2 b) {
   __half2 c;
-  c.p[0] = (a.p[0] >= b.p[0]) ? (__half)1 : (__half)0;
-  c.p[1] = (a.p[1] >= b.p[1]) ? (__half)1 : (__half)0;
+  c.x = (a.x >= b.x) ? (__half)1 : (__half)0;
+  c.y = (a.y >= b.y) ? (__half)1 : (__half)0;
   return c;
 }
 
 __device__  __half2 __hgt2(__half2 a, __half2 b) {
   __half2 c;
-  c.p[0] = (a.p[0] > b.p[0]) ? (__half)1 : (__half)0;
-  c.p[1] = (a.p[1] > b.p[1]) ? (__half)1 : (__half)0;
+  c.x = (a.x > b.x) ? (__half)1 : (__half)0;
+  c.y = (a.y > b.y) ? (__half)1 : (__half)0;
   return c;
 }
 
 __device__  __half2 __hisnan2(__half2 a) {
   __half2 c;
-  c.p[0] = (a.p[0] > __hInfValue.h) ? (__half)1 : (__half)0;
-  c.p[1] = (a.p[1] > __hInfValue.h) ? (__half)1 : (__half)0;
+  c.x = (a.x > HINF) ? (__half)1 : (__half)0;
+  c.y = (a.y > HINF) ? (__half)1 : (__half)0;
   return c;
 }
 
 __device__  __half2 __hle2(__half2 a, __half2 b) {
   __half2 c;
-  c.p[0] = (a.p[0] <= b.p[0]) ? (__half)1 : (__half)0;
-  c.p[1] = (a.p[1] <= b.p[1]) ? (__half)1 : (__half)0;
+  c.x = (a.x <= b.x) ? (__half)1 : (__half)0;
+  c.y = (a.y <= b.y) ? (__half)1 : (__half)0;
   return c;
 }
 
 __device__  __half2 __hlt2(__half2 a, __half2 b) {
   __half2 c;
-  c.p[0] = (a.p[0] < b.p[0]) ? (__half)1 : (__half)0;
-  c.p[1] = (a.p[1] < b.p[1]) ? (__half)1 : (__half)0;
+  c.x = (a.x < b.x) ? (__half)1 : (__half)0;
+  c.y = (a.y < b.y) ? (__half)1 : (__half)0;
   return c;
 }
 
 __device__  __half2 __hne2(__half2 a, __half2 b) {
   __half2 c;
-  c.p[0] = (a.p[0] != b.p[0]) ? (__half)1 : (__half)0;
-  c.p[1] = (a.p[1] != b.p[1]) ? (__half)1 : (__half)0;
+  c.x = (a.x != b.x) ? (__half)1 : (__half)0;
+  c.y = (a.y != b.y) ? (__half)1 : (__half)0;
   return c;
 }
 
@@ -191,8 +191,8 @@ Conversion instructions
 */
 __device__  __half2 __float22half2_rn(const float2 a) {
   __half2 b;
-  b.p[0] = (__half)a.x;
-  b.p[1] = (__half)a.y;
+  b.x = (__half)a.x;
+  b.y = (__half)a.y;
   return b;
 }
 
@@ -202,8 +202,8 @@ __device__  __half __float2half(const float a) {
 
 __device__  __half2 __float2half2_rn(const float a) {
   __half2 b;
-  b.p[0] = (__half)a;
-  b.p[1] = (__half)a;
+  b.x = (__half)a;
+  b.y = (__half)a;
   return b;
 }
 
@@ -225,15 +225,15 @@ __device__  __half __float2half_rz(const float a) {
 
 __device__  __half2 __floats2half2_rn(const float a, const float b) {
   __half2 c;
-  c.p[0] = (__half)a;
-  c.p[1] = (__half)b;
+  c.x = (__half)a;
+  c.y = (__half)b;
   return c;
 }
 
 __device__  float2 __half22float2(const __half2 a) {
   float2 b;
-  b.x = (float)a.p[0];
-  b.y = (float)a.p[1];
+  b.x = (float)a.x;
+  b.y = (float)a.y;
   return b;
 }
 
@@ -243,8 +243,8 @@ __device__  float __half2float(const __half a) {
 
 __device__  __half2 half2half2(const __half a) {
   __half2 b;
-  b.p[0] = a;
-  b.p[1] = a;
+  b.x = a;
+  b.y = a;
   return b;
 }
 
@@ -358,30 +358,30 @@ __device__  unsigned short int __half_as_ushort(const __half h) {
 
 __device__  __half2 __halves2half2(const __half a, const __half b) {
   __half2 c;
-  c.p[0] = a;
-  c.p[1] = b;
+  c.x = a;
+  c.y = b;
   return c;
 }
 
 __device__  float __high2float(const __half2 a) {
-  return (float)a.p[1];
+  return (float)a.y;
 }
 
 __device__  __half __high2half(const __half2 a) {
-  return a.p[1];
+  return a.y;
 }
 
 __device__  __half2 __high2half2(const __half2 a) {
   __half2 b;
-  b.p[0] = a.p[1];
-  b.p[1] = a.p[1];
+  b.x = a.y;
+  b.y = a.y;
   return b;
 }
 
 __device__  __half2 __highs2half2(const __half2 a, const __half2 b) {
   __half2 c;
-  c.p[0] = a.p[1];
-  c.p[1] = b.p[1];
+  c.x = a.y;
+  c.y = b.y;
   return c;
 }
 
@@ -418,38 +418,38 @@ __device__  __half __ll2half_rz(long long int i){
 }
 
 __device__  float __low2float(const __half2 a) {
-  return (float)a.p[0];
+  return (float)a.x;
 }
 
 __device__  __half __low2half(const __half2 a) {
-  return a.p[0];
+  return a.x;
 }
 
 __device__  __half2 __low2half2(const __half2 a, const __half2 b) {
   __half2 c;
-  c.p[0] = a.p[0];
-  c.p[1] = b.p[0];
+  c.x = a.x;
+  c.y = b.x;
   return c;
 }
 
 __device__  __half2 __low2half2(const __half2 a) {
   __half2 b;
-  b.p[0] = a.p[0];
-  b.p[1] = a.p[0];
+  b.x = a.x;
+  b.y = a.x;
   return b;
 }
 
 __device__  __half2 __lowhigh2highlow(const __half2 a) {
   __half2 b;
-  b.p[0] = a.p[1];
-  b.p[1] = a.p[0];
+  b.x = a.y;
+  b.y = a.x;
   return b;
 }
 
 __device__  __half2 __lows2half2(const __half2 a, const __half2 b) {
   __half2 c;
-  c.p[0] = a.p[0];
-  c.p[1] = b.p[0];
+  c.y = a.x;
+  c.y = b.x;
   return c;
 }
 
@@ -542,346 +542,4 @@ typedef struct{
   };
 } struct_float;
 
-#if __clang_major__ == 3
 
-static __device__ float cvt_half_to_float(__half a){
-  struct_float ret = {0};
-  if(a.x == 0){
-    return 0.0f;
-  }
-  if(a.x == 0x8000){
-    return -0.0f;
-  }
-  ret.u = ((a.x&0x8000)<<16) | (((a.x&0x7c00)+0x1C000)<<13) | ((a.x&0x03FF)<<13);
-  return ret.f;
-}
-
-static __device__ __half cvt_float_to_half(float b){
-  struct_float f = {0};
-  __half ret = {0};
-  f.f = b;
-  if(f.f == 0.0f){
-    ret.x = 0;
-    return ret;
-  }
-  if(f.f == -0.0f){
-    ret.x = 0x8000;
-    return ret;
-  }
-  ret.x = ((f.u>>16)&0x8000)|((((f.u&0x7f800000)-0x38000000)>>13)&0x7c00)|((f.u>>13)&0x03ff);
-  return ret;
-}
-
-
-__device__ __half __soft_hadd(const __half a, const __half b){
-  return cvt_float_to_half(cvt_half_to_float(a)+cvt_half_to_float(b));
-}
-
-__device__ __half __soft_hadd_sat(const __half a, const __half b){
-  float f = cvt_half_to_float(a) + cvt_half_to_float(b);
-  return (f < 0.0f ? __half_value_zero_float : (f > 1.0f ? __half_value_one_float: cvt_float_to_half(f)));
-}
-
-__device__ __half __soft_hfma(const __half a, const __half b, const __half c){
-  return cvt_float_to_half(fmaf(cvt_half_to_float(a), cvt_half_to_float(b), cvt_half_to_float(c)));
-}
-
-__device__ __half __soft_hfma_sat(const __half a, const __half b, const __half c){
-  float f = fmaf(cvt_half_to_float(a), cvt_half_to_float(b), cvt_half_to_float(c));
-  return (f < 0.0f ? __half_value_zero_float : (f > 1.0f ? __half_value_one_float: cvt_float_to_half(f)));
-}
-
-__device__ __half __soft_hmul(const __half a, const __half b){
-  return cvt_float_to_half(cvt_half_to_float(a)*cvt_half_to_float(b));
-}
-
-__device__ __half __soft_hmul_sat(const __half a, const __half b){
-  float f = cvt_half_to_float(a) * cvt_half_to_float(b);
-  return (f < 0.0f ? __half_value_zero_float : (f > 1.0f ? __half_value_one_float: cvt_float_to_half(f)));
-}
-
-__device__ __half __soft_hneq(const __half a){
-  __half ret = {a.x};
-  ret.x ^= 1 << 15;
-  return ret;
-}
-
-__device__ __half __soft_hsub(const __half a, const __half b){
-  return cvt_float_to_half(cvt_half_to_float(a)-cvt_half_to_float(b));
-}
-
-__device__ __half __soft_hsub_sat(const __half a, const __half b){
-  float f = cvt_half_to_float(a) - cvt_half_to_float(b);
-  return (f < 0.0f ? __half_value_zero_float : (f > 1.0f ? __half_value_one_float: cvt_float_to_half(f)));
-}
-
-
-/*
-Half2 Arithmetic Instructions
-*/
-
-__device__ __half2 __soft_hadd2(const __half2 a, const __half2 b){
-  __half2 ret;
-  ret.p[1] = __soft_hadd(a.p[1], b.p[1]);
-  ret.p[0] = __soft_hadd(a.p[0], b.p[0]);
-  return ret;
-}
-
-__device__ __half2 __soft_hadd2_sat(const __half2 a, const __half2 b){
-  __half2 ret;
-  ret.p[1] = __soft_hadd_sat(a.p[1], b.p[1]);
-  ret.p[0] = __soft_hadd_sat(a.p[0], b.p[0]);
-  return ret;
-}
-
-__device__ __half2 __soft_hfma2(const __half2 a, const __half2 b, const __half2 c){
-  __half2 ret;
-  ret.p[1] = __soft_hfma(a.p[1], b.p[1], c.p[1]);
-  ret.p[0] = __soft_hfma(a.p[0], b.p[0], c.p[0]);
-  return ret;
-}
-
-__device__ __half2 __soft_hfma2_sat(const __half2 a, const __half2 b, const __half2 c){
-  __half2 ret;
-  ret.p[1] = __soft_hfma_sat(a.p[1], b.p[1], c.p[1]);
-  ret.p[0] = __soft_hfma_sat(a.p[0], b.p[0], c.p[0]);
-  return ret;
-}
-
-__device__ __half2 __soft_hmul2(const __half2 a, const __half2 b){
-  __half2 ret;
-  ret.p[1] = __soft_hmul(a.p[1], b.p[1]);
-  ret.p[0] = __soft_hmul(a.p[0], b.p[0]);
-  return ret;
-}
-
-__device__ __half2 __soft_hmul2_sat(const __half2 a, const __half2 b){
-  __half2 ret;
-  ret.p[1] = __soft_hmul_sat(a.p[1], b.p[1]);
-  ret.p[0] = __soft_hmul_sat(a.p[0], b.p[0]);
-  return ret;
-}
-
-__device__ __half2 __soft_hneq2(const __half2 a){
-  __half2 ret;
-  ret.p[1] = __soft_hneq(a.p[1]);
-  ret.p[0] = __soft_hneq(a.p[0]);
-  return ret;
-}
-
-__device__ __half2 __soft_hsub2(const __half2 a, const __half2 b){
-  __half2 ret;
-  ret.p[1] = __soft_hsub(a.p[1], b.p[1]);
-  ret.p[0] = __soft_hsub(a.p[0], b.p[0]);
-  return ret;
-}
-
-__device__ __half2 __soft_hsub2_sat(const __half2 a, const __half2 b){
-  __half2 ret;
-  ret.p[1] = __soft_hsub_sat(a.p[1], b.p[1]);
-  ret.p[0] = __soft_hsub_sat(a.p[0], b.p[0]);
-  return ret;
-}
-
-/*
-Half Cmps
-*/
-
-__device__  bool __soft_heq(const __half a, const __half b){
-  return (a.x == b.x ? true:false);
-}
-
-__device__ bool __soft_hge(const __half a, const __half b){
-  return (cvt_half_to_float(a) >= cvt_half_to_float(b));
-}
-
-__device__ bool __soft_hgt(const __half a, const __half b){
-  return (cvt_half_to_float(a) > cvt_half_to_float(b));
-}
-
-__device__ bool __soft_hisinf(const __half a){
-  return ((a.x == __half_neg_inf) ? -1 : (a.x == __half_pos_inf) ? 1 : 0);
-}
-
-__device__ bool __soft_hisnan(const __half a){
-  if(((a.x & __half_pos_inf) == a.x) || ((a.x & __half_neg_inf) == a.x)){
-    return true;
-  }else{
-    return false;
-  }
-}
-
-__device__ bool __soft_hle(const __half a, const __half b){
-  return (cvt_half_to_float(a) <= cvt_half_to_float(b));
-}
-
-__device__ bool __soft_hlt(const __half a, const __half b){
-  return (cvt_half_to_float(a) < cvt_half_to_float(b));
-}
-
-__device__ bool __soft_hne(const __half a, const __half b){
-  return a.x == b.x ? false : true;
-}
-
-/*
-Half2 Cmps
-*/
-
-__device__ bool __soft_hbeq2(const __half2 a, const __half2 b){
-  return __soft_heq(a.p[1], b.p[1]) && __soft_heq(a.p[0], b.p[0]);
-}
-
-__device__ bool __soft_hbge2(const __half2 a, const __half2 b){
-  return __soft_hge(a.p[1], b.p[1]) && __soft_hge(a.p[0], b.p[0]);
-}
-
-__device__ bool __soft_hbgt2(const __half2 a, const __half2 b){
-  return __soft_hgt(a.p[1], b.p[1]) && __soft_hgt(a.p[0], b.p[0]);
-}
-
-__device__ bool __soft_hble2(const __half2 a, const __half2 b){
-  return __soft_hle(a.p[1], b.p[1]) && __soft_hle(a.p[0], b.p[0]);
-}
-
-__device__ bool __soft_hblt2(const __half2 a, const __half2 b){
-  return __soft_hlt(a.p[1], b.p[1]) && __soft_hlt(a.p[0], b.p[0]);
-}
-
-__device__ bool __soft_hbne2(const __half2 a, const __half2 b){
-  return __soft_hne(a.p[1], b.p[1]) && __soft_hne(a.p[0], b.p[0]);
-}
-
-
-
-__device__ __half2 __soft_heq2(const __half2 a, const __half2 b){
-  __half2 ret = {0};
-  ret.p[1] = (__soft_heq(a.p[1], b.p[1])) ? __half_value_one_float : __half_value_zero_float;
-  ret.p[0] = (__soft_heq(a.p[0], b.p[0])) ? __half_value_one_float : __half_value_zero_float;
-  return ret;
-}
-
-__device__ __half2 __soft_hge2(const __half2 a, const __half2 b){
-  __half2 ret = {0};
-  ret.p[1] = (__soft_hge(a.p[1], b.p[1])) ? __half_value_one_float : __half_value_zero_float;
-  ret.p[0] = (__soft_hge(a.p[0], b.p[0])) ? __half_value_one_float : __half_value_zero_float;
-  return ret;
-}
-
-__device__ __half2 __soft_hgt2(const __half2 a, const __half2 b){
-  __half2 ret = {0};
-  ret.p[1] = (__soft_hgt(a.p[1], b.p[1])) ? __half_value_one_float : __half_value_zero_float;
-  ret.p[0] = (__soft_hgt(a.p[0], b.p[0])) ? __half_value_one_float : __half_value_zero_float;
-  return ret;
-}
-
-__device__ __half2 __soft_hisnan2(const __half2 a){
-  __half2 ret = {0};
-  ret.p[1] = __soft_hisnan(a.p[1]) ? __half_value_one_float : __half_value_zero_float;
-  ret.p[0] = __soft_hisnan(a.p[0]) ? __half_value_one_float : __half_value_zero_float;
-  return ret;
-}
-
-__device__ __half2 __soft_hle2(const __half2 a, const __half2 b){
-  __half2 ret = {0};
-  ret.p[1] = (__soft_hle(a.p[1], b.p[1])) ? __half_value_one_float : __half_value_zero_float;
-  ret.p[0] = (__soft_hle(a.p[0], b.p[0])) ? __half_value_one_float : __half_value_zero_float;
-  return ret;
-}
-
-__device__ __half2 __soft_hlt2(const __half2 a, const __half2 b){
-  __half2 ret = {0};
-  ret.p[1] = (__soft_hlt(a.p[1], b.p[1])) ? __half_value_one_float : __half_value_zero_float;
-  ret.p[0] = (__soft_hlt(a.p[0], b.p[0])) ? __half_value_one_float : __half_value_zero_float;
-  return ret;
-}
-
-__device__ __half2 __soft_hne2(const __half2 a, const __half2 b){
-  __half2 ret = {0};
-  ret.p[1] = (__soft_hne(a.p[1], b.p[1])) ? __half_value_one_float : __half_value_zero_float;
-  ret.p[0] = (__soft_hne(a.p[0], b.p[0])) ? __half_value_one_float : __half_value_zero_float;
-  return ret;
-}
-
-/*
-Half Cnvs and Data Mvmnt
-*/
-
-__device__ __half2 __soft_float22half2_rn(const float2 a){
-  __half2 ret = {0};
-  ret.p[1] = cvt_float_to_half(a.x);
-  ret.p[0] = cvt_float_to_half(a.y);
-  return ret;
-}
-
-__device__ __half __soft_float2half(const float a){
-  return cvt_float_to_half(a);
-}
-
-__device__ __half2 __soft_float2half2_rn(const float a){
-  __half ret = cvt_float_to_half(a);
-  return {ret, ret};
-}
-
-__device__ __half2 __soft_floats2half2_rn(const float a, const float b){
-  return {cvt_float_to_half(a), cvt_float_to_half(b)};
-}
-
-__device__ float2 __soft_half22float2(const __half2 a){
-  return {cvt_half_to_float(a.p[1]), cvt_half_to_float(a.p[0])};
-}
-
-__device__ float __soft_half2float(const __half a){
-  return cvt_half_to_float(a);
-}
-
-__device__ __half2 __soft_half2half2(const __half a){
-  return {a,a};
-}
-
-__device__ __half2 __soft_halves2half2(const __half a, const __half b){
-  return {a,b};
-}
-
-__device__ float __soft_high2float(const __half2 a){
-  return cvt_half_to_float(a.p[1]);
-}
-
-__device__ __half __soft_high2half(const __half2 a){
-  return a.p[1];
-}
-
-__device__ __half2 __soft_high2half2(const __half2 a){
-  return {a.p[1], a.p[1]};
-}
-
-__device__ __half2 __soft_highs2half2(const __half2 a, const __half2 b){
-  return {a.p[1], b.p[1]};
-}
-
-__device__ float __soft_low2float(const __half2 a){
-  return cvt_half_to_float(a.p[0]);
-}
-
-__device__ __half __soft_low2half(const __half2 a){
-  return a.p[0];
-}
-
-__device__ __half2 __soft_low2half2(const __half2 a){
-  return {a.p[0], a.p[0]};
-}
-
-__device__ __half2 __soft_lows2half2(const __half2 a, const __half2 b){
-  return {a.p[0], b.p[0]};
-}
-
-__device__ __half2 __soft_lowhigh2highlow(const __half2 a){
-  return {a.p[0], a.p[1]};
-}
-
-__device__ __half2 __soft_low2half2(const __half2 a, const __half2 b){
-  return {a.p[0], b.p[0]};
-}
-
-
-
-#endif
diff --git a/projects/hip/src/hip_hc_gfx803.ll b/projects/hip/src/hip_hc_gfx803.ll
index 0080fc7d81..7e3d0e37dd 100644
--- a/projects/hip/src/hip_hc_gfx803.ll
+++ b/projects/hip/src/hip_hc_gfx803.ll
@@ -2,89 +2,122 @@ target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-i64:
 target triple = "amdgcn--amdhsa"
 
 
-define i32 @__hip_hc_ir_hadd2_int(i32 %a, i32 %b) #1 {
-  %1 = tail call i32 asm sideeffect "v_add_f16 $0, $1, $2","=v,v,v"(i32 %a, i32 %b)
-  tail call void asm sideeffect "v_add_f16_sdwa $0, $1, $2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1","v,v,v"(i32 %1, i32 %a, i32 %b)
-  ret i32 %1
+define <2 x half> @__hip_hc_ir_hadd2_int(<2 x half> %a, <2 x half> %b) #1 {
+  %1 = bitcast <2 x half> %a to i32
+  %2 = bitcast <2 x half> %b to i32
+  %3 = tail call i32 asm sideeffect "v_add_f16 $0, $1, $2","=v,v,v"(i32 %1, i32 %2)
+  tail call void asm sideeffect "v_add_f16_sdwa $0, $1, $2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1","v,v,v"(i32 %3, i32 %1, i32 %2)
+  %4 = bitcast i32 %3 to <2 x half>
+  ret <2 x half> %4
 }
 
-define i32 @__hip_hc_ir_hfma2_int(i32 %a, i32 %b, i32 %c) #1 {
-  %1 = tail call i32 asm sideeffect "v_mad_f16 $0, $1, $2, $3","=v,v,v,v"(i32 %a, i32 %b, i32 %c)
-  tail call void asm sideeffect "v_mul_f16_sdwa $0, $1, $2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1","v,v,v"(i32 %1, i32 %a, i32 %b)
-  tail call void asm sideeffect "v_add_f16_sdwa $0, $1, $2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1","v,v,v"(i32 %1, i32 %1, i32 %c)
-  ret i32 %1
+define <2 x half> @__hip_hc_ir_hfma2_int(<2 x half> %a, <2 x half> %b, <2 x half> %c) #1 {
+  %1 = bitcast <2 x half> %a to i32
+  %2 = bitcast <2 x half> %b to i32
+  %3 = bitcast <2 x half> %c to i32
+  %4 = tail call i32 asm sideeffect "v_mad_f16 $0, $1, $2, $3","=v,v,v,v"(i32 %1, i32 %2, i32 %3)
+  tail call void asm sideeffect "v_mul_f16_sdwa $0, $1, $2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1","v,v,v"(i32 %4, i32 %1, i32 %2)
+  tail call void asm sideeffect "v_add_f16_sdwa $0, $1, $2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1","v,v,v"(i32 %4, i32 %4, i32 %3)
+  %5 = bitcast i32 %4 to <2 x half>
+  ret <2 x half> %5
 }
 
-define i32 @__hip_hc_ir_hmul2_int(i32 %a, i32 %b) #1 {
-  %1 = tail call i32 asm sideeffect "v_mul_f16 $0, $1, $2","=v,v,v"(i32 %a, i32 %b)
-  tail call void asm sideeffect "v_mul_f16_sdwa $0, $1, $2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1","v,v,v"(i32 %1, i32 %a, i32 %b)
-  ret i32 %1
+define <2 x half> @__hip_hc_ir_hmul2_int(<2 x half> %a, <2 x half> %b) #1 {
+  %1 = bitcast <2 x half> %a to i32
+  %2 = bitcast <2 x half> %b to i32
+  %3 = tail call i32 asm sideeffect "v_mul_f16 $0, $1, $2","=v,v,v"(i32 %1, i32 %2)
+  tail call void asm sideeffect "v_mul_f16_sdwa $0, $1, $2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1","v,v,v"(i32 %3, i32 %1, i32 %2)
+  %4 = bitcast i32 %3 to <2 x half>
+  ret <2 x half> %4
 }
 
-define i32 @__hip_hc_ir_hsub2_int(i32 %a, i32 %b) #1 {
-  %1 = tail call i32 asm sideeffect "v_sub_f16 $0, $1, $2","=v,v,v"(i32 %a, i32 %b)
-  tail call void asm sideeffect "v_sub_f16_sdwa $0, $1, $2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1","v,v,v"(i32 %1, i32 %a, i32 %b)
-  ret i32 %1
+define <2 x half> @__hip_hc_ir_hsub2_int(<2 x half> %a, <2 x half> %b) #1 {
+  %1 = bitcast <2 x half> %a to i32
+  %2 = bitcast <2 x half> %b to i32
+  %3 = tail call i32 asm sideeffect "v_sub_f16 $0, $1, $2","=v,v,v"(i32 %1, i32 %2)
+  tail call void asm sideeffect "v_sub_f16_sdwa $0, $1, $2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1","v,v,v"(i32 %3, i32 %1, i32 %2)
+  %4 = bitcast i32 %3 to <2 x half>
+  ret <2 x half> %4
 }
 
-define i32 @__hip_hc_ir_h2ceil_int(i32 %a) #1 {
-  %1 = tail call i32 asm sideeffect "v_ceil_f16 $0, $1","=v,v"(i32 %a)
-  tail call void asm sideeffect "v_ceil_f16_sdwa $0, $1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1","v,v"(i32 %1, i32 %a)
-  ret i32 %1
+define <2 x half> @__hip_hc_ir_h2ceil_int(<2 x half> %a) #1 {
+  %1 = bitcast <2 x half> %a to i32
+  %2 = tail call i32 asm sideeffect "v_ceil_f16 $0, $1","=v,v"(i32 %1)
+  tail call void asm sideeffect "v_ceil_f16_sdwa $0, $1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1","v,v"(i32 %2, i32 %1)
+  %3 = bitcast i32 %2 to <2 x half>
+  ret <2 x half> %3
 }
 
-define i32 @__hip_hc_ir_h2cos_int(i32 %a) #1 {
-  %1 = tail call i32 asm sideeffect "v_cos_f16 $0, $1","=v,v"(i32 %a)
-  tail call void asm sideeffect "v_cos_f16_sdwa $0, $1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1","v,v"(i32 %1, i32 %a)
-  ret i32 %1
+define <2 x half> @__hip_hc_ir_h2cos_int(<2 x half> %a) #1 {
+  %1 = bitcast <2 x half> %a to i32
+  %2 = tail call i32 asm sideeffect "v_cos_f16 $0, $1","=v,v"(i32 %1)
+  tail call void asm sideeffect "v_cos_f16_sdwa $0, $1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1","v,v"(i32 %2, i32 %1)
+  %3 = bitcast i32 %2 to <2 x half>
+  ret <2 x half> %3
 }
 
-define i32 @__hip_hc_ir_h2exp2_int(i32 %a) #1 {
-  %1 = tail call i32 asm sideeffect "v_exp_f16 $0, $1","=v,v"(i32 %a)
-  tail call void asm sideeffect "v_exp_f16_sdwa $0, $1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1","v,v"(i32 %1, i32 %a)
-  ret i32 %1
+define <2 x half> @__hip_hc_ir_h2exp2_int(<2 x half> %a) #1 {
+  %1 = bitcast <2 x half> %a to i32
+  %2 = tail call i32 asm sideeffect "v_exp_f16 $0, $1","=v,v"(i32 %1)
+  tail call void asm sideeffect "v_exp_f16_sdwa $0, $1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1","v,v"(i32 %2, i32 %1)
+  %3 = bitcast i32 %2 to <2 x half>
+  ret <2 x half> %3
 }
 
-define i32 @__hip_hc_ir_h2floor_int(i32 %a) #1 {
-  %1 = tail call i32 asm sideeffect "v_floor_f16 $0, $1","=v,v"(i32 %a)
-  tail call void asm sideeffect "v_floor_f16_sdwa $0, $1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1","v,v"(i32 %1, i32 %a)
-  ret i32 %1
+define <2 x half> @__hip_hc_ir_h2floor_int(<2 x half> %a) #1 {
+  %1 = bitcast <2 x half> %a to i32
+  %2 = tail call i32 asm sideeffect "v_floor_f16 $0, $1","=v,v"(i32 %1)
+  tail call void asm sideeffect "v_floor_f16_sdwa $0, $1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1","v,v"(i32 %2, i32 %1)
+  %3 = bitcast i32 %2 to <2 x half>
+  ret <2 x half> %3
 }
 
-define i32 @__hip_hc_ir_h2log2_int(i32 %a) #1 {
-  %1 = tail call i32 asm sideeffect "v_log_f16 $0, $1","=v,v"(i32 %a)
-  tail call void asm sideeffect "v_log_f16_sdwa $0, $1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1","v,v"(i32 %1, i32 %a)
-  ret i32 %1
+define <2 x half> @__hip_hc_ir_h2log2_int(<2 x half> %a) #1 {
+  %1 = bitcast <2 x half> %a to i32
+  %2 = tail call i32 asm sideeffect "v_log_f16 $0, $1","=v,v"(i32 %1)
+  tail call void asm sideeffect "v_log_f16_sdwa $0, $1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1","v,v"(i32 %2, i32 %1)
+  %3 = bitcast i32 %2 to <2 x half>
+  ret <2 x half> %3
 }
 
-define i32 @__hip_hc_ir_h2rcp_int(i32 %a) #1 {
-  %1 = tail call i32 asm sideeffect "v_rcp_f16 $0, $1","=v,v"(i32 %a)
-  tail call void asm sideeffect "v_rcp_f16_sdwa $0, $1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1","v,v"(i32 %1, i32 %a)
-  ret i32 %1
+define <2 x half> @__hip_hc_ir_h2rcp_int(<2 x half> %a) #1 {
+  %1 = bitcast <2 x half> %a to i32
+  %2 = tail call i32 asm sideeffect "v_rcp_f16 $0, $1","=v,v"(i32 %1)
+  tail call void asm sideeffect "v_rcp_f16_sdwa $0, $1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1","v,v"(i32 %2, i32 %1)
+  %3 = bitcast i32 %2 to <2 x half>
+  ret <2 x half> %3
 }
 
-define i32 @__hip_hc_ir_h2rsqrt_int(i32 %a) #1 {
-  %1 = tail call i32 asm sideeffect "v_rsq_f16 $0, $1","=v,v"(i32 %a)
-  tail call void asm sideeffect "v_rsq_f16_sdwa $0, $1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1","v,v"(i32 %1, i32 %a)
-  ret i32 %1
+define <2 x half> @__hip_hc_ir_h2rsqrt_int(<2 x half> %a) #1 {
+  %1 = bitcast <2 x half> %a to i32
+  %2 = tail call i32 asm sideeffect "v_rsq_f16 $0, $1","=v,v"(i32 %1)
+  tail call void asm sideeffect "v_rsq_f16_sdwa $0, $1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1","v,v"(i32 %2, i32 %1)
+  %3 = bitcast i32 %2 to <2 x half>
+  ret <2 x half> %3
 }
 
-define i32 @__hip_hc_ir_h2sin_int(i32 %a) #1 {
-  %1 = tail call i32 asm sideeffect "v_sin_f16 $0, $1","=v,v"(i32 %a)
-  tail call void asm sideeffect "v_sin_f16_sdwa $0, $1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1","v,v"(i32 %1, i32 %a)
-  ret i32 %1
+define <2 x half> @__hip_hc_ir_h2sin_int(<2 x half> %a) #1 {
+  %1 = bitcast <2 x half> %a to i32
+  %2 = tail call i32 asm sideeffect "v_sin_f16 $0, $1","=v,v"(i32 %1)
+  tail call void asm sideeffect "v_sin_f16_sdwa $0, $1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1","v,v"(i32 %2, i32 %1)
+  %3 = bitcast i32 %2 to <2 x half>
+  ret <2 x half> %3
 }
 
-define i32 @__hip_hc_ir_h2sqrt_int(i32 %a) #1 {
-  %1 = tail call i32 asm sideeffect "v_sqrt_f16 $0, $1","=v,v"(i32 %a)
-  tail call void asm sideeffect "v_sqrt_f16_sdwa $0, $1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1","v,v"(i32 %1, i32 %a)
-  ret i32 %1
+define <2 x half> @__hip_hc_ir_h2sqrt_int(<2 x half> %a) #1 {
+  %1 = bitcast <2 x half> %a to i32
+  %2 = tail call i32 asm sideeffect "v_sqrt_f16 $0, $1","=v,v"(i32 %1)
+  tail call void asm sideeffect "v_sqrt_f16_sdwa $0, $1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1","v,v"(i32 %2, i32 %1)
+  %3 = bitcast i32 %2 to <2 x half>
+  ret <2 x half> %3
 }
 
-define i32 @__hip_hc_ir_h2trunc_int(i32 %a) #1 {
-  %1 = tail call i32 asm sideeffect "v_trunc_f16 $0, $1","=v,v"(i32 %a)
-  tail call void asm sideeffect "v_trunc_f16_sdwa $0, $1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1","v,v"(i32 %1, i32 %a)
-  ret i32 %1
+define <2 x half> @__hip_hc_ir_h2trunc_int(<2 x half> %a) #1 {
+  %1 = bitcast <2 x half> %a to i32
+  %2 = tail call i32 asm sideeffect "v_trunc_f16 $0, $1","=v,v"(i32 %1)
+  tail call void asm sideeffect "v_trunc_f16_sdwa $0, $1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1","v,v"(i32 %2, i32 %1)
+  %3 = bitcast i32 %2 to <2 x half>
+  ret <2 x half> %3
 }
 
 attributes #1 = { alwaysinline nounwind }
diff --git a/projects/hip/src/hip_hcc.cpp b/projects/hip/src/hip_hcc.cpp
index 71d947488d..08a2cdbfcf 100644
--- a/projects/hip/src/hip_hcc.cpp
+++ b/projects/hip/src/hip_hcc.cpp
@@ -48,13 +48,8 @@ THE SOFTWARE.
 #include "env.h"
 
 
-#ifndef USE_COPY_EXT_V2
-#define USE_COPY_EXT_V2 1
-#endif
-
-#ifndef USE_ROCR_1_4
-#define USE_ROCR_1_4 1
-#endif
+// needs HCC change for hc::no_scope
+#define USE_NO_SCOPE 1
 
 //=================================================================================================
 //Global variables:
@@ -69,7 +64,6 @@ std::string HIP_LAUNCH_BLOCKING_KERNELS;
 std::vector<std::string> g_hipLaunchBlockingKernels;
 int HIP_API_BLOCKING = 0;
 
-int HIP_MAX_QUEUES = 0;
 
 int HIP_PRINT_ENV = 0;
 int HIP_TRACE_API= 0;
@@ -80,8 +74,7 @@ int HIP_PROFILE_API= 0;
 std::string HIP_DB_START_API;
 std::string HIP_DB_STOP_API;
 int HIP_DB= 0;
-int HIP_VISIBLE_DEVICES = 0; /* Contains a comma-separated sequence of GPU identifiers */
-int HIP_NUM_KERNELS_INFLIGHT = 128;
+int HIP_VISIBLE_DEVICES = 0; 
 int HIP_WAIT_MODE = 0;
 
 int HIP_FORCE_P2P_HOST = 0;
@@ -91,12 +84,20 @@ int HIP_DENY_PEER_ACCESS = 0;
 // Force async copies to actually use the synchronous copy interface.
 int HIP_FORCE_SYNC_COPY = 0;
 
+// TODO - set these to 0 and 1
+int HIP_EVENT_SYS_RELEASE=1;
 int HIP_COHERENT_HOST_ALLOC = 0;
 
 // TODO - set to 0 once we resolve stability.
 // USE_ HIP_SYNC_HOST_ALLOC
 int HIP_SYNC_HOST_ALLOC = 1;
 
+// Chicken bit to sync on host to implement null stream.
+// If 0, null stream synchronization is performed on the GPU
+int HIP_SYNC_NULL_STREAM = 0;
+
+// HIP needs to change some behavior based on HCC_OPT_FLUSH :
+// TODO - set this to 1
 int HCC_OPT_FLUSH = 0;
 
 
@@ -104,9 +105,6 @@ int HCC_OPT_FLUSH = 0;
 
 
 
-#define HIP_USE_PRODUCT_NAME 1
-//#define DISABLE_COPY_EXT 1
-
 
 std::once_flag hip_initialized;
 
@@ -118,6 +116,7 @@ bool g_visible_device = false;
 unsigned g_deviceCnt;
 std::vector<int> g_hip_visible_devices;
 hsa_agent_t g_cpu_agent;
+hsa_agent_t *g_allAgents; // CPU agents + all the visible GPU agents.
 unsigned g_numLogicalThreads;
 
 std::atomic<int> g_lastShortTid(1);
@@ -272,64 +271,40 @@ ihipStream_t::~ihipStream_t()
 }
 
 
-inline void ihipStream_t::ensureHaveQueue(LockedAccessor_StreamCrit_t &streamCrit)
+hc::hcWaitMode ihipStream_t::waitMode() const
 {
-    if (HIP_MAX_QUEUES && !streamCrit->_hasQueue)  {
+    hc::hcWaitMode waitMode = hc::hcWaitModeActive;
 
-        // To avoid deadlock, we have to release the stream lock before acquiring context lock.
-        // Else we can get hung if another thread has the context lock is trying to get lock for this stream.
-        // We lock it again below.
-        streamCrit->munlock();
-
-        // Obtain mutex access to the device critical data, release by destructor
-        LockedAccessor_CtxCrit_t  ctxCrit(this->_ctx->criticalData());
-        // TODO
-        auto needyCritPtr = this->_criticalData.mlock();
-
-        // Second test to ensure we still need to steal the queue - another thread may have
-        // snuck in here and already solved the issue.
-        if (!needyCritPtr->_hasQueue) {
-            needyCritPtr->_av = this->_ctx->stealActiveQueue(ctxCrit, this);
+    if (_scheduleMode == Auto) {
+        if (g_deviceCnt > g_numLogicalThreads) {
+            waitMode = hc::hcWaitModeActive;
+        } else {
+            waitMode = hc::hcWaitModeBlocked;
         }
-
-        streamCrit->_hasQueue = true;
+    } else if (_scheduleMode == Spin) {
+        waitMode = hc::hcWaitModeActive;
+    } else if (_scheduleMode == Yield) {
+        waitMode = hc::hcWaitModeBlocked;
+    } else {
+        assert(0); // bad wait mode.
     }
-    assert(streamCrit->_hasQueue);
-}
 
+    if (HIP_WAIT_MODE == 1) {
+        waitMode = hc::hcWaitModeBlocked;
+    } else if (HIP_WAIT_MODE == 2) {
+        waitMode = hc::hcWaitModeActive;
+    }
+
+    return waitMode;
+}
 
 //Wait for all kernel and data copy commands in this stream to complete.
 //This signature should be used in routines that already have locked the stream mutex
 void ihipStream_t::wait(LockedAccessor_StreamCrit_t &crit)
 {
-    if (crit->_hasQueue) {
-        tprintf (DB_SYNC, "%s wait for queue-empty..\n", ToString(this).c_str());
-        hc::hcWaitMode waitMode = hc::hcWaitModeActive;
+    tprintf (DB_SYNC, "%s wait for queue-empty..\n", ToString(this).c_str());
 
-        if (_scheduleMode == Auto) {
-            if (g_deviceCnt > g_numLogicalThreads) {
-                waitMode = hc::hcWaitModeActive;
-            } else {
-                waitMode = hc::hcWaitModeBlocked;
-            }
-        } else if (_scheduleMode == Spin) {
-            waitMode = hc::hcWaitModeActive;
-        } else if (_scheduleMode == Yield) {
-            waitMode = hc::hcWaitModeBlocked;
-        } else {
-            assert(0); // bad wait mode.
-        }
-
-        if (HIP_WAIT_MODE == 1) {
-            waitMode = hc::hcWaitModeBlocked;
-        } else if (HIP_WAIT_MODE == 2) {
-            waitMode = hc::hcWaitModeActive;
-        }
-
-        crit->_av.wait(waitMode);
-    } else {
-        tprintf (DB_SYNC, "%s wait for queue empty (done since stream has no physical queue).\n", ToString(this).c_str());
-    }
+    crit->_av.wait(waitMode());
 
     crit->_kernelCnt = 0;
 }
@@ -345,14 +320,13 @@ void ihipStream_t::locked_wait()
 };
 
 // Causes current stream to wait for specified event to complete:
-// Note this does not require any kind of host serialization.
+// Note this does not provide any kind of host serialization.
 void ihipStream_t::locked_waitEvent(hipEvent_t event)
 {
     LockedAccessor_StreamCrit_t crit(_criticalData);
 
-    this->ensureHaveQueue(crit);
 
-    crit->_av.create_blocking_marker(event->_marker);
+    crit->_av.create_blocking_marker(event->_marker, hc::accelerator_scope);
 }
 
 // Create a marker in this stream.
@@ -362,9 +336,19 @@ void ihipStream_t::locked_recordEvent(hipEvent_t event)
     // Lock the stream to prevent simultaneous access
     LockedAccessor_StreamCrit_t crit(_criticalData);
 
-    this->ensureHaveQueue(crit);
-    event->_marker = crit->_av.create_marker();
-}
+    auto scopeFlag = hc::accelerator_scope;
+    // The env var HIP_EVENT_SYS_RELEASE sets the default,
+    // The explicit flags override the env var (if specified)
+    if (event->_flags & hipEventReleaseToSystem) {
+        scopeFlag = hc::system_scope;
+    } else if (event->_flags & hipEventReleaseToDevice) {
+        scopeFlag = hc::accelerator_scope;
+    } else {
+        scopeFlag = HIP_EVENT_SYS_RELEASE ? hc::system_scope : hc::accelerator_scope;
+    }
+
+    event->_marker = crit->_av.create_marker(scopeFlag);
+};
 
 //=============================================================================
 
@@ -396,14 +380,6 @@ LockedAccessor_StreamCrit_t ihipStream_t::lockopen_preKernelCommand()
 
     LockedAccessor_StreamCrit_t crit(_criticalData, false/*no unlock at destruction*/);
 
-    if(crit->_kernelCnt > HIP_NUM_KERNELS_INFLIGHT){
-       this->wait(crit);
-       crit->_kernelCnt = 0;
-    }
-
-    this->ensureHaveQueue(crit);
-
-
 
     return crit;
 }
@@ -775,6 +751,9 @@ hipError_t ihipDevice_t::initProperties(hipDeviceProp_t* prop)
     if(strcmp(archName,"gfx803")==0){
       prop->gcnArch = 803;
     }
+    if(strcmp(archName,"gfx900")==0){
+      prop->gcnArch = 900;
+    }
 
     DeviceErrorCheck(err);
 
@@ -848,11 +827,7 @@ hipError_t ihipDevice_t::initProperties(hipDeviceProp_t* prop)
 
     // Get Max Threads Per Multiprocessor
     uint32_t max_waves_per_cu;
-#if USE_ROCR_1_4
     err = hsa_agent_get_info(_hsaAgent,(hsa_agent_info_t) HSA_AMD_AGENT_INFO_MAX_WAVES_PER_CU, &max_waves_per_cu);
-#else
-    max_waves_per_cu = 10;
-#endif
     DeviceErrorCheck(err);
     prop-> maxThreadsPerMultiProcessor = prop->warpSize*max_waves_per_cu;
 
@@ -997,55 +972,6 @@ std::string ihipCtx_t::toString() const
 };
 
 
-hc::accelerator_view
-ihipCtx_t::stealActiveQueue(LockedAccessor_CtxCrit_t &ctxCrit, ihipStream_t *needyStream)
-{
-
-    // TODO - review handling if queue can't be found.
-    while (1) {
-
-        for (auto iter=ctxCrit->streams().begin(); iter != ctxCrit->streams().end(); iter++) {
-            if (*iter != needyStream) {
-                auto victimCritPtr = (*iter)->_criticalData.mtry_lock();
-                if (victimCritPtr)   {
-                    // try-lock succeeded:
-                    if (victimCritPtr->_hasQueue && (victimCritPtr->_kernelCnt == 0)) {
-
-                        victimCritPtr->_hasQueue = false;
-
-                        tprintf(DB_SYNC, " stealActiveQueue from victim:%s to needy:%s\n",
-                                ToString(*iter).c_str(), ToString(needyStream).c_str());
-
-                        hc::accelerator_view av = victimCritPtr->_av;
-
-                        // TODO - cleanup to remove forced setting to N
-                        uint64_t *p = (uint64_t*)(&victimCritPtr->_av);
-                        *p = 0; // damage the victim av so attempt to use it will fault.
-
-                        (*iter)->_criticalData.munlock();
-                        return av;
-                    }
-                    (*iter)->_criticalData.munlock();
-                }
-            }
-        }
-    }
-}
-
-
-hc::accelerator_view
-ihipCtx_t::createOrStealQueue(LockedAccessor_CtxCrit_t &ctxCrit)
-{
-    if (HIP_MAX_QUEUES && (ctxCrit->streams().size() >= HIP_MAX_QUEUES)) {
-        // Steal a queue from an existing stream:
-        hc::accelerator_view av = this->stealActiveQueue (ctxCrit, nullptr);
-        return av;
-    } else {
-        // Create a new view
-        return getWriteableDevice()->_acc.create_view();
-    }
-}
-
 //----
 
 
@@ -1062,29 +988,68 @@ ihipCtx_t::createOrStealQueue(LockedAccessor_CtxCrit_t &ctxCrit)
 
 
 
-// Implement "default" stream syncronization
-//   This waits for all other streams to drain before continuing.
+//   This called for submissions that are sent to the null/default stream.  This routine ensures
+//   that this new command waits for activity in the other streams to complete before proceeding.
+//
+//   HIP_SYNC_NULL_STREAM=0 does all dependency resolutiokn on the GPU
+//   HIP_SYNC_NULL_STREAM=1 s legacy non-optimal mode which conservatively waits on host.
+//
 //   If waitOnSelf is set, this additionally waits for the default stream to empty.
-void ihipCtx_t::locked_syncDefaultStream(bool waitOnSelf)
+//   In new HIP_SYNC_NULL_STREAM=0 mode, this enqueues a marker which causes the default stream to wait for other
+//   activity, but doesn't actually block the host.  If host blocking is desired, the caller should set syncHost.
+//
+//   syncToHost causes host to wait for the stream to finish.
+//   Note HIP_SYNC_NULL_STREAM=1 path always sync to Host. 
+void ihipCtx_t::locked_syncDefaultStream(bool waitOnSelf, bool syncHost)
 {
     LockedAccessor_CtxCrit_t  crit(_criticalData);
 
-    tprintf(DB_SYNC, "syncDefaultStream\n");
+    tprintf(DB_SYNC, "syncDefaultStream \n");
+
+    // Vector of ops sent to each stream that will complete before ops sent to null stream:
+    std::vector<hc::completion_future> depOps;
 
     for (auto streamI=crit->const_streams().begin(); streamI!=crit->const_streams().end(); streamI++) {
         ihipStream_t *stream = *streamI;
 
         // Don't wait for streams that have "opted-out" of syncing with NULL stream.
-        // And - don't wait for the NULL stream
-        if (!(stream->_flags & hipStreamNonBlocking)) {
+        // And - don't wait for the NULL stream, unless waitOnSelf specified.
+        bool waitThisStream = (!(stream->_flags & hipStreamNonBlocking)) && 
+                              (waitOnSelf || (stream != _defaultStream));
 
-            if (waitOnSelf || (stream != _defaultStream)) {
-                // TODO-hcc - use blocking or active wait here?
-                // TODO-sync - cudaDeviceBlockingSync
+        if (HIP_SYNC_NULL_STREAM) {
+
+            if (waitThisStream) {
                 stream->locked_wait();
             }
+        } else {
+            if (waitThisStream) {
+                LockedAccessor_StreamCrit_t streamCrit(stream->_criticalData);
+
+                // The last marker will provide appropriate visibility:
+                if (!streamCrit->_av.get_is_empty()) {
+                    depOps.push_back(streamCrit->_av.create_marker(hc::accelerator_scope));
+                    tprintf(DB_SYNC, "  push marker to wait for stream=%s\n", ToString(stream).c_str());
+                } else {
+                    tprintf(DB_SYNC, "  skipped stream=%s since it is empty\n", ToString(stream).c_str());
+                }
+            }
         }
     }
+
+    
+    // Enqueue a barrier to wait on all the barriers we sent above:
+    if (!HIP_SYNC_NULL_STREAM && !depOps.empty()) {
+        LockedAccessor_StreamCrit_t defaultStreamCrit(_defaultStream->_criticalData);
+        tprintf(DB_SYNC, "  null-stream wait on %zu non-empty streams. sync_host=%d\n", depOps.size(), syncHost);
+        hc::completion_future defaultCf = defaultStreamCrit->_av.create_blocking_marker(depOps.begin(), depOps.end(), hc::accelerator_scope);
+        if (syncHost) {
+            defaultCf.wait(); // TODO - account for active or blocking here.
+        }
+    }
+
+    tprintf(DB_SYNC, "  syncDefaultStream depOps=%zu\n", depOps.size());
+
 }
 
 
@@ -1244,7 +1209,6 @@ void HipReadEnv()
     READ_ENV_I(release, HIP_API_BLOCKING, 0, "Make HIP APIs 'host-synchronous', so they block until completed.  Impacts hipMemcpyAsync, hipMemsetAsync." );
 
 
-    READ_ENV_I(release, HIP_MAX_QUEUES, 0, "Maximum number of queues that this app will use per-device.  Additional streams will share the specified number of queues.  0=no limit.");
 
     READ_ENV_C(release, HIP_DB, 0,  "Print debug info.  Bitmask (HIP_DB=0xff) or flags separated by '+' (HIP_DB=api+sync+mem+copy)", HIP_DB_callback);
     if ((HIP_DB & (1<<DB_API))  && (HIP_TRACE_API == 0)) {
@@ -1271,14 +1235,14 @@ void HipReadEnv()
     READ_ENV_I(release, HIP_FAIL_SOC, 0, "Fault on Sub-Optimal-Copy, rather than use a slower but functional implementation.  Bit 0x1=Fail on async copy with unpinned memory.  Bit 0x2=Fail peer copy rather than use staging buffer copy");
 
     READ_ENV_I(release, HIP_SYNC_HOST_ALLOC, 0, "Sync before and after all host memory allocations.  May help stability");
+    READ_ENV_I(release, HIP_SYNC_NULL_STREAM, 0, "Synchronize on host for null stream submissions");
 
-    // TODO - review, can we remove this?
-    READ_ENV_I(release, HIP_NUM_KERNELS_INFLIGHT, 128, "Max number of inflight kernels per stream before active synchronization is forced.");
 
     READ_ENV_I(release, HIP_COHERENT_HOST_ALLOC, 0, "If set, all host memory will be allocated as fine-grained system memory.  This allows threadfence_system to work but prevents host memory from being cached on GPU which may have performance impact.");
 
 
-    READ_ENV_I(release, HCC_OPT_FLUSH, 0, "Note this flag also impact HCC.  When set, use agent-scope flush rather than system-scope flush when possible.");
+    READ_ENV_I(release, HCC_OPT_FLUSH, 0, "When set, use agent-scope fence operations rather than system-scope fence operationsflush when possible. This flag controls both HIP and HCC behavior.");
+    READ_ENV_I(release, HIP_EVENT_SYS_RELEASE, 0, "If set, event are created with hipEventReleaseToSystem by default.  If 0, events are created with hipEventReleaseToDevice by default.  The defaults can be overridden by specifying hipEventReleaseToSystem or hipEventReleaseToDevice flag when creating the event.");
 
     // Some flags have both compile-time and runtime flags - generate a warning if user enables the runtime flag but the compile-time flag is disabled.
     if (HIP_DB && !COMPILE_HIP_DB) {
@@ -1389,6 +1353,14 @@ void ihipInit()
             g_deviceCnt++;
         }
     }
+
+    g_allAgents = static_cast<hsa_agent_t*> (malloc((g_deviceCnt+1) * sizeof(hsa_agent_t)));
+    g_allAgents[0] = g_cpu_agent;
+    for (int i=0; i<g_deviceCnt; i++) {
+        g_allAgents[i+1] = g_deviceArray[i]->_hsaAgent;
+    }
+
+
     g_numLogicalThreads = std::thread::hardware_concurrency();
 
     // If HIP_VISIBLE_DEVICES is not set, make sure all devices are initialized
@@ -1411,17 +1383,49 @@ void ihipInit()
 hipStream_t ihipSyncAndResolveStream(hipStream_t stream)
 {
     if (stream == hipStreamNull ) {
-        ihipCtx_t *device = ihipGetTlsDefaultCtx();
+        // Submitting to NULL stream, call locked_syncDefaultStream to wait for all other streams:
+        ihipCtx_t *ctx = ihipGetTlsDefaultCtx();
+        tprintf(DB_SYNC, "ihipSyncAndResolveStream %s wait on default stream\n", ToString(stream).c_str());
 
 #ifndef HIP_API_PER_THREAD_DEFAULT_STREAM
-        device->locked_syncDefaultStream(false);
+        ctx->locked_syncDefaultStream(false, false);
 #endif
-        return device->_defaultStream;
+        return ctx->_defaultStream;
     } else {
-        // ALl streams have to wait for legacy default stream to be empty:
+        // Submitting to a "normal" stream, just wait for null stream:
         if (!(stream->_flags & hipStreamNonBlocking))  {
-            tprintf(DB_SYNC, "%s wait default stream\n", ToString(stream).c_str());
-            stream->getCtx()->_defaultStream->locked_wait();
+            if (HIP_SYNC_NULL_STREAM) {
+                tprintf(DB_SYNC, "ihipSyncAndResolveStream %s host-wait on default stream\n", ToString(stream).c_str());
+                stream->getCtx()->_defaultStream->locked_wait();
+            } else {
+                ihipStream_t *defaultStream = stream->getCtx()->_defaultStream;
+
+
+                bool needGatherMarker = false; // used to gather together other markers.
+                hc::completion_future dcf;
+                {
+                    LockedAccessor_StreamCrit_t defaultStreamCrit(defaultStream->criticalData());
+                    // TODO - could call create_blocking_marker(queue) or uses existing marker.
+                    if (!defaultStreamCrit->_av.get_is_empty()) {
+                        needGatherMarker = true;
+
+                        tprintf(DB_SYNC, "  %s adding marker to default %s for dependency\n", 
+                                ToString(stream).c_str(), ToString(defaultStream).c_str());
+                        dcf = defaultStreamCrit->_av.create_marker(hc::accelerator_scope);
+                    } else {
+                        tprintf(DB_SYNC, "  %s skipping marker since default stream is empty\n", ToString(stream).c_str());
+                    }
+                }
+
+                if (needGatherMarker) {
+                    // ensure any commands sent to this stream wait on the NULL stream before continuing
+                    LockedAccessor_StreamCrit_t thisStreamCrit(stream->criticalData());
+                    // TODO - could be "noret" version of create_blocking_marker
+                    thisStreamCrit->_av.create_blocking_marker(dcf, hc::accelerator_scope);
+                    tprintf(DB_SYNC, "  %s adding marker to wait for freshly recorded default-stream marker \n", 
+                            ToString(stream).c_str());
+                }
+            }
         }
 
         return stream;
@@ -1431,7 +1435,7 @@ hipStream_t ihipSyncAndResolveStream(hipStream_t stream)
 void ihipPrintKernelLaunch(const char *kernelName, const grid_launch_parm *lp, const hipStream_t stream)
 {
 
-    if ((HIP_TRACE_API & (1<<TRACE_CMD)) || HIP_PROFILE_API || (COMPILE_HIP_DB && HIP_TRACE_API)) {
+    if ((HIP_TRACE_API & (1<<TRACE_KCMD)) || HIP_PROFILE_API || (COMPILE_HIP_DB & HIP_TRACE_API)) {
         std::stringstream os_pre;
         std::stringstream os;
         os_pre  << "<<hip-api tid:";
@@ -1900,13 +1904,8 @@ void ihipStream_t::locked_copySync(void* dst, const void* src, size_t sizeBytes,
         printPointerInfo(DB_COPY, "  dst", dst, dstPtrInfo);
         printPointerInfo(DB_COPY, "  src", src, srcPtrInfo);
 
-        this->ensureHaveQueue(crit);
 
-#if USE_COPY_EXT_V2
         crit->_av.copy_ext(src, dst, sizeBytes, hcCopyDir, srcPtrInfo, dstPtrInfo, copyDevice ? &copyDevice->getDevice()->_acc : nullptr, forceUnpinnedCopy);
-#else
-        crit->_av.copy_ext(src, dst, sizeBytes, hcCopyDir, srcPtrInfo, dstPtrInfo, forceUnpinnedCopy);
-#endif
     }
 }
 
@@ -2011,21 +2010,12 @@ void ihipStream_t::locked_copyAsync(void* dst, const void* src, size_t sizeBytes
 
             // Perform fast asynchronous copy - we know copyDevice != NULL based on check above
             try {
-                this->ensureHaveQueue(crit);
 
                 if (HIP_FORCE_SYNC_COPY) {
-#if USE_COPY_EXT_V2
                     crit->_av.copy_ext      (src, dst, sizeBytes, hcCopyDir, srcPtrInfo, dstPtrInfo, &copyDevice->getDevice()->_acc, forceUnpinnedCopy);
-#else
-                    crit->_av.copy_ext      (src, dst, sizeBytes, hcCopyDir, srcPtrInfo, dstPtrInfo, forceUnpinnedCopy);
-#endif
 
                 } else {
-#if USE_COPY_EXT_V2
                     crit->_av.copy_async_ext(src, dst, sizeBytes, hcCopyDir, srcPtrInfo, dstPtrInfo, &copyDevice->getDevice()->_acc);
-#else
-                    crit->_av.copy_async(src, dst, sizeBytes);
-#endif
                 }
             } catch (Kalmar::runtime_exception) {
                 throw ihipException(hipErrorRuntimeOther);
@@ -2056,13 +2046,8 @@ void ihipStream_t::locked_copyAsync(void* dst, const void* src, size_t sizeBytes
             // Perform slow synchronous copy:
             LockedAccessor_StreamCrit_t crit(_criticalData);
 
-            this->ensureHaveQueue(crit);
 
-#if USE_COPY_EXT_V2
             crit->_av.copy_ext(src, dst, sizeBytes, hcCopyDir, srcPtrInfo, dstPtrInfo, copyDevice ? &copyDevice->getDevice()->_acc : nullptr, forceUnpinnedCopy);
-#else
-            crit->_av.copy_ext(src, dst, sizeBytes, hcCopyDir, srcPtrInfo, dstPtrInfo, forceUnpinnedCopy);
-#endif
         }
     }
 }
@@ -2115,7 +2100,6 @@ hipError_t hipHccGetAccelerator(int deviceId, hc::accelerator *acc)
 
 
 //---
-// Warning - with HIP_MAX_QUEUES!=0 there is no mechanism to prevent accelerator_view from being re-assigned...
 hipError_t hipHccGetAcceleratorView(hipStream_t stream, hc::accelerator_view **av)
 {
     HIP_INIT_API(stream, av);
diff --git a/projects/hip/src/hip_hcc_internal.h b/projects/hip/src/hip_hcc_internal.h
index 9c17c6e98c..c3f8b72311 100644
--- a/projects/hip/src/hip_hcc_internal.h
+++ b/projects/hip/src/hip_hcc_internal.h
@@ -66,6 +66,8 @@ extern int HIP_COHERENT_HOST_ALLOC;
 // Chicken bits for disabling functionality to work around potential issues:
 extern int HIP_SYNC_HOST_ALLOC;
 
+extern int HIP_SYNC_NULL_STREAM;
+
 // TODO - remove when this is standard behavior.
 extern int HCC_OPT_FLUSH;
 
@@ -187,10 +189,11 @@ extern const char *API_COLOR_END;
 
 
 //---
-//HIP Trace modes
-#define TRACE_ALL 0 // 0x1
-#define TRACE_CMD 1 // 0x2
-#define TRACE_MEM 2 // 0x4
+//HIP Trace modes - use with HIP_TRACE_API=...
+#define TRACE_ALL  0 // 0x1
+#define TRACE_KCMD 1 // 0x2, kernel command
+#define TRACE_MCMD 2 // 0x4, memory command
+#define TRACE_MEM  3 // 0x8, memory allocation or deallocation.
 
 
 //---
@@ -275,12 +278,13 @@ extern void recordApiTrace(std::string *fullStr, const std::string &apiStr);
     API_TRACE(0, __VA_ARGS__);
 
 
-// Like above, but will trace with DB_CMD.
-// Replace HIP_INIT_API with this call inside important APIs that launch work on the GPU:
+// Like above, but will trace with a specified "special" bit.
+// Replace HIP_INIT_API with this call inside HIP APIs that launch work on the GPU:
 // kernel launches, copy commands, memory sets, etc.
-#define HIP_INIT_CMD_API(...) \
+#define HIP_INIT_SPECIAL_API(tbit, ...) \
     HIP_INIT()\
-    API_TRACE((HIP_TRACE_API&(1<<TRACE_CMD)), __VA_ARGS__);
+    API_TRACE((HIP_TRACE_API&(1<<tbit)), __VA_ARGS__);
+
 
 // This macro should be called at the end of every HIP API, and only at the end of top-level hip APIS (not internal hip)
 // It has dual function: logs the last error returned for use by hipGetLastError,
@@ -443,7 +447,6 @@ public:
     ihipStreamCriticalBase_t(ihipStream_t *parentStream, hc::accelerator_view av) :
         _kernelCnt(0),
         _av(av),
-        _hasQueue(true),
         _parent(parentStream)
     {
     };
@@ -469,11 +472,6 @@ public:
     uint32_t                    _kernelCnt;    // Count of inflight kernels in this stream.  Reset at ::wait().
 
     hc::accelerator_view        _av;
-
-    // True if the stream has an allocated queue (accelerato_view) for its use:
-    // Always true at ihipStream creation but queue may later be stolen.
-    // This acts as a valid bit for the _av.
-    bool                        _hasQueue;
 private:
 };
 
@@ -519,8 +517,10 @@ public:
     void                 locked_waitEvent(hipEvent_t event);
     void                 locked_recordEvent(hipEvent_t event);
 
+    ihipStreamCritical_t  &criticalData() { return _criticalData; };
 
     //---
+    hc::hcWaitMode waitMode() const;
 
     // Use this if we already have the stream critical data mutex:
     void                 wait(LockedAccessor_StreamCrit_t &crit);
@@ -538,12 +538,13 @@ public:
     const ihipDevice_t *     getDevice() const;
     ihipCtx_t *              getCtx() const;
 
-    void ensureHaveQueue(LockedAccessor_StreamCrit_t &streamCrit);
+    // Before calling this function, stream must be resolved from "0" to the actual stream:
+    bool isDefaultStream() const { return _id == 0; };
 
 public:
     //---
     //Public member vars - these are set at initialization and never change:
-    SeqNum_t                    _id;   // monotonic sequence ID
+    SeqNum_t                    _id;   // monotonic sequence ID.  0 is the default stream.
     unsigned                    _flags;
 
 
@@ -562,6 +563,7 @@ private:
 
     void addSymbolPtrToTracker(hc::accelerator& acc, void* ptr, size_t sizeBytes);
 
+
 public: // TODO - move private
     // Critical Data - MUST be accessed through LockedAccessor_StreamCrit_t
     ihipStreamCritical_t        _criticalData;
@@ -584,10 +586,10 @@ private: // Data
 //----
 // Internal event structure:
 enum hipEventStatus_t {
-    hipEventStatusUnitialized = 0, // event is unutilized, must be "Created" before use.
-    hipEventStatusCreated     = 1,
-    hipEventStatusRecording   = 2, // event has been enqueued to record something.
-    hipEventStatusRecorded    = 3, // event has been recorded - timestamps are valid.
+    hipEventStatusUnitialized = 0, // event is uninitialized, must be "Created" before use.
+    hipEventStatusCreated     = 1, // event created, but not yet Recorded
+    hipEventStatusRecording   = 2, // event has been recorded into a stream but not completed yet.
+    hipEventStatusComplete    = 3, // event has been recorded - timestamps are valid.
 } ;
 
 // TODO - rename to ihip type of some kind
@@ -601,8 +603,8 @@ enum ihipEventType_t {
 class ihipEvent_t {
 public:
     ihipEvent_t(unsigned flags);
-    void attachToCompletionFuture(const hc::completion_future *cf, ihipEventType_t eventType);
-    void setTimestamp();
+    void attachToCompletionFuture(const hc::completion_future *cf, hipStream_t stream, ihipEventType_t eventType);
+    void refereshEventStatus();
     uint64_t timestamp() const { return _timestamp; } ;
     ihipEventType_t type() const { return _type; };
 
@@ -784,11 +786,7 @@ public: // Functions:
     void locked_removeStream(ihipStream_t *s);
     void locked_reset();
     void locked_waitAllStreams();
-    void locked_syncDefaultStream(bool waitOnSelf);
-
-    // Will allocate a queue and assign it to the needyStream:
-    hc::accelerator_view  stealActiveQueue(LockedAccessor_CtxCrit_t &ctxCrit, ihipStream_t *needyStream);
-    hc::accelerator_view createOrStealQueue(LockedAccessor_CtxCrit_t &ctxCrit);
+    void locked_syncDefaultStream(bool waitOnSelf, bool syncHost);
 
     ihipCtxCritical_t  &criticalData() { return _criticalData; };
 
@@ -826,6 +824,7 @@ private:  // Critical data, protected with locked access:
 extern std::once_flag hip_initialized;
 extern unsigned g_deviceCnt;
 extern hsa_agent_t g_cpu_agent ;   // the CPU agent.
+extern hsa_agent_t *g_allAgents; // CPU agents + all the visible GPU agents.
 
 //=================================================================================================
 // Extern functions:
diff --git a/projects/hip/src/hip_memory.cpp b/projects/hip/src/hip_memory.cpp
index 821f64bc76..3ab7713afa 100644
--- a/projects/hip/src/hip_memory.cpp
+++ b/projects/hip/src/hip_memory.cpp
@@ -59,31 +59,40 @@ hipError_t memcpyAsync (void* dst, const void* src, size_t sizeBytes, hipMemcpyK
 }
 
 // return 0 on success or -1 on error:
-int sharePtr(void *ptr, ihipCtx_t *ctx, unsigned hipFlags)
+int sharePtr(void *ptr, ihipCtx_t *ctx, bool shareWithAll, unsigned hipFlags)
 {
     int ret = 0;
 
     auto device = ctx->getWriteableDevice();
 
     hc::am_memtracker_update(ptr, device->_deviceId, hipFlags);
-    int peerCnt=0;
-    {
-        LockedAccessor_CtxCrit_t crit(ctx->criticalData());
-        // the peerCnt always stores self so make sure the trace actually
-        peerCnt = crit->peerCnt();
-        tprintf(DB_MEM, "  allow access to %d other peer(s)\n", peerCnt-1);
-        if (peerCnt > 1) {
 
-            //printf ("peer self access\n");
+    if (shareWithAll) {
+        hsa_status_t s = hsa_amd_agents_allow_access(g_deviceCnt+1, g_allAgents, NULL, ptr);
+        tprintf (DB_MEM, "    allow access to CPU + all %d GPUs (shareWithAll)\n", g_deviceCnt); 
+        if (s != HSA_STATUS_SUCCESS) {
+            ret = -1;
+        }
+    } else {
+        int peerCnt=0;
+        {
+            LockedAccessor_CtxCrit_t crit(ctx->criticalData());
+            // the peerCnt always stores self so make sure the trace actually
+            peerCnt = crit->peerCnt();
+            tprintf(DB_MEM, "  allow access to %d other peer(s)\n", peerCnt-1);
+            if (peerCnt > 1) {
 
-            // TODOD - remove me:
-            for (auto iter = crit->_peers.begin(); iter!=crit->_peers.end(); iter++) {
-                tprintf (DB_MEM, "    allow access to peer: %s%s\n", (*iter)->toString().c_str(), (iter == crit->_peers.begin()) ? " (self)":"");
-            };
+                //printf ("peer self access\n");
 
-            hsa_status_t s = hsa_amd_agents_allow_access(crit->peerCnt(), crit->peerAgents(), NULL, ptr);
-            if (s != HSA_STATUS_SUCCESS) {
-                ret = -1;
+                // TODOD - remove me:
+                for (auto iter = crit->_peers.begin(); iter!=crit->_peers.end(); iter++) {
+                    tprintf (DB_MEM, "    allow access to peer: %s%s\n", (*iter)->toString().c_str(), (iter == crit->_peers.begin()) ? " (self)":"");
+                };
+
+                hsa_status_t s = hsa_amd_agents_allow_access(crit->peerCnt(), crit->peerAgents(), NULL, ptr);
+                if (s != HSA_STATUS_SUCCESS) {
+                    ret = -1;
+                }
             }
         }
     }
@@ -96,7 +105,7 @@ int sharePtr(void *ptr, ihipCtx_t *ctx, unsigned hipFlags)
 
 // Allocate a new pointer with am_alloc and share with all valid peers.
 // Returns null-ptr if a memory error occurs (either allocation or sharing)
-void * allocAndSharePtr(const char *msg, size_t sizeBytes, ihipCtx_t *ctx, unsigned amFlags, unsigned hipFlags)
+void * allocAndSharePtr(const char *msg, size_t sizeBytes, ihipCtx_t *ctx, bool shareWithAll, unsigned amFlags, unsigned hipFlags)
 {
 
     void *ptr = nullptr;
@@ -104,11 +113,11 @@ void * allocAndSharePtr(const char *msg, size_t sizeBytes, ihipCtx_t *ctx, unsig
     auto device = ctx->getWriteableDevice();
 
     ptr = hc::am_alloc(sizeBytes, device->_acc, amFlags);
-    tprintf(DB_MEM, " alloc %s ptr:%p size:%zu on dev:%d\n",
-            msg, ptr, sizeBytes, device->_deviceId);
+    tprintf(DB_MEM, " alloc %s ptr:%p-%p size:%zu on dev:%d\n",
+            msg, ptr, static_cast<char*>(ptr)+sizeBytes, sizeBytes, device->_deviceId);
 
     if (ptr != nullptr) {
-        int r = sharePtr(ptr, ctx, hipFlags);
+        int r = sharePtr(ptr, ctx, shareWithAll, hipFlags);
         if (r != 0) {
             ptr = nullptr;
         }
@@ -193,7 +202,8 @@ hipError_t hipHostGetDevicePointer(void **devicePointer, void *hostPointer, unsi
         hc::AmPointerInfo amPointerInfo(NULL, NULL, 0, acc, 0, 0);
         am_status_t status = hc::am_memtracker_getinfo(&amPointerInfo, hostPointer);
         if (status == AM_SUCCESS) {
-            *devicePointer = amPointerInfo._devicePointer;
+            *devicePointer = static_cast<char*>(amPointerInfo._devicePointer) + (static_cast<char*>(hostPointer) - static_cast<char*>(amPointerInfo._hostPointer)) ;
+            tprintf(DB_MEM, " host_ptr=%p returned device_pointer=%p\n", hostPointer, *devicePointer);
         } else {
             e = hipErrorMemoryAllocation;
         }
@@ -204,7 +214,7 @@ hipError_t hipHostGetDevicePointer(void **devicePointer, void *hostPointer, unsi
 
 hipError_t hipMalloc(void** ptr, size_t sizeBytes)
 {
-    HIP_INIT_API(ptr, sizeBytes);
+    HIP_INIT_SPECIAL_API((TRACE_MEM), ptr, sizeBytes);
     HIP_SET_DEVICE();
     hipError_t  hip_status = hipSuccess;
 
@@ -220,7 +230,7 @@ hipError_t hipMalloc(void** ptr, size_t sizeBytes)
 
     } else {
         auto device = ctx->getWriteableDevice();
-        *ptr = hip_internal::allocAndSharePtr("device_mem", sizeBytes, ctx,  0/*amFlags*/, 0/*hipFlags*/);
+        *ptr = hip_internal::allocAndSharePtr("device_mem", sizeBytes, ctx,  false/*shareWithAll*/, 0/*amFlags*/, 0/*hipFlags*/);
 
         if(sizeBytes  && (*ptr == NULL)){
             hip_status = hipErrorMemoryAllocation;
@@ -235,7 +245,7 @@ hipError_t hipMalloc(void** ptr, size_t sizeBytes)
 
 hipError_t hipHostMalloc(void** ptr, size_t sizeBytes, unsigned int flags)
 {
-    HIP_INIT_CMD_API(ptr, sizeBytes, flags);
+    HIP_INIT_SPECIAL_API((TRACE_MEM), ptr, sizeBytes, flags);
     HIP_SET_DEVICE();
     hipError_t hip_status = hipSuccess;
 
@@ -253,20 +263,42 @@ hipError_t hipHostMalloc(void** ptr, size_t sizeBytes, unsigned int flags)
     } else {
         unsigned trueFlags = flags;
         if (flags == hipHostMallocDefault) {
-            trueFlags = hipHostMallocMapped | hipHostMallocWriteCombined;
+            // HCC/ROCM provide a modern system with unified memory and should set both of these flags by default:
+            trueFlags = hipHostMallocMapped | hipHostMallocPortable;
         }
 
-        const unsigned supportedFlags = hipHostMallocPortable | hipHostMallocMapped | hipHostMallocWriteCombined;
 
-        if (flags & ~supportedFlags) {
+        const unsigned supportedFlags = hipHostMallocPortable 
+                                      | hipHostMallocMapped 
+                                      | hipHostMallocWriteCombined 
+                                      | hipHostMallocCoherent 
+                                      | hipHostMallocNonCoherent;
+
+
+        const unsigned coherencyFlags = hipHostMallocCoherent | hipHostMallocNonCoherent;
+
+        if ((flags & ~supportedFlags) ||
+            ((flags & coherencyFlags) == coherencyFlags)) {
+            *ptr = nullptr;
+             // can't specify unsupported flags, can't specify both Coherent + NonCoherent
             hip_status = hipErrorInvalidValue;
-        }
-        else {
+        } else {
             auto device = ctx->getWriteableDevice();
-            unsigned amFlags = HIP_COHERENT_HOST_ALLOC ? amHostCoherent : amHostPinned;
+            
+            unsigned amFlags = 0;
+            if (flags & hipHostMallocCoherent) {
+                amFlags = amHostCoherent;
+            } else if (flags & hipHostMallocNonCoherent) {
+                amFlags = amHostPinned;
+            } else {
+                // depends on env variables:
+                amFlags = HIP_COHERENT_HOST_ALLOC ? amHostCoherent : amHostPinned;
+            }
+
+
+            *ptr = hip_internal::allocAndSharePtr((amFlags & amHostCoherent) ? "finegrained_host":"pinned_host",
+                                                  sizeBytes, ctx, (trueFlags & hipHostMallocPortable) /*shareWithAll*/, amFlags, flags);
 
-            *ptr = hip_internal::allocAndSharePtr(HIP_COHERENT_HOST_ALLOC ? "finegrained_host":"pinned_host",
-                                                  sizeBytes, ctx, amFlags, flags);
             if(sizeBytes  && (*ptr == NULL)){
                 hip_status = hipErrorMemoryAllocation;
             }
@@ -296,7 +328,7 @@ hipError_t hipHostAlloc(void** ptr, size_t sizeBytes, unsigned int flags)
 // width in bytes
 hipError_t hipMallocPitch(void** ptr, size_t* pitch, size_t width, size_t height)
 {
-    HIP_INIT_CMD_API(ptr, pitch, width, height);
+    HIP_INIT_SPECIAL_API((TRACE_MEM), ptr, pitch, width, height);
     HIP_SET_DEVICE();
     hipError_t  hip_status = hipSuccess;
 
@@ -314,7 +346,7 @@ hipError_t hipMallocPitch(void** ptr, size_t* pitch, size_t width, size_t height
         auto device = ctx->getWriteableDevice();
 
         const unsigned am_flags = 0;
-        *ptr = hip_internal::allocAndSharePtr("device_pitch", sizeBytes, ctx, am_flags, 0);
+        *ptr = hip_internal::allocAndSharePtr("device_pitch", sizeBytes, ctx, false/*shareWithAll*/, am_flags, 0);
 
         if (sizeBytes && (*ptr == NULL)) {
             hip_status = hipErrorMemoryAllocation;
@@ -337,7 +369,7 @@ hipChannelFormatDesc hipCreateChannelDesc(int x, int y, int z, int w, hipChannel
 hipError_t hipMallocArray(hipArray** array, const hipChannelFormatDesc* desc,
         size_t width, size_t height, unsigned int flags)
 {
-    HIP_INIT_CMD_API(array, desc, width, height, flags);
+    HIP_INIT_SPECIAL_API((TRACE_MEM), array, desc, width, height, flags);
     HIP_SET_DEVICE();
     hipError_t  hip_status = hipSuccess;
 
@@ -373,7 +405,7 @@ hipError_t hipMallocArray(hipArray** array, const hipChannelFormatDesc* desc,
                 hip_status = hipErrorUnknown;
                 break;
         }
-        *ptr = hip_internal::allocAndSharePtr("device_array", allocSize, ctx, am_flags, 0);
+        *ptr = hip_internal::allocAndSharePtr("device_array", allocSize, ctx, false/*shareWithAll*/, am_flags, 0);
         if (size && (*ptr == NULL)) {
             hip_status = hipErrorMemoryAllocation;
         }
@@ -478,7 +510,7 @@ hipError_t hipHostUnregister(void *hostPtr)
 
 hipError_t hipMemcpyToSymbol(const void* symbolName, const void *src, size_t count, size_t offset, hipMemcpyKind kind)
 {
-    HIP_INIT_CMD_API(symbolName, src, count, offset, kind);
+    HIP_INIT_SPECIAL_API((TRACE_MCMD), symbolName, src, count, offset, kind);
 
     if(symbolName == nullptr)
     {
@@ -513,7 +545,7 @@ hipError_t hipMemcpyToSymbol(const void* symbolName, const void *src, size_t cou
 
 hipError_t hipMemcpyFromSymbol(void* dst, const void* symbolName, size_t count, size_t offset, hipMemcpyKind kind)
 {
-    HIP_INIT_CMD_API(symbolName, dst, count, offset, kind);
+    HIP_INIT_SPECIAL_API((TRACE_MCMD), symbolName, dst, count, offset, kind);
 
     if(symbolName == nullptr)
     {
@@ -548,7 +580,7 @@ hipError_t hipMemcpyFromSymbol(void* dst, const void* symbolName, size_t count,
 
 hipError_t hipMemcpyToSymbolAsync(const void* symbolName, const void *src, size_t count, size_t offset, hipMemcpyKind kind, hipStream_t stream)
 {
-    HIP_INIT_CMD_API(symbolName, src, count, offset, kind, stream);
+    HIP_INIT_SPECIAL_API((TRACE_MCMD), symbolName, src, count, offset, kind, stream);
 
     if(symbolName == nullptr)
     {
@@ -586,7 +618,7 @@ hipError_t hipMemcpyToSymbolAsync(const void* symbolName, const void *src, size_
 
 hipError_t hipMemcpyFromSymbolAsync(void* dst, const void* symbolName, size_t count, size_t offset, hipMemcpyKind kind, hipStream_t stream)
 {
-    HIP_INIT_CMD_API(symbolName, dst, count, offset, kind, stream);
+    HIP_INIT_SPECIAL_API((TRACE_MCMD), symbolName, dst, count, offset, kind, stream);
 
     if(symbolName == nullptr)
     {
@@ -625,7 +657,7 @@ hipError_t hipMemcpyFromSymbolAsync(void* dst, const void* symbolName, size_t co
 //---
 hipError_t hipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind)
 {
-    HIP_INIT_CMD_API(dst, src, sizeBytes, kind);
+    HIP_INIT_SPECIAL_API((TRACE_MCMD), dst, src, sizeBytes, kind);
 
     hipStream_t stream = ihipSyncAndResolveStream(hipStreamNull);
 
@@ -647,7 +679,7 @@ hipError_t hipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind
 
 hipError_t hipMemcpyHtoD(hipDeviceptr_t dst, void* src, size_t sizeBytes)
 {
-    HIP_INIT_CMD_API(dst, src, sizeBytes);
+    HIP_INIT_SPECIAL_API((TRACE_MCMD), dst, src, sizeBytes);
 
     hipStream_t stream = ihipSyncAndResolveStream(hipStreamNull);
 
@@ -669,7 +701,7 @@ hipError_t hipMemcpyHtoD(hipDeviceptr_t dst, void* src, size_t sizeBytes)
 
 hipError_t hipMemcpyDtoH(void* dst, hipDeviceptr_t src, size_t sizeBytes)
 {
-    HIP_INIT_CMD_API(dst, src, sizeBytes);
+    HIP_INIT_SPECIAL_API((TRACE_MCMD), dst, src, sizeBytes);
 
     hipStream_t stream = ihipSyncAndResolveStream(hipStreamNull);
 
@@ -691,7 +723,7 @@ hipError_t hipMemcpyDtoH(void* dst, hipDeviceptr_t src, size_t sizeBytes)
 
 hipError_t hipMemcpyDtoD(hipDeviceptr_t dst, hipDeviceptr_t src, size_t sizeBytes)
 {
-    HIP_INIT_CMD_API(dst, src, sizeBytes);
+    HIP_INIT_SPECIAL_API((TRACE_MCMD), dst, src, sizeBytes);
 
     hipStream_t stream = ihipSyncAndResolveStream(hipStreamNull);
 
@@ -713,7 +745,7 @@ hipError_t hipMemcpyDtoD(hipDeviceptr_t dst, hipDeviceptr_t src, size_t sizeByte
 
 hipError_t hipMemcpyHtoH(void* dst, void* src, size_t sizeBytes)
 {
-    HIP_INIT_CMD_API(dst, src, sizeBytes);
+    HIP_INIT_SPECIAL_API((TRACE_MCMD), dst, src, sizeBytes);
 
     hipStream_t stream = ihipSyncAndResolveStream(hipStreamNull);
 
@@ -738,7 +770,7 @@ hipError_t hipMemcpyHtoH(void* dst, void* src, size_t sizeBytes)
 
 hipError_t hipMemcpyAsync(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind, hipStream_t stream)
 {
-    HIP_INIT_CMD_API(dst, src, sizeBytes, kind, stream);
+    HIP_INIT_SPECIAL_API((TRACE_MCMD), dst, src, sizeBytes, kind, stream);
 
     return ihipLogStatus(hip_internal::memcpyAsync(dst, src, sizeBytes, kind, stream));
 
@@ -747,21 +779,21 @@ hipError_t hipMemcpyAsync(void* dst, const void* src, size_t sizeBytes, hipMemcp
 
 hipError_t hipMemcpyHtoDAsync(hipDeviceptr_t dst, void* src, size_t sizeBytes, hipStream_t stream)
 {
-    HIP_INIT_CMD_API(dst, src, sizeBytes, stream);
+    HIP_INIT_SPECIAL_API((TRACE_MCMD), dst, src, sizeBytes, stream);
 
     return ihipLogStatus(hip_internal::memcpyAsync(dst, src, sizeBytes, hipMemcpyHostToDevice, stream));
 }
 
 hipError_t hipMemcpyDtoDAsync(hipDeviceptr_t dst, hipDeviceptr_t src, size_t sizeBytes, hipStream_t stream)
 {
-    HIP_INIT_CMD_API(dst, src, sizeBytes, stream);
+    HIP_INIT_SPECIAL_API((TRACE_MCMD), dst, src, sizeBytes, stream);
 
     return ihipLogStatus(hip_internal::memcpyAsync(dst, src, sizeBytes, hipMemcpyDeviceToDevice, stream));
 }
 
 hipError_t hipMemcpyDtoHAsync(void* dst, hipDeviceptr_t src, size_t sizeBytes, hipStream_t stream)
 {
-    HIP_INIT_CMD_API(dst, src, sizeBytes, stream);
+    HIP_INIT_SPECIAL_API((TRACE_MCMD), dst, src, sizeBytes, stream);
 
     return ihipLogStatus(hip_internal::memcpyAsync(dst, src, sizeBytes, hipMemcpyDeviceToHost, stream));
 }
@@ -770,7 +802,7 @@ hipError_t hipMemcpyDtoHAsync(void* dst, hipDeviceptr_t src, size_t sizeBytes, h
 hipError_t hipMemcpy2D(void* dst, size_t dpitch, const void* src, size_t spitch,
         size_t width, size_t height, hipMemcpyKind kind) {
 
-    HIP_INIT_CMD_API(dst, dpitch, src, spitch, width, height, kind);
+    HIP_INIT_SPECIAL_API((TRACE_MCMD), dst, dpitch, src, spitch, width, height, kind);
 
     if(width > dpitch || width > spitch)
         return ihipLogStatus(hipErrorUnknown);
@@ -793,10 +825,28 @@ hipError_t hipMemcpy2D(void* dst, size_t dpitch, const void* src, size_t spitch,
     return ihipLogStatus(e);
 }
 
+hipError_t hipMemcpy2D(void* dst, size_t dpitch, const void* src, size_t spitch,
+        size_t width, size_t height, hipMemcpyKind kind, hipStream_t stream) {
+    HIP_INIT_SPECIAL_API((TRACE_MCMD), dst, dpitch, src, spitch, width, height, kind, stream);
+    if(width > dpitch || width > spitch)
+        return ihipLogStatus(hipErrorUnknown);
+    hipError_t e = hipSuccess;
+    try {
+        for(int i = 0; i < height; ++i) {
+            e = hip_internal::memcpyAsync((unsigned char*)dst + i*dpitch, (unsigned char*)src + i*spitch, width, kind,stream);
+        }
+    }
+    catch (ihipException ex) {
+        e = ex._code;
+    }
+
+    return ihipLogStatus(e);
+}
+
 hipError_t hipMemcpy2DToArray(hipArray* dst, size_t wOffset, size_t hOffset, const void* src,
         size_t spitch, size_t width, size_t height, hipMemcpyKind kind) {
 
-    HIP_INIT_CMD_API(dst, wOffset, hOffset, src, spitch, width, height, kind);
+    HIP_INIT_SPECIAL_API((TRACE_MCMD), dst, wOffset, hOffset, src, spitch, width, height, kind);
 
     hipStream_t stream = ihipSyncAndResolveStream(hipStreamNull);
 
@@ -849,7 +899,7 @@ hipError_t hipMemcpy2DToArray(hipArray* dst, size_t wOffset, size_t hOffset, con
 hipError_t hipMemcpyToArray(hipArray* dst, size_t wOffset, size_t hOffset,
         const void* src, size_t count, hipMemcpyKind kind) {
 
-    HIP_INIT_CMD_API(dst, wOffset, hOffset, src, count, kind);
+    HIP_INIT_SPECIAL_API((TRACE_MCMD), dst, wOffset, hOffset, src, count, kind);
 
     hipStream_t stream = ihipSyncAndResolveStream(hipStreamNull);
 
@@ -908,7 +958,7 @@ ihipMemsetKernel(hipStream_t stream,
 // TODO-sync: function is async unless target is pinned host memory - then these are fully sync.
 hipError_t hipMemsetAsync(void* dst, int  value, size_t sizeBytes, hipStream_t stream )
 {
-    HIP_INIT_CMD_API(dst, value, sizeBytes, stream);
+    HIP_INIT_SPECIAL_API((TRACE_MCMD), dst, value, sizeBytes, stream);
 
     hipError_t e = hipSuccess;
 
@@ -917,7 +967,6 @@ hipError_t hipMemsetAsync(void* dst, int  value, size_t sizeBytes, hipStream_t s
     if (stream) {
         auto crit = stream->lockopen_preKernelCommand();
 
-        stream->ensureHaveQueue(crit);
 
         hc::completion_future cf ;
 
@@ -958,7 +1007,7 @@ hipError_t hipMemsetAsync(void* dst, int  value, size_t sizeBytes, hipStream_t s
 
 hipError_t hipMemset(void* dst, int  value, size_t sizeBytes )
 {
-    HIP_INIT_CMD_API(dst, value, sizeBytes);
+    HIP_INIT_SPECIAL_API((TRACE_MCMD), dst, value, sizeBytes);
 
     hipError_t e = hipSuccess;
 
@@ -969,7 +1018,6 @@ hipError_t hipMemset(void* dst, int  value, size_t sizeBytes )
     if (stream) {
         auto crit = stream->lockopen_preKernelCommand();
 
-        stream->ensureHaveQueue(crit);
         hc::completion_future cf ;
 
         if ((sizeBytes & 0x3) == 0) {
@@ -1011,7 +1059,7 @@ hipError_t hipMemset(void* dst, int  value, size_t sizeBytes )
 
 hipError_t hipMemsetD8(hipDeviceptr_t dst, unsigned char  value, size_t sizeBytes )
 {
-    HIP_INIT_CMD_API(dst, value, sizeBytes);
+    HIP_INIT_SPECIAL_API((TRACE_MCMD), dst, value, sizeBytes);
 
     hipError_t e = hipSuccess;
 
@@ -1022,7 +1070,6 @@ hipError_t hipMemsetD8(hipDeviceptr_t dst, unsigned char  value, size_t sizeByte
     if (stream) {
         auto crit = stream->lockopen_preKernelCommand();
 
-        stream->ensureHaveQueue(crit);
         hc::completion_future cf ;
 
         if ((sizeBytes & 0x3) == 0) {
@@ -1118,7 +1165,7 @@ hipError_t hipMemPtrGetInfo(void *ptr, size_t *size)
 
 hipError_t hipFree(void* ptr)
 {
-    HIP_INIT_API(ptr);
+    HIP_INIT_SPECIAL_API((TRACE_MEM), ptr);
 
     hipError_t hipStatus = hipErrorInvalidDevicePointer;
 
@@ -1146,7 +1193,7 @@ hipError_t hipFree(void* ptr)
 
 hipError_t hipHostFree(void* ptr)
 {
-    HIP_INIT_API(ptr);
+    HIP_INIT_SPECIAL_API((TRACE_MEM), ptr);
 
     // Synchronize to ensure all work has finished.
     ihipGetTlsDefaultCtx()->locked_waitAllStreams(); // ignores non-blocking streams, this waits for all activity to finish.
@@ -1180,7 +1227,7 @@ hipError_t hipFreeHost(void* ptr)
 
 hipError_t hipFreeArray(hipArray* array)
 {
-    HIP_INIT_API(array);
+    HIP_INIT_SPECIAL_API((TRACE_MEM), array);
 
     hipError_t hipStatus = hipErrorInvalidDevicePointer;
 
diff --git a/projects/hip/src/hip_module.cpp b/projects/hip/src/hip_module.cpp
index b359e7a63c..2a3bfabc28 100644
--- a/projects/hip/src/hip_module.cpp
+++ b/projects/hip/src/hip_module.cpp
@@ -352,14 +352,14 @@ hipError_t ihipModuleGetSymbol(hipFunction_t *func, hipModule_t hmod, const char
         *func = sym;
         hmod->funcTrack.push_back(*func);
     }
-    return ihipLogStatus(ret);
+    return ret;
 }
 
 
 hipError_t hipModuleGetFunction(hipFunction_t *hfunc, hipModule_t hmod,
                                 const char *name){
     HIP_INIT_API(hfunc, hmod, name);
-    return ihipModuleGetSymbol(hfunc, hmod, name);
+    return ihipLogStatus(ihipModuleGetSymbol(hfunc, hmod, name));
 }
 
 
@@ -455,10 +455,10 @@ hipError_t ihipModuleLaunchKernel(hipFunction_t f,
 
 
         if (startEvent) {
-            startEvent->attachToCompletionFuture(&cf, hipEventTypeStartCommand);
+            startEvent->attachToCompletionFuture(&cf, hStream, hipEventTypeStartCommand);
         }
         if (stopEvent) {
-            stopEvent->attachToCompletionFuture (&cf, hipEventTypeStopCommand);
+            stopEvent->attachToCompletionFuture (&cf, hStream, hipEventTypeStopCommand);
         }
 
 
@@ -525,7 +525,6 @@ hipError_t hipModuleGetGlobal(hipDeviceptr_t *dptr, size_t *bytes,
     }
 }
 
-
 hipError_t hipModuleLoadData(hipModule_t *module, const void *image)
 {
     HIP_INIT_API(module, image);
@@ -575,3 +574,8 @@ hipError_t hipModuleLoadData(hipModule_t *module, const void *image)
     }
     return ihipLogStatus(ret);
 }
+
+hipError_t hipModuleLoadDataEx(hipModule_t *module, const void *image, unsigned int numOptions, hipJitOption *options, void **optionValues)
+{
+    return hipModuleLoadData(module, image);
+}
diff --git a/projects/hip/src/hip_stream.cpp b/projects/hip/src/hip_stream.cpp
index d7f8717725..40aade28b9 100644
--- a/projects/hip/src/hip_stream.cpp
+++ b/projects/hip/src/hip_stream.cpp
@@ -49,7 +49,7 @@ hipError_t ihipStreamCreate(hipStream_t *stream, unsigned int flags)
             // Obtain mutex access to the device critical data, release by destructor
             LockedAccessor_CtxCrit_t  ctxCrit(ctx->criticalData());
 
-            auto istream = new ihipStream_t(ctx, ctx->createOrStealQueue(ctxCrit), flags);
+            auto istream = new ihipStream_t(ctx, acc.create_view(), flags);
 
             ctxCrit->addStream(istream);
             *stream = istream;
@@ -93,20 +93,17 @@ hipError_t hipStreamWaitEvent(hipStream_t stream, hipEvent_t event, unsigned int
 
     } else if (event->_state != hipEventStatusUnitialized) {
 
-        bool fastWait = false;
-
         if (stream != hipStreamNull) {
+
+            // This will user create_blocking_marker to wait on the specified queue.
             stream->locked_waitEvent(event);
 
-            fastWait = true; // don't use the slow host-side synchronization.
-        }
-
-        if (!fastWait) {
+        } else {
             // TODO-hcc Convert to use create_blocking_marker(...) functionality.
             // Currently we have a super-conservative version of this - block on host, and drain the queue.
             // This should create a barrier packet in the target queue.
+            // TODO-HIP_SYNC_NULL_STREAM
             stream->locked_wait();
-            e = hipSuccess;
         }
     } // else event not recorded, return immediately and don't create marker.
 
@@ -129,9 +126,7 @@ hipError_t hipStreamQuery(hipStream_t stream)
 
     {
         LockedAccessor_StreamCrit_t crit(stream->_criticalData);
-        if (crit->_hasQueue) {
-            pendingOps = crit->_av.get_pending_async_ops();
-        }
+        pendingOps = crit->_av.get_pending_async_ops();
     }
 
 
@@ -148,10 +143,11 @@ hipError_t hipStreamSynchronize(hipStream_t stream)
 
     hipError_t e = hipSuccess;
 
-    if (stream == NULL) {
+    if (stream == hipStreamNull) {
         ihipCtx_t *ctx = ihipGetTlsDefaultCtx();
-        ctx->locked_syncDefaultStream(true/*waitOnSelf*/);
+        ctx->locked_syncDefaultStream(true/*waitOnSelf*/, true/*syncToHost*/);
     } else {
+		// note this does not synchornize with the NULL stream:
         stream->locked_wait();
         e = hipSuccess;
     }
@@ -173,20 +169,18 @@ hipError_t hipStreamDestroy(hipStream_t stream)
 
     //--- Drain the stream:
     if (stream == NULL) {
-        ihipCtx_t *ctx = ihipGetTlsDefaultCtx();
-        ctx->locked_syncDefaultStream(true/*waitOnSelf*/);
+        e = hipErrorInvalidResourceHandle; // TODO - review - what happens if try to destroy null stream
     } else {
         stream->locked_wait();
-        e = hipSuccess;
-    }
 
-    ihipCtx_t *ctx = stream->getCtx();
+        ihipCtx_t *ctx = stream->getCtx();
 
-    if (ctx) {
-        ctx->locked_removeStream(stream);
-        delete stream;
-    } else {
-        e = hipErrorInvalidResourceHandle;
+        if (ctx) {
+            ctx->locked_removeStream(stream);
+            delete stream;
+        } else {
+            e = hipErrorInvalidResourceHandle;
+        }
     }
 
     return ihipLogStatus(e);
@@ -200,7 +194,7 @@ hipError_t hipStreamGetFlags(hipStream_t stream, unsigned int *flags)
 
     if (flags == NULL) {
         return ihipLogStatus(hipErrorInvalidValue);
-    } else if (stream == NULL) {
+    } else if (stream == hipStreamNull) {
         return ihipLogStatus(hipErrorInvalidResourceHandle);
     } else {
         *flags = stream->_flags;
diff --git a/projects/hip/src/math_functions.cpp b/projects/hip/src/math_functions.cpp
index 3472216309..f66f0a4312 100644
--- a/projects/hip/src/math_functions.cpp
+++ b/projects/hip/src/math_functions.cpp
@@ -830,16 +830,6 @@ __host__ double erfcinv(double y)
     return __hip_host_erfcinv(y);
 }
 
-__host__ float erfinvf(float x)
-{
-    return __hip_host_erfinvf(x);
-}
-
-__host__ double erfinv(double x)
-{
-    return __hip_host_erfinv(x);
-}
-
 __host__ double fdivide(double x, double y)
 {
     return x/y;
@@ -947,15 +937,15 @@ __host__ void sincospi(double x, double *sptr, double *cptr)
     *cptr = std::cos(HIP_PI*x);
 }
 
-__host__ float normcdfinvf(float x)
-{
-    return std::sqrt(2) * erfinv(2*x-1);
-}
+//__host__ float normcdfinvf(float x)
+//{
+//    return std::sqrt(2) * erfinvf(2*x-1);
+//}
 
-__host__ double normcdfinv(double x)
-{
-    return std::sqrt(2) * erfinv(2*x-1);
-}
+//__host__ double normcdfinv(double x)
+//{
+//    return std::sqrt(2) * erfinv(2*x-1);
+//}
 
 __host__ float nextafterf(float x, float y)
 {
diff --git a/projects/hip/tests/README.md b/projects/hip/tests/README.md
index 223bd149dc..cb41cc10cd 100644
--- a/projects/hip/tests/README.md
+++ b/projects/hip/tests/README.md
@@ -59,5 +59,9 @@ Find the test and commandline that fail:
 grep -IR hipMemcpy-modes -IR ../tests/
 ../tests/src/runtimeApi/memory/hipMemcpy.cpp: * RUN_NAMED: %t hipMemcpy-modes --tests 0x1
 
+# Guidelines for adding new tests
 
+- Prefer to enhance an existing test as opposed to writing a new one. Tests have overhead to start and many small tests spend precious test time on startup and initialization issues.
+- Make the test run standalone without requirement for command-line arguments.  THis makes it easier to debug since the name of the test is shown in the test report and if you know the name of the test you can the run the test.
+- For long-running tests or tests with multiple phases, consider using the --tests option as an optional mechanism to allow debuggers to start with the failing subset of the test.
 
diff --git a/projects/hip/tests/src/deviceLib/hipDoublePrecisionMathDevice.cpp b/projects/hip/tests/src/deviceLib/hipDoublePrecisionMathDevice.cpp
index df5dad3968..f4f7ab0479 100644
--- a/projects/hip/tests/src/deviceLib/hipDoublePrecisionMathDevice.cpp
+++ b/projects/hip/tests/src/deviceLib/hipDoublePrecisionMathDevice.cpp
@@ -99,7 +99,7 @@ __device__ void double_precision_math_functions()
     normcdf(0.0);
     normcdfinv(1.0);
     pow(1.0, 0.0);
-    rcbrt(1.0);
+    //rcbrt(1.0);
     remainder(2.0, 1.0);
 //    remquo(1.0, 2.0, &iX);
     rhypot(0.0, 1.0);
@@ -109,8 +109,8 @@ __device__ void double_precision_math_functions()
     rnorm4d(0.0, 0.0, 0.0, 1.0);
     round(0.0);
     rsqrt(1.0);
-    scalbln(0.0, 1);
-    scalbn(0.0, 1);
+    //scalbln(0.0, 1);
+    //scalbn(0.0, 1);
     signbit(1.0);
     sin(0.0);
     sincos(0.0, &fX, &fY);
diff --git a/projects/hip/tests/src/deviceLib/hipSinglePrecisionMathDevice.cpp b/projects/hip/tests/src/deviceLib/hipSinglePrecisionMathDevice.cpp
index 53ccd2251f..de3dec35ef 100644
--- a/projects/hip/tests/src/deviceLib/hipSinglePrecisionMathDevice.cpp
+++ b/projects/hip/tests/src/deviceLib/hipSinglePrecisionMathDevice.cpp
@@ -100,7 +100,7 @@ __device__ void single_precision_math_functions()
     normcdfinvf(1.0f);
     fX = 1.0f; normf(1, &fX);
     powf(1.0f, 0.0f);
-    rcbrtf(1.0f);
+    //rcbrtf(1.0f);
     remainderf(2.0f, 1.0f);
     //remquof(1.0f, 2.0f, &iX);
     rhypotf(0.0f, 1.0f);
@@ -110,8 +110,8 @@ __device__ void single_precision_math_functions()
     fX = 1.0f; rnormf(1, &fX);
     roundf(0.0f);
     rsqrtf(1.0f);
-    scalblnf(0.0f, 1);
-    scalbnf(0.0f, 1);
+    //scalblnf(0.0f, 1);
+    //scalbnf(0.0f, 1);
     signbit(1.0f);
     sincosf(0.0f, &fX, &fY);
     sincospif(0.0f, &fX, &fY);
diff --git a/projects/hip/tests/src/hipPointerAttrib.cpp b/projects/hip/tests/src/hipPointerAttrib.cpp
index fb7832d9a6..7a2ab64bea 100644
--- a/projects/hip/tests/src/hipPointerAttrib.cpp
+++ b/projects/hip/tests/src/hipPointerAttrib.cpp
@@ -99,7 +99,7 @@ inline int zrand(int max)
 
 
 //=================================================================================================
-// Functins to run tests
+// Functions to run tests
 //=================================================================================================
 //--
 //Run through a couple simple cases to test lookups and host pointer arithmetic:
diff --git a/projects/hip/tests/src/kernel/inline_asm_vadd.cpp b/projects/hip/tests/src/kernel/inline_asm_vadd.cpp
new file mode 100644
index 0000000000..7a941d31af
--- /dev/null
+++ b/projects/hip/tests/src/kernel/inline_asm_vadd.cpp
@@ -0,0 +1,126 @@
+/* Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
+associated documentation files (the "Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the
+following conditions:
+
+The above copyright notice and this permission notice shall be included in all copies or substantial
+portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT
+LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO
+EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
+THE USE OR OTHER DEALINGS IN THE SOFTWARE. */
+
+/* HIT_START
+ * BUILD: %t %s EXCLUDE_HIP_PLATFORM nvcc
+ * RUN: %t
+ * HIT_END
+ */
+
+
+#include<iostream>
+
+// hip header file
+#include "hip/hip_runtime.h"
+
+#define NUM       1024
+
+#define THREADS_PER_BLOCK_X  4
+
+// Device (Kernel) function, it must be void
+// hipLaunchParm provides the execution configuration
+__global__ void vadd_asm(hipLaunchParm lp,
+                                float *out,
+                                float *in)
+{
+    int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
+
+    asm volatile ("v_add_f32_e32 %0, %1, %2" : "=v" (out[i]) : "v"(in[i]),"v" (out[i]));
+}
+
+// CPU implementation of Vector Result
+void addCPUReference(
+    float * output,
+    float * input)
+{
+    for(unsigned int j=0; j < NUM; j++)
+    {
+        
+        output[j]= input[j] + output[j];
+    }
+}
+
+int main(){
+
+    float* VectorA;
+    float* ResultVector;
+    float* VectorB;
+
+    float* gpuVector;
+    float* gpuResultVector;
+
+    int i;
+    int errors;
+
+    VectorA = (float*)malloc(NUM * sizeof(float));
+    ResultVector = (float*)malloc(NUM * sizeof(float));
+    VectorB = (float*)malloc(NUM * sizeof(float));
+
+    // initialize the input data
+    for (i = 0; i < NUM; i++) {
+    VectorA[i] = (float)i*10.0f;
+    VectorB[i] = (float)i*30.0f;
+    }
+
+    // allocate the memory on the device side
+    hipMalloc((void**)&gpuVector, NUM * sizeof(float));
+    hipMalloc((void**)&gpuResultVector, NUM * sizeof(float));
+
+    // Memory transfer from host to device
+    hipMemcpy(gpuVector, VectorA, NUM*sizeof(float), hipMemcpyHostToDevice);
+    hipMemcpy(gpuResultVector, VectorB, NUM*sizeof(float), hipMemcpyHostToDevice);
+
+    // Lauching kernel from host
+    hipLaunchKernel(vadd_asm,
+                    dim3(NUM/THREADS_PER_BLOCK_X),
+                    dim3(THREADS_PER_BLOCK_X),
+                    0, 0,
+                    gpuResultVector , gpuVector);
+
+    // Memory transfer from device to host
+    hipMemcpy(ResultVector, gpuResultVector, NUM*sizeof(float), hipMemcpyDeviceToHost);
+
+    // CPU Result computation
+    addCPUReference(VectorB, VectorA);
+
+    // verify the results
+    errors = 0;
+    double eps = 1.0E-3;
+    for (i = 0; i < NUM; i++) {
+    if (std::abs(ResultVector[i] - VectorB[i]) > eps ) {
+        errors++;
+        }
+    }
+    if (errors!=0) {
+    printf("FAILED: %d errors\n",errors);
+    } else {
+        printf ("PASSED!\n");
+    }
+
+    //free the resources on device side
+    hipFree(gpuVector);
+    hipFree(gpuResultVector);
+
+    hipDeviceReset();
+
+    //free the resources on host side
+    free(VectorA);
+    free(ResultVector);
+    free(VectorB);
+
+    return errors;
+}
diff --git a/projects/hip/tests/src/kernel/inline_asm_vmac.cpp b/projects/hip/tests/src/kernel/inline_asm_vmac.cpp
new file mode 100644
index 0000000000..1b6941c249
--- /dev/null
+++ b/projects/hip/tests/src/kernel/inline_asm_vmac.cpp
@@ -0,0 +1,125 @@
+/*
+Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include<iostream>
+
+// hip header file
+#include "hip/hip_runtime.h"
+
+#define NUM       1024
+
+#define THREADS_PER_BLOCK_X  4
+
+// Device (Kernel) function, it must be void
+// hipLaunchParm provides the execution configuration
+__global__ void vmac_asm(hipLaunchParm lp,
+                                float *out,
+                                float *in)
+{
+    int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
+
+    asm volatile ("v_mac_f32_e32 %0, %2, %3" : "=v" (out[i]) : "0"(out[i]), "v" (a), "v" (in[i]));
+}
+
+// CPU implementation of saxpy
+void CPUReference(
+    float * output,
+    float * input)
+{
+    for(unsigned int j=0; j < NUM; j++)
+    {
+        
+        output[j]= a*input[j] + output[j];
+    }
+}
+
+int main(){
+
+    float* VectorA;
+    float* ResultVector;
+    float* VectorB;
+
+    float* gpuVector;
+    float* gpuResultVector;
+
+    const float a = 10.0f
+    int i;
+    int errors;
+
+    VectorA = (float*)malloc(NUM * sizeof(float));
+    ResultVector = (float*)malloc(NUM * sizeof(float));
+    VectorB = (float*)malloc(NUM * sizeof(float));
+
+    // initialize the input data
+    for (i = 0; i < NUM; i++) {
+    VectorA[i] = (float)i*10.0f;
+    VectorB[i] = (float)i*30.0f;
+    }
+
+    // allocate the memory on the device side
+    hipMalloc((void**)&gpuVector, NUM * sizeof(float));
+    hipMalloc((void**)&gpuResultVector, NUM * sizeof(float));
+
+    // Memory transfer from host to device
+    hipMemcpy(gpuVector, VectorA, NUM*sizeof(float), hipMemcpyHostToDevice);
+    hipMemcpy(gpuResultVector, VectorB, NUM*sizeof(float), hipMemcpyHostToDevice);
+
+    // Lauching kernel from host
+    hipLaunchKernel(vmac_asm,
+                    dim3(NUM/THREADS_PER_BLOCK_X),
+                    dim3(THREADS_PER_BLOCK_X),
+                    0, 0,
+                    gpuResultVector , gpuVector);
+
+    // Memory transfer from device to host
+    hipMemcpy(ResultVector, gpuResultVector, NUM*sizeof(float), hipMemcpyDeviceToHost);
+
+    // CPU Result computation
+    addCPUReference(VectorB, VectorA);
+
+    // verify the results
+    errors = 0;
+    double eps = 1.0E-3;
+    for (i = 0; i < NUM; i++) {
+    if (std::abs(ResultVector[i] - VectorB[i]) > eps ) {
+        errors++;
+        }
+    }
+    if (errors!=0) {
+    printf("FAILED: %d errors\n",errors);
+    } else {
+        printf ("PASSED!\n");
+    }
+
+    //free the resources on device side
+    hipFree(gpuVector);
+    hipFree(gpuResultVector);
+
+    hipDeviceReset();
+
+    //free the resources on host side
+    free(VectorA);
+    free(ResultVector);
+    free(VectorB);
+
+    return errors;
+}
diff --git a/projects/hip/tests/src/runtimeApi/event/record_event.cpp b/projects/hip/tests/src/runtimeApi/event/record_event.cpp
new file mode 100644
index 0000000000..bd8a3ada8e
--- /dev/null
+++ b/projects/hip/tests/src/runtimeApi/event/record_event.cpp
@@ -0,0 +1,200 @@
+/*
+Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+/* HIT_START
+ * BUILD: %t %s ../../test_common.cpp
+ * RUN: %t
+ * HIT_END
+ */
+
+
+#include "test_common.h"
+
+enum SyncMode {
+    syncNone,
+    syncStream,
+    syncStopEvent,
+};
+
+
+const char *syncModeString(int syncMode) {
+    switch (syncMode) {
+        case syncNone:
+            return "syncNone";
+        case syncStream:
+            return "syncStream";
+        case syncStopEvent:
+            return "syncStopEvent";
+        default:
+            return "unknown";
+    };
+};
+
+
+void test(unsigned testMask, int *C_d, int *C_h, int64_t numElements, hipStream_t stream, int waitStart, SyncMode syncMode)
+{
+    if (!(testMask & p_tests)) {
+        return;
+    }
+    printf ("\ntest 0x%3x: stream=%p waitStart=%d syncMode=%s\n", 
+            testMask, stream, waitStart, syncModeString(syncMode));
+
+    size_t sizeBytes = numElements * sizeof(int);
+
+    int count =100;
+    int init0 = 0;
+    HIPCHECK(hipMemset(C_d, init0, sizeBytes));
+    for (int i=0; i<numElements; i++) {
+        C_h[i] = -1; // initialize
+    }
+
+    hipEvent_t neverCreated=0, neverRecorded, timingDisabled;
+    HIPCHECK(hipEventCreate(&neverRecorded));
+    HIPCHECK(hipEventCreateWithFlags(&timingDisabled, hipEventDisableTiming));
+
+    hipEvent_t start, stop;
+    HIPCHECK(hipEventCreate(&start));
+    HIPCHECK(hipEventCreate(&stop));
+
+    unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, numElements);
+
+    HIPCHECK(hipEventRecord(timingDisabled, stream));
+    // sandwhich a kernel:
+    HIPCHECK(hipEventRecord(start, stream));
+    hipLaunchKernelGGL(HipTest::addCountReverse , dim3(blocks), dim3(threadsPerBlock), 0, stream,    C_d, C_h, numElements, count);
+    HIPCHECK(hipEventRecord(stop, stream));
+
+
+    if (waitStart) {
+        HIPCHECK(hipEventSynchronize(start));
+    }
+
+   
+    hipError_t expectedStopError = hipSuccess; 
+
+    // How to wait for the events to finish:
+    switch (syncMode) {
+        case syncNone:
+            expectedStopError = hipErrorNotReady;
+            break;
+        case syncStream:
+            HIPCHECK(hipStreamSynchronize(stream));  // wait for recording to finish...
+            break;
+        case syncStopEvent:
+            HIPCHECK(hipEventSynchronize(stop)); 
+            break;
+        default:
+            assert(0);
+    };
+            
+
+    float t;
+
+    hipError_t e = hipEventElapsedTime(&t, start, start);
+    if ((e != hipSuccess) && (e != hipErrorNotReady))  {
+        failed ("start event not in expected state, was %d=%s\n", e, hipGetErrorName(e));
+    }
+
+    if (e == hipSuccess) 
+        assert (t==0.0f);
+        
+
+    // stop usually ready unless we skipped the synchronization (syncNone)
+    HIPCHECK_API(hipEventElapsedTime(&t, stop, stop), expectedStopError);
+    if (e == hipSuccess) 
+        assert (t==0.0f);
+
+
+    e = hipEventElapsedTime(&t, start, stop);
+    HIPCHECK_API(e, expectedStopError);
+    if (expectedStopError == hipSuccess) 
+        assert (t>0.0f);
+    printf ("time=%6.2f error=%s\n", t, hipGetErrorName(e));
+
+    e = hipEventElapsedTime(&t, stop, start);
+    HIPCHECK_API(e, expectedStopError);
+    if (expectedStopError == hipSuccess) 
+        assert (t<0.0f);
+    printf ("negtime=%6.2f error=%s\n", t, hipGetErrorName(e));
+
+
+
+    {
+        // Check some error conditions for incomplete events:
+        HIPCHECK_API(hipEventElapsedTime(&t, timingDisabled, stop), hipErrorInvalidResourceHandle);
+        HIPCHECK_API(hipEventElapsedTime(&t, start, timingDisabled), hipErrorInvalidResourceHandle);
+
+        HIPCHECK_API(hipEventElapsedTime(&t, neverCreated, stop), hipErrorInvalidResourceHandle);
+        HIPCHECK_API(hipEventElapsedTime(&t, start, neverCreated),  hipErrorInvalidResourceHandle);
+
+        HIPCHECK_API(hipEventElapsedTime(&t, neverRecorded, stop), hipErrorInvalidResourceHandle);
+        HIPCHECK_API(hipEventElapsedTime(&t, start, neverRecorded),  hipErrorInvalidResourceHandle);
+    }
+
+    HIPCHECK(hipEventDestroy(start));
+    HIPCHECK(hipEventDestroy(stop));
+
+    // Clear out everything:
+    HIPCHECK(hipDeviceSynchronize());
+
+    printf ("test:   OK  \n");
+}
+
+
+
+void runTests(int64_t numElements)
+{
+    size_t sizeBytes = numElements * sizeof(int);
+
+    printf ("test: starting sequence with sizeBytes=%zu bytes, %6.2f MB\n", sizeBytes, sizeBytes/1024.0/1024.0);
+
+
+    int *C_h, *C_d;
+    HIPCHECK(hipMalloc(&C_d, sizeBytes));
+    HIPCHECK(hipHostMalloc(&C_h, sizeBytes));
+
+    hipStream_t stream;
+    HIPCHECK(hipStreamCreateWithFlags(&stream, 0x0));
+
+    //for (int waitStart=0; waitStart<2; waitStart++) {
+    for (int waitStart=1; waitStart>=0; waitStart--) {
+        unsigned W = waitStart ? 0x1000:0;
+        test (W | 0x01, C_d, C_h, numElements,  0     , waitStart, syncNone);
+        test (W | 0x02, C_d, C_h, numElements,  stream, waitStart, syncNone);
+        test (W | 0x04, C_d, C_h, numElements,  0     , waitStart, syncStream);
+        test (W | 0x08, C_d, C_h, numElements,  stream, waitStart, syncStream);
+        test (W | 0x10, C_d, C_h, numElements,  0,      waitStart, syncStopEvent);
+        test (W | 0x20, C_d, C_h, numElements,  stream, waitStart, syncStopEvent);
+    }
+
+
+    HIPCHECK(hipStreamDestroy(stream));
+    HIPCHECK(hipFree(C_d));
+    HIPCHECK(hipHostFree(C_h));
+}
+
+
+int main(int argc, char *argv[])
+{
+    HipTest::parseStandardArguments(argc, argv, true /*failOnUndefinedArg*/);
+
+    runTests(80000000);
+
+    passed();
+}
diff --git a/projects/hip/tests/src/runtimeApi/memory/hipHostMalloc.cpp b/projects/hip/tests/src/runtimeApi/memory/hipHostMalloc.cpp
index d6b3b05a1d..607e2a9f63 100644
--- a/projects/hip/tests/src/runtimeApi/memory/hipHostMalloc.cpp
+++ b/projects/hip/tests/src/runtimeApi/memory/hipHostMalloc.cpp
@@ -21,24 +21,92 @@
    */
 
 /* HIT_START
- * BUILD: %t %s ../../test_common.cpp
+ * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS -std=c++11
  * RUN: %t
  * HIT_END
  */
 
+#include <vector>
 #include"test_common.h"
 
 #define LEN 1024*1024
 #define SIZE LEN*sizeof(float)
 
-__global__ void Add(hipLaunchParm lp, float *Ad, float *Bd, float *Cd){
+__global__ void Add(float *Ad, float *Bd, float *Cd){
     int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
     Cd[tx] = Ad[tx] + Bd[tx];
 }
 
+
+__global__ void Set(int *Ad, int val){
+    int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
+    Ad[tx] = val;
+}
+
+
+#define SYNC_EVENT 0
+#define SYNC_STREAM 1
+#define SYNC_DEVICE 2
+
+std::vector<std::string> syncMsg = {"event", "stream", "device"};
+
+void CheckHostPointer(int numElements, int *ptr, unsigned eventFlags, int syncMethod, std::string msg)
+{
+    std::cerr << "test: CheckHostPointer " << msg 
+              //<< " HIP_COHERENT_HOST_ALLOC=" << HIP_COHERENT_HOST_ALLOC
+              //<< " HIP_EVENT_SYS_RELEASE=" <<   HIP_EVENT_SYS_RELEASE
+              << " eventFlags = " << std::hex << eventFlags 
+              << ((eventFlags & hipEventReleaseToDevice) ?  " hipEventReleaseToDevice" : "")  
+              << ((eventFlags & hipEventReleaseToSystem) ? " hipEventReleaseToSystem" : "") 
+              << " ptr=" << ptr 
+              << " syncMethod=" << syncMsg[syncMethod] << "\n";
+
+    hipStream_t s;
+    hipEvent_t e;
+
+    // Init:
+    HIPCHECK(hipStreamCreate(&s));
+    HIPCHECK(hipEventCreateWithFlags(&e, eventFlags))
+    dim3 dimBlock(64,1,1);
+    dim3 dimGrid(numElements/dimBlock.x,1,1);
+
+    const int expected = 13;
+
+    // Init array to know state:
+    hipLaunchKernelGGL(Set, dimGrid, dimBlock, 0, 0x0, ptr, -42);
+    HIPCHECK(hipDeviceSynchronize());
+
+    hipLaunchKernelGGL(Set, dimGrid, dimBlock, 0, s, ptr, expected);
+    HIPCHECK(hipEventRecord(e, s));
+
+    // Host waits for event :
+    switch (syncMethod) {
+        case SYNC_EVENT:
+            HIPCHECK(hipEventSynchronize(e));
+            break;
+        case SYNC_STREAM:
+            HIPCHECK(hipStreamSynchronize(s));
+            break;
+        case SYNC_DEVICE:
+            HIPCHECK(hipDeviceSynchronize());
+            break;
+        default:
+            assert(0);
+    };
+            
+    for (int i=0; i<numElements; i++) {
+        if (ptr[i] != expected) {
+            printf ("mismatch at %d: %d != %d\n", i, ptr[i], expected);
+            assert(ptr[i] == expected);
+        }
+    }
+
+    HIPCHECK(hipStreamDestroy(s));
+    HIPCHECK(hipEventDestroy(e));
+};
+
 int main(){
-    float *A, *B, *C;
-    float *Ad, *Bd, *Cd;
+
 
     hipDeviceProp_t prop;
     int device;
@@ -49,26 +117,96 @@ int main(){
         failed("Does support HostPinned Memory");
     }
 
-    HIPCHECK(hipHostMalloc((void**)&A, SIZE, hipHostMallocWriteCombined | hipHostMallocMapped));
-    HIPCHECK(hipHostMalloc((void**)&B, SIZE, hipHostMallocDefault));
-    HIPCHECK(hipHostMalloc((void**)&C, SIZE, hipHostMallocMapped));
 
-    HIPCHECK(hipHostGetDevicePointer((void**)&Ad, A, 0));
-    HIPCHECK(hipHostGetDevicePointer((void**)&Cd, C, 0));
+    {
+        float *A, *B, *C;
+        float *Ad, *Bd, *Cd;
+        HIPCHECK(hipHostMalloc((void**)&A, SIZE, hipHostMallocWriteCombined | hipHostMallocMapped));
+        HIPCHECK(hipHostMalloc((void**)&B, SIZE, hipHostMallocDefault));
+        HIPCHECK(hipHostMalloc((void**)&C, SIZE, hipHostMallocMapped));
 
-    for(int i=0;i<LEN;i++){
-        A[i] = 1.0f;
-        B[i] = 2.0f;
+        HIPCHECK(hipHostGetDevicePointer((void**)&Ad, A, 0));
+        HIPCHECK(hipHostGetDevicePointer((void**)&Cd, C, 0));
+
+        for(int i=0;i<LEN;i++){
+            A[i] = 1.0f;
+            B[i] = 2.0f;
+        }
+
+        HIPCHECK(hipMalloc((void**)&Bd, SIZE));
+        HIPCHECK(hipMemcpy(Bd, B, SIZE, hipMemcpyHostToDevice));
+
+        dim3 dimGrid(LEN/512,1,1);
+        dim3 dimBlock(512,1,1);
+
+        hipLaunchKernelGGL(Add, dimGrid, dimBlock, 0, 0, Ad, Bd, Cd);
+
+        HIPCHECK(hipDeviceSynchronize());
+
+        HIPCHECK(hipHostFree(A));
+        HIPCHECK(hipHostFree(B));
+        HIPCHECK(hipHostFree(C));
     }
 
-    HIPCHECK(hipMalloc((void**)&Bd, SIZE));
-    HIPCHECK(hipMemcpy(Bd, B, SIZE, hipMemcpyHostToDevice));
+    {
+        int numElements = 1024*16;
+        size_t sizeBytes = numElements * sizeof (int);
 
-    dim3 dimGrid(LEN/512,1,1);
-    dim3 dimBlock(512,1,1);
+#ifdef __HIP_PLATFORM_HCC__
+        { 
+            // Stimulate error condition:
+            int *A = &numElements;
+            HIPCHECK_API(hipHostMalloc((void**)&A, sizeBytes, hipHostMallocCoherent|hipHostMallocNonCoherent), hipErrorInvalidValue);
 
-    hipLaunchKernel(HIP_KERNEL_NAME(Add), dimGrid, dimBlock, 0, 0, Ad, Bd, Cd);
+            assert (A == 0);
+        }
+#endif
 
+
+        {
+            int *A = nullptr;
+            HIPCHECK(hipHostMalloc((void**)&A, sizeBytes, hipHostMallocNonCoherent));
+            const char *ptrType = "non-coherent"; // TODO
+            CheckHostPointer(numElements, A, hipEventReleaseToSystem, SYNC_DEVICE,   ptrType);
+            CheckHostPointer(numElements, A, hipEventReleaseToSystem, SYNC_STREAM,  ptrType);
+            CheckHostPointer(numElements, A, hipEventReleaseToSystem, SYNC_EVENT,   ptrType);
+
+            // agent-scope releases don't provide host visibility, don't use them here:
+        }
+
+        if (1) { 
+            int *A = nullptr;
+            HIPCHECK(hipHostMalloc((void**)&A, sizeBytes, hipHostMallocCoherent));
+            const char *ptrType = "coherent";
+            CheckHostPointer(numElements, A, hipEventReleaseToDevice, SYNC_DEVICE,   ptrType);
+            CheckHostPointer(numElements, A, hipEventReleaseToDevice, SYNC_STREAM,  ptrType);
+            CheckHostPointer(numElements, A, hipEventReleaseToDevice, SYNC_EVENT,   ptrType);
+
+            CheckHostPointer(numElements, A, hipEventReleaseToSystem, SYNC_DEVICE,   ptrType);
+            CheckHostPointer(numElements, A, hipEventReleaseToSystem, SYNC_STREAM,  ptrType);
+            CheckHostPointer(numElements, A, hipEventReleaseToSystem, SYNC_EVENT,   ptrType);
+        }
+
+
+        // Check defaults:
+        if (1) { 
+            int *A = nullptr;
+            HIPCHECK(hipHostMalloc((void**)&A, sizeBytes));
+            const char *ptrType = "default";
+            CheckHostPointer(numElements, A, 0, SYNC_DEVICE,   ptrType);
+            CheckHostPointer(numElements, A, 0, SYNC_STREAM,  ptrType);
+            CheckHostPointer(numElements, A, 0, SYNC_EVENT,   ptrType);
+            
+            CheckHostPointer(numElements, A, 0, SYNC_DEVICE,   ptrType);
+            CheckHostPointer(numElements, A, 0, SYNC_STREAM,  ptrType);
+            CheckHostPointer(numElements, A, 0, SYNC_EVENT,   ptrType);
+        }
+
+
+
+
+    }
+        
     passed();
 
 }
diff --git a/projects/hip/tests/src/runtimeApi/memory/hipHostRegister.cpp b/projects/hip/tests/src/runtimeApi/memory/hipHostRegister.cpp
index 8cf0979261..3376ee04f1 100644
--- a/projects/hip/tests/src/runtimeApi/memory/hipHostRegister.cpp
+++ b/projects/hip/tests/src/runtimeApi/memory/hipHostRegister.cpp
@@ -19,9 +19,7 @@ THE SOFTWARE.
 
 /* HIT_START
  * BUILD: %t %s ../../test_common.cpp
- * RUN: %t --tests 0x1
- * RUN: %t --tests 0x2
- * RUN: %t --tests 0x4
+ * RUN: %t
  * HIT_END
  */
 
@@ -131,7 +129,7 @@ int main(int argc, char *argv[])
         HIPCHECK(hipMalloc(&Bd, size));
 
         // TODO - set to 128
-#define OFFSETS_TO_TRY 1 
+#define OFFSETS_TO_TRY 128 
         assert (N>OFFSETS_TO_TRY);
 
         if (p_tests & 0x2) {
diff --git a/projects/hip/tests/src/runtimeApi/memory/hipMemoryAllocate.cpp b/projects/hip/tests/src/runtimeApi/memory/hipMemoryAllocate.cpp
deleted file mode 100644
index 0a256d6362..0000000000
--- a/projects/hip/tests/src/runtimeApi/memory/hipMemoryAllocate.cpp
+++ /dev/null
@@ -1,70 +0,0 @@
-/*
-Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR
-IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
-*/
-
-/* HIT_START
- * BUILD: %t %s ../../test_common.cpp
- * RUN: %t
- * HIT_END
- */
-
-#include"test_common.h"
-
-#define SIZE 1024*1024*256
-
-int main(){
-    float *Ad, *B, *Bd, *Bm, *C, *Cd, *ptr_0;
-    B = (float*)malloc(SIZE);
-    hipMalloc((void**)&Ad, SIZE);
-    hipHostMalloc((void**)&B, SIZE);
-    hipHostMalloc((void**)&Bd, SIZE, hipHostMallocDefault);
-    hipHostMalloc((void**)&Bm, SIZE, hipHostMallocMapped);
-    hipHostMalloc((void**)&C, SIZE, hipHostMallocMapped);
-
-    hipHostGetDevicePointer((void**)&Cd, C, 0/*flags*/);
-
-    HIPCHECK_API(hipMalloc((void**)&ptr_0,0), hipSuccess);
-
-    HIPCHECK_API(hipFree(Ad) ,  hipSuccess);
-    HIPCHECK_API(hipHostFree(Ad) , hipErrorInvalidValue);
-
-    HIPCHECK_API(hipFree(B)  , hipErrorInvalidDevicePointer); // try to hipFree on malloced memory
-    HIPCHECK_API(hipFree(Bd) , hipErrorInvalidDevicePointer);
-    HIPCHECK_API(hipFree(Bm) , hipErrorInvalidDevicePointer);
-    HIPCHECK_API(hipFree(ptr_0) , hipSuccess);
-    HIPCHECK_API(hipHostFree(Bd) , hipSuccess);
-    HIPCHECK_API(hipHostFree(Bm) , hipSuccess);
-
-    HIPCHECK_API(hipFree(C) , hipErrorInvalidDevicePointer);
-    HIPCHECK_API(hipHostFree(C) , hipSuccess);
-
-
-    HIPCHECK_API(hipFree(NULL) , hipSuccess);
-    HIPCHECK_API(hipHostFree(NULL) , hipSuccess);
-
-    {
-        // Some negative testing - request a too-big allocation and verify it fails:
-        // Someday when we support virtual memory may need to refactor these:
-        size_t tooBig = 128LL*1024*1024*1024*1024;  // 128 TB;
-        void *p;
-        HIPCHECK_API ( hipMalloc(&p, tooBig),  hipErrorMemoryAllocation );
-        HIPCHECK_API ( hipHostMalloc(&p, tooBig),  hipErrorMemoryAllocation );
-    }
-
-    passed();
-}
diff --git a/projects/hip/tests/src/runtimeApi/memory/p2p_copy_coherency.cpp b/projects/hip/tests/src/runtimeApi/memory/p2p_copy_coherency.cpp
new file mode 100644
index 0000000000..459c0054c9
--- /dev/null
+++ b/projects/hip/tests/src/runtimeApi/memory/p2p_copy_coherency.cpp
@@ -0,0 +1,170 @@
+/*
+Copyright (c) 2015-2017 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+// Simple test for memset.
+// Also serves as a template for other tests.
+
+/* HIT_START
+ * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS --std=c++11
+ * RUN: %t 
+ * HIT_END
+ */
+
+#include "hip/hip_runtime.h"
+#include "test_common.h"
+
+#ifdef __HIP_PLATFORM_HCC__
+#include <hc_am.hpp>
+#endif
+
+#define USE_HSA_COPY 1
+
+int enablePeers(int dev0, int dev1)
+{
+    int canAccessPeer01, canAccessPeer10;
+    HIPCHECK(hipDeviceCanAccessPeer(&canAccessPeer01, dev0, dev1));
+    HIPCHECK(hipDeviceCanAccessPeer(&canAccessPeer10, dev1, dev0));
+    if (!canAccessPeer01 || !canAccessPeer10) {
+        return -1;
+    }
+
+    HIPCHECK(hipSetDevice(dev0));
+    HIPCHECK(hipDeviceEnablePeerAccess(dev1, 0/*flags*/));
+    HIPCHECK(hipSetDevice(dev1));
+    HIPCHECK(hipDeviceEnablePeerAccess(dev0, 0/*flags*/));
+
+    return 0;
+};
+
+
+__global__ void
+memsetIntKernel(int * ptr, int val, size_t numElements)
+{
+    int gid = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
+    if (gid < numElements) {
+        ptr[gid] = val;
+    }
+};
+
+
+void checkReverse(const int *ptr, int numElements, int expected) {
+    for (int i=numElements-1; i>=0; i--) {
+        if (ptr[i] != expected) {
+            printf ("i=%d, ptr[](%d) != expected (%d)\n", i, ptr[i], expected);
+            assert (ptr[i] == expected);
+        }
+    }
+
+    printf ("test:   OK\n");
+}
+
+
+void runTest(bool stepAIsCopy, hipStream_t gpu0Stream, hipStream_t gpu1Stream, int numElements,
+             int * dataGpu0, int *dataGpu1, int *dataHost, int expected)
+{
+    hipEvent_t e;
+    HIPCHECK(hipEventCreateWithFlags(&e,0));
+
+    printf ("test: runTest with %s\n", stepAIsCopy ? "copy" : "kernel");
+    const size_t sizeElements = numElements * sizeof(int);
+
+    hipStream_t stepAStream = gpu0Stream;
+
+    if (stepAIsCopy) {
+#ifdef USE_HSA_COPY
+        HIPCHECK(hipMemcpyAsync(dataGpu1, dataGpu0, sizeElements, hipMemcpyDeviceToDevice, stepAStream));
+#endif
+    } else {
+        assert(0); // not yet supported.
+    }
+
+    HIPCHECK(hipEventRecord(e, stepAStream));
+    HIPCHECK(hipStreamWaitEvent(gpu1Stream, e, 0));
+
+    HIPCHECK(hipMemcpyAsync(dataHost, dataGpu1, sizeElements, hipMemcpyDeviceToHost, gpu1Stream));
+
+    HIPCHECK(hipStreamSynchronize(gpu1Stream));
+
+    checkReverse(dataHost, numElements, expected);
+}
+
+
+void testMultiGpu0(int dev0, int dev1, int numElements)
+{
+    const size_t sizeElements = numElements * sizeof(int);
+
+    int * dataGpu0, *dataGpu1, *dataHost;
+    hipStream_t gpu0Stream, gpu1Stream;
+    const int expected = 42;
+    unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, numElements);
+
+    HIPCHECK(hipSetDevice(dev0));
+
+    HIPCHECK(hipMalloc(&dataGpu0, sizeElements));
+    HIPCHECK(hipStreamCreate(&gpu0Stream));
+    hipLaunchKernelGGL(memsetIntKernel, dim3(blocks), dim3(threadsPerBlock), 0, gpu0Stream,
+                       dataGpu0, expected, numElements);
+    HIPCHECK(hipDeviceSynchronize());
+
+
+    HIPCHECK(hipSetDevice(dev1));
+    HIPCHECK(hipMalloc(&dataGpu1, sizeElements));
+    HIPCHECK(hipStreamCreate(&gpu1Stream));
+    hipLaunchKernelGGL(memsetIntKernel, dim3(blocks), dim3(threadsPerBlock), 0, gpu0Stream,
+                       dataGpu1, 0x34, numElements);
+    HIPCHECK(hipDeviceSynchronize());
+
+    HIPCHECK(hipHostMalloc(&dataHost, sizeElements));
+    memset(dataHost, 13, sizeElements);
+
+#ifdef __HIP_PLATFORM_HCC__
+    hc::am_memtracker_print(0x0);
+#endif
+    
+    printf ("  test: init complete\n");
+
+    runTest(true/*stepAIsCopy*/, gpu0Stream, gpu1Stream, numElements, dataGpu0, dataGpu1, dataHost, expected);
+
+};
+
+
+
+int main(int argc, char *argv[])
+{
+    HipTest::parseStandardArguments(argc, argv, true);
+
+    int numElements = N;
+
+    int dev0 = 0;
+    int dev1 = 1;
+
+    // TODO - only works on multi-GPU system:
+    if (enablePeers(dev0,dev1) == -1) {
+        printf ("warning : could not find peer gpus\n");
+        return -1;
+    };
+
+    //testMultiGpu0(dev0, dev1, numElements);
+
+
+
+    passed();
+};
diff --git a/projects/hip/tests/src/runtimeApi/module/hipFuncSetCacheConfig.cpp b/projects/hip/tests/src/runtimeApi/module/hipFuncSetCacheConfig.cpp
new file mode 100644
index 0000000000..e3c3efad3d
--- /dev/null
+++ b/projects/hip/tests/src/runtimeApi/module/hipFuncSetCacheConfig.cpp
@@ -0,0 +1,36 @@
+/*
+Copyright (c) 2015-Present Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+/* HIT_START
+ * BUILD: %t %s ../../test_common.cpp
+ * RUN: %t
+ * HIT_END
+ */
+
+#include<hip/hip_runtime_api.h>
+#include<iostream>
+#include"test_common.h"
+
+int main(){
+  hipFuncCache_t cacheConfig;
+  void *func;
+  hipFuncSetCacheConfig(func, cacheConfig);
+  passed();
+}
+
diff --git a/projects/hip/tests/src/runtimeApi/multiThread/hipMultiThreadStreams1.cpp b/projects/hip/tests/src/runtimeApi/multiThread/hipMultiThreadStreams1.cpp
index 229ceea440..4f73b67ad7 100644
--- a/projects/hip/tests/src/runtimeApi/multiThread/hipMultiThreadStreams1.cpp
+++ b/projects/hip/tests/src/runtimeApi/multiThread/hipMultiThreadStreams1.cpp
@@ -29,6 +29,8 @@ THE SOFTWARE.
 #include "hip/hip_runtime.h"
 #include "test_common.h"
 
+int p_iters=10;
+
 void printSep()
 {
     printf ("======================================================================================\n");
@@ -43,7 +45,7 @@ template<
 	class P=HipTest::Unpinned, 
 	class C=HipTest::Memcpy
 >
-void simpleVectorCopy(size_t numElements, int iters, hipStream_t stream)
+void simpleVectorAdd(size_t numElements, int iters, hipStream_t stream)
 {
 	using HipTest::MemTraits;
 
@@ -57,6 +59,24 @@ void simpleVectorCopy(size_t numElements, int iters, hipStream_t stream)
     T *A_h, *B_h, *C_h;
 
     HipTest::initArrays (&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, P::isPinned);
+	for (size_t i=0; i<numElements; i++) {
+		A_h[i] = 1000.0f;
+		B_h[i] = 2000.0f;
+		C_h[i] = -1;
+	}
+
+
+	MemTraits<C>::Copy(B_d, B_h, Nbytes, hipMemcpyHostToDevice, stream);
+	MemTraits<C>::Copy(A_d, A_h, Nbytes, hipMemcpyHostToDevice, stream);
+	MemTraits<C>::Copy(C_d, C_h, Nbytes, hipMemcpyHostToDevice, stream);
+    HIPCHECK (hipDeviceSynchronize());
+
+	for (size_t i=0; i<numElements; i++) {
+		A_h[i] = 1.0f;
+		B_h[i] = 2.0f;
+		C_h[i] = -1;
+	}
+
 
 
 	for (int i=0; i<iters; i++) {
@@ -66,7 +86,11 @@ void simpleVectorCopy(size_t numElements, int iters, hipStream_t stream)
 		MemTraits<C>::Copy(A_d, A_h, Nbytes, hipMemcpyHostToDevice, stream);
 		MemTraits<C>::Copy(B_d, B_h, Nbytes, hipMemcpyHostToDevice, stream);
 
-		hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d, B_d, C_d, numElements);
+		//HIPCHECK(hipStreamSynchronize(stream));
+
+		// This is the null stream?
+		//hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d, B_d, C_d, numElements);
+		hipLaunchKernel(HipTest::vectorADDReverse, dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d, B_d, C_d, numElements);
 
 		MemTraits<C>::Copy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost, stream);
 
@@ -76,9 +100,9 @@ void simpleVectorCopy(size_t numElements, int iters, hipStream_t stream)
 	}
 
     HipTest::freeArrays (A_d, B_d, C_d, A_h, B_h, C_h, P::isPinned);
+	std::cout <<"  pid" << pid << " success\n";
     HIPCHECK (hipDeviceSynchronize());
 
-	std::cout <<"  pid" << pid << " success\n";
 }
 
 template<typename T, class C>
@@ -88,12 +112,14 @@ void test_multiThread_1(std::string testName, hipStream_t stream0, hipStream_t s
 	printf ("%s\n", __func__);
 	std::cout << testName << std::endl;
 
+	size_t numElements = N;
+
 	// Test 2 threads operating on same stream:
-    std::thread t1 (simpleVectorCopy<T, HipTest::Pinned, C>, 2000000/*mb*/, 100/*iters*/, stream0);
+    std::thread t1 (simpleVectorAdd<T, HipTest::Pinned, C>, numElements, p_iters/*iters*/, stream0);
     if (serialize) {
         t1.join();
     }
-    std::thread t2 (simpleVectorCopy<T, HipTest::Pinned, C>, 2000000/*mb*/, 100/*iters*/, stream1);
+    std::thread t2 (simpleVectorAdd<T, HipTest::Pinned, C>, numElements, p_iters/*iters*/, stream1);
     if (serialize) {
         t2.join();
     }
@@ -109,6 +135,7 @@ void test_multiThread_1(std::string testName, hipStream_t stream0, hipStream_t s
 
 int main(int argc, char *argv[])
 {
+	N = 8000000;
     HipTest::parseStandardArguments(argc, argv, true);
 
     printf ("info: set device to %d\n", p_gpuDevice);
@@ -121,8 +148,8 @@ int main(int argc, char *argv[])
         hipStream_t stream;
         HIPCHECK (hipStreamCreate(&stream));
 
-        simpleVectorCopy<float, HipTest::Pinned, HipTest::MemcpyAsync>	(2000000/*mb*/, 10/*iters*/, stream);
-        simpleVectorCopy<float, HipTest::Pinned, HipTest::Memcpy>		(2000000/*mb*/, 10/*iters*/, stream);
+        simpleVectorAdd<float, HipTest::Pinned, HipTest::MemcpyAsync>	(N/*mb*/, 10/*iters*/, stream);
+        simpleVectorAdd<float, HipTest::Pinned, HipTest::Memcpy>		(N/*mb*/, 10/*iters*/, stream);
 
         HIPCHECK(hipStreamDestroy(stream));
     }
@@ -139,8 +166,8 @@ int main(int argc, char *argv[])
     }
 
     if (p_tests & 0x4) {
-		test_multiThread_1<float, HipTest::MemcpyAsync> ("Multithread with NULL stream", NULL, NULL, false);
-		test_multiThread_1<float, HipTest::MemcpyAsync> ("Multithread with two streams", stream0, stream1, false);
+		//test_multiThread_1<float, HipTest::MemcpyAsync> ("Multithread with NULL stream", NULL, NULL, false);
+		//test_multiThread_1<float, HipTest::MemcpyAsync> ("Multithread with two streams", stream0, stream1, false);
 		test_multiThread_1<float, HipTest::MemcpyAsync> ("Multithread with one stream",  stream0, stream0, false);
 	}
 
diff --git a/projects/hip/tests/src/runtimeApi/stream/hipNullStream.cpp b/projects/hip/tests/src/runtimeApi/stream/hipNullStream.cpp
new file mode 100644
index 0000000000..b610315608
--- /dev/null
+++ b/projects/hip/tests/src/runtimeApi/stream/hipNullStream.cpp
@@ -0,0 +1,300 @@
+/*
+Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+/* HIT_START
+ * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS --std=c++11
+ * RUN: %t
+ * HIT_END
+ */
+
+
+#include "hip/hip_runtime.h"
+#include "test_common.h"
+#include <vector>
+unsigned p_streams =16;
+int      p_repeat = 10;
+int      p_db     = 0;
+
+
+template <typename T>
+__global__ void
+vectorADDRepeat(hipLaunchParm lp,
+            const T *A_d,
+            const T *B_d,
+            T *C_d,
+            size_t NELEM,
+            int repeat)
+{
+    size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
+    size_t stride = hipBlockDim_x * hipGridDim_x ;
+
+    for (int j=1; j<=repeat;j++) {
+        for (size_t i=offset; i<NELEM; i+=stride) {
+           C_d[i] = A_d[i]*j + B_d[i]*j;
+        }
+    };
+}
+
+
+
+
+//------
+// Structure for one stream - includes the stream + data buffers that are used by the stream.
+template <typename T>
+class Streamer {
+public:
+    Streamer(size_t numElements, bool useNullStream=false);
+    ~Streamer();
+    void enqueAsync();
+    void queryUntilComplete();
+
+    void reset();
+    void H2D();
+    void D2H();
+
+
+public:
+    T *_A_h;
+    T *_B_h;
+    T *_C_h;
+
+    T *_A_d;
+    T *_B_d;
+    T *_C_d;
+
+    hipStream_t _stream;
+    hipEvent_t  _event;
+
+    size_t      _numElements;
+};
+
+template <typename T>
+Streamer<T>::Streamer(size_t numElements, bool useNullStream) :
+    _numElements(numElements)
+{
+    HipTest::initArrays (&_A_d, &_B_d, &_C_d, &_A_h, &_B_h, &_C_h, numElements, true);
+
+    if (useNullStream) {
+        _stream = 0x0;
+    } else {
+        HIPCHECK(hipStreamCreate(&_stream));
+    }
+    HIPCHECK(hipEventCreate(&_event));
+
+    H2D();
+
+};
+
+template <typename T>
+void Streamer<T>::H2D()
+{
+    HIPCHECK(hipMemcpy(_A_d, _A_h, _numElements*sizeof(T), hipMemcpyHostToDevice));
+    HIPCHECK(hipMemcpy(_B_d, _B_h, _numElements*sizeof(T), hipMemcpyHostToDevice));
+}
+
+template <typename T>
+void Streamer<T>::D2H()
+{
+    HIPCHECK(hipMemcpy(_C_h, _C_d, _numElements*sizeof(T), hipMemcpyDeviceToHost));
+}
+
+template <typename T>
+void Streamer<T>::reset()
+{
+    HipTest::setDefaultData(_numElements, _A_h, _B_h, _C_h);
+    H2D();
+    
+}
+
+
+template <typename T>
+void Streamer<T>::enqueAsync()
+{
+    printf ("testing: %s  numElements=%zu size=%6.2fMB\n", __func__, _numElements, _numElements * sizeof(T) / 1024.0/1024.0);
+    unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, _numElements);
+    hipLaunchKernel(vectorADDRepeat, dim3(blocks), dim3(threadsPerBlock), 0, _stream, _A_d, _B_d, _C_d, _numElements, p_repeat);
+
+}
+
+template <typename T>
+void Streamer<T>::queryUntilComplete()
+{
+    int numQueries = 0;
+    hipError_t e = hipSuccess;
+    do {
+        numQueries++;
+        e = hipStreamQuery(_stream);
+    } while (e != hipSuccess) ;
+
+    printf ("completed after %d queries\n", numQueries);
+};
+
+
+
+//---
+//Parse arguments specific to this test.
+void parseMyArguments(int argc, char *argv[])
+{
+    int more_argc = HipTest::parseStandardArguments(argc, argv, false);
+
+    // parse args for this test:
+    for (int i = 1; i < more_argc; i++) {
+        const char *arg = argv[i];
+
+        if (!strcmp(arg, "--streams")) {
+            if (++i >= argc || !HipTest::parseUInt(argv[i], &p_streams)) {
+               failed("Bad streams argument");
+            }
+        } else if (!strcmp(arg, "--repeat") || (!strcmp(arg, "-r"))) {
+            if (++i >= argc || !HipTest::parseInt(argv[i], &p_repeat)) {
+               failed("Bad repeat argument");
+            }
+        } else {
+            failed("Bad argument '%s'", arg);
+        }
+    };
+};
+
+
+void
+printBuffer(std::string name, int *f, size_t numElements)
+{
+    std::cout << name << "\n";
+    for (size_t i=0; i<numElements; i++) {
+        printf ("%5zu: %d\n", i, f[i]);
+    }
+}
+
+
+
+
+//---
+int main(int argc, char *argv[])
+{
+    HipTest::parseStandardArguments(argc, argv, false);
+    parseMyArguments(argc, argv);
+
+    typedef Streamer<int> IntStreamer;
+
+    std::vector<IntStreamer *> streamers;
+
+    size_t numElements = N;
+
+    int *expected_H = (int*)malloc(numElements*sizeof(int));
+
+
+    auto nullStreamer = new IntStreamer(numElements, true);
+
+    // Expected resultr - last streamer runs vectorADDRepeat, then nullstreamer adds lastStreamer->_C_d + lastStreamer->_C_d
+    for (size_t i=0; i<numElements; i++) {
+        expected_H[i] = ((nullStreamer->_A_h[i])*p_repeat + (nullStreamer->_B_h[i]) * p_repeat) *2;
+    }
+
+
+    for (int i=0; i<p_streams; i++) {
+        IntStreamer * s = new IntStreamer(numElements);
+        streamers.push_back(s);
+    }
+    unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, numElements);
+
+    for (int s=1; s<p_streams; s++) {
+        if (p_tests & (1<<s)) {
+            printf ("==> Test %x runAsnc, #streams=%d\n", (1<<s), s);
+            nullStreamer->reset();
+
+            for (int i=0; i<s; i++) {
+                streamers[i]->enqueAsync();
+            }
+
+            auto lastStreamer = streamers[s - 1];
+
+            // Dispatch to NULL stream, should wait for prior async activity to complete before beginning:
+            hipLaunchKernel(vectorADDRepeat, dim3(blocks), dim3(threadsPerBlock), 0, 0/*nullstream*/, lastStreamer->_C_d, lastStreamer->_C_d, nullStreamer->_C_d, numElements, 1/*repeat*/);
+
+
+            if (p_db) {
+                HIPCHECK(hipDeviceSynchronize());
+                lastStreamer->D2H();
+                printBuffer("lastStream _A_h", lastStreamer->_A_h, min(numElements, size_t(20)));
+                printBuffer("lastStream _B_h", lastStreamer->_B_h, min(numElements, size_t(20)));
+                printBuffer("lastStream _C_h", lastStreamer->_C_h, min(numElements, size_t(20)));
+            }
+            nullStreamer->D2H();
+            HIPCHECK(hipDeviceSynchronize());
+
+            HipTest::checkTest(expected_H, nullStreamer->_C_h, numElements); 
+        }
+    }
+
+
+    for (int s=1; s<p_streams; s+=2) {
+        unsigned tmask = (0x10000 | (1<<s));
+        if (p_tests & tmask) {
+            nullStreamer->reset();
+            printf ("==> Test %x runAsnc-odd-only, #streams=%d\n", tmask, s);
+            for (int i=0; i<s; i++) {
+                // RUn just odd streams so we have some empty ones to examine/optimize:
+                if (i & 0x1) {
+                    streamers[i]->enqueAsync();
+                }
+            }
+            auto lastStreamer = streamers[s - 1];
+
+            // Dispatch to NULL stream, should wait for prior async activity to complete before beginning:
+            hipLaunchKernel(vectorADDRepeat, dim3(blocks), dim3(threadsPerBlock), 0, 0/*nullstream*/, lastStreamer->_C_d, lastStreamer->_C_d, nullStreamer->_C_d, numElements, 1/*repeat*/);
+
+            nullStreamer->D2H();
+
+            HIPCHECK(hipDeviceSynchronize());
+
+            HipTest::checkTest(expected_H, nullStreamer->_C_h, numElements); 
+        }
+    }
+
+    // Expected resultr - last streamer runs vectorADDRepeat
+    for (size_t i=0; i<numElements; i++) {
+        expected_H[i] = ((nullStreamer->_A_h[i])*p_repeat + (nullStreamer->_B_h[i]) * p_repeat);
+    }
+
+    if (p_tests & 0x20000) {
+
+        assert (p_streams >=2); // need a couple streams in order to run this test.
+        nullStreamer->reset();
+        printf ("\n==> Test hipStreamSynchronize with defaultStream \n");
+
+        // Enqueue a long-running job to stream1
+        streamers[0]->enqueAsync();
+
+        // Check to see if synchronizing on a null stream synchronizes all other streams or just the null stream.
+        // This function follows null stream semantics and will wait for all other blocking streams before returning.
+        // This will wait on the host
+        HIPCHECK(hipStreamSynchronize(0));
+
+        // Copy with stream1, this could go async if the streamSync doesn't synchronize ALL the streams.
+        HIPCHECK(hipMemcpyAsync(streamers[0]->_C_h, streamers[0]->_C_d, streamers[0]->_numElements*sizeof(int), hipMemcpyDeviceToHost, streamers[1]->_stream));
+
+        
+        HIPCHECK(hipDeviceSynchronize());
+
+        HipTest::checkTest(expected_H, streamers[0]->_C_h, numElements); 
+    }
+
+
+    passed();
+}
diff --git a/projects/hip/tests/src/runtimeApi/stream/hipStreamSync2.cpp b/projects/hip/tests/src/runtimeApi/stream/hipStreamSync2.cpp
new file mode 100644
index 0000000000..c6a58ce7d4
--- /dev/null
+++ b/projects/hip/tests/src/runtimeApi/stream/hipStreamSync2.cpp
@@ -0,0 +1,207 @@
+/*
+Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+/* HIT_START
+ * BUILD: %t %s ../../test_common.cpp
+ * RUN: %t
+ * HIT_END
+ */
+
+
+#include "test_common.h"
+
+enum SyncMode {
+    syncNone,
+    syncNullStream,
+    syncOtherStream,
+    syncMarkerThenOtherStream,
+    syncMarkerThenOtherNonBlockingStream,
+    syncDevice
+};
+
+
+const char *syncModeString(int syncMode) {
+    switch (syncMode) {
+        case syncNone:
+            return "syncNone";
+        case syncNullStream:
+            return "syncNullStream";
+        case syncOtherStream:
+            return "syncOtherStream";
+        case syncMarkerThenOtherStream:
+            return "syncMarkerThenOtherStream";
+        case syncMarkerThenOtherNonBlockingStream:
+            return "syncMarkerThenOtherNonBlockingStream";
+        case syncDevice:
+            return "syncDevice";
+        default:
+            return "unknown";
+    };
+};
+
+
+void test(unsigned testMask, int *C_d, int *C_h, int64_t numElements, SyncMode syncMode, bool expectMismatch)
+{
+
+    // This test sends a long-running kernel to the null stream, then tests to see if the 
+    // specified synchronization technique is effective.
+    //
+    // Some syncMode are not expected to correctly sync (for example "syncNone").  in these 
+    // cases the test sets expectMismatch and the check logic below will attempt to ensure that
+    // the undesired synchronization did not occur - ie ensure the kernel is still running and did
+    // not yet update the stop event.  This can be tricky since if the kernel runs fast enough it
+    // may complete before the check.  To prevent this, the addCountReverse has a count parameter 
+    // which causes it to loop repeatedly, and the results are checked in reverse order.  
+    //
+    // Tests with expectMismatch=true should ensure the kernel finishes correctly. This results
+    // are checked and we test to make sure stop event has completed.
+    
+    if (!(testMask & p_tests)) {
+        return;
+    }
+    printf ("\ntest 0x%02x: syncMode=%s expectMismatch=%d\n", 
+            testMask, syncModeString(syncMode), expectMismatch);
+
+    size_t sizeBytes = numElements * sizeof(int);
+
+    int count =100;
+    int init0 = 0;
+    HIPCHECK(hipMemset(C_d, init0, sizeBytes));
+    for (int i=0; i<numElements; i++) {
+        C_h[i] = -1; // initialize
+    }
+
+    hipStream_t otherStream = 0;
+    unsigned flags = (syncMode == syncMarkerThenOtherNonBlockingStream) ?  hipStreamNonBlocking : hipStreamDefault;
+    HIPCHECK(hipStreamCreateWithFlags(&otherStream, flags));
+    hipEvent_t stop, otherStreamEvent;
+    HIPCHECK(hipEventCreate(&stop));
+    HIPCHECK(hipEventCreate(&otherStreamEvent));
+
+
+    unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, numElements);
+    // Launch kernel into null stream, should result in C_h == count.
+    hipLaunchKernelGGL(HipTest::addCountReverse , dim3(blocks), dim3(threadsPerBlock), 0, 0 /*stream*/,    C_d, C_h, numElements, count);
+    HIPCHECK(hipEventRecord(stop, 0/*default*/));  
+
+    switch (syncMode) {
+        case syncNone:
+            break;
+        case syncNullStream:
+            HIPCHECK(hipStreamSynchronize(0));  // wait on host for null stream:
+            break;
+        case syncOtherStream:
+            // Does this synchronize with the null stream?
+            HIPCHECK(hipStreamSynchronize(otherStream));  
+            break;
+        case syncMarkerThenOtherStream:
+        case syncMarkerThenOtherNonBlockingStream:
+            
+            // this may wait for NULL stream depending hipStreamNonBlocking flag above
+            HIPCHECK(hipEventRecord(otherStreamEvent, otherStream));  
+
+            HIPCHECK(hipStreamSynchronize(otherStream));  
+            break;
+        case syncDevice:
+            HIPCHECK(hipDeviceSynchronize());  
+            break;
+        default:
+            assert(0);
+    };
+
+    hipError_t done = hipEventQuery(stop);
+
+    if (expectMismatch) {
+        assert (done == hipErrorNotReady);
+    } else {
+        assert (done == hipSuccess);
+    }
+
+    int mismatches = 0;
+    int expected = init0 + count;
+    for (int i=0; i<numElements; i++) {
+        bool compareEqual = (C_h[i] == expected);
+        if (!compareEqual) {
+            mismatches ++;
+            if  (!expectMismatch) {
+                printf ("C_h[%d] (%d) != %d\n", i, C_h[i], expected);
+                assert(C_h[i] == expected);
+            }
+        }
+    }
+
+    if (expectMismatch) {
+        assert (mismatches > 0);
+    }
+
+
+    HIPCHECK(hipStreamDestroy(otherStream));
+    HIPCHECK(hipEventDestroy(stop));
+    HIPCHECK(hipEventDestroy(otherStreamEvent));
+
+    HIPCHECK(hipDeviceSynchronize());
+
+    printf ("test:   OK - %d mismatches (%6.2f%%)\n",  mismatches, ((double)(mismatches)*100.0)/numElements);
+}
+
+
+void runTests(int64_t numElements)
+{
+    size_t sizeBytes = numElements * sizeof(int);
+
+    printf ("\n\ntest: starting sequence with sizeBytes=%zu bytes, %6.2f MB\n", sizeBytes, sizeBytes/1024.0/1024.0);
+
+
+    int *C_h, *C_d;
+    HIPCHECK(hipMalloc(&C_d, sizeBytes));
+    HIPCHECK(hipHostMalloc(&C_h, sizeBytes));
+
+
+    {
+        test (0x01, C_d, C_h, numElements,  syncNone,                             true /*expectMismatch*/);
+        test (0x02, C_d, C_h, numElements,  syncNullStream,                       false /*expectMismatch*/);
+        test (0x04, C_d, C_h, numElements,  syncOtherStream,                      true /*expectMismatch*/);
+        test (0x08, C_d, C_h, numElements,  syncDevice,                           false /*expectMismatch*/);
+
+        // Sending a marker to to null stream may synchronize the otherStream
+        //  - other created with hipStreamNonBlocking=0 : synchronization, should match
+        //  - other created with hipStreamNonBlocking=1 : no synchronization, may mismatch
+        test (0x10, C_d, C_h, numElements,  syncMarkerThenOtherStream,            false /*expectMismatch*/);
+
+        // TODO - review why this test seems flaky
+        //test (0x20, C_d, C_h, numElements,  syncMarkerThenOtherNonBlockingStream, true /*expectMismatch*/);
+    }
+
+
+    HIPCHECK(hipFree(C_d));
+    HIPCHECK(hipHostFree(C_h));
+}
+
+
+int main(int argc, char *argv[])
+{
+    // Can' destroy the default stream:// TODO - move to another test
+    HIPCHECK_API(hipStreamDestroy(0), hipErrorInvalidResourceHandle); 
+
+    HipTest::parseStandardArguments(argc, argv, true /*failOnUndefinedArg*/);
+
+    runTests(40000000);
+
+    passed();
+}
diff --git a/projects/hip/tests/src/runtimeApi/stream/hipStreamWaitEvent.cpp b/projects/hip/tests/src/runtimeApi/stream/hipStreamWaitEvent.cpp
index 637275c381..9bbd43828c 100644
--- a/projects/hip/tests/src/runtimeApi/stream/hipStreamWaitEvent.cpp
+++ b/projects/hip/tests/src/runtimeApi/stream/hipStreamWaitEvent.cpp
@@ -18,7 +18,7 @@ THE SOFTWARE.
 */
 
 /* HIT_START
- * BUILD: %t %s ../../test_common.cpp
+ * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS --std=c++11
  * RUN: %t
  * HIT_END
  */
@@ -28,60 +28,162 @@ THE SOFTWARE.
 #include "hip/hip_runtime.h"
 #include "test_common.h"
 #include <vector>
-unsigned p_streams = 6;
+#include <limits>
+unsigned p_streams = 8;
+unsigned p_db = 0;
+unsigned p_count = 100;
+
+
+
 
 
 //------
 // Structure for one stream;
 template <typename T>
 class Streamer {
+
+#define COMMAND_ADD_FORWARD 0
+#define COMMAND_ADD_REVERSE 1
+#define COMMAND_COPY        2
+
+
 public:
-    Streamer(size_t numElements);
+    Streamer(int deviceId, T *input, size_t numElements, int commandType);
     ~Streamer();
-    void runAsync();
+    void runAsyncAfter(Streamer<T> *depStreamer, bool waitSameStream=false);
+    void runAsyncWaitSameStream();
     void queryUntilComplete();
 
+    size_t check(int streamerNum, T initValue, T expectedOffset, bool expectPass=true);
+    void copyToHost(hipStream_t copyStream);
 
+    hipEvent_t event() { return _event; };
+
+    int deviceId() const { return _deviceId; };
+    size_t mismatchCount() const { return _mismatchCount; };
+    T *C_d() { return _C_d; };
+
+    // How much does this streamer add to A[i] after running runAsyncAfter
+    int expectedAdd() const { return (_commandType == COMMAND_COPY) ? 0 : p_count; };
+
+
+    int         _commandType;  // 0=addReverse, 1=addFwd, 2=move
 private:
-    T *_A_h;
-    T *_B_h;
+
     T *_C_h;
 
+    T *_preA_d; // if input is on another device, this is pointer to that memory.
     T *_A_d;
-    T *_B_d;
     T *_C_d;
 
     hipStream_t _stream;
     hipEvent_t  _event;
 
+    int         _deviceId;
     size_t      _numElements;
+
+    size_t      _mismatchCount;
 };
 
+
 template <typename T>
-Streamer<T>::Streamer(size_t numElements) :
-    _numElements(numElements)
+Streamer<T>::Streamer(int deviceId, T * A_d, size_t numElements, int commandType) :
+    _preA_d(NULL), 
+    _A_d(A_d),
+    _deviceId(deviceId),
+    _numElements(numElements),
+    _commandType(commandType)
 {
-    HipTest::initArrays (&_A_d, &_B_d, &_C_d, &_A_h, &_B_h, &_C_h, numElements, true);
+    size_t sizeElements = numElements * sizeof(int);
+
+    //if (commandType == 0) _commandType = 1; // TODO - remove me
+
+    HIPCHECK(hipSetDevice(_deviceId));
+
+
+    hipPointerAttribute_t attr;
+    HIPCHECK(hipPointerGetAttributes(&attr, A_d));
+    if (attr.device != deviceId) {
+        // source is on another device, we will need to copy later.
+        // So save original source pointer and allocate local space.
+        printf ("info: source for streamer on another device, will insert memcpy\n");
+        _preA_d = A_d;
+        HIPCHECK(hipMalloc(&_A_d, sizeElements));
+        HIPCHECK(hipMemset(_A_d, -3, sizeElements));
+    }
+
+    HIPCHECK(hipMalloc(&_C_d, sizeElements));
+    HIPCHECK(hipHostMalloc(&_C_h, sizeElements));
+
+    HIPCHECK(hipMemset(_C_d, -1, sizeElements));
+    HIPCHECK(hipMemset(_C_h, -2, sizeElements));
 
     HIPCHECK(hipStreamCreate(&_stream));
     HIPCHECK(hipEventCreate(&_event));
+
+
+
 };
 
-template <typename T>
-void Streamer<T>::runAsync()
-{
-    printf ("testing: %s  numElements=%zu size=%6.2fMB\n", __func__, _numElements, _numElements * sizeof(T) / 1024.0/1024.0);
-    unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, _numElements);
-    hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, _stream, _A_d, _B_d, _C_d, _numElements);
 
-    // Test case where hipStreamWaitEvent waits on same event we just placed into the queue.
-    HIPCHECK(hipEventRecord(_event, _stream));
-    HIPCHECK(hipStreamWaitEvent(_stream, _event, 0));
+template <typename T>
+Streamer<T>::~Streamer()
+{
+    HIPCHECK(hipSetDevice(_deviceId));
+
+    printf ("info: ~Streamer\n");
+    if (_preA_d) {
+        HIPCHECK(hipFree(_preA_d));
+    }
+    HIPCHECK(hipFree(_C_d));
+    HIPCHECK(hipHostFree(_C_h));
+
+    HIPCHECK(hipStreamDestroy(_stream));
+    HIPCHECK(hipEventDestroy(_event));
 }
 
+
+template <typename T>
+void Streamer<T>::runAsyncAfter(Streamer<T> *depStreamer, bool waitSameStream)
+{
+    HIPCHECK(hipSetDevice(_deviceId));
+    if (p_db) {
+      printf ("testing: %s  numElements=%zu size=%6.2fMB\n", __func__, _numElements, _numElements * sizeof(T) / 1024.0/1024.0);
+    }
+
+    if (depStreamer) {
+        HIPCHECK(hipStreamWaitEvent(_stream, depStreamer->event(), 0));
+    }
+
+    if (_preA_d) {
+        // _preA_d is on another device, so copy to local device so kernel can access it:
+        HIPCHECK(hipMemcpyAsync(_A_d, _preA_d, _numElements * sizeof(T), hipMemcpyDeviceToDevice, _stream));
+    }
+
+
+    unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, _numElements);
+    if (_commandType == COMMAND_ADD_REVERSE) {
+        hipLaunchKernelGGL(HipTest::addCountReverse , dim3(blocks), dim3(threadsPerBlock), 0, _stream,    _A_d, _C_d, _numElements, p_count);
+    } else if (_commandType == COMMAND_ADD_FORWARD) {
+        hipLaunchKernelGGL(HipTest::addCount,         dim3(blocks), dim3(threadsPerBlock), 0, _stream,    _A_d, _C_d, _numElements, p_count);
+    } else if (_commandType == COMMAND_COPY) {
+        HIPCHECK(hipMemcpyAsync(_C_d, _A_d, _numElements * sizeof(T), hipMemcpyDeviceToDevice, _stream));
+    } else {
+        assert(0); // bad command type
+    }
+    HIPCHECK(hipEventRecord(_event, _stream));
+
+    if (waitSameStream) {
+        HIPCHECK(hipStreamWaitEvent(_stream, _event, 0)); // this is essentially a no-op, but make sure it doesn't crash
+    }
+}
+
+
+
 template <typename T>
 void Streamer<T>::queryUntilComplete()
 {
+    HIPCHECK(hipSetDevice(_deviceId));
     int numQueries = 0;
     hipError_t e = hipSuccess;
     do {
@@ -89,15 +191,62 @@ void Streamer<T>::queryUntilComplete()
         e = hipStreamQuery(_stream);
     } while (e != hipSuccess) ;
 
-    printf ("completed after %d queries\n", numQueries);
+    printf ("info: hipStreamQuery completed after %d queries\n", numQueries);
 };
 
 
+// If copyStream is !nullptr it is used for the copy.
+template <typename T>
+void Streamer<T>::copyToHost(hipStream_t copyStream)
+{
+    if (p_db) {
+        printf ("db: copy back to host\n");
+    }
+    HIPCHECK(hipSetDevice(_deviceId));
+    HIPCHECK(hipMemcpyAsync(_C_h, _C_d, _numElements*sizeof(T), hipMemcpyDeviceToHost, copyStream ? copyStream : _stream));
+    HIPCHECK(hipStreamSynchronize(copyStream ? copyStream:_stream));
+
+}
+
+
+template <typename T>
+size_t Streamer<T>::check(int streamerNum, T initValue, T expectedOffset, bool expectPass)
+{
+    T expected = initValue + expectedOffset;
+    if (p_db) {
+        printf ("db: check\n");
+    }
+
+    _mismatchCount = 0;
+    for (size_t i=0; i<_numElements; i++) {
+        if (_C_h[i] != expected) {
+            _mismatchCount++;
+            if (expectPass) {
+                fprintf(stderr, "for streamer:%d  _C_h[%zu] (%d)  !=  expected(%d)\n", streamerNum, i, _C_h[i], expected);
+                if (_mismatchCount > 10) {
+                    failed("for streamer:%d  _C_h[%zu] (%d)  !=  expected(%d)\n", streamerNum, i, _C_h[i], expected);
+                }
+            }
+        }
+    }
+
+    if (!expectPass && (_mismatchCount ==0)) {
+        // the test should run kernels long enough that if we don't correctly wait for them to finish then an error is reported.
+        //failed("for streamer:%d  we expected inavalid synchronization to lead to mismatch but none was detected.  Increase --N to sensitize sync.\n", streamerNum);
+
+    }
+
+    return _mismatchCount;
+}
+
+   
 
 //---
 //Parse arguments specific to this test.
 void parseMyArguments(int argc, char *argv[])
 {
+    N = 64*1024*1024;
+
     int more_argc = HipTest::parseStandardArguments(argc, argv, false);
 
     // parse args for this test:
@@ -108,6 +257,14 @@ void parseMyArguments(int argc, char *argv[])
             if (++i >= argc || !HipTest::parseUInt(argv[i], &p_streams)) {
                failed("Bad streams argument");
             }
+        } else if (!strcmp(arg, "--count")) {
+            if (++i >= argc || !HipTest::parseUInt(argv[i], &p_count)) {
+               failed("Bad count argument");
+            }
+        } else if (!strcmp(arg, "--db")) {
+            if (++i >= argc || !HipTest::parseUInt(argv[i], &p_db)) {
+               failed("Bad db argument");
+            }
         } else {
             failed("Bad argument '%s'", arg);
         }
@@ -115,6 +272,95 @@ void parseMyArguments(int argc, char *argv[])
 };
 
 
+typedef Streamer<int> IntStreamer;
+
+
+
+
+void runStreamerLoop(std::vector<IntStreamer *> &streamers)
+{
+    for (int i=0; i<streamers.size(); i++) {
+        streamers[i]->runAsyncAfter(i ? streamers[i-1] : NULL);
+    }
+}
+
+
+void checkAll(int initValue, std::vector<IntStreamer *> &streamers, std::vector<hipStream_t> &sideStreams, bool expectPass=true)
+{
+    size_t mismatchCount=0;
+
+    // Copy in reverse order to catch anything not yet finished...
+    for (int i=streamers.size()-1; i>=0; i--) {
+        streamers[i]->copyToHost(sideStreams.empty() ? NULL : sideStreams[streamers[i]->deviceId()]);
+    }
+
+
+    int expected = 0;
+    // Check in forward order so we can find first mismatch:
+    for (int i=0; i<streamers.size(); i++) {
+
+        expected += streamers[i]->expectedAdd();
+        
+        mismatchCount += streamers[i]->check(i+1, initValue, expected, expectPass);
+
+    }
+    if (!expectPass && (mismatchCount==0)) {
+        // the test should run kernels long enough that if we don't correctly wait for them to finish then an error is reported.
+        failed("we expected inavalid synchronization to lead to mismatch but none was detected.  Increase --count to sensitize sync.\n");
+    }
+
+}
+
+
+
+#define RUN_SYNC_TEST(_enableBit, _streamers, _sync, _expectPass)\
+    if (p_tests & (_enableBit)) {\
+        printf ("==> Test %02x runAsyncAfter sync=%s\n", (_enableBit), #_sync);\
+        runStreamerLoop(_streamers);\
+        (_sync);\
+        checkAll (initValue, _streamers, sideStreams, _expectPass);\
+    }
+
+
+
+
+//---
+// A family of sync functions which somehow wait for inflight activity to finish:
+
+
+void sync_none(void) {};
+
+void sync_allDevices(int numDevices) 
+{
+    for (int d=0; d<numDevices; d++) {
+        HIPCHECK(hipSetDevice(d));
+        HIPCHECK(hipDeviceSynchronize());
+    }
+}
+
+
+void sync_queryAllUntilComplete(std::vector<IntStreamer *> streamers) 
+{
+    for (int i=streamers.size()-1; i>=0; i--) {
+        streamers[i]->queryUntilComplete();
+    };
+}
+
+
+void sync_streamWaitEvent(hipEvent_t lastEvent, int sideDeviceId, hipStream_t sideStream, bool waitHere) 
+{
+    HIPCHECK(hipSetDevice(sideDeviceId));
+
+    // wait on the last event in the stream of chained streamers:
+    // This plants a marker which the subsquent copy for this device will wait on:
+    HIPCHECK(hipStreamWaitEvent(sideStream, lastEvent, 0));
+
+    if (waitHere) {
+        HIPCHECK(hipStreamSynchronize(sideStream));
+    }
+}
+
+
 
 //---
 int main(int argc, char *argv[])
@@ -122,39 +368,115 @@ int main(int argc, char *argv[])
     HipTest::parseStandardArguments(argc, argv, false);
     parseMyArguments(argc, argv);
 
-    typedef Streamer<float> FloatStreamer;
 
-    std::vector<FloatStreamer *> streamers;
+
 
     size_t numElements = N;
+    size_t sizeElements = numElements * sizeof(int);
 
-    for (int i=0; i<p_streams; i++) {
-        FloatStreamer * s = new FloatStreamer(numElements);
-        streamers.push_back(s);
+    printf("info: sizeof arrays = %zu elements (%6.3f MB)\n", numElements, sizeElements / 1024.0/1024.0);
+    printf("info: streams=%d count=%d\n", p_streams, p_count);
+
+    assert (sizeElements <= std::numeric_limits<int64_t>::max());
+
+
+    int initValue = 1000;
+
+    int * initArray_d, *initArray_h;
+    HIPCHECK(hipMalloc(&initArray_d, sizeElements));
+    HIPCHECK(hipHostMalloc(&initArray_h, sizeElements));
+    for (size_t i=0; i<numElements; i++) {
+        initArray_h[i] = initValue;
     }
+    HIPCHECK(hipMemcpy(initArray_d, initArray_h, sizeElements, hipMemcpyHostToDevice));
+    
 
-    if (p_tests & 0x1) {
-        printf ("==> Test 0x1 runAsnc\n");
+    int numDevices;
+    HIPCHECK(hipGetDeviceCount(&numDevices));
+    numDevices = min(2, numDevices); // multi-GPU to 2 device.
+
+    std::vector<IntStreamer *> streamers;
+    std::vector<IntStreamer *> streamersDev0; // streamers for first device.
+
+    for (int d=0; d<numDevices/*TODO*/; d++) {
         for (int i=0; i<p_streams; i++) {
-            streamers[i]->runAsync();
+            int command = (i%2) ? COMMAND_ADD_FORWARD : COMMAND_ADD_REVERSE;
+            IntStreamer * s = new IntStreamer(d, i ? streamers.back()->C_d() : initArray_d, numElements, command);
+            streamers.push_back(s);
+            if (d==0) {
+                streamersDev0.push_back(s);
+            }
         }
-        HIPCHECK(hipDeviceSynchronize());
     }
 
-    if (p_tests & 0x2) {
-        printf ("==> Test 0x2 queryUntilComplete\n");
-        for (int i=0; i<p_streams; i++) {
-            streamers[i]->runAsync();
-            streamers[i]->queryUntilComplete();
-        }
-        HIPCHECK(hipDeviceSynchronize());
+
+
+
+
+    // A sideband stream channel that is independent from above.
+    // Used to check to ensure the WaitEvent or other synchronization is working correctly since by default sideStream is 
+    // asynchronous wrt the other streams.
+    std::vector<hipStream_t> sideStreams;
+    for (int d=0; d<numDevices; d++) {
+        hipStream_t s;
+        HIPCHECK(hipStreamCreate(&s));
+        sideStreams.push_back(s);
     }
 
-    if (p_tests & 0x4) {
+
+    // Tests on first GPU:
+    //
+    // This test has no synchronization - make sure it mismatches so we can ensure the other tests properyl prevent the mismatch:
+    RUN_SYNC_TEST(0x01, streamersDev0, sync_none(), false);
+
+    RUN_SYNC_TEST(0x02, streamersDev0, sync_allDevices(numDevices),  true);
+    RUN_SYNC_TEST(0x04, streamersDev0, sync_queryAllUntilComplete(streamersDev0),  true);
+    RUN_SYNC_TEST(0x08, streamersDev0, sync_streamWaitEvent(streamersDev0.back()->event(), 0, sideStreams[0], false),  true);
+
+    if (numDevices > 1) {
+        // Sync on second device for activity running on device 0:
+        RUN_SYNC_TEST(0x10, streamersDev0, sync_streamWaitEvent(streamersDev0.back()->event(), 1, sideStreams[1], true),  true);
+    }
+
+
+    // Tests on all GPUs:
+    // RUN_SYNC_TEST(0x100, streamers, sync_streamWaitEvent(streamers.back()->event(), 0, sideStreams[0], false),  true);
+
+
+
+
+    if (p_tests & 0x1000) {
+        printf ("==> Test 0x1000 try null stream\n"); 
         hipStreamQuery(0/* try null stream*/);
 
     }
 
 
+    // Insert small wrinkle here, insert a wait on event just recorded, all in the same stream.
+    if (p_tests & 0x2000) {
+        printf ("==> Test 0x2000 runAsyncWaitSameStream\n");
+        for (int i=0; i<streamersDev0.size(); i++) {
+            streamersDev0[i]->runAsyncAfter(i ? streamersDev0[i-1] : NULL, true/*waitSameStream*/);
+        }
+
+        sync_streamWaitEvent(streamersDev0.back()->event(), 0, sideStreams[0], false);
+        checkAll (initValue, streamersDev0, sideStreams);
+    }
+
+
+    // Change Adds to copies to stimulate different case with event followign copy:
+    for (auto &s : streamers) {
+        if (s->_commandType == COMMAND_ADD_FORWARD)
+            s->_commandType = COMMAND_COPY;
+    }
+
+
+    {
+        printf ("test: alternating memcpy/count-reverse followed by event\n");
+        RUN_SYNC_TEST(0x4000, streamersDev0, sync_queryAllUntilComplete(streamersDev0),  true);
+        RUN_SYNC_TEST(0x8000, streamersDev0, sync_streamWaitEvent(streamersDev0.back()->event(), 0, sideStreams[0], false),  true);
+    }
+
+
     passed();
 }
diff --git a/projects/hip/tests/src/test_common.h b/projects/hip/tests/src/test_common.h
index 1250de4801..f585fb8bca 100644
--- a/projects/hip/tests/src/test_common.h
+++ b/projects/hip/tests/src/test_common.h
@@ -146,6 +146,90 @@ vectorADD(hipLaunchParm lp,
 }
 
 
+template <typename T>
+__global__ void
+vectorADDReverse(hipLaunchParm lp,
+            const T *A_d,
+            const T *B_d,
+            T *C_d,
+            size_t NELEM)
+{
+    size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
+    size_t stride = hipBlockDim_x * hipGridDim_x ;
+
+    for (int64_t i=NELEM-stride+offset; i>=0; i-=stride) {
+        C_d[i] = A_d[i] + B_d[i];
+	}
+}
+
+
+template <typename T>
+__global__ void
+addCount( const T *A_d,
+        T *C_d,
+        size_t NELEM,
+        int count)
+{
+    size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
+    size_t stride = hipBlockDim_x * hipGridDim_x ;
+
+    // Deliberately do this in an inefficient way to increase kernel runtime
+    for (int i=0; i<count; i++) {
+        for (size_t i=offset; i<NELEM; i+=stride) {
+            C_d[i] = A_d[i] + (T)count;
+        }
+    }
+}
+
+
+template <typename T>
+__global__ void
+addCountReverse( const T *A_d,
+        T *C_d,
+        int64_t NELEM,
+        int count)
+{
+    size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
+    size_t stride = hipBlockDim_x * hipGridDim_x ;
+
+    // Deliberately do this in an inefficient way to increase kernel runtime
+    for (int i=0; i<count; i++) {
+        for (int64_t i=NELEM-stride+offset; i>=0; i-=stride) {
+            C_d[i] = A_d[i] + (T)count;
+        }
+    }
+}
+
+
+template <typename T>
+__global__ void
+memsetReverse( T *C_d,  T val,
+        int64_t NELEM)
+{
+    size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
+    size_t stride = hipBlockDim_x * hipGridDim_x ;
+
+    for (int64_t i=NELEM-stride+offset; i>=0; i-=stride) {
+        C_d[i] = val;
+    }
+}
+
+
+template <typename T>
+void setDefaultData(size_t numElements, T *A_h, T* B_h, T *C_h)
+{
+    // Initialize the host data:
+    for (size_t i=0; i<numElements; i++) {
+        if (A_h) 
+            (A_h)[i] = 3.146f + i; // Pi
+        if (B_h) 
+            (B_h)[i] = 1.618f + i; // Phi
+        if (C_h) 
+            (C_h)[i] = 0.0f + i;  
+    }
+}
+
+
 template <typename T>
 void initArraysForHost(T **A_h, T **B_h, T **C_h,
                 size_t N, bool usePinnedHost=false)
@@ -179,15 +263,10 @@ void initArraysForHost(T **A_h, T **B_h, T **C_h,
         }
     }
 
-    // Initialize the host data:
-    for (size_t i=0; i<N; i++) {
-        if (A_h) 
-            (*A_h)[i] = 3.146f + i; // Pi
-        if (B_h) 
-            (*B_h)[i] = 1.618f + i; // Phi
-    }
+    setDefaultData(N, A_h ? *A_h : NULL, B_h ? *B_h : NULL, C_h ? *C_h : NULL);
 }
 
+
 template <typename T>
 void initArrays(T **A_d, T **B_d, T **C_d,
                 T **A_h, T **B_h, T **C_h, 
@@ -295,7 +374,7 @@ inline void initHIPArrays(hipArray **A_d, hipArray **B_d, hipArray **C_d,
 // Assumes C_h contains vector add of A_h + B_h
 // Calls the test "failed" macro if a mismatch is detected.
 template <typename T>
-void checkVectorADD(T* A_h, T* B_h, T* result_H, size_t N, bool expectMatch=true)
+size_t checkVectorADD(T* A_h, T* B_h, T* result_H, size_t N, bool expectMatch=true, bool reportMismatch=true)
 {
     size_t  mismatchCount = 0;
     size_t  firstMismatch = 0;
@@ -316,9 +395,50 @@ void checkVectorADD(T* A_h, T* B_h, T* result_H, size_t N, bool expectMatch=true
         }
     }
 
+	if (reportMismatch) {
+        if (expectMatch) {
+            if (mismatchCount) {
+                failed("%zu mismatches ; first at index:%zu\n", mismatchCount, firstMismatch);
+            }
+        } else {
+            if (mismatchCount == 0) {
+                failed("expected mismatches but did not detect any!");
+            }
+        }
+	}
+
+	return mismatchCount;
+
+}
+
+
+// Assumes C_h contains vector add of A_h + B_h
+// Calls the test "failed" macro if a mismatch is detected.
+template <typename T>
+void checkTest(T* expected_H, T* result_H, size_t N, bool expectMatch=true)
+{
+    size_t  mismatchCount = 0;
+    size_t  firstMismatch = 0;
+    size_t  mismatchesToPrint = 10;
+    for (size_t i=0; i<N; i++) {
+        if (result_H[i] != expected_H[i]) {
+            if (mismatchCount == 0) {
+                firstMismatch = i;
+            }
+            mismatchCount++;
+            if ((mismatchCount <= mismatchesToPrint) && expectMatch) {
+                std::cout << std::fixed << std::setprecision(32);
+                std::cout << "At " << i << std::endl;
+                std::cout << "  Computed:" << result_H[i]  << std::endl;
+                std::cout << "  Expected:" << expected_H[i] << std::endl;
+            }
+        }
+    }
+
     if (expectMatch) {
         if (mismatchCount) {
-            failed("%zu mismatches ; first at index:%zu\n", mismatchCount, firstMismatch);
+            fprintf(stderr, "%zu mismatches ; first at index:%zu\n", mismatchCount, firstMismatch);
+            //failed("%zu mismatches ; first at index:%zu\n", mismatchCount, firstMismatch);
         }
     } else {
         if (mismatchCount == 0) {
diff --git a/projects/hip/util/vim/hip.vim b/projects/hip/util/vim/hip.vim
index b73066a786..f28761ad0b 100644
--- a/projects/hip/util/vim/hip.vim
+++ b/projects/hip/util/vim/hip.vim
@@ -185,6 +185,8 @@ syn keyword hipFlags hipHostMallocDefault
 syn keyword hipFlags hipHostMallocPortable
 syn keyword hipFlags hipHostMallocMapped
 syn keyword hipFlags hipHostMallocWriteCombined
+syn keyword hipFlags hipHostMallocCoherent
+syn keyword hipFlags hipHostMallocNonCoherent
 
 syn keyword hipFlags hipHostRegisterDefault
 syn keyword hipFlags hipHostRegisterPortable