Merge branch 'amd-develop' into amd-master

Change-Id: I3802a78577a3979172c0e5fbf39c3db2a044824d [ROCm/hip commit: 75f8840ca9]
2017-06-06 15:35:09 +05:30
@@ -32,6 +32,8 @@ HIP releases are typically of two types. The tag naming convention is different
 - [HIP Runtime API (Doxygen)](http://gpuopen-professionalcompute-tools.github.io/HIP)
 - [HIP Porting Guide](docs/markdown/hip_porting_guide.md)
 - [HIP Porting Driver Guide](docs/markdown/hip_porting_driver_api.md)
+- [HIP Profiling ](docs/markdown/hip_profiling.md)
+- [HIP Debugging](docs/markdown/hip_debugging.md)
 - [HIP Terminology](docs/markdown/hip_terms.md) (including Rosetta Stone of GPU computing terms across CUDA/HIP/HC/AMP/OpenL)
 - [hipify-clang](hipify-clang/README.md)
 - [Developer/CONTRIBUTING Info](CONTRIBUTING.md)
@@ -2,4 +2,4 @@

 SEARCH_DIRS=$@

-find $SEARCH_DIRS -name '*.cpp' -o -name '*.h' -o -name '*.cu' -o -name '*.cuh' -o -name '*.c' -o -name '*.hpp'
+find $SEARCH_DIRS -name '*.cpp' -o -name '*.h' -o -name '*.cu' -o -name '*.cuh' -o -name '*.c' -o -name '*.hpp' -o -name '*.inl'
@@ -1,18 +1,18 @@
 #!/bin/bash

-#usage : hipconvertinplace.sh [DIRNAME] [HIPIFY_OPTIONS]
+#usage : hipconvertinplace-perl.sh DIRNAME [hipify-perl options]

-#hipify "inplace" all code files in specified directory.    
+#hipify "inplace" all code files in specified directory.
 # This can be quite handy when dealing with an existing CUDA code base since the script
 # preserves the existing directory structure.

 #  For each code file, this script will:
-#   - If ".prehip file does not exist, copy the original code to a new file with extension ".prehip".  Then Hipify the code file.
+#   - If ".prehip file does not exist, copy the original code to a new file with extension ".prehip". Then hipify the code file.
 #   - If ".prehip" file exists, this is used as input to hipify.
-# (this is useful for testing improvements to the hipify toolset).
+# (this is useful for testing improvements to the hipify-perl toolset).


 SCRIPT_DIR=`dirname $0`
 SEARCH_DIR=$1
 shift
-$SCRIPT_DIR/hipify -inplace -print-stats "$@" `$SCRIPT_DIR/findcode.sh $SEARCH_DIR`
+$SCRIPT_DIR/hipify-perl -inplace -print-stats "$@" `$SCRIPT_DIR/findcode.sh $SEARCH_DIR`
@@ -1,12 +1,12 @@
 #!/bin/bash

-#usage : hipexamine.sh DIRNAME [hipify.pl options]
+#usage : hipexamine-perl.sh DIRNAME [hipify-perl options]

-# Generate HIP stats (LOC, CUDA->API conversions, missing functionality) for all the code files 
+# Generate HIP stats (LOC, CUDA->API conversions, missing functionality) for all the code files
 # in the specified directory.


 SCRIPT_DIR=`dirname $0`
 SEARCH_DIR=$1
 shift
-$SCRIPT_DIR/hipify -no-output -print-stats "$@" `$SCRIPT_DIR/findcode.sh $SEARCH_DIR`
+$SCRIPT_DIR/hipify-perl -no-output -print-stats "$@" `$SCRIPT_DIR/findcode.sh $SEARCH_DIR`
@@ -1,6 +1,6 @@
 #!/bin/bash

-#usage : hipexamine2.sh DIRNAME [hipify options] [--] [clang options]
+#usage : hipexamine.sh DIRNAME [hipify options] [--] [clang options]

 # Generate CUDA->HIP conversion statistics for all the code files in the specified directory.

@@ -20,7 +20,7 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 # THE SOFTWARE.
 ##
-#usage hipify [OPTIONS] INPUT_FILE
+#usage hipify-perl [OPTIONS] INPUT_FILE
 use Getopt::Long;

 my $warn_whitelist ="";
@@ -201,7 +201,7 @@ while (@ARGV) {
    my %ft;
    clearStats(\%ft, \@statNames);
    my $countIncludes = 0;
-    my $countKeywords = 0; # keywords like __global__, __shared__ - not converted by hipify but counted here.
+    my $countKeywords = 0; # keywords like __global__, __shared__ - not converted by hipify-perl, but counted here.
    my $warnings = 0;
    my $warningsCublas = 0;
    my $warningsCurand = 0;
@@ -323,6 +323,68 @@
 |          500 |*`CUDA_ERROR_NOT_FOUND`*                                       |*`hipErrorNotFound`*                                        | This indicates that a named symbol was not found. Examples of symbols are global/constant variable names, texture names, and surface names. |
 |          600 |*`CUDA_ERROR_NOT_READY`*                                       |*`hipErrorNotReady`*                                        | This indicates that asynchronous operations issued previously have not completed yet. This result is not actually an error, but must be indicated differently than CUDA_SUCCESS (which indicates completion). Calls that may return this value include cuEventQuery() and cuStreamQuery(). |
 |          700 |*`CUDA_ERROR_ILLEGAL_ADDRESS`*                                 |*`hipErrorIllegalAddress`*                                  | While executing a kernel, the device encountered a load or store instruction on an invalid memory address. The context cannot be used, so it must be destroyed (and a new one should be created). All existing device memory allocations from this context are invalid and must be reconstructed if the program is to continue using CUDA. |
+|          701 |*`CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES`*                         |*`hipErrorLaunchOutOfResources`*                            | This indicates that a launch did not occur because it did not have appropriate resources. This error usually indicates that the user has attempted to pass too many arguments to the device kernel, or the kernel launch specifies too many threads for the kernel's register count. Passing arguments of the wrong size (i.e. a 64-bit pointer when a 32-bit int is expected) is equivalent to passing too many arguments and can also result in this error. |
+|          702 |*`CUDA_ERROR_LAUNCH_TIMEOUT`*                                  |*`hipErrorLaunchTimeOut`*                                   | This indicates that the device kernel took too long to execute. This can only occur if timeouts are enabled - see the device attribute CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT for more information. The context cannot be used (and must be destroyed similar to CUDA_ERROR_LAUNCH_FAILED). All existing device memory allocations from this context are invalid and must be reconstructed if the program is to continue using CUDA. |
+|          703 |*`CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING`*                   |                                                            | This error indicates a kernel launch that uses an incompatible texturing mode.                                                 |
+|          704 |*`CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED`*                     |*`hipErrorPeerAccessAlreadyEnabled`*                        | This error indicates that a call to cuCtxEnablePeerAccess() is trying to re-enable peer access to a context which has already had peer access to it enabled. |
+|          705 |*`CUDA_ERROR_PEER_ACCESS_NOT_ENABLED`*                         |*`hipErrorPeerAccessNotEnabled`*                            | This error indicates that cuCtxDisablePeerAccess() is trying to disable peer access which has not been enabled yet via cuCtxEnablePeerAccess(). |
+|          708 |*`CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE`*                          |                                                            | This error indicates that cuCtxDisablePeerAccess() is trying to disable peer access which has not been enabled yet via cuCtxEnablePeerAccess(). |
+|          709 |*`CUDA_ERROR_CONTEXT_IS_DESTROYED`*                            |                                                            | This error indicates that the context current to the calling thread has been destroyed using cuCtxDestroy, or is a primary context which has not yet been initialized. |
+|          710 |*`CUDA_ERROR_ASSERT`*                                          |                                                            | A device-side assert triggered during kernel execution. The context cannot be used anymore, and must be destroyed. All existing device memory allocations from this context are invalid and must be reconstructed if the program is to continue using CUDA. |
+|          711 |*`CUDA_ERROR_TOO_MANY_PEERS`*                                  |                                                            | This error indicates that the hardware resources required to enable peer access have been exhausted for one or more of the devices passed to cuCtxEnablePeerAccess(). |
+|          712 |*`CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED`*                  |*`hipErrorHostMemoryAlreadyRegistered`*                     | This error indicates that the memory range passed to cuMemHostRegister() has already been registered.                          |
+|          713 |*`CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED`*                      |*`hipErrorHostMemoryNotRegistered`*                         | This error indicates that the pointer passed to cuMemHostUnregister() does not correspond to any currently registered memory region. |
+|          714 |*`CUDA_ERROR_HARDWARE_STACK_ERROR`*                            |                                                            | While executing a kernel, the device encountered a stack error. This can be due to stack corruption or exceeding the stack size limit. The context cannot be used, so it must be destroyed (and a new one should be created). All existing device memory allocations from this context are invalid and must be reconstructed if the program is to continue using CUDA. |
+|          715 |*`CUDA_ERROR_ILLEGAL_INSTRUCTION`*                             |                                                            | While executing a kernel, the device encountered an illegal instruction. The context cannot be used, so it must be destroyed (and a new one should be created). All existing device memory allocations from this context are invalid and must be reconstructed if the program is to continue using CUDA. |
+|          716 |*`CUDA_ERROR_MISALIGNED_ADDRESS`*                              |                                                            | While executing a kernel, the device encountered a load or store instruction on a memory address which is not aligned. The context cannot be used, so it must be destroyed (and a new one should be created). All existing device memory allocations from this context are invalid and must be reconstructed if the program is to continue using CUDA. |
+|          717 |*`CUDA_ERROR_INVALID_ADDRESS_SPACE`*                           |                                                            | While executing a kernel, the device encountered an instruction which can only operate on memory locations in certain address spaces (global, shared, or local), but was supplied a memory address not belonging to an allowed address space. The context cannot be used, so it must be destroyed (and a new one should be created). All existing device memory allocations from this context are invalid and must be reconstructed if the program is to continue using CUDA. |
+|          718 |*`CUDA_ERROR_INVALID_PC`*                                      |                                                            | While executing a kernel, the device program counter wrapped its address space. The context cannot be used, so it must be destroyed (and a new one should be created). All existing device memory allocations from this context are invalid and must be reconstructed if the program is to continue using CUDA. |
+|          719 |*`CUDA_ERROR_LAUNCH_FAILED`*                                   |                                                            | An exception occurred on the device while executing a kernel. Common causes include dereferencing an invalid device pointer and accessing out of bounds shared memory. The context cannot be used, so it must be destroyed (and a new one should be created). All existing device memory allocations from this context are invalid and must be reconstructed if the program is to continue using CUDA. |
+|          800 |*`CUDA_ERROR_NOT_PERMITTED`*                                   |                                                            | This error indicates that the attempted operation is not permitted.                                                            |
+|          801 |*`CUDA_ERROR_NOT_SUPPORTED`*                                   |                                                            | This error indicates that the attempted operation is not supported on the current system or device.                            |
+|          999 |*`CUDA_ERROR_UNKNOWN`*                                         |                                                            | This indicates that an unknown internal error has occurred.                                                                    |
+| enum         |***`CUstream_flags`***                                         |***`hipStreamFlags`***                                      | Stream creation flags                                                                                                          |
+|          0x0 |*`CU_STREAM_DEFAULT`*                                          |*`hipStreamDefault`*                                        | Default stream flag                                                                                                            |
+|          0x1 |*`CU_STREAM_NON_BLOCKING`*                                     |*`hipStreamNonBlocking`*                                    | Stream does not synchronize with stream 0 (the NULL stream)                                                                    |
+| typedef      | `CUarray`                                                     | `hipArray *`                                               | CUDA array                                                                                                                     |
+| struct       | `CUarray_st`                                                  | `hipArray`                                                 | CUDA array                                                                                                                     |
+| typedef      | `CUcontext`                                                   | `hipCtx_t`                                                 | CUDA context                                                                                                                   |
+| typedef      | `CUdevice`                                                    | `hipDevice_t`                                              | CUDA device                                                                                                                    |
+| typedef      | `CUdeviceptr`                                                 | `hipDeviceptr_t`                                           | CUDA device pointer CUdeviceptr is defined as an unsigned integer type whose size matches the size of a pointer on the target platform. |
+| typedef      | `CUevent`                                                     | `hipEvent_t`                                               | CUDA event                                                                                                                     |
+| typedef      | `CUfunction`                                                  | `hipFunction_t`                                            | CUDA function                                                                                                                  |
+| typedef      | `CUgraphicsResource`                                          |                                                            | CUDA graphics interop resource                                                                                                 |
+| typedef      | `CUmipmappedArray`                                            |                                                            | CUDA mipmapped array                                                                                                           |
+| typedef      | `CUmodule`                                                    | `hipModule_t`                                              | CUDA module                                                                                                                    |
+| typedef      | `CUstream`                                                    | `hipStream_t`                                              | CUDA module                                                                                                                    |
+| typedef      | `CUstreamCallback`                                            | `hipStreamCallback_t`                                      | CUDA stream callback                                                                                                           |
+| typedef      | `CUsurfObject`                                                |                                                            | An opaque value that represents a CUDA surface object                                                                          |
+| typedef      | `CUsurfref`                                                   |                                                            | CUDA surface reference                                                                                                         |
+| typedef      | `CUtexObject`                                                 |                                                            | An opaque value that represents a CUDA texture object                                                                          |
+| typedef      | `CUtexref`                                                    |                                                            | CUDA texture reference                                                                                                         |
+| define       |`CU_IPC_HANDLE_SIZE`                                           |                                                            | CUDA IPC handle size.                                                                                                          |
+| define       |`CU_LAUNCH_PARAM_BUFFER_POINTER`                               | `HIP_LAUNCH_PARAM_BUFFER_POINTER`                          | Indicator that the next value in the extra parameter to cuLaunchKernel will be a pointer to a buffer containing all kernel parameters used for launching kernel f. This buffer needs to honor all alignment/padding requirements of the individual parameters. If CU_LAUNCH_PARAM_BUFFER_SIZE is not also specified in the extra array, then CU_LAUNCH_PARAM_BUFFER_POINTER will have no effect. |
+| define       |`CU_LAUNCH_PARAM_BUFFER_SIZE`                                  | `HIP_LAUNCH_PARAM_BUFFER_SIZE`                             | Indicator that the next value in the extra parameter to cuLaunchKernel will be a pointer to a size_t which contains the size of the buffer specified with CU_LAUNCH_PARAM_BUFFER_POINTER. It is required that CU_LAUNCH_PARAM_BUFFER_POINTER also be specified in the extra array if the value associated with CU_LAUNCH_PARAM_BUFFER_SIZE is not zero. |
+| define       |`CU_LAUNCH_PARAM_END`                                          | `HIP_LAUNCH_PARAM_END`                                     | End of array terminator for the extra parameter to cuLaunchKernel.                                                             |
+| define       |`CU_MEMHOSTALLOC_DEVICEMAP`                                    |                                                            | If set, host memory is mapped into CUDA address space and cuMemHostGetDevicePointer() may be called on the host pointer. Flag for cuMemHostAlloc(). |
+| define       |`CU_MEMHOSTALLOC_PORTABLE`                                     |                                                            | If set, host memory is portable between CUDA contexts. Flag for cuMemHostAlloc().                                              |
+| define       |`CU_MEMHOSTALLOC_WRITECOMBINED`                                |                                                            | If set, host memory is allocated as write-combined - fast to write, faster to DMA, slow to read except via SSE4 streaming load instruction (MOVNTDQA). Flag for cuMemHostAlloc(). |
+| define       |`CU_MEMHOSTREGISTER_DEVICEMAP`                                 |                                                            | If set, host memory is mapped into CUDA address space and cuMemHostGetDevicePointer() may be called on the host pointer. Flag for cuMemHostRegister(). |
+| define       |`CU_MEMHOSTREGISTER_IOMEMORY`                                  |                                                            | If set, the passed memory pointer is treated as pointing to some memory-mapped I/O space, e.g. belonging to a third-party PCIe device. On Windows the flag is a no-op. On Linux that memory is marked as non cache-coherent for the GPU and is expected to be physically contiguous. It may return CUDA_ERROR_NOT_PERMITTED if run as an unprivileged user, CUDA_ERROR_NOT_SUPPORTED on older Linux kernel versions. On all other platforms, it is not supported and CUDA_ERROR_NOT_SUPPORTED is returned. Flag for cuMemHostRegister(). |
+| define       |`CU_MEMHOSTREGISTER_PORTABLE`                                  |                                                            | If set, host memory is portable between CUDA contexts. Flag for cuMemHostRegister().                                           |
+| define       |`CU_PARAM_TR_DEFAULT`                                          |                                                            | For texture references loaded into the module, use default texunit from texture reference.                                     |
+| define       |`CU_STREAM_LEGACY`                                             |                                                            | Legacy stream handle. Stream handle that can be passed as a CUstream to use an implicit stream with legacy synchronization behavior. See details of the synchronization behavior. |
+| define       |`CU_STREAM_PER_THREAD`                                         |                                                            | Per-thread stream handle. Stream handle that can be passed as a CUstream to use an implicit stream with perthread synchronization behavior. See details of the synchronization behavior. |
+| define       |`CU_TRSA_OVERRIDE_FORMAT`                                      |                                                            | Override the texref format with a format inferred from the array. Flag for cuTexRefSetArray().                                 |
+| define       |`CU_TRSF_NORMALIZED_COORDINATES`                               |                                                            | Use normalized texture coordinates in the range [0,1) instead of [0,dim). Flag for cuTexRefSetFlags().                         |
+| define       |`CU_TRSF_SRGB`                                                 |                                                            | Perform sRGB->linear conversion during texture read. Flag for cuTexRefSetFlags().                                              |
+| define       |`CUDA_ARRAY3D_2DARRAY`                                         |                                                            | Deprecated, use CUDA_ARRAY3D_LAYERED.                                                                                          |
+| define       |`CUDA_ARRAY3D_CUBEMAP`                                         |                                                            | If set, the CUDA array is a collection of six 2D arrays, representing faces of a cube. The width of such a CUDA array must be equal to its height, and Depth must be six. If CUDA_ARRAY3D_LAYERED flag is also set, then the CUDA array is a collection of cubemaps and Depth must be a multiple of six. |
+| define       |`CUDA_ARRAY3D_DEPTH_TEXTURE`                                   |                                                            | This flag if set indicates that the CUDA array is a DEPTH_TEXTURE.                                                             |
+| define       |`CUDA_ARRAY3D_LAYERED`                                         |                                                            | If set, the CUDA array is a collection of layers, where each layer is either a 1D or a 2D array and the Depth member of CUDA_ARRAY3D_DESCRIPTOR specifies the number of layers, not the depth of a 3D array. |
+| define       |`CUDA_ARRAY3D_SURFACE_LDST`                                    |                                                            | This flag must be set in order to bind a surface reference to the CUDA array.                                                  |
+| define       |`CUDA_ARRAY3D_TEXTURE_GATHER`                                  |                                                            | This flag must be set in order to perform texture gather operations on a CUDA array.                                           |
+| define       |`CUDA_VERSION`                                                 |                                                            | CUDA API version number.                                                                                                       |


 ## **2. Error Handling**
@@ -45,6 +45,8 @@ To correct, add the following flag to hcc or hipcc:
 $ hipcc -Wl,-Bsymbolic ...
 ```

+Ensure there is no space in the "Wl,-Bsymbolic" option.
+

 ### What is the current limitation of HIP Generic Grid Launch method?
 1. __global__ functions cannot be marked as static or put in an unnamed namespace i.e. they cannot be given internal linkage (this would clash with __attribute__((weak)));
@@ -0,0 +1,168 @@
+Table of Contents
+=================
+
+  * [Profiling HIP Code](#profiling-hip-code" aria-hidden="true"><span aria-hidden="true)
+      * [Using HIP_DB](#using-hip_db" aria-hidden="true"><span aria-hidden="true)
+      * [Using ltrace](#using-ltrace" aria-hidden="true"><span aria-hidden="true)
+      * [Chicken bits](#chicken-bits" aria-hidden="true"><span aria-hidden="true)
+      * [Debugging HIP Applications](#debugging-hip-applications" aria-hidden="true"><span aria-hidden="true)
+      * [General Debugging Tips](#general-debugging-tips" aria-hidden="true"><span aria-hidden="true)
+        * [Print env var state](#print-env-var-state" aria-hidden="true"><span aria-hidden="true)
+
+### Using HIP_DB
+
+This flag is primarily targeted to assist HIP development team in the development of the HIP runtime, but in some situations may be useful to HIP application developers as well.
+The HIP debug information is designed to print important information during the execution of a HIP API.  HIP provides
+different color-coded levels of debug information:
+  - api  : Print the beginning and end of each HIP API, including the arguments and return codes.  This is equivalent to setting HIP_TRACE_API=1.
+  - sync : Print multi-thread and other synchronization debug information.
+  - copy : Print which engine is doing the copy, which copy flavor is selected, information on source and destination memory.
+  - mem  : Print information about memory allocation - which pointers are allocated, where they are allocated, peer mappings, and more.
+
+HIP_DB format is flags separated by '+' sign, or a hex code for the bitmask.  Generally the + format is preferred.  
+For example:
+```
+$ HIP_DB=api+copy+mem  my-application
+$ HIP_DB=0xF  my-application
+```
+
+### Using ltrace
+ltrace is a standard linux tool which provides a message to stderr on every dynamic library call.  Since ROCr and the ROCt (the ROC thunk, which is the thin user-space interface to the ROC kernel driver) are both dynamic libraries, this provides an easy way to trace the activity in these libraries.  Tracing can be a powerful way to quickly observe the flow of the application before diving into the details with a command-line debugger.
+The trace can also show performance issues related to accidental calls to expensive API calls on the critical path.
+
+ltrace can be easily combined with the HIP_DB switches to visualize the runtime behavior of the entire ROCm software stack.  Here's a sample command-line and output:
+
+```
+$ HIP_DB=api ltrace -C -e 'hsa*'   <applicationName> <applicationArguments>
+
+...
+
+<<hip-api tid:1.17 hipMemcpy (0x7f7776d3e010, 0x503d1d000, 4194304, hipMemcpyDeviceToHost)
+libmcwamp_hsa.so->hsa_signal_store_relaxed(0x1804000, 0, 0, 0x400000) = 0
+libmcwamp_hsa.so->hsa_signal_store_relaxed(0x1816000, 0, 0x7f777f85f2a0, 0x400000) = 0
+libmcwamp_hsa.so->hsa_amd_memory_lock(0x7f7776d3e010, 0x400000, 0x1213b70, 1 <unfinished ...>
+libhsa-runtime64.so.1->hsaKmtRegisterMemoryToNodes(0x7f7776d3e010, 0x400000, 1, 0x1220c10) = 0
+libhsa-runtime64.so.1->hsaKmtMapMemoryToGPUNodes(0x7f7776d3e010, 0x400000, 0x7ffc32865400, 64) = 0
+<... hsa_amd_memory_lock resumed> )              = 0
+libmcwamp_hsa.so->hsa_signal_store_relaxed(0x1804000, 1, 0x7f777e95a770, 0x12205b0) = 0
+libmcwamp_hsa.so->hsa_amd_memory_async_copy(0x50411d010, 0x11e70d0, 0x503d1d000, 0x11e70d0) = 0
+libmcwamp_hsa.so->hsa_signal_wait_acquire(0x1804000, 2, 1, -1) = 0
+libmcwamp_hsa.so->hsa_amd_memory_unlock(0x7f7776d3e010, 0x1213c6c, 0x12c3c600000000, 0x1804000 <unfinished ...>
+libhsa-runtime64.so.1->hsaKmtUnmapMemoryToGPU(0x7f7776d3e010, 0x7f7776d3e010, 0x12c3c600000000, 0x1804000) = 0
+libhsa-runtime64.so.1->hsaKmtDeregisterMemory(0x7f7776d3e010, 0x7f7776d3e010, 0x7f777f60f9e8, 0x1220580) = 0
+<... hsa_amd_memory_unlock resumed> )            = 0
+  hip-api tid:1.17 hipMemcpy                      ret= 0 (hipSuccess)>>
+```
+
+Some key information from the trace above.
+  - Thy trace snippet shows the execution of a hipMemcpy API, bracketed by the first and last message in the trace output.  The messages show the thread id and API sequence number (`1.17`).  ltrace output intermixes messages from all threads, so the HIP debug information can be useful to determine which threads are executing.
+  - The code flows through HIP APIs into ROCr (HSA) APIs (hsa*) and into the thunk (hsaKmt*) calls.
+  - The HCC runtime is "libmcwamp_hsa.so" and the HSA/ROCr runtime is "libhsa-runtime64.so".
+  - In this particular case, the memory copy is for unpinned memory, and the selected copy algorithm is to pin the host memory "in-place" before performing the copy.  The signaling APIs and calls to pin ("lock", "register") the memory are readily apparent in the trace output.
+
+
+### Chicken bits
+Chicken bits are environment variables which cause the HIP, HCC, or HSA driver to disable some feature or optimization.
+These are not intended for production but can be useful diagnose synchronization problems in the application (or driver).
+
+Some of the most useful chicken bits are described here. These bits are supported on the ROCm path:
+
+HIP provides 3 environment variables in the HIP_*_BLOCKING family.  These introduce additional synchronization and can be useful to isolate synchronization problems. Specifically, if the code works with this flag set, then it indicates the kernels are executing correctly, and any failures likely are causes by improper or missing synchronization.  These flags will have performance impact and are not intended for production use.
+
+- HIP_LAUNCH_BLOCKING=1 : Waits on the host after each kernel launch.  Equivalent to setting CUDA_LAUNCH_BLOCKING.
+- HIP_LAUNCH_BLOCKING_KERNELS: A comma-separated list of kernel names.  The HIP runtime will wait on the host after one of the named kernels executes.  This provides a more targeted version of HIP_LAUNCH_BLOCKING and may be useful to isolate exactly which kernel needs further analysis if HIP_LAUNCH_BLOCKING=1 improves functionality.  There is no indication if kernel names are spelled incorrectly.  One mechanism to verify that the blocking is working is to run with HIP_DB=api+sync and search for debug messages with "LAUNCH_BLOCKING".
+- HIP_API_BLOCKING : Forces hipMemcpyAsync and hipMemsetAsync to be host-synchronous, meaning they will wait for the requested operation to complete before returning to the caller.
+
+These options cause HCC to serialize.  Useful if you have libraries or code which is calling HCC kernels directly rather than using HIP.  
+- HCC_SERIALZIE_KERNELS : 0x1=pre-serialize before each kernel launch, 0x2=post-serialize after each kernel launch., 0x3= pre- and post- serialize.
+- HCC_SERIALIZE_COPY    : 0x1=pre-serialize before each async copy, 0x2=post-serialize after each async copy., 0x3= pre- and post- serialize.
+
+- HSA_ENABLE_SDMA=0     : Causes host-to-device and device-to-host copies to use compute shader blit kernels rather than the dedicated DMA copy engines.  Compute shader copies have low latency (typically < 5us) and can achieve approximately 80% of the bandwidth of the DMA copy engine.  This flag is useful to isolate issues with the hardware copy engines.
+- HSA_ENABLE_INTERRUPT=0 : Causes completion signals to be detected with memory-based polling rather than interrupts.  Can be useful to diagnose interrupt storm issues in the driver.
+- HSA_DISABLE_CACHE=1  : Disables the GPU L2 data cache.
+
+### Debugging HIP Applications
+
+- The variable "tls_tidInfo" contains the API sequence number (_apiSeqNum)- a monotonically increasing count of the HIP APIs called from this thread.  This can be useful for setting conditional breakpoints.  Also, each new HIP thread is mapped to monotically increasing shortTid ID.  Both of these fields are displayed in the HIP debug info. 
+```
+(gdb) p tls_tidInfo
+$32 = {_shortTid = 1, _apiSeqNum = 803}
+```
+
+- HCC tracks all of the application memory allocations, including those from HIP and HC's "am_alloc".
+If the HCC runtime is built with debug information (HCC_RUNTIME_DEBUG=ON when building HCC), then calling the function 'hc::am_memtracker_print()' will show all memory allocations. 
+An optional argument specifies a void * targetPointer - the print routine will mark the allocation which contains the specified pointer with "-->" in the printed output.
+This example shows a sample GDB session where we print the memory allocated by this process and mark a specified address by using the gdb "call" function..
+The gdb syntax also supports using the variable name (in this case 'dst'):
+```
+(gdb) p dst
+$33 = (void *) 0x5ec7e9000
+(gdb) call hc::am_memtracker_print(dst)
+TargetAddress:0x5ec7e9000
+   0x504cfc000-0x504cfc00f::  allocSeqNum:1 hostPointer:0x504cfc000 devicePointer:0x504cfc000 sizeBytes:16 isInDeviceMem:0 isAmManaged:1 appId:0 appAllocFlags:0 appPtr:(nil)
+...
+-->0x5ec7e9000-0x5f7e28fff::  allocSeqNum:488 hostPointer:(nil) devicePointer:0x5ec7e9000 sizeBytes:191102976 isInDeviceMem:1 isAmManaged:1 appId:0 appAllocFlags:0 appPtr:(nil)
+
+```
+
+To debug an explicit address, cast the address to (void*) :
+```
+(gdb) call hc::am_memtracker_print((void*)0x508c7f000)
+```
+- Debugging GPUVM fault.
+For example:
+```
+Memory access fault by GPU node-1 on address 0x5924000. Reason: Page not present or supervisor privilege.
+
+Program received signal SIGABRT, Aborted.
+[Switching to Thread 0x7fffdffb5700 (LWP 14893)]
+0x00007ffff2057c37 in __GI_raise (sig=sig@entry=6) at ../nptl/sysdeps/unix/sysv/linux/raise.c:56
+56      ../nptl/sysdeps/unix/sysv/linux/raise.c: No such file or directory.
+(gdb) bt
+#0  0x00007ffff2057c37 in __GI_raise (sig=sig@entry=6) at ../nptl/sysdeps/unix/sysv/linux/raise.c:56
+#1  0x00007ffff205b028 in __GI_abort () at abort.c:89
+#2  0x00007ffff6f960eb in ?? () from /opt/rocm/hsa/lib/libhsa-runtime64.so.1
+#3  0x00007ffff6f99ea5 in ?? () from /opt/rocm/hsa/lib/libhsa-runtime64.so.1
+#4  0x00007ffff6f78107 in ?? () from /opt/rocm/hsa/lib/libhsa-runtime64.so.1
+#5  0x00007ffff744f184 in start_thread (arg=0x7fffdffb5700) at pthread_create.c:312
+#6  0x00007ffff211b37d in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:111
+(gdb) info threads
+  Id   Target Id         Frame
+  4    Thread 0x7fffdd521700 (LWP 14895) "caffe" pthread_cond_wait@@GLIBC_2.3.2 () at ../nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S:185
+  3    Thread 0x7fffddd22700 (LWP 14894) "caffe" pthread_cond_wait@@GLIBC_2.3.2 () at ../nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S:185
+* 2    Thread 0x7fffdffb5700 (LWP 14893) "caffe" 0x00007ffff2057c37 in __GI_raise (sig=sig@entry=6) at ../nptl/sysdeps/unix/sysv/linux/raise.c:56
+  1    Thread 0x7ffff7fa6ac0 (LWP 14892) "caffe" 0x00007ffff6f934d5 in ?? () from /opt/rocm/hsa/lib/libhsa-runtime64.so.1
+(gdb) thread 1
+[Switching to thread 1 (Thread 0x7ffff7fa6ac0 (LWP 14892))]
+#0  0x00007ffff6f934d5 in ?? () from /opt/rocm/hsa/lib/libhsa-runtime64.so.1
+(gdb) bt
+#0  0x00007ffff6f934d5 in ?? () from /opt/rocm/hsa/lib/libhsa-runtime64.so.1
+#1  0x00007ffff6f929ba in ?? () from /opt/rocm/hsa/lib/libhsa-runtime64.so.1
+#2  0x00007fffe080beca in HSADispatch::waitComplete() () from /opt/rocm/hcc/lib/libmcwamp_hsa.so
+#3  0x00007fffe080415f in HSADispatch::dispatchKernelAsync(Kalmar::HSAQueue*, void const*, int, bool) () from /opt/rocm/hcc/lib/libmcwamp_hsa.so
+#4  0x00007fffe080238e in Kalmar::HSAQueue::dispatch_hsa_kernel(hsa_kernel_dispatch_packet_s const*, void const*, unsigned long, hc::completion_future*) () from /opt/rocm/hcc/lib/libmcwamp_hsa.so
+#5  0x00007ffff7bb7559 in hipModuleLaunchKernel () from /opt/rocm/hip/lib/libhip_hcc.so
+#6  0x00007ffff2e6cd2c in mlopen::HIPOCKernel::run (this=0x7fffffffb5a8, args=0x7fffffffb2a8, size=80) at /root/MIOpen/src/hipoc/hipoc_kernel.cpp:15  
+...
+```
+
+### General Debugging Tips
+- The fault will be caught by the runtime but was actually generated by an asynchronous command running on the GPU.    So, the GDB backtrace will show a path in the runtime, ie inside "GI_Raise" as shown in the example above.
+- To determine the true location of the fault, force the kernels to execute synchronously by seeing the environment variables HCC_SERIALIZE_KERNEL=3 HCC_SERIALIZE_COPY=3.  This will force HCC to wait for the kernel to finish executing before retuning.  If the fault occurs during the execution of a kernel, you can see the code which launched the kernel inside the backtrace.  A bit of guesswork is required to determine which thread is actually causing the issue - typically it will the thread which is waiting inside the libhsa-runtime64.so.
+- VM faults inside kernels can be caused byi:
+   - incorrect code (ie a for loop which extends past array boundaries), i
+   - memory issues  - kernel arguments which are invalid (null pointers, unregistered host pointers, bad pointers).
+   - synchronization issues
+   - compiler issues (incorrect code generation from the compiler)
+   - runtime issues 
+
+-- General debug tips:
+- 'gdb --args' can be used to conviently pass the executable and arguments to gdb.
+- From inside GDB, you can set environment variables "set env".  Note the command does not use an '=' sign:
+```
+(gdb) set env HIP_DB 1
+```
+
+#### Print env var state
+Setting HIP_PRINT_ENV=1 and then running a HIP application will print the HIP environment variables, their current values, and usage info.
+Setting HCC_PRINT_ENV=1 and then running a HCC application will print the HCC environment variables, their current values, and usage info.
@@ -53,7 +53,7 @@ At a high-level, the following features are not supported:
 - Dynamic parallelism (CUDA 5.0)
 - Managed memory (CUDA 6.5)
 - Graphics interoperability with OpenGL or Direct3D
- CUDA Driver API (Under Development)
+- CUDA Driver API
 - CUDA IPC Functions (Under Development)
 - CUDA array, mipmappedArray and pitched memory
 - MemcpyToSymbol functions
@@ -102,7 +102,7 @@ However, we can provide a rough summary of the features included in each CUDA SD
    - Per-thread-streams (under development)
    - C++11 (HCC supports all of C++11, all of C++14 and some C++17 features)
 - CUDA 7.5
-    - float16 (under development)
+    - float16
 - CUDA 8.0
    - TBD.

@@ -98,48 +98,57 @@ HIP/HCC will push primary context to context stack when it is empty. This can ha
 #### Interoperation between HIP and CUDA Driver
 CUDA applications may want to mix CUDA driver code with HIP code (see example below). This table shows the type equivalence to enable this interaction.

-|**HIP Type** |**CU Driver Type**|**CUDA Runtime Type**|
-| ----        | ----             | ----                |
-| hipModule   |  CUmodule        |                     |
-| hipFunction |  CUfunction      |                     |
-| hipCtx_t    |  CUcontext       |                     |
-| hipDevice_t |  CUdevice        |                     |
-| hipStream_t |  CUstream        | cudaStream_t        |
-| hipEvent_t  |  CUevent         | cudaEvent_t         |
-| hipArray    |  CUarray         | cudaArray           |
-
-#### Compilation Flags
-The hipModule interface does not support the `cuModuleLoadDataEx` function, which is used to control PTX compilation options.
-HCC does not use PTX and does not support the same compilation options.
-In fact, HCC code objects always contain fully compiled ISA and do not require additional compilation as part of the load step.
-Code which requires this functionally should use platform-specific coding, calling `cuModuleLoadDataEx` on the NVCC path and `hipModuleLoadData` on the hcc path.
-For example:
+|**HIP Type**   |**CU Driver Type**|**CUDA Runtime Type**|
+| ----          | ----             | ----                |
+| hipModule_t   |  CUmodule        |                     |
+| hipFunction_t |  CUfunction      |                     |
+| hipCtx_t      |  CUcontext       |                     |
+| hipDevice_t   |  CUdevice        |                     |
+| hipStream_t   |  CUstream        | cudaStream_t        |
+| hipEvent_t    |  CUevent         | cudaEvent_t         |
+| hipArray      |  CUarray         | cudaArray           |

+#### Compilation Options
+The `hipModule_t` interface does not support `cuModuleLoadDataEx` function, which is used to control PTX compilation options.
+HCC does not use PTX and does not support these compilation options.
+In fact, HCC code objects always contain fully compiled ISA and do not require additional compilation as a part of the load step.
+The corresponding HIP function `hipModuleLoadDataEx` behaves as `hipModuleLoadData` on HCC path (compilation options are not used) and as `cuModuleLoadDataEx` on NVCC path.
+For example (CUDA):
 ```
-hipModule module;
-void *imagePtr = ... ;  // Somehow populate data pointer with code object
+CUmodule module;
+void *imagePtr = ...;  // Somehow populate data pointer with code object

-#ifdef __HIP_PLATFORM_NVCC__
-// Use CUDA driver API but write to hipModule since they are same type:
 const int numOptions = 1;
 CUJit_option options[numOptions];
 void * optionValues[numOptions];

 options[0] = CU_JIT_MAX_REGISTERS;
-unsigned maxRegs=15;
-optionValues[0] = (void*) (&maxRegs);
+unsigned maxRegs = 15;
+optionValues[0] = (void*)(&maxRegs);

 cuModuleLoadDataEx(module, imagePtr, numOptions, options, optionValues);

-#else // __HIP_PLATFORM_HCC__
+CUfunction k;
+cuModuleGetFunction(&k, module, "myKernel");
+```
+HIP:
+```
+hipModule_t module;
+void *imagePtr = ...;  // Somehow populate data pointer with code object

-// HCC path does not support or require JIT options, so just load the module.
-hipModuleLoadData(&module, imagePtr);
+const int numOptions = 1;
+hipJitOption options[numOptions];
+void * optionValues[numOptions];

-#endif
+options[0] = hipJitOptionMaxRegisters;
+unsigned maxRegs = 15;
+optionValues[0] = (void*)(&maxRegs);

-// Back to unified code - both paths above loaded the "module" variable.
-hipFunction k;
+// hipModuleLoadData(module, imagePtr) will be called on HCC path, JIT options will not be used, and
+// cupModuleLoadDataEx(module, imagePtr, numOptions, options, optionValues) will be called on NVCC path
+hipModuleLoadDataEx(module, imagePtr, numOptions, options, optionValues);
+
+hipFunction_t k;
 hipModuleGetFunction(&k, module, "myKernel");
 ```

@@ -569,7 +569,6 @@ HIP_TRACE_API                  =  0 : Trace each HIP API call.  Print function n
 HIP_TRACE_API_COLOR            = green : Color to use for HIP_API.  None/Red/Green/Yellow/Blue/Magenta/Cyan/White
 HIP_PROFILE_API                 =  0 : Add HIP function begin/end to ATP file generated with CodeXL
 HIP_VISIBLE_DEVICES            =  0 : Only devices whose index is present in the secquence are visible to HIP applications and they are enumerated in the order of secquence
-HIP_NUM_KERNELS_INFLIGHT       = 128 : Number of kernels per stream

 ```

@@ -1,4 +1,4 @@
-# Profiling and Debugging HIP Code
+# Profiling HIP Code

 This section describes the profiling and debugging capabilities that HIP provides.  
 Profiling information can viewed in the CodeXL visualization tool or printed directly to stderr as the application runs.
@@ -267,6 +267,11 @@ info: check result
 PASSED!
 ```

+HIP_TRACE_API supports multiple levels of debug information:
+   - 0x1 = print all HIP APIs
+   - 0x2 = print HIP APIs which initiate GPU kernels, copies, or memsets.  Includes hipLaunchKernel, hipMemcpy*, hipMemset*.
+   - 0x4 = print HIP APIs which allocate or free memory.  Includes hipMalloc, hipHostMalloc, hipFree, hipHostFree.
+

 #### Color
 Note this trace mode uses colors. "less -r" can handle raw control characters and will display the debug output in proper colors.
@@ -275,161 +280,3 @@ None will disable use of color control codes for both the opening and closing an



-### Using HIP_DB
-
-This flag is primarily targeted to assist HIP development team in the development of the HIP runtime, but in some situations may be useful to HIP application developers as well.
-The HIP debug information is designed to print important information during the execution of a HIP API.  HIP provides
-different color-coded levels of debug information:
-  - api  : Print the beginning and end of each HIP API, including the arguments and return codes.  This is equivalent to setting HIP_TRACE_API=1.
-  - sync : Print multi-thread and other synchronization debug information.
-  - copy : Print which engine is doing the copy, which copy flavor is selected, information on source and destination memory.
-  - mem  : Print information about memory allocation - which pointers are allocated, where they are allocated, peer mappings, and more.
-
-DB_MEM format is flags separated by '+' sign, or a hex code for the bitmask.  Generally the + format is preferred.  
-For example:
-```
-$ HIP_DB=api+copy+mem  my-application
-$ HIP_DB=0xF  my-application
-```
-
-### Using ltrace
-ltrace is a standard linux tool which provides a message to stderr on every dynamic library call.  Since ROCr and the ROCt (the ROC thunk, which is the thin user-space interface to the ROC kernel driver) are both dynamic libraries, this provides an easy way to trace the activity in these libraries.  Tracing can be a powerful way to quickly observe the flow of the application before diving into the details with a command-line debugger.
-The trace can also show performance issues related to accidental calls to expensive API calls on the critical path.
-
-ltrace can be easily combined with the HIP_DB switches to visualize the runtime behavior of the entire ROCm software stack.  Here's a sample command-line and output:
-
-```
-$ HIP_DB=api ltrace -C -e 'hsa*'   <applicationName> <applicationArguments>
-
-...
-
-<<hip-api tid:1.17 hipMemcpy (0x7f7776d3e010, 0x503d1d000, 4194304, hipMemcpyDeviceToHost)
-libmcwamp_hsa.so->hsa_signal_store_relaxed(0x1804000, 0, 0, 0x400000) = 0
-libmcwamp_hsa.so->hsa_signal_store_relaxed(0x1816000, 0, 0x7f777f85f2a0, 0x400000) = 0
-libmcwamp_hsa.so->hsa_amd_memory_lock(0x7f7776d3e010, 0x400000, 0x1213b70, 1 <unfinished ...>
-libhsa-runtime64.so.1->hsaKmtRegisterMemoryToNodes(0x7f7776d3e010, 0x400000, 1, 0x1220c10) = 0
-libhsa-runtime64.so.1->hsaKmtMapMemoryToGPUNodes(0x7f7776d3e010, 0x400000, 0x7ffc32865400, 64) = 0
-<... hsa_amd_memory_lock resumed> )              = 0
-libmcwamp_hsa.so->hsa_signal_store_relaxed(0x1804000, 1, 0x7f777e95a770, 0x12205b0) = 0
-libmcwamp_hsa.so->hsa_amd_memory_async_copy(0x50411d010, 0x11e70d0, 0x503d1d000, 0x11e70d0) = 0
-libmcwamp_hsa.so->hsa_signal_wait_acquire(0x1804000, 2, 1, -1) = 0
-libmcwamp_hsa.so->hsa_amd_memory_unlock(0x7f7776d3e010, 0x1213c6c, 0x12c3c600000000, 0x1804000 <unfinished ...>
-libhsa-runtime64.so.1->hsaKmtUnmapMemoryToGPU(0x7f7776d3e010, 0x7f7776d3e010, 0x12c3c600000000, 0x1804000) = 0
-libhsa-runtime64.so.1->hsaKmtDeregisterMemory(0x7f7776d3e010, 0x7f7776d3e010, 0x7f777f60f9e8, 0x1220580) = 0
-<... hsa_amd_memory_unlock resumed> )            = 0
-  hip-api tid:1.17 hipMemcpy                      ret= 0 (hipSuccess)>>
-```
-
-Some key information from the trace above.
-  - Thy trace snippet shows the execution of a hipMemcpy API, bracketed by the first and last message in the trace output.  The messages show the thread id and API sequence number (`1.17`).  ltrace output intermixes messages from all threads, so the HIP debug information can be useful to determine which threads are executing.
-  - The code flows through HIP APIs into ROCr (HSA) APIs (hsa*) and into the thunk (hsaKmt*) calls.
-  - The HCC runtime is "libmcwamp_hsa.so" and the HSA/ROCr runtime is "libhsa-runtime64.so".
-  - In this particular case, the memory copy is for unpinned memory, and the selected copy algorithm is to pin the host memory "in-place" before performing the copy.  The signaling APIs and calls to pin ("lock", "register") the memory are readily apparent in the trace output.
-
-
-### Chicken bits
-Chicken bits are environment variables which cause the HIP, HCC, or HSA driver to disable some feature or optimization.
-These are not intended for production but can be useful diagnose synchronization problems in the application (or driver).
-
-Some of the most useful chicken bits are described here. These bits are supported on the ROCm path:
-
-HIP provides 3 environment variables in the HIP_*_BLOCKING family.  These introduce additional synchronization and can be useful to isolate synchronization problems. Specifically, if the code works with this flag set, then it indicates the kernels are executing correctly, and any failures likely are causes by improper or missing synchronization.  These flags will have performance impact and are not intended for production use.
-
- HIP_LAUNCH_BLOCKING=1 : Waits on the host after each kernel launch.  Equivalent to setting CUDA_LAUNCH_BLOCKING.
- HIP_LAUNCH_BLOCKING_KERNELS: A comma-separated list of kernel names.  The HIP runtime will wait on the host after one of the named kernels executes.  This provides a more targeted version of HIP_LAUNCH_BLOCKING and may be useful to isolate exactly which kernel needs further analysis if HIP_LAUNCH_BLOCKING=1 improves functionality.  There is no indication if kernel names are spelled incorrectly.  One mechanism to verify that the blocking is working is to run with HIP_DB=api+sync and search for debug messages with "LAUNCH_BLOCKING".
- HIP_API_BLOCKING : Forces hipMemcpyAsync and hipMemsetAsync to be host-synchronous, meaning they will wait for the requested operation to complete before returning to the caller.
-
-These options cause HCC to serialize.  Useful if you have libraries or code which is calling HCC kernels directly rather than using HIP.  
- HCC_SERIALZIE_KERNELS : 0x1=pre-serialize before each kernel launch, 0x2=post-serialize after each kernel launch., 0x3= pre- and post- serialize.
- HCC_SERIALIZE_COPY    : 0x1=pre-serialize before each async copy, 0x2=post-serialize after each async copy., 0x3= pre- and post- serialize.
-
- HSA_ENABLE_SDMA=0     : Causes host-to-device and device-to-host copies to use compute shader blit kernels rather than the dedicated DMA copy engines.  Compute shader copies have low latency (typically < 5us) and can achieve approximately 80% of the bandwidth of the DMA copy engine.  This flag is useful to isolate issues with the hardware copy engines.
- HSA_ENABLE_INTERRUPT=0 : Causes completion signals to be detected with memory-based polling rather than interrupts.  Can be useful to diagnose interrupt storm issues in the driver.
- HSA_DISABLE_CACHE=1  : Disables the GPU L2 data cache.
-
-### Debugging HIP Applications
-
- The variable "tls_tidInfo" contains the API sequence number (_apiSeqNum)- a monotonically increasing count of the HIP APIs called from this thread.  This can be useful for setting conditional breakpoints.  Also, each new HIP thread is mapped to monotically increasing shortTid ID.  Both of these fields are displayed in the HIP debug info. 
-```
-(gdb) p tls_tidInfo
-$32 = {_shortTid = 1, _apiSeqNum = 803}
-```
-
- HCC tracks all of the application memory allocations, including those from HIP and HC's "am_alloc".
-If the HCC runtime is built with debug information (HCC_RUNTIME_DEBUG=ON when building HCC), then calling the function 'hc::am_memtracker_print()' will show all memory allocations. 
-An optional argument specifies a void * targetPointer - the print routine will mark the allocation which contains the specified pointer with "-->" in the printed output.
-This example shows a sample GDB session where we print the memory allocated by this process and mark a specified address by using the gdb "call" function..
-The gdb syntax also supports using the variable name (in this case 'dst'):
-```
-(gdb) p dst
-$33 = (void *) 0x5ec7e9000
-(gdb) call hc::am_memtracker_print(dst)
-TargetAddress:0x5ec7e9000
-   0x504cfc000-0x504cfc00f::  allocSeqNum:1 hostPointer:0x504cfc000 devicePointer:0x504cfc000 sizeBytes:16 isInDeviceMem:0 isAmManaged:1 appId:0 appAllocFlags:0 appPtr:(nil)
-...
-->0x5ec7e9000-0x5f7e28fff::  allocSeqNum:488 hostPointer:(nil) devicePointer:0x5ec7e9000 sizeBytes:191102976 isInDeviceMem:1 isAmManaged:1 appId:0 appAllocFlags:0 appPtr:(nil)
-
-```
-
-To debug an explicit address, cast the address to (void*) :
-```
-(gdb) call hc::am_memtracker_print((void*)0x508c7f000)
-```
- Debugging GPUVM fault.
-For example:
-```
-Memory access fault by GPU node-1 on address 0x5924000. Reason: Page not present or supervisor privilege.
-
-Program received signal SIGABRT, Aborted.
-[Switching to Thread 0x7fffdffb5700 (LWP 14893)]
-0x00007ffff2057c37 in __GI_raise (sig=sig@entry=6) at ../nptl/sysdeps/unix/sysv/linux/raise.c:56
-56      ../nptl/sysdeps/unix/sysv/linux/raise.c: No such file or directory.
-(gdb) bt
-#0  0x00007ffff2057c37 in __GI_raise (sig=sig@entry=6) at ../nptl/sysdeps/unix/sysv/linux/raise.c:56
-#1  0x00007ffff205b028 in __GI_abort () at abort.c:89
-#2  0x00007ffff6f960eb in ?? () from /opt/rocm/hsa/lib/libhsa-runtime64.so.1
-#3  0x00007ffff6f99ea5 in ?? () from /opt/rocm/hsa/lib/libhsa-runtime64.so.1
-#4  0x00007ffff6f78107 in ?? () from /opt/rocm/hsa/lib/libhsa-runtime64.so.1
-#5  0x00007ffff744f184 in start_thread (arg=0x7fffdffb5700) at pthread_create.c:312
-#6  0x00007ffff211b37d in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:111
-(gdb) info threads
-  Id   Target Id         Frame
-  4    Thread 0x7fffdd521700 (LWP 14895) "caffe" pthread_cond_wait@@GLIBC_2.3.2 () at ../nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S:185
-  3    Thread 0x7fffddd22700 (LWP 14894) "caffe" pthread_cond_wait@@GLIBC_2.3.2 () at ../nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S:185
-* 2    Thread 0x7fffdffb5700 (LWP 14893) "caffe" 0x00007ffff2057c37 in __GI_raise (sig=sig@entry=6) at ../nptl/sysdeps/unix/sysv/linux/raise.c:56
-  1    Thread 0x7ffff7fa6ac0 (LWP 14892) "caffe" 0x00007ffff6f934d5 in ?? () from /opt/rocm/hsa/lib/libhsa-runtime64.so.1
-(gdb) thread 1
-[Switching to thread 1 (Thread 0x7ffff7fa6ac0 (LWP 14892))]
-#0  0x00007ffff6f934d5 in ?? () from /opt/rocm/hsa/lib/libhsa-runtime64.so.1
-(gdb) bt
-#0  0x00007ffff6f934d5 in ?? () from /opt/rocm/hsa/lib/libhsa-runtime64.so.1
-#1  0x00007ffff6f929ba in ?? () from /opt/rocm/hsa/lib/libhsa-runtime64.so.1
-#2  0x00007fffe080beca in HSADispatch::waitComplete() () from /opt/rocm/hcc/lib/libmcwamp_hsa.so
-#3  0x00007fffe080415f in HSADispatch::dispatchKernelAsync(Kalmar::HSAQueue*, void const*, int, bool) () from /opt/rocm/hcc/lib/libmcwamp_hsa.so
-#4  0x00007fffe080238e in Kalmar::HSAQueue::dispatch_hsa_kernel(hsa_kernel_dispatch_packet_s const*, void const*, unsigned long, hc::completion_future*) () from /opt/rocm/hcc/lib/libmcwamp_hsa.so
-#5  0x00007ffff7bb7559 in hipModuleLaunchKernel () from /opt/rocm/hip/lib/libhip_hcc.so
-#6  0x00007ffff2e6cd2c in mlopen::HIPOCKernel::run (this=0x7fffffffb5a8, args=0x7fffffffb2a8, size=80) at /root/MIOpen/src/hipoc/hipoc_kernel.cpp:15  
-...
-```
-
-### General Debugging Tips
- The fault will be caught by the runtime but was actually generated by an asynchronous command running on the GPU.    So, the GDB backtrace will show a path in the runtime, ie inside "GI_Raise" as shown in the example above.
- To determine the true location of the fault, force the kernels to execute synchronously by seeing the environment variables HCC_SERIALIZE_KERNEL=3 HCC_SERIALIZE_COPY=3.  This will force HCC to wait for the kernel to finish executing before retuning.  If the fault occurs during the execution of a kernel, you can see the code which launched the kernel inside the backtrace.  A bit of guesswork is required to determine which thread is actually causing the issue - typically it will the thread which is waiting inside the libhsa-runtime64.so.
- VM faults inside kernels can be caused byi:
-   - incorrect code (ie a for loop which extends past array boundaries), i
-   - memory issues  - kernel arguments which are invalid (null pointers, unregistered host pointers, bad pointers).
-   - synchronization issues
-   - compiler issues (incorrect code generation from the compiler)
-   - runtime issues 
-
-- General debug tips:
- 'gdb --args' can be used to conviently pass the executable and arguments to gdb.
- From inside GDB, you can set environment variables "set env".  Note the command does not use an '=' sign:
-```
-(gdb) set env HIP_DB 1
-```
-Setting HIP_PRINT_ENV=1 and then running a HIP application will print the HIP environment variables, their current values, and usage info.
-Setting HCC_PRINT_ENV=1 and then running a HCC application will print the HCC environment variables, their current values, and usage info.
-
-
-
@@ -6,8 +6,12 @@ set(BUILD_HIPIFY_CLANG 0 PARENT_SCOPE)
 # Find LLVM package
 find_package(LLVM 3.8 QUIET PATHS ${HIPIFY_CLANG_LLVM_DIR} NO_DEFAULT_PATH)
 if (NOT ${LLVM_FOUND})
-    message(STATUS "hipify-clang will not be built. To build it please specify absolute path to LLVM (v3.8) package using -DHIPIFY_CLANG_LLVM_DIR")
-else()
+    find_package(LLVM 3.9 QUIET PATHS ${HIPIFY_CLANG_LLVM_DIR} NO_DEFAULT_PATH)
+    if (NOT ${LLVM_FOUND})
+        message(STATUS "hipify-clang will not be built. To build it please specify absolute path to LLVM 3.8 or LLVM 3.9 package using -DHIPIFY_CLANG_LLVM_DIR")
+    endif()
+endif()
+if (${LLVM_FOUND})
    list(APPEND CMAKE_MODULE_PATH ${LLVM_CMAKE_DIR})
    include(AddLLVM)

@@ -31,6 +35,7 @@ else()
        clangSerialization
        clangSema
        clangEdit
+        clangFormat
        clangLex
        clangAnalysis
        clangDriver
@@ -102,9 +102,6 @@ namespace hip_impl
        // Not callable.
        template<FunctionalProcedure F>
        struct is_callable_impl<F, 5u> : std::false_type {};
-
-        template<typename Call>
-        struct is_callable : is_callable_impl<Call> {};
    #else
        template<typename, typename = void>
        struct is_callable_impl : std::false_type {};
@@ -114,6 +111,8 @@ namespace hip_impl
            F(Ts...),
            void_t_<std::result_of_t<F(Ts...)>>> : std::true_type {};
    #endif
+        template<typename Call>
+        struct is_callable : is_callable_impl<Call> {};

    #define count_macro_args_impl_hip_(\
         _0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15,\
@@ -25,19 +25,10 @@ THE SOFTWARE.

 #include "hip/hcc_detail/hip_vector_types.h"

-#if __clang_major__ > 3
-
 typedef __fp16 __half;
-
-typedef struct __attribute__((aligned(4))){
-  union {
-    __half p[2];
-    unsigned int q;
-  };
-} __half2;
-
-typedef __half half;
-typedef __half2 half2;
+typedef __fp16 __half1 __attribute__((ext_vector_type(1)));
+typedef __fp16 __half2 __attribute__((ext_vector_type(2)));
+typedef __fp16 half;

 /*
 Half Arithmetic Functions
@@ -214,10 +205,10 @@ __device__  __half __ushort2half_ru(unsigned short int i);
 __device__  __half __ushort2half_rz(unsigned short int i);
 __device__  __half __ushort_as_half(const unsigned short int i);

-extern "C" int __hip_hc_ir_hadd2_int(int, int);
-extern "C" int __hip_hc_ir_hfma2_int(int, int, int);
-extern "C" int __hip_hc_ir_hmul2_int(int, int);
-extern "C" int __hip_hc_ir_hsub2_int(int, int);
+extern "C" __half2 __hip_hc_ir_hadd2_int(__half2, __half2);
+extern "C" __half2 __hip_hc_ir_hfma2_int(__half2, __half2, __half2);
+extern "C" __half2 __hip_hc_ir_hmul2_int(__half2, __half2);
+extern "C" __half2 __hip_hc_ir_hsub2_int(__half2, __half2);

 extern "C" __half __hip_hc_ir_hceil_half(__half) __asm("llvm.ceil.f16");
 extern "C" __half __hip_hc_ir_hcos_half(__half) __asm("llvm.cos.f16");
@@ -231,16 +222,16 @@ extern "C" __half __hip_hc_ir_hsin_half(__half) __asm("llvm.sin.f16");
 extern "C" __half __hip_hc_ir_hsqrt_half(__half) __asm("llvm.sqrt.f16");
 extern "C" __half __hip_hc_ir_htrunc_half(__half) __asm("llvm.trunc.f16");

-extern "C" int __hip_hc_ir_h2ceil_int(int);
-extern "C" int __hip_hc_ir_h2cos_int(int);
-extern "C" int __hip_hc_ir_h2exp2_int(int);
-extern "C" int __hip_hc_ir_h2floor_int(int);
-extern "C" int __hip_hc_ir_h2log2_int(int);
-extern "C" int __hip_hc_ir_h2rcp_int(int);
-extern "C" int __hip_hc_ir_h2rsqrt_int(int);
-extern "C" int __hip_hc_ir_h2sin_int(int);
-extern "C" int __hip_hc_ir_h2sqrt_int(int);
-extern "C" int __hip_hc_ir_h2trunc_int(int);
+extern "C" __half2 __hip_hc_ir_h2ceil_int(__half2);
+extern "C" __half2 __hip_hc_ir_h2cos_int(__half2);
+extern "C" __half2 __hip_hc_ir_h2exp2_int(__half2);
+extern "C" __half2 __hip_hc_ir_h2floor_int(__half2);
+extern "C" __half2 __hip_hc_ir_h2log2_int(__half2);
+extern "C" __half2 __hip_hc_ir_h2rcp_int(__half2);
+extern "C" __half2 __hip_hc_ir_h2rsqrt_int(__half2);
+extern "C" __half2 __hip_hc_ir_h2sin_int(__half2);
+extern "C" __half2 __hip_hc_ir_h2sqrt_int(__half2);
+extern "C" __half2 __hip_hc_ir_h2trunc_int(__half2);

 /*
  Half2 Arithmetic Functions
@@ -248,63 +239,63 @@ extern "C" int __hip_hc_ir_h2trunc_int(int);

 __device__ static inline __half2 __hadd2(__half2 a, __half2 b) {
  __half2 c;
-  c.q = __hip_hc_ir_hadd2_int(a.q, b.q);
+  c.xy = __hip_hc_ir_hadd2_int(a.xy, b.xy);
  return c;
 }

 __device__ static inline __half2 __hadd2_sat(__half2 a, __half2 b) {
  __half2 c;
-  c.q = __hip_hc_ir_hadd2_int(a.q, b.q);
+  c.xy = __hip_hc_ir_hadd2_int(a.xy, b.xy);
  return c;
 }

 __device__ static inline __half2 __hfma2(__half2 a, __half2 b, __half2 c) {
  __half2 d;
-  d.q = __hip_hc_ir_hfma2_int(a.q, b.q, c.q);
+  d.xy = __hip_hc_ir_hfma2_int(a.xy, b.xy, c.xy);
  return d;
 }

 __device__ static inline __half2 __hfma2_sat(__half2 a, __half2 b, __half2 c) {
  __half2 d;
-  d.q = __hip_hc_ir_hfma2_int(a.q, b.q, c.q);
+  d.xy = __hip_hc_ir_hfma2_int(a.xy, b.xy, c.xy);
  return d;
 }

 __device__ static inline __half2 __hmul2(__half2 a, __half2 b) {
  __half2 c;
-  c.q = __hip_hc_ir_hmul2_int(a.q, b.q);
+  c.xy = __hip_hc_ir_hmul2_int(a.xy, b.xy);
  return c;
 }

 __device__ static inline __half2 __hmul2_sat(__half2 a, __half2 b) {
  __half2 c;
-  c.q = __hip_hc_ir_hmul2_int(a.q, b.q);
+  c.xy = __hip_hc_ir_hmul2_int(a.xy, b.xy);
  return c;
 }

 __device__ static inline __half2 __hsub2(__half2 a, __half2 b) {
  __half2 c;
-  c.q = __hip_hc_ir_hsub2_int(a.q, b.q);
+  c.xy = __hip_hc_ir_hsub2_int(a.xy, b.xy);
  return c;
 }

 __device__ static inline __half2 __hneg2(__half2 a) {
  __half2 c;
-  c.p[0] = - a.p[0];
-  c.p[1] = - a.p[1];
+  c.x = - a.x;
+  c.y = - a.y;
  return c;
 }

 __device__ static inline __half2 __hsub2_sat(__half2 a, __half2 b) {
  __half2 c;
-  c.q = __hip_hc_ir_hsub2_int(a.q, b.q);
+  c.xy = __hip_hc_ir_hsub2_int(a.xy, b.xy);
  return c;
 }

 __device__ static inline __half2 h2div(__half2 a, __half2 b) {
  __half2 c;
-  c.p[0] = a.p[0] / b.p[0];
-  c.p[1] = a.p[1] / b.p[1];
+  c.x = a.x / b.x;
+  c.y = a.y / b.y;
  return c;
 }

@@ -375,112 +366,94 @@ Half2 Math Operations

 __device__ static inline __half2 h2ceil(const __half2 h) {
  __half2 a;
-  a.q = __hip_hc_ir_h2ceil_int(h.q);
+  a.xy = __hip_hc_ir_h2ceil_int(h.xy);
  return a;
 }

 __device__ static inline __half2 h2cos(const __half2 h) {
  __half2 a;
-  a.q = __hip_hc_ir_h2cos_int(h.q);
+  a.xy = __hip_hc_ir_h2cos_int(h.xy);
  return a;
 }

 __device__ static inline __half2 h2exp(const __half2 h) {
  __half2 factor;
-  factor.p[0] = 1.442694;
-  factor.p[1] = 1.442694;
-  factor.q = __hip_hc_ir_h2exp2_int(__hip_hc_ir_hmul2_int(h.q, factor.q));
+  factor.x = 1.442694;
+  factor.y = 1.442694;
+  factor.xy = __hip_hc_ir_h2exp2_int(__hip_hc_ir_hmul2_int(h.xy, factor.xy));
  return factor;
 }

 __device__ static inline __half2 h2exp10(const __half2 h) {
  __half2 factor;
-  factor.p[0] = 3.3219281;
-  factor.p[1] = 3.3219281;
-  factor.q = __hip_hc_ir_h2exp2_int(__hip_hc_ir_hmul2_int(h.q, factor.q));
+  factor.x = 3.3219281;
+  factor.y = 3.3219281;
+  factor.xy = __hip_hc_ir_h2exp2_int(__hip_hc_ir_hmul2_int(h.xy, factor.xy));
  return factor;
 }

 __device__ static inline __half2 h2exp2(const __half2 h) {
  __half2 a;
-  a.q = __hip_hc_ir_h2exp2_int(h.q);
+  a.xy = __hip_hc_ir_h2exp2_int(h.xy);
  return a;
 }

 __device__ static inline __half2 h2floor(const __half2 h) {
  __half2 a;
-  a.q = __hip_hc_ir_h2floor_int(h.q);
+  a.xy = __hip_hc_ir_h2floor_int(h.xy);
  return a;
 }

 __device__ static inline __half2 h2log(const __half2 h) {
  __half2 factor;
-  factor.p[0] = 0.693147;
-  factor.p[1] = 0.693147;
-  factor. q = __hip_hc_ir_hmul2_int(__hip_hc_ir_h2log2_int(h.q), factor.q);
+  factor.x = 0.693147;
+  factor.y = 0.693147;
+  factor.xy = __hip_hc_ir_hmul2_int(__hip_hc_ir_h2log2_int(h.xy), factor.xy);
  return factor;
 }

 __device__ static inline __half2 h2log10(const __half2 h) {
  __half2 factor;
-  factor.p[0] = 0.301029;
-  factor.p[1] = 0.301029;
-  factor.q = __hip_hc_ir_hmul2_int(__hip_hc_ir_h2log2_int(h.q),  factor.q);
+  factor.x = 0.301029;
+  factor.y = 0.301029;
+  factor.xy = __hip_hc_ir_hmul2_int(__hip_hc_ir_h2log2_int(h.xy),  factor.xy);
  return factor;
 }
 __device__ static inline __half2 h2log2(const __half2 h) {
  __half2 a;
-  a.q = __hip_hc_ir_h2log2_int(h.q);
+  a.xy = __hip_hc_ir_h2log2_int(h.xy);
  return a;
 }

 __device__ static inline __half2 h2rcp(const __half2 h) {
  __half2 a;
-  a.q = __hip_hc_ir_h2rcp_int(h.q);
+  a.xy = __hip_hc_ir_h2rcp_int(h.xy);
  return a;
 }

 __device__ static inline __half2 h2rsqrt(const __half2 h) {
  __half2 a;
-  a.q = __hip_hc_ir_h2rsqrt_int(h.q);
+  a.xy = __hip_hc_ir_h2rsqrt_int(h.xy);
  return a;
 }

 __device__ static inline __half2 h2sin(const __half2 h) {
  __half2 a;
-  a.q = __hip_hc_ir_h2sin_int(h.q);
+  a.xy = __hip_hc_ir_h2sin_int(h.xy);
  return a;
 }

 __device__ static inline __half2 h2sqrt(const __half2 h) {
  __half2 a;
-  a.q = __hip_hc_ir_h2sqrt_int(h.q);
+  a.xy = __hip_hc_ir_h2sqrt_int(h.xy);
  return a;
 }

 __device__ static inline __half2 h2trunc(const __half2 h) {
  __half2 a;
-  a.q = __hip_hc_ir_h2trunc_int(h.q);
+  a.xy = __hip_hc_ir_h2trunc_int(h.xy);
  return a;
 }

-#endif
-
-#if __clang_major__ == 3
-
-typedef struct {
-  unsigned x: 16;
-} __half;
-
-typedef struct __attribute__((aligned(4))){
-  union {
-    __half p[2];
-    unsigned int q;
-  };
-} __half2;
-
-
-#endif
-

 #endif
@@ -41,6 +41,8 @@ THE SOFTWARE.
 #include <stddef.h>
 #endif//__cplusplus

+#if __HCC__
+
 // Define NVCC_COMPAT for CUDA compatibility
 #define NVCC_COMPAT
 #define CUDA_SUCCESS hipSuccess
@@ -147,8 +149,15 @@ extern int HIP_TRACE_API;
 #endif /* Device feature flags */


-//TODO-HCC  this is currently ignored by HCC target of HIP
-#define __launch_bounds__(requiredMaxThreadsPerBlock, minBlocksPerMultiprocessor)
+#define launch_bounds_impl0(requiredMaxThreadsPerBlock)\
+    __attribute__((amdgpu_flat_work_group_size(1, requiredMaxThreadsPerBlock)))
+#define launch_bounds_impl1(\
+    requiredMaxThreadsPerBlock, minBlocksPerMultiprocessor)\
+    __attribute__((amdgpu_flat_work_group_size(1, requiredMaxThreadsPerBlock),\
+        amdgpu_waves_per_eu(minBlocksPerMultiprocessor)))
+#define select_impl_(_1, _2, impl_, ...) impl_
+#define __launch_bounds__(...) select_impl_(\
+    __VA_ARGS__, launch_bounds_impl1, launch_bounds_impl0)(__VA_ARGS__)

 // Detect if we are compiling C++ mode or C mode
 #if defined(__cplusplus)
@@ -481,6 +490,6 @@ do {\
 */


-
+#endif

 #endif//HIP_HCC_DETAIL_RUNTIME_H
@@ -106,21 +106,27 @@ enum hipLimit_t
 #define hipEventBlockingSync        0x1  ///< Waiting will yield CPU.  Power-friendly and usage-friendly but may increase latency.
 #define hipEventDisableTiming       0x2  ///< Disable event's capability to record timing information.  May improve performance.
 #define hipEventInterprocess        0x4  ///< Event can support IPC.  @warning - not supported in HIP.
+#define hipEventReleaseToDevice     0x40000000  /// < Use a device-scope release when recording this event.  This flag is useful to obtain more precise timings of commands between events.  The flag is a no-op on CUDA platforms.
+#define hipEventReleaseToSystem     0x80000000  /// < Use a system-scope release that when recording this event.  This flag is useful to make non-coherent host memory visible to the host.  The flag is a no-op on CUDA platforms.


 //! Flags that can be used with hipHostMalloc
 #define hipHostMallocDefault        0x0
-#define hipHostMallocPortable       0x1
-#define hipHostMallocMapped         0x2
+#define hipHostMallocPortable       0x1  ///< Memory is considered allocated by all contexts.
+#define hipHostMallocMapped         0x2  ///< Map the allocation into the address space for the current device.  The device pointer can be obtained with #hipHostGetDevicePointer.
 #define hipHostMallocWriteCombined  0x4
+#define hipHostMallocCoherent       0x40000000 ///< Allocate coherent memory. Overrides HIP_COHERENT_HOST_ALLOC for specific allocation.
+#define hipHostMallocNonCoherent    0x80000000 ///< Allocate non-coherent memory. Overrides HIP_COHERENT_HOST_ALLOC for specific allocation.
+

 //! Flags that can be used with hipHostRegister
 #define hipHostRegisterDefault      0x0  ///< Memory is Mapped and Portable
-#define hipHostRegisterPortable     0x1  ///< Memory is considered registered by all contexts.  HIP only supports one context so this is always assumed true.
+#define hipHostRegisterPortable     0x1  ///< Memory is considered registered by all contexts.
 #define hipHostRegisterMapped       0x2  ///< Map the allocation into the address space for the current device.  The device pointer can be obtained with #hipHostGetDevicePointer.
 #define hipHostRegisterIoMemory     0x4  ///< Not supported.


+
 #define hipDeviceScheduleAuto       0x0  ///< Automatically select between Spin and Yield
 #define hipDeviceScheduleSpin       0x1  ///< Dedicate a CPU core to spin-wait.  Provides lowest latency, but burns a CPU core and may consume more power.
 #define hipDeviceScheduleYield      0x2  ///< Yield the CPU to the operating system when waiting.  May increase latency, but lowers power and is friendlier to other threads in the system.
@@ -131,6 +137,33 @@ enum hipLimit_t
 #define hipDeviceLmemResizeToMax    0x16


+/*
+* @brief hipJitOption
+* @enum
+* @ingroup Enumerations
+*/
+typedef enum hipJitOption {
+  hipJitOptionMaxRegisters = 0,
+  hipJitOptionThreadsPerBlock,
+  hipJitOptionWallTime,
+  hipJitOptionInfoLogBuffer,
+  hipJitOptionInfoLogBufferSizeBytes,
+  hipJitOptionErrorLogBuffer,
+  hipJitOptionErrorLogBufferSizeBytes,
+  hipJitOptionOptimizationLevel,
+  hipJitOptionTargetFromContext,
+  hipJitOptionTarget,
+  hipJitOptionFallbackStrategy,
+  hipJitOptionGenerateDebugInfo,
+  hipJitOptionLogVerbose,
+  hipJitOptionGenerateLineInfo,
+  hipJitOptionCacheMode,
+  hipJitOptionSm3xOpt,
+  hipJitOptionFastCompile,
+  hipJitOptionNumOptions
+} hipJitOption;
+
+
 /**
 * @warning On AMD devices and recent Nvidia devices, these hints and controls are ignored.
 */
@@ -385,7 +418,7 @@ hipError_t hipDeviceGetLimit(size_t *pValue, enum hipLimit_t limit);
 * Note: AMD devices and recent Nvidia GPUS do not support reconfigurable cache.  This hint is ignored on those architectures.
 *
 */
-hipError_t hipFuncSetCacheConfig ( hipFuncCache_t config );
+hipError_t hipFuncSetCacheConfig (const void* func, hipFuncCache_t config );

 /**
 * @brief Returns bank width of shared memory for current device
@@ -601,9 +634,12 @@ hipError_t hipStreamQuery(hipStream_t stream);
 *
 * @return #hipSuccess, #hipErrorInvalidResourceHandle
 *
- * If the null stream is specified, this command blocks until all
+ * This command is host-synchronous : the host will block until the specified stream is empty.
+ *
+ * This command follows standard null-stream semantics.  Specifically, specifying the null stream will cause the 
+ * command to wait for other streams on the same device to complete all pending operations.
+ *
 * This command honors the hipDeviceLaunchBlocking flag, which controls whether the wait is active or blocking.
- * This command is host-synchronous : the host will block until the stream is empty.
 *
 * @see hipStreamCreate, hipStreamCreateWithFlags, hipStreamWaitEvent, hipStreamDestroy
 *
@@ -622,10 +658,12 @@ hipError_t hipStreamSynchronize(hipStream_t stream);
 *
 * This function inserts a wait operation into the specified stream.
 * All future work submitted to @p stream will wait until @p event reports completion before beginning execution.
- * This function is host-asynchronous and the function may return before the wait has completed.
+ *
+ * This function only waits for commands in the current stream to complete.  Notably,, this function does 
+ * not impliciy wait for commands in the default stream to complete, even if the specified stream is 
+ * created with hipStreamNonBlocking = 0.  
 *
 * @see hipStreamCreate, hipStreamCreateWithFlags, hipStreamSynchronize, hipStreamDestroy
- *
 */
 hipError_t hipStreamWaitEvent(hipStream_t stream, hipEvent_t event, unsigned int flags);

@@ -730,10 +768,10 @@ hipError_t hipEventCreate(hipEvent_t* event);
 * the specified stream, after all previous
 * commands in that stream have completed executing.
 *
- * If hipEventRecord() has been previously called aon event, then this call will overwrite any existing state in event.
+ * If hipEventRecord() has been previously called on this event, then this call will overwrite any existing state in event.
 *
 * If this function is called on a an event that is currently being recorded, results are undefined - either
- * outstanding recording may save state into the event, and the order is not guaranteed.  This shoul be avoided.
+ * outstanding recording may save state into the event, and the order is not guaranteed.  
 *
 * @see hipEventCreate, hipEventCreateWithFlags, hipEventQuery, hipEventSynchronize, hipEventDestroy, hipEventElapsedTime
 *
@@ -1308,6 +1346,27 @@ hipError_t hipFreeArray(hipArray* array);
 */
 hipError_t hipMemcpy2D(void* dst, size_t dpitch, const void* src, size_t spitch, size_t width, size_t height, hipMemcpyKind kind);

+/**
+ *  @brief Copies data between host and device.
+ *
+ *  @param[in]   dst    Destination memory address
+ *  @param[in]   dpitch Pitch of destination memory
+ *  @param[in]   src    Source memory address
+ *  @param[in]   spitch Pitch of source memory
+ *  @param[in]   width  Width of matrix transfer (columns in bytes)
+ *  @param[in]   height Height of matrix transfer (rows)
+ *  @param[in]   kind   Type of transfer
+ *  @param[in]   stream Stream to use
+ *  @return      #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidPitchValue, #hipErrorInvalidDevicePointer, #hipErrorInvalidMemcpyDirection
+ *
+ *  @see hipMemcpy, hipMemcpyToArray, hipMemcpy2DToArray, hipMemcpyFromArray, hipMemcpyToSymbol, hipMemcpyAsync
+ */
+#if __cplusplus
+hipError_t hipMemcpy2DAsync(void* dst, size_t dpitch, const void* src, size_t spitch, size_t width, size_t height, hipMemcpyKind kind, hipStream_t stream = 0);
+#else
+hipError_t hipMemcpy2DAsync(void* dst, size_t dpitch, const void* src, size_t spitch, size_t width, size_t height, hipMemcpyKind kind, hipStream_t stream);
+#endif
+
 /**
 *  @brief Copies data between host and device.
 *
@@ -1890,7 +1949,7 @@ hipError_t hipModuleGetFunction(hipFunction_t *function, hipModule_t module, con
 * @brief returns device memory pointer and size of the kernel present in the module with symbol @p name
 *
 * @param [out] dptr
- * @param [out[ bytes
+ * @param [out] bytes
 * @param [in] hmod
 * @param [in] name
 *
@@ -1898,7 +1957,6 @@ hipError_t hipModuleGetFunction(hipFunction_t *function, hipModule_t module, con
 */
 hipError_t hipModuleGetGlobal(hipDeviceptr_t *dptr, size_t *bytes, hipModule_t hmod, const char *name);

-
 /**
 * @brief builds module from code object which resides in host memory. Image is pointer to that location.
 *
@@ -1909,11 +1967,23 @@ hipError_t hipModuleGetGlobal(hipDeviceptr_t *dptr, size_t *bytes, hipModule_t h
 */
 hipError_t hipModuleLoadData(hipModule_t *module, const void *image);

+/**
+* @brief builds module from code object which resides in host memory. Image is pointer to that location. Options are not used. hipModuleLoadData is called.
+*
+* @param [in] image
+* @param [out] module
+* @param [in] number of options
+* @param [in] options for JIT
+* @param [in] option values for JIT
+*
+* @returns hipSuccess, hipErrorNotInitialized, hipErrorOutOfMemory, hipErrorNotInitialized
+*/
+hipError_t hipModuleLoadDataEx(hipModule_t *module, const void *image, unsigned int numOptions, hipJitOption *options, void **optionValues);

 /**
 * @brief launches kernel f with launch parameters and shared memory on stream with arguments passed to kernelparams or extra
 *
- * @param [in[ f	 Kernel to launch.
+ * @param [in] f         Kernel to launch.
 * @param [in] gridDimX  X grid dimension specified as multiple of blockDimX.
 * @param [in] gridDimY  Y grid dimension specified as multiple of blockDimY.
 * @param [in] gridDimZ  Z grid dimension specified as multiple of blockDimZ.
@@ -1921,7 +1991,7 @@ hipError_t hipModuleLoadData(hipModule_t *module, const void *image);
 * @param [in] blockDimY Y grid dimension specified in work-items
 * @param [in] blockDimZ Z grid dimension specified in work-items
 * @param [in] sharedMemBytes Amount of dynamic shared memory to allocate for this kernel.  The kernel can access this with HIP_DYNAMIC_SHARED.
- * @param [in] stream Stream where the kernel should be dispatched.  May be 0, in which case th default stream is used with associated synchronization rules.
+ * @param [in] stream    Stream where the kernel should be dispatched.  May be 0, in which case th default stream is used with associated synchronization rules.
 * @param [in] kernelParams
 * @param [in] extra     Pointer to kernel arguments.   These are passed directly to the kernel and must be in the memory layout and alignment expected by the kernel.
 *
@@ -37,38 +37,41 @@ THE SOFTWARE.
 #define MAKE_DEFAULT_CONSTRUCTOR_ONE_COMPONENT(type) \
 __device__ __host__ type() {} \
 __device__ __host__ type(type& val) : x(val.x) { } \
-__device__ __host__ type(const type& val) : x(val.x) { }
+__device__ __host__ type(const type& val) : x(val.x) { } \
+__device__ __host__ ~type() {} 

 #define MAKE_DEFAULT_CONSTRUCTOR_TWO_COMPONENT(type) \
 __device__ __host__ type() {} \
 __device__ __host__ type(type& val) : x(val.x), y(val.y) { } \
-__device__ __host__ type(const type& val) : x(val.x), y(val.y) { }
+__device__ __host__ type(const type& val) : x(val.x), y(val.y) { } \
+__device__ __host__ ~type() {}

 #define MAKE_DEFAULT_CONSTRUCTOR_THREE_COMPONENT(type) \
 __device__ __host__ type() {} \
 __device__ __host__ type(type& val) : x(val.x), y(val.y), z(val.z) { } \
-__device__ __host__ type(const type& val) : x(val.x), y(val.y), z(val.z) { }
+__device__ __host__ type(const type& val) : x(val.x), y(val.y), z(val.z) { } \
+__device__ __host__ ~type() {} 

 #define MAKE_DEFAULT_CONSTRUCTOR_FOUR_COMPONENT(type) \
 __device__ __host__ type() {} \
 __device__ __host__ type(type& val) : x(val.x), y(val.y), z(val.z), w(val.w) { } \
-__device__ __host__ type(const type& val) : x(val.x), y(val.y), z(val.z), w(val.w) { }
-
+__device__ __host__ type(const type& val) : x(val.x), y(val.y), z(val.z), w(val.w) { } \
+__device__ __host__ ~type() {}

 #define MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(type, type1) \
 __device__ __host__ type(type1 val) : x(val) {} \

 #define MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(type, type1) \
 __device__ __host__ type(type1 val) : x(val), y(val) {} \
-__device__ __host__ type(type1 val1, type1 val2) : x(val1), y(val2) {}
+__device__ __host__ type(type1 val1, type1 val2) : x(val1), y(val2) {} \

 #define MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(type, type1) \
 __device__ __host__ type(type1 val) : x(val), y(val), z(val) {} \
-__device__ __host__ type(type1 val1, type1 val2, type1 val3) : x(val1), y(val2), z(val3) {}
+__device__ __host__ type(type1 val1, type1 val2, type1 val3) : x(val1), y(val2), z(val3) {} \

 #define MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(type, type1) \
 __device__ __host__ type(type1 val) : x(val), y(val), z(val), w(val) {} \
-__device__ __host__ type(type1 val1, type1 val2, type1 val3, type1 val4) : x(val1), y(val2), z(val3), w(val4) {}
+__device__ __host__ type(type1 val1, type1 val2, type1 val3, type1 val4) : x(val1), y(val2), z(val3), w(val4) {} \

 struct uchar1 {
  #ifdef __cplusplus
@@ -4115,4 +4118,5 @@ DECLOP_4VAR_SCALE_PRODUCT(longlong4, signed long long)

 #endif

+
 #endif
@@ -48,7 +48,7 @@ THE SOFTWARE.
 #define __global__  __attribute__((hc_grid_launch)) __attribute__((used))
 #else
 //#warning "GGL global define reached"
-#define __global__ __attribute__((hc, weak))
+#define __global__ __attribute__((annotate("hip__global__"), hc, used))
 #endif //GENERIC_GRID_LAUNCH

 #define __noinline__      __attribute__((noinline))
@@ -250,6 +250,7 @@ typedef enum hipDeviceAttribute_t {
    hipDeviceAttributeIsMultiGpuBoard,                      ///< Multiple GPU devices.
 } hipDeviceAttribute_t;

+
 /**
 *     @}
 */
@@ -54,24 +54,49 @@ hipMemcpyHostToHost
 #define hipFilterModePoint cudaFilterModePoint

 //! Flags that can be used with hipEventCreateWithFlags:
-#define hipEventDefault             cudaEventDefault
-#define hipEventBlockingSync        cudaEventBlockingSync
-#define hipEventDisableTiming       cudaEventDisableTiming
-#define hipEventInterprocess        cudaEventInterprocess
+#define hipEventDefault              cudaEventDefault
+#define hipEventBlockingSync         cudaEventBlockingSync
+#define hipEventDisableTiming        cudaEventDisableTiming
+#define hipEventInterprocess         cudaEventInterprocess
+#define hipEventReleaseToDevice      0  /* no-op on CUDA platform */
+#define hipEventReleaseToSystem      0  /* no-op on CUDA platform */

-#define hipHostMallocDefault cudaHostAllocDefault
-#define hipHostMallocPortable cudaHostAllocPortable
-#define hipHostMallocMapped cudaHostAllocMapped
+
+#define hipHostMallocDefault       cudaHostAllocDefault
+#define hipHostMallocPortable      cudaHostAllocPortable
+#define hipHostMallocMapped        cudaHostAllocMapped
 #define hipHostMallocWriteCombined cudaHostAllocWriteCombined
+#define hipHostMallocCoherent      0x0 
+#define hipHostMallocNonCoherent   0x0

 #define hipHostRegisterPortable cudaHostRegisterPortable
-#define hipHostRegisterMapped cudaHostRegisterMapped
+#define hipHostRegisterMapped   cudaHostRegisterMapped

 #define HIP_LAUNCH_PARAM_BUFFER_POINTER CU_LAUNCH_PARAM_BUFFER_POINTER
-#define HIP_LAUNCH_PARAM_BUFFER_SIZE     CU_LAUNCH_PARAM_BUFFER_SIZE
+#define HIP_LAUNCH_PARAM_BUFFER_SIZE    CU_LAUNCH_PARAM_BUFFER_SIZE
 #define HIP_LAUNCH_PARAM_END            CU_LAUNCH_PARAM_END
 #define hipLimitMallocHeapSize          cudaLimitMallocHeapSize
-#define hipIpcMemLazyEnablePeerAccess          cudaIpcMemLazyEnablePeerAccess
+#define hipIpcMemLazyEnablePeerAccess   cudaIpcMemLazyEnablePeerAccess
+
+// enum CUjit_option redefines
+#define hipJitOptionMaxRegisters            CU_JIT_MAX_REGISTERS
+#define hipJitOptionThreadsPerBlock         CU_JIT_THREADS_PER_BLOCK
+#define hipJitOptionWallTime                CU_JIT_WALL_TIME
+#define hipJitOptionInfoLogBuffer           CU_JIT_INFO_LOG_BUFFER
+#define hipJitOptionInfoLogBufferSizeBytes  CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES
+#define hipJitOptionErrorLogBuffer          CU_JIT_ERROR_LOG_BUFFER
+#define hipJitOptionErrorLogBufferSizeBytes CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES
+#define hipJitOptionOptimizationLevel       CU_JIT_OPTIMIZATION_LEVEL
+#define hipJitOptionTargetFromContext       CU_JIT_TARGET_FROM_CUCONTEXT
+#define hipJitOptionTarget                  CU_JIT_TARGET
+#define hipJitOptionFallbackStrategy        CU_JIT_FALLBACK_STRATEGY
+#define hipJitOptionGenerateDebugInfo       CU_JIT_GENERATE_DEBUG_INFO
+#define hipJitOptionLogVerbose              CU_JIT_LOG_VERBOSE
+#define hipJitOptionGenerateLineInfo        CU_JIT_GENERATE_LINE_INFO
+#define hipJitOptionCacheMode               CU_JIT_CACHE_MODE
+#define hipJitOptionSm3xOpt                 CU_JIT_NEW_SM3X_OPT
+#define hipJitOptionFastCompile             CU_JIT_FAST_COMPILE
+#define hipJitOptionNumOptions              CU_JIT_NUM_OPTIONS

 typedef cudaEvent_t hipEvent_t;
 typedef cudaStream_t hipStream_t;
@@ -82,6 +107,7 @@ typedef cudaFuncCache hipFuncCache_t;
 typedef CUcontext hipCtx_t;
 typedef CUsharedconfig hipSharedMemConfig;
 typedef CUfunc_cache hipFuncCache;
+typedef CUjit_option hipJitOption;
 typedef CUdevice hipDevice_t;
 typedef CUmodule hipModule_t;
 typedef CUfunction hipFunction_t;
@@ -202,6 +228,10 @@ inline static hipError_t hipMalloc(void** ptr, size_t size) {
    return hipCUDAErrorTohipError(cudaMalloc(ptr, size));
 }

+inline static hipError_t hipMallocPitch(void** ptr, size_t* pitch, size_t width, size_t height) {
+    return hipCUDAErrorTohipError(cudaMallocPitch(ptr, pitch, width, height));
+}
+
 inline static hipError_t hipFree(void* ptr) {
    return hipCUDAErrorTohipError(cudaFree(ptr));
 }
@@ -345,7 +375,11 @@ inline static hipError_t hipMemcpyFromSymbolAsync(void *dst, const void* symbolN
 }

 inline static hipError_t hipMemcpy2D(void* dst, size_t dpitch, const void* src, size_t spitch, size_t width, size_t height, hipMemcpyKind kind){
-  return hipCUDAErrorTohipError(cudaMemcpy2D(dst, dpitch, src, spitch, width, height, hipMemcpyKindToCudaMemcpyKind(kind)));
+    return hipCUDAErrorTohipError(cudaMemcpy2D(dst, dpitch, src, spitch, width, height, hipMemcpyKindToCudaMemcpyKind(kind)));
+}
+
+inline static hipError_t hipMemcpy2DAsync(void* dst, size_t dpitch, const void* src, size_t spitch, size_t width, size_t height, hipMemcpyKind kind, hipStream_t stream) {
+    return hipCUDAErrorTohipError(cudaMemcpy2DAsync(dst, dpitch, src, spitch, width, height, hipMemcpyKindToCudaMemcpyKind(kind),stream));
 }

 inline static hipError_t hipMemcpy2DToArray(hipArray *dst, size_t wOffset, size_t hOffset, const void* src, size_t spitch, size_t width, size_t height, hipMemcpyKind kind){
@@ -884,6 +918,11 @@ inline static hipError_t hipModuleLoadData(hipModule_t *module, const void *imag
    return hipCUResultTohipError(cuModuleLoadData(module, image));
 }

+inline static hipError_t hipModuleLoadDataEx(hipModule_t *module, const void *image, unsigned int numOptions, hipJitOption *options, void **optionValues)
+{
+  return hipCUResultTohipError(cuModuleLoadDataEx(module, image, numOptions, options, optionValues));
+}
+
 inline static hipError_t hipModuleLaunchKernel(hipFunction_t f,
      unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ,
      unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ,
@@ -897,6 +936,10 @@ inline static hipError_t hipModuleLaunchKernel(hipFunction_t f,
 }


+inline static hipError_t hipFuncSetCacheConfig(const void* func, hipFuncCache_t cacheConfig)
+{
+    return hipCUDAErrorTohipError(cudaFuncSetCacheConfig(func, cacheConfig));
+}

 #ifdef __cplusplus
 }
@@ -0,0 +1,35 @@
+HIP_PATH?= $(wildcard /opt/rocm/hip)
+ifeq (,$(HIP_PATH))
+	HIP_PATH=../../..
+endif
+
+HIPCC=$(HIP_PATH)/bin/hipcc
+
+TARGET=hcc
+
+SOURCES = inline_asm.cpp
+OBJECTS = $(SOURCES:.cpp=.o)
+
+EXECUTABLE=./inline_asm
+
+.PHONY: test
+
+
+all: $(EXECUTABLE) test
+
+CXXFLAGS =-g
+CXX=$(HIPCC)
+
+
+$(EXECUTABLE): $(OBJECTS)
+	$(HIPCC) $(OBJECTS) -o $@
+
+
+test: $(EXECUTABLE)
+	$(EXECUTABLE)
+
+
+clean:
+	rm -f $(EXECUTABLE)
+	rm -f $(OBJECTS)
+	rm -f $(HIP_PATH)/src/*.o
@@ -0,0 +1,47 @@
+## inline asm  ###
+
+This tutorial is about how to use inline GCN asm in kernel. In this tutorial, we'll explain how to by using the simple Matrix Transpose.
+
+## Introduction:
+
+If you want to take advantage of the extra performance benefits of writing in assembly as well as take advantage of special GPU hardware features that were only available through assemby, then this tutorial is for you. In this tutorial we'll be explaining how to start writing inline asm in kernel.
+
+For more insight Please read the following blogs by Ben Sander 
+[The Art of AMDGCN Assembly: How to Bend the Machine to Your Will](gpuopen.com/amdgcn-assembly)
+[AMD GCN Assembly: Cross-Lane Operations](http://gpuopen.com/amd-gcn-assembly-cross-lane-operations/)
+
+For more information:
+[AMD GCN3 ISA Architecture Manual](http://gpuopen.com/compute-product/amd-gcn3-isa-architecture-manual/)
+[User Guide for AMDGPU Back-end](llvm.org/docs/AMDGPUUsage.html)
+
+## Requirement:
+For hardware requirement and software installation [Installation](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/INSTALL.md) 
+
+## prerequiste knowledge:
+
+Programmers familiar with CUDA, OpenCL will be able to quickly learn and start coding with the HIP API. In case you are not, don't worry. You choose to start with the best one. We'll be explaining everything assuming you are completely new to gpgpu programming.
+
+## Simple Matrix Transpose 
+
+We will be using the Simple Matrix Transpose application from the our very first tutorial.
+
+## asm() Assembler statement
+
+We insert the GCN isa into the kernel using asm() Assembler statement. In the same sourcecode, we used for MatrixTranspose. We'll add the following:
+
+`  asm volatile ("v_mov_b32_e32 %0, %1" : "=v" (out[x*width + y]) : "v" (in[y*width + x]));                    `
+
+## How to build and run:
+Use the make command and execute it using ./exe
+Use hipcc to build the application, which is using hcc on AMD and nvcc on nvidia.
+
+
+## More Info:
+- [HIP FAQ](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/docs/markdown/hip_faq.md)
+- [HIP Kernel Language](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/docs/markdown/hip_kernel_language.md)
+- [HIP Runtime API (Doxygen)](http://gpuopen-professionalcompute-tools.github.io/HIP)
+- [HIP Porting Guide](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/docs/markdown/hip_porting_guide.md)
+- [HIP Terminology](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/docs/markdown/hip_terms.md) (including Rosetta Stone of GPU computing terms across CUDA/HIP/HC/AMP/OpenL)
+- [clang-hipify](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/clang-hipify/README.md)
+- [Developer/CONTRIBUTING Info](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/CONTRIBUTING.md)
+- [Release Notes](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/RELEASE.md)
@@ -0,0 +1,174 @@
+/*
+Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include<iostream>
+
+// hip header file
+#include "hip/hip_runtime.h"
+
+#define WIDTH     1024
+
+#define NUM       (WIDTH*WIDTH)
+
+#define THREADS_PER_BLOCK_X  4
+#define THREADS_PER_BLOCK_Y  4
+#define THREADS_PER_BLOCK_Z  1
+
+// Device (Kernel) function, it must be void
+// hipLaunchParm provides the execution configuration
+__global__ void matrixTranspose(hipLaunchParm lp,
+                                float *out,
+                                float *in,
+                                const int width)
+{
+ 
+    int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
+    int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;
+
+    asm volatile ("v_mov_b32_e32 %0, %1" : "=v" (out[x*width + y]) : "v" (in[y*width + x]));
+}
+
+// CPU implementation of matrix transpose
+void matrixTransposeCPUReference(
+    float * output,
+    float * input,
+    const unsigned int width)
+{
+    for(unsigned int j=0; j < width; j++)
+    {
+        for(unsigned int i=0; i < width; i++)
+        {
+            output[i*width + j] = input[j*width + i];
+        }
+    }
+}
+
+int main() {
+
+  float* Matrix;
+  float* TransposeMatrix;
+  float* cpuTransposeMatrix;
+
+  float* gpuMatrix;
+  float* gpuTransposeMatrix;
+
+  hipDeviceProp_t devProp;
+  hipGetDeviceProperties(&devProp, 0);
+
+  std::cout << "Device name " << devProp.name << std::endl;
+
+  hipEvent_t start, stop;
+  hipEventCreate(&start);
+  hipEventCreate(&stop);
+  float eventMs = 1.0f;
+
+  int i;
+  int errors;
+
+  Matrix = (float*)malloc(NUM * sizeof(float));
+  TransposeMatrix = (float*)malloc(NUM * sizeof(float));
+  cpuTransposeMatrix = (float*)malloc(NUM * sizeof(float));
+
+  // initialize the input data
+  for (i = 0; i < NUM; i++) {
+    Matrix[i] = (float)i*10.0f;
+  }
+
+  // allocate the memory on the device side
+  hipMalloc((void**)&gpuMatrix, NUM * sizeof(float));
+  hipMalloc((void**)&gpuTransposeMatrix, NUM * sizeof(float));
+
+  // Record the start event
+  hipEventRecord(start, NULL);
+
+  // Memory transfer from host to device
+  hipMemcpy(gpuMatrix, Matrix, NUM*sizeof(float), hipMemcpyHostToDevice);
+
+  // Record the stop event
+  hipEventRecord(stop, NULL);
+  hipEventSynchronize(stop);
+
+  hipEventElapsedTime(&eventMs, start, stop);
+
+  printf ("hipMemcpyHostToDevice time taken  = %6.3fms\n", eventMs);
+
+  // Record the start event
+  hipEventRecord(start, NULL);
+
+  // Lauching kernel from host
+  hipLaunchKernel(matrixTranspose,
+                  dim3(WIDTH/THREADS_PER_BLOCK_X, WIDTH/THREADS_PER_BLOCK_Y),
+                  dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),
+                  0, 0,
+                  gpuTransposeMatrix , gpuMatrix, WIDTH);
+
+  // Record the stop event
+  hipEventRecord(stop, NULL);
+  hipEventSynchronize(stop);
+
+  hipEventElapsedTime(&eventMs, start, stop);
+
+  printf ("kernel Execution time             = %6.3fms\n", eventMs);
+
+  // Record the start event
+  hipEventRecord(start, NULL);
+
+  // Memory transfer from device to host
+  hipMemcpy(TransposeMatrix, gpuTransposeMatrix, NUM*sizeof(float), hipMemcpyDeviceToHost);
+
+  // Record the stop event
+  hipEventRecord(stop, NULL);
+  hipEventSynchronize(stop);
+
+  hipEventElapsedTime(&eventMs, start, stop);
+
+  printf ("hipMemcpyDeviceToHost time taken  = %6.3fms\n", eventMs);
+
+  // CPU MatrixTranspose computation
+  matrixTransposeCPUReference(cpuTransposeMatrix, Matrix, WIDTH);
+
+  // verify the results
+  errors = 0;
+  double eps = 1.0E-6;
+  for (i = 0; i < NUM; i++) {
+    if (std::abs(TransposeMatrix[i] - cpuTransposeMatrix[i]) > eps ) {
+    printf("gpu%f cpu %f \n",TransposeMatrix[i],cpuTransposeMatrix[i]);
+      errors++;
+    }
+  }
+  if (errors!=0) {
+    printf("FAILED: %d errors\n",errors);
+  } else {
+    printf ("PASSED!\n");
+  }
+
+  //free the resources on device side
+  hipFree(gpuMatrix);
+  hipFree(gpuTransposeMatrix);
+
+  //free the resources on host side
+  free(Matrix);
+  free(TransposeMatrix);
+  free(cpuTransposeMatrix);
+
+  return errors;
+}
@@ -55,13 +55,9 @@ void checkPeer2PeerSupport()
 {
    int gpuCount;
    int canAccessPeer;
-    int p2pCapableDeviceCount=0;

    HIPCHECK(hipGetDeviceCount(&gpuCount));

-    if (gpuCount < 2)
-        printf("Peer2Peer application requires atleast 2 gpu devices");
-
    for (int currentGpu=0; currentGpu<gpuCount; currentGpu++)
    {
        HIPCHECK(hipSetDevice(currentGpu));
@@ -161,6 +157,12 @@ int main(){

    HIPCHECK(hipGetDeviceCount(&gpuCount));

+    if (gpuCount < 2)
+    {
+        printf("Peer2Peer application requires atleast 2 gpu devices");
+        return 0;
+    }
+
    currentGpu = 0;
    peerGpu = (currentGpu + 1);

@@ -0,0 +1,39 @@
+HIP_PATH?= $(wildcard /opt/rocm/hip)
+ifeq (,$(HIP_PATH))
+	HIP_PATH=../../..
+endif
+
+ifeq (gfx701, $(findstring gfx701,$(HCC_AMDGPU_TARGET)))
+	$(error gfx701 is not a supported device for this sample)
+endif
+
+HIPCC=$(HIP_PATH)/bin/hipcc
+
+TARGET=hcc
+
+SOURCES = unroll.cpp
+OBJECTS = $(SOURCES:.cpp=.o)
+
+EXECUTABLE=./unroll
+
+.PHONY: test
+
+
+all: $(EXECUTABLE) test
+
+CXXFLAGS =-g
+CXX=$(HIPCC)
+
+
+$(EXECUTABLE): $(OBJECTS)
+	$(HIPCC) $(OBJECTS) -o $@
+
+
+test: $(EXECUTABLE)
+	$(EXECUTABLE)
+
+
+clean:
+	rm -f $(EXECUTABLE)
+	rm -f $(OBJECTS)
+	rm -f $(HIP_PATH)/src/*.o
@@ -0,0 +1,48 @@
+## Using Pragma unroll ###
+
+In this tutorial, we'll explain how to use #pragma unroll to improve the performance. 
+
+## Introduction:
+
+Loop unrolling optimization hints can be specified with #pragma unroll and #pragma nounroll. The pragma is placed immediately before a for loop.
+Specifying #pragma unroll without a parameter directs the loop unroller to attempt to fully unroll the loop if the trip count is known at compile time and attempt to partially unroll the loop if the trip count is not known at compile time.
+
+## Requirement:
+For hardware requirement and software installation [Installation](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/INSTALL.md) 
+
+## prerequiste knowledge:
+
+Programmers familiar with CUDA, OpenCL will be able to quickly learn and start coding with the HIP API. In case you are not, don't worry. You choose to start with the best one. We'll be explaining everything assuming you are completely new to gpgpu programming.
+
+## Simple Matrix Transpose 
+
+For this tutorial we will be using MatrixTranspose with shfl operation i.e., our 4_shfl tutorial since it is the only examples where we used loops inside the kernel.
+
+In this tutorial, we'll use `#pragma unroll`. In the same sourcecode, we used for MatrixTranspose. We'll add it just before the for loop as following:
+
+`#pragma unroll                                                        `
+`    for(int i=0;i<width;i++)                                          `
+`    {                                                                 `
+`        for(int j=0;j<width;j++)                                      `
+`            out[i*width + j] = __shfl(val,j*width + i);               `
+`    }                                                                 `
+
+Specifying the optional parameter, #pragma unroll value, directs the unroller to unroll the loop value times. Be careful while using it.
+Specifying #pragma nounroll indicates that the loop should not be unroll. #pragma unroll 1 will show the same behaviour.
+
+## How to build and run:
+Use the make command and execute it using ./exe
+Use hipcc to build the application, which is using hcc on AMD and nvcc on nvidia.
+
+## requirement for nvidia
+please make sure you have a 3.0 or higher compute capable device in order to use warp shfl operations and add `-gencode arch=compute=30, code=sm_30` nvcc flag in the Makefile while using this application.
+
+## More Info:
+- [HIP FAQ](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/docs/markdown/hip_faq.md)
+- [HIP Kernel Language](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/docs/markdown/hip_kernel_language.md)
+- [HIP Runtime API (Doxygen)](http://gpuopen-professionalcompute-tools.github.io/HIP)
+- [HIP Porting Guide](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/docs/markdown/hip_porting_guide.md)
+- [HIP Terminology](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/docs/markdown/hip_terms.md) (including Rosetta Stone of GPU computing terms across CUDA/HIP/HC/AMP/OpenL)
+- [clang-hipify](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/clang-hipify/README.md)
+- [Developer/CONTRIBUTING Info](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/CONTRIBUTING.md)
+- [Release Notes](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/RELEASE.md)
@@ -0,0 +1,141 @@
+/*
+Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include<iostream>
+
+// hip header file
+#include "hip/hip_runtime.h"
+
+
+#define WIDTH     4
+
+#define NUM       (WIDTH*WIDTH)
+
+#define THREADS_PER_BLOCK_X  4
+#define THREADS_PER_BLOCK_Y  4
+#define THREADS_PER_BLOCK_Z  1
+
+// Device (Kernel) function, it must be void
+// hipLaunchParm provides the execution configuration
+__global__ void matrixTranspose(hipLaunchParm lp,
+                                float *out,
+                                float *in,
+                                const int width)
+{
+    int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
+    float val = in[x];
+
+#pragma unroll
+    for(int i=0;i<width;i++)
+    {
+        for(int j=0;j<width;j++)
+            out[i*width + j] = __shfl(val,j*width + i);
+    }
+}
+
+// CPU implementation of matrix transpose
+void matrixTransposeCPUReference(
+    float * output,
+    float * input,
+    const unsigned int width)
+{
+    for(unsigned int j=0; j < width; j++)
+    {
+        for(unsigned int i=0; i < width; i++)
+        {
+            output[i*width + j] = input[j*width + i];
+        }
+    }
+}
+
+int main() {
+
+  float* Matrix;
+  float* TransposeMatrix;
+  float* cpuTransposeMatrix;
+
+  float* gpuMatrix;
+  float* gpuTransposeMatrix;
+
+  hipDeviceProp_t devProp;
+  hipGetDeviceProperties(&devProp, 0);
+
+  std::cout << "Device name " << devProp.name << std::endl;
+
+  int i;
+  int errors;
+
+  Matrix = (float*)malloc(NUM * sizeof(float));
+  TransposeMatrix = (float*)malloc(NUM * sizeof(float));
+  cpuTransposeMatrix = (float*)malloc(NUM * sizeof(float));
+
+  // initialize the input data
+  for (i = 0; i < NUM; i++) {
+    Matrix[i] = (float)i*10.0f;
+  }
+
+  // allocate the memory on the device side
+  hipMalloc((void**)&gpuMatrix, NUM * sizeof(float));
+  hipMalloc((void**)&gpuTransposeMatrix, NUM * sizeof(float));
+
+  // Memory transfer from host to device
+  hipMemcpy(gpuMatrix, Matrix, NUM*sizeof(float), hipMemcpyHostToDevice);
+
+  // Lauching kernel from host
+  hipLaunchKernel(matrixTranspose,
+                  dim3(1),
+                  dim3(THREADS_PER_BLOCK_X * THREADS_PER_BLOCK_Y),
+                  0, 0,
+                  gpuTransposeMatrix , gpuMatrix, WIDTH);
+
+  // Memory transfer from device to host
+  hipMemcpy(TransposeMatrix, gpuTransposeMatrix, NUM*sizeof(float), hipMemcpyDeviceToHost);
+
+  // CPU MatrixTranspose computation
+  matrixTransposeCPUReference(cpuTransposeMatrix, Matrix, WIDTH);
+
+  // verify the results
+  errors = 0;
+  double eps = 1.0E-6;
+  for (i = 0; i < NUM; i++) {
+    if (std::abs(TransposeMatrix[i] - cpuTransposeMatrix[i]) > eps ) {
+    printf("%d cpu: %f gpu  %f\n",i,cpuTransposeMatrix[i],TransposeMatrix[i]);
+      errors++;
+    }
+  }
+  if (errors!=0) {
+    printf("FAILED: %d errors\n",errors);
+  } else {
+    printf ("PASSED!\n");
+  }
+
+  //free the resources on device side
+  hipFree(gpuMatrix);
+  hipFree(gpuTransposeMatrix);
+
+  //free the resources on host side
+  free(Matrix);
+  free(TransposeMatrix);
+  free(cpuTransposeMatrix);
+
+  return errors;
+}
@@ -26,6 +26,7 @@ THE SOFTWARE.
 #include "device_util.h"
 #include "hip/hcc_detail/device_functions.h"
 #include "hip/hip_runtime.h"
+#include <atomic>

 //=================================================================================================
 /*
@@ -923,24 +924,45 @@ __device__  unsigned long long int atomicMax(unsigned long long int* address,
 }

 //atomicCAS()
+template<typename T>
+__device__ T atomicCAS_impl(T* address, T compare, T val)
+{
+  // the implementation assumes the atomic is lock-free and 
+  // has the same size as the non-atmoic equivalent type
+  static_assert(sizeof(T) == sizeof(std::atomic<T>)
+                , "size mismatch between atomic and non-atomic types");
+
+  union {
+    T*              address;
+    std::atomic<T>* atomic_address;
+  } u;
+  u.address = address;
+
+  T expected = compare;
+
+  // hcc should generate a system scope atomic CAS 
+  std::atomic_compare_exchange_weak_explicit(u.atomic_address
+                                            , &expected, val
+                                            , std::memory_order_acq_rel
+                                            , std::memory_order_relaxed);
+  return expected;
+}
+
 __device__  int atomicCAS(int* address, int compare, int val)
 {
-	hc::atomic_compare_exchange(address,&compare,val);
-	return *address;
+  return atomicCAS_impl(address, compare, val);
 }
 __device__  unsigned int atomicCAS(unsigned int* address,
                       unsigned int compare,
                       unsigned int val)
 {
-	hc::atomic_compare_exchange(address,&compare,val);
-	return *address;
+  return atomicCAS_impl(address, compare, val);
 }
 __device__  unsigned long long int atomicCAS(unsigned long long int* address,
                                 unsigned long long int compare,
                                 unsigned long long int val)
 {
-	hc::atomic_compare_exchange((uint64_t*)address,(uint64_t*)&compare,(uint64_t)val);
-	return *address;
+  return atomicCAS_impl(address, compare, val);
 }

 //atomicAnd()
@@ -1163,18 +1185,18 @@ __device__ double __hip_precise_dsqrt_rz(double x) {
  return hc::precise_math::sqrt(x);
 }

-#define LOG_BASE2_E_DIV_2 0.4426950408894701
-#define LOG_BASE2_5 2.321928094887362
+#define LOG_BASE2_E 1.4426950408889634
+#define LOG_BASE2_10 3.32192809488736
 #define ONE_DIV_LOG_BASE2_E 0.69314718056
 #define ONE_DIV_LOG_BASE2_10 0.30102999566

 // Fast Math Intrinsics
 __device__ float __hip_fast_exp10f(float x) {
-  return __hip_fast_exp2f(x*LOG_BASE2_E_DIV_2);
+  return __hip_fast_exp2f(x*LOG_BASE2_E);
 }

 __device__ float __hip_fast_expf(float x) {
-  return __hip_fast_expf(x*LOG_BASE2_5);
+  return __hip_fast_exp2f(x*LOG_BASE2_10);
 }

 __device__ float __hip_fast_frsqrt_rn(float x) {
@@ -1215,20 +1237,23 @@ __device__ float __hip_fast_tanf(float x) {
 }

 // Double Precision Math
+// FIXME - HCC doesn't have a fast_math version double FP sqrt
+// Another issue is that these intrinsics call for a specific rounding mode;
+// however, their implementation all map to the same sqrt builtin
 __device__ double __hip_fast_dsqrt_rd(double x) {
-  return hc::fast_math::sqrt(x);
+  return hc::precise_math::sqrt(x);
 }

 __device__ double __hip_fast_dsqrt_rn(double x) {
-  return hc::fast_math::sqrt(x);
+  return hc::precise_math::sqrt(x);
 }

 __device__ double __hip_fast_dsqrt_ru(double x) {
-  return hc::fast_math::sqrt(x);
+  return hc::precise_math::sqrt(x);
 }

 __device__ double __hip_fast_dsqrt_rz(double x) {
-  return hc::fast_math::sqrt(x);
+  return hc::precise_math::sqrt(x);
 }

 __device__ void  __threadfence_system(void){
@@ -52,9 +52,9 @@ namespace hip_impl
        int group_mem_bytes,
        hipStream_t stream)
    {
-        if ((HIP_TRACE_API & (1 << TRACE_CMD)) ||
+        if ((HIP_TRACE_API & (1 << TRACE_KCMD)) ||
            HIP_PROFILE_API ||
-            (COMPILE_HIP_DB && HIP_TRACE_API)) {
+            (COMPILE_HIP_DB && (HIP_TRACE_API & (1<<TRACE_ALL)))) {
            std::stringstream os;
            os  << tls_tidInfo.tid() << "." << tls_tidInfo.apiSeqNum()
                << " hipLaunchKernel '" << kernel_name << "'"
@@ -112,7 +112,7 @@ hipError_t hipDeviceGetLimit (size_t *pValue, hipLimit_t limit)
    }
 }

-hipError_t hipFuncSetCacheConfig (hipFuncCache_t cacheConfig)
+hipError_t hipFuncSetCacheConfig (const void* func, hipFuncCache_t cacheConfig)
 {
    HIP_INIT_API(cacheConfig);

@@ -298,7 +298,7 @@ hipError_t ihipGetDeviceProperties(hipDeviceProp_t* props, int device)
 hipError_t hipGetDeviceProperties(hipDeviceProp_t* props, int device)
 {
    HIP_INIT_API(props, device);
-    return ihipGetDeviceProperties(props, device);
+    return ihipLogStatus(ihipGetDeviceProperties(props, device));
 }

 hipError_t hipSetDeviceFlags( unsigned int flags)
@@ -42,25 +42,29 @@ ihipEvent_t::ihipEvent_t(unsigned flags)


 // Attach to an existing completion future:
-void ihipEvent_t::attachToCompletionFuture(const hc::completion_future *cf, ihipEventType_t eventType)
+void ihipEvent_t::attachToCompletionFuture(const hc::completion_future *cf, 
+                                           hipStream_t stream, ihipEventType_t eventType)
 {
    _state  = hipEventStatusRecording;
    _marker = *cf;
    _type   = eventType;
+    _stream = stream;
 }



-void ihipEvent_t::setTimestamp()
+void ihipEvent_t::refereshEventStatus()
 {
-    if (_state == hipEventStatusRecorded) {
-        // already recorded, done:
-        return;
-    } else {
+    bool isReady0 = _marker.is_ready();
+    bool isReady1;
+    int val = 0;
+    if (_state == hipEventStatusRecording) {
        // TODO - use completion-future functions to obtain ticks and timestamps:
        hsa_signal_t *sig  = static_cast<hsa_signal_t*> (_marker.get_native_handle());
+        isReady1 = _marker.is_ready();
        if (sig) {
-            if (hsa_signal_load_acquire(*sig) == 0) {
+            val = hsa_signal_load_acquire(*sig);
+            if (val == 0) {

                if ((_type == hipEventTypeIndependent) || (_type == hipEventTypeStopCommand)) {
                    _timestamp =  _marker.get_end_tick();
@@ -71,10 +75,14 @@ void ihipEvent_t::setTimestamp()
                    _timestamp =  0;
                }

-                _state = hipEventStatusRecorded;
+                _state = hipEventStatusComplete;
            }
        }
    }
+
+    if (_state != hipEventStatusComplete) {
+        //printf (" not ready isReady0=%d val=%d isReady1=%d\n", isReady0, val, isReady1);
+    }
 }


@@ -83,11 +91,19 @@ hipError_t ihipEventCreate(hipEvent_t* event, unsigned flags)
    hipError_t e = hipSuccess;

    // TODO-IPC - support hipEventInterprocess.
-    unsigned supportedFlags = hipEventDefault | hipEventBlockingSync | hipEventDisableTiming;
-    if ((flags & ~supportedFlags) == 0) {
-        ihipEvent_t *eh = new ihipEvent_t(flags);
+    unsigned supportedFlags =   hipEventDefault 
+                              | hipEventBlockingSync 
+                              | hipEventDisableTiming 
+                              | hipEventReleaseToDevice 
+                              | hipEventReleaseToSystem 
+                              ;
+    const unsigned releaseFlags = (hipEventReleaseToDevice | hipEventReleaseToSystem);

-        *event = eh;
+    const bool illegalFlags = (flags & ~supportedFlags) ||            // can't set any unsupported flags.
+                              (flags & releaseFlags) == releaseFlags; // can't set both release flags
+
+    if (!illegalFlags) {
+        *event = new ihipEvent_t(flags);
    } else {
        e = hipErrorInvalidValue;
    }
@@ -114,17 +130,20 @@ hipError_t hipEventRecord(hipEvent_t event, hipStream_t stream)
    HIP_INIT_API(event, stream);

    if (event && event->_state != hipEventStatusUnitialized)   {
+        stream = ihipSyncAndResolveStream(stream);
+
        event->_stream = stream;

-        if (stream == NULL) {
-            // If stream == NULL, wait on all queues.
-            // TODO-HCC fix this - is this conservative or still uses device timestamps?
-            // TODO-HCC can we use barrier or event marker to implement better solution?
+        if (HIP_SYNC_NULL_STREAM && stream->isDefaultStream()) {
+
+            // TODO-HIP_SYNC_NULL_STREAM : can remove this code when HIP_SYNC_NULL_STREAM = 0
+            
+            // If default stream , then wait on all queues.
            ihipCtx_t *ctx = ihipGetTlsDefaultCtx();
-            ctx->locked_syncDefaultStream(true);
+            ctx->locked_syncDefaultStream(true, true);

            event->_timestamp = hc::get_system_ticks();
-            event->_state = hipEventStatusRecorded;
+            event->_state = hipEventStatusComplete;
            return ihipLogStatus(hipSuccess);
        } else {
            event->_state  = hipEventStatusRecording;
@@ -164,13 +183,16 @@ hipError_t hipEventSynchronize(hipEvent_t event)
        } else if (event->_state == hipEventStatusCreated ) {
            // Created but not actually recorded on any device:
            return ihipLogStatus(hipSuccess);
-        } else if (event->_stream == NULL) {
+        } else if (HIP_SYNC_NULL_STREAM && (event->_stream->isDefaultStream() )) {
            auto *ctx = ihipGetTlsDefaultCtx();
-            ctx->locked_syncDefaultStream(true);
+            // TODO-HIP_SYNC_NULL_STREAM - can remove this code
+            ctx->locked_syncDefaultStream(true, true);
            return ihipLogStatus(hipSuccess);
        } else {
            event->_marker.wait((event->_flags & hipEventBlockingSync) ? hc::hcWaitModeBlocked : hc::hcWaitModeActive);

+            assert (event->_marker.is_ready());
+
            return ihipLogStatus(hipSuccess);
        }
    } else {
@@ -182,40 +204,50 @@ hipError_t hipEventElapsedTime(float *ms, hipEvent_t start, hipEvent_t stop)
 {
    HIP_INIT_API(ms, start, stop);

-    ihipEvent_t *start_eh = start;
-    ihipEvent_t *stop_eh = stop;
-
-    start->setTimestamp();
-    stop->setTimestamp();
-
    hipError_t status = hipSuccess;
+
    *ms = 0.0f;

-    if (start_eh && stop_eh) {
-        if ((start_eh->_state == hipEventStatusRecorded) && (stop_eh->_state == hipEventStatusRecorded)) {
-            // Common case, we have good information for both events.
+    if ((start == nullptr) ||
+        (start->_flags & hipEventDisableTiming) ||
+        (start->_state == hipEventStatusUnitialized) || (start->_state == hipEventStatusCreated) ||
+        (stop == nullptr) ||
+        (stop->_flags & hipEventDisableTiming) ||
+        ( stop->_state == hipEventStatusUnitialized) || ( stop->_state == hipEventStatusCreated)) {

-            int64_t tickDiff = (stop_eh->timestamp() - start_eh->timestamp());
+        // Both events must be at least recorded else return hipErrorInvalidResourceHandle

-            uint64_t freqHz;
-            hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY, &freqHz);
-            if (freqHz) {
-                *ms = ((double)(tickDiff) /  (double)(freqHz)) * 1000.0f;
-                status = hipSuccess;
-            } else {
-                * ms = 0.0f;
-                status = hipErrorInvalidValue;
-            }
+        status = hipErrorInvalidResourceHandle;

+    } else {
+        // Refresh status, if still recording...
+        start->refereshEventStatus();
+        stop->refereshEventStatus();

-        } else if ((start_eh->_state == hipEventStatusRecording) ||
-                   (stop_eh->_state  == hipEventStatusRecording)) {
-            status = hipErrorNotReady;
-        } else if ((start_eh->_state == hipEventStatusUnitialized) ||
-                   (stop_eh->_state  == hipEventStatusUnitialized)) {
-            status = hipErrorInvalidResourceHandle;
+        if ((start->_state == hipEventStatusComplete) && (stop->_state == hipEventStatusComplete)) {
+        // Common case, we have good information for both events.
+
+        int64_t tickDiff = (stop->timestamp() - start->timestamp());
+
+        uint64_t freqHz;
+        hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY, &freqHz);
+        if (freqHz) {
+            *ms = ((double)(tickDiff) /  (double)(freqHz)) * 1000.0f;
+            status = hipSuccess;
+        } else {
+            * ms = 0.0f;
+            status = hipErrorInvalidValue;
        }
-    }
+
+
+        } else if ((start->_state == hipEventStatusRecording) ||
+                   (stop->_state  == hipEventStatusRecording)) {
+
+            status = hipErrorNotReady;
+        } else {
+                assert(0);
+        }
+    } 

    return ihipLogStatus(status);
 }
@@ -90,11 +90,11 @@ __device__  bool __hgt(__half a, __half b) {
 }

 __device__  bool __hisinf(__half a) {
-  return a == __hInfValue.h ? true : false;
+  return a == HINF ? true : false;
 }

 __device__  bool __hisnan(__half a) {
-  return a > __hInfValue.h ? true : false;
+  return a > HINF ? true : false;
 }

 __device__  bool __hle(__half a, __half b) {
@@ -114,75 +114,75 @@ Half2 Comparision Functions
 */

 __device__  bool __hbeq2(__half2 a, __half2 b) {
-  return (a.p[0] == b.p[0] ? true : false) && (a.p[1] == b.p[1] ? true : false);
+  return (a.x == b.x ? true : false) && (a.y == b.y ? true : false);
 }

 __device__  bool __hbge2(__half2 a, __half2 b) {
-  return (a.p[0] >= b.p[0] ? true : false) && (a.p[1] >= b.p[1] ? true : false);
+  return (a.x >= b.x ? true : false) && (a.y >= b.y ? true : false);
 }

 __device__  bool __hbgt2(__half2 a, __half2 b) {
-  return (a.p[0] > b.p[0] ? true : false) && (a.p[1] > b.p[1] ? true : false);
+  return (a.x > b.x ? true : false) && (a.y > b.y ? true : false);
 }

 __device__  bool __hble2(__half2 a, __half2 b) {
-  return (a.p[0] <= b.p[0] ? true : false) && (a.p[1] <= b.p[1] ? true : false);
+  return (a.x <= b.x ? true : false) && (a.y <= b.y ? true : false);
 }

 __device__  bool __hblt2(__half2 a, __half2 b) {
-  return (a.p[0] < b.p[0] ? true : false) && (a.p[1] < b.p[1] ? true : false);
+  return (a.x < b.x ? true : false) && (a.y < b.y ? true : false);
 }

 __device__  bool __hbne2(__half2 a, __half2 b) {
-  return (a.p[0] != b.p[0] ? true : false) && (a.p[1] != b.p[1] ? true : false);
+  return (a.x != b.x ? true : false) && (a.y != b.y ? true : false);
 }

 __device__  __half2 __heq2(__half2 a, __half2 b) {
  __half2 c;
-  c.p[0] = (a.p[0] == b.p[0]) ? (__half)1 : (__half)0;
-  c.p[1] = (a.p[1] == b.p[1]) ? (__half)1 : (__half)0;
+  c.x = (a.x == b.x) ? (__half)1 : (__half)0;
+  c.y = (a.y == b.y) ? (__half)1 : (__half)0;
  return c;
 }

 __device__  __half2 __hge2(__half2 a, __half2 b) {
  __half2 c;
-  c.p[0] = (a.p[0] >= b.p[0]) ? (__half)1 : (__half)0;
-  c.p[1] = (a.p[1] >= b.p[1]) ? (__half)1 : (__half)0;
+  c.x = (a.x >= b.x) ? (__half)1 : (__half)0;
+  c.y = (a.y >= b.y) ? (__half)1 : (__half)0;
  return c;
 }

 __device__  __half2 __hgt2(__half2 a, __half2 b) {
  __half2 c;
-  c.p[0] = (a.p[0] > b.p[0]) ? (__half)1 : (__half)0;
-  c.p[1] = (a.p[1] > b.p[1]) ? (__half)1 : (__half)0;
+  c.x = (a.x > b.x) ? (__half)1 : (__half)0;
+  c.y = (a.y > b.y) ? (__half)1 : (__half)0;
  return c;
 }

 __device__  __half2 __hisnan2(__half2 a) {
  __half2 c;
-  c.p[0] = (a.p[0] > __hInfValue.h) ? (__half)1 : (__half)0;
-  c.p[1] = (a.p[1] > __hInfValue.h) ? (__half)1 : (__half)0;
+  c.x = (a.x > HINF) ? (__half)1 : (__half)0;
+  c.y = (a.y > HINF) ? (__half)1 : (__half)0;
  return c;
 }

 __device__  __half2 __hle2(__half2 a, __half2 b) {
  __half2 c;
-  c.p[0] = (a.p[0] <= b.p[0]) ? (__half)1 : (__half)0;
-  c.p[1] = (a.p[1] <= b.p[1]) ? (__half)1 : (__half)0;
+  c.x = (a.x <= b.x) ? (__half)1 : (__half)0;
+  c.y = (a.y <= b.y) ? (__half)1 : (__half)0;
  return c;
 }

 __device__  __half2 __hlt2(__half2 a, __half2 b) {
  __half2 c;
-  c.p[0] = (a.p[0] < b.p[0]) ? (__half)1 : (__half)0;
-  c.p[1] = (a.p[1] < b.p[1]) ? (__half)1 : (__half)0;
+  c.x = (a.x < b.x) ? (__half)1 : (__half)0;
+  c.y = (a.y < b.y) ? (__half)1 : (__half)0;
  return c;
 }

 __device__  __half2 __hne2(__half2 a, __half2 b) {
  __half2 c;
-  c.p[0] = (a.p[0] != b.p[0]) ? (__half)1 : (__half)0;
-  c.p[1] = (a.p[1] != b.p[1]) ? (__half)1 : (__half)0;
+  c.x = (a.x != b.x) ? (__half)1 : (__half)0;
+  c.y = (a.y != b.y) ? (__half)1 : (__half)0;
  return c;
 }

@@ -191,8 +191,8 @@ Conversion instructions
 */
 __device__  __half2 __float22half2_rn(const float2 a) {
  __half2 b;
-  b.p[0] = (__half)a.x;
-  b.p[1] = (__half)a.y;
+  b.x = (__half)a.x;
+  b.y = (__half)a.y;
  return b;
 }

@@ -202,8 +202,8 @@ __device__  __half __float2half(const float a) {

 __device__  __half2 __float2half2_rn(const float a) {
  __half2 b;
-  b.p[0] = (__half)a;
-  b.p[1] = (__half)a;
+  b.x = (__half)a;
+  b.y = (__half)a;
  return b;
 }

@@ -225,15 +225,15 @@ __device__  __half __float2half_rz(const float a) {

 __device__  __half2 __floats2half2_rn(const float a, const float b) {
  __half2 c;
-  c.p[0] = (__half)a;
-  c.p[1] = (__half)b;
+  c.x = (__half)a;
+  c.y = (__half)b;
  return c;
 }

 __device__  float2 __half22float2(const __half2 a) {
  float2 b;
-  b.x = (float)a.p[0];
-  b.y = (float)a.p[1];
+  b.x = (float)a.x;
+  b.y = (float)a.y;
  return b;
 }

@@ -243,8 +243,8 @@ __device__  float __half2float(const __half a) {

 __device__  __half2 half2half2(const __half a) {
  __half2 b;
-  b.p[0] = a;
-  b.p[1] = a;
+  b.x = a;
+  b.y = a;
  return b;
 }

@@ -358,30 +358,30 @@ __device__  unsigned short int __half_as_ushort(const __half h) {

 __device__  __half2 __halves2half2(const __half a, const __half b) {
  __half2 c;
-  c.p[0] = a;
-  c.p[1] = b;
+  c.x = a;
+  c.y = b;
  return c;
 }

 __device__  float __high2float(const __half2 a) {
-  return (float)a.p[1];
+  return (float)a.y;
 }

 __device__  __half __high2half(const __half2 a) {
-  return a.p[1];
+  return a.y;
 }

 __device__  __half2 __high2half2(const __half2 a) {
  __half2 b;
-  b.p[0] = a.p[1];
-  b.p[1] = a.p[1];
+  b.x = a.y;
+  b.y = a.y;
  return b;
 }

 __device__  __half2 __highs2half2(const __half2 a, const __half2 b) {
  __half2 c;
-  c.p[0] = a.p[1];
-  c.p[1] = b.p[1];
+  c.x = a.y;
+  c.y = b.y;
  return c;
 }

@@ -418,38 +418,38 @@ __device__  __half __ll2half_rz(long long int i){
 }

 __device__  float __low2float(const __half2 a) {
-  return (float)a.p[0];
+  return (float)a.x;
 }

 __device__  __half __low2half(const __half2 a) {
-  return a.p[0];
+  return a.x;
 }

 __device__  __half2 __low2half2(const __half2 a, const __half2 b) {
  __half2 c;
-  c.p[0] = a.p[0];
-  c.p[1] = b.p[0];
+  c.x = a.x;
+  c.y = b.x;
  return c;
 }

 __device__  __half2 __low2half2(const __half2 a) {
  __half2 b;
-  b.p[0] = a.p[0];
-  b.p[1] = a.p[0];
+  b.x = a.x;
+  b.y = a.x;
  return b;
 }

 __device__  __half2 __lowhigh2highlow(const __half2 a) {
  __half2 b;
-  b.p[0] = a.p[1];
-  b.p[1] = a.p[0];
+  b.x = a.y;
+  b.y = a.x;
  return b;
 }

 __device__  __half2 __lows2half2(const __half2 a, const __half2 b) {
  __half2 c;
-  c.p[0] = a.p[0];
-  c.p[1] = b.p[0];
+  c.y = a.x;
+  c.y = b.x;
  return c;
 }

@@ -542,346 +542,4 @@ typedef struct{
  };
 } struct_float;

-#if __clang_major__ == 3

-static __device__ float cvt_half_to_float(__half a){
-  struct_float ret = {0};
-  if(a.x == 0){
-    return 0.0f;
-  }
-  if(a.x == 0x8000){
-    return -0.0f;
-  }
-  ret.u = ((a.x&0x8000)<<16) | (((a.x&0x7c00)+0x1C000)<<13) | ((a.x&0x03FF)<<13);
-  return ret.f;
-}
-
-static __device__ __half cvt_float_to_half(float b){
-  struct_float f = {0};
-  __half ret = {0};
-  f.f = b;
-  if(f.f == 0.0f){
-    ret.x = 0;
-    return ret;
-  }
-  if(f.f == -0.0f){
-    ret.x = 0x8000;
-    return ret;
-  }
-  ret.x = ((f.u>>16)&0x8000)|((((f.u&0x7f800000)-0x38000000)>>13)&0x7c00)|((f.u>>13)&0x03ff);
-  return ret;
-}
-
-
-__device__ __half __soft_hadd(const __half a, const __half b){
-  return cvt_float_to_half(cvt_half_to_float(a)+cvt_half_to_float(b));
-}
-
-__device__ __half __soft_hadd_sat(const __half a, const __half b){
-  float f = cvt_half_to_float(a) + cvt_half_to_float(b);
-  return (f < 0.0f ? __half_value_zero_float : (f > 1.0f ? __half_value_one_float: cvt_float_to_half(f)));
-}
-
-__device__ __half __soft_hfma(const __half a, const __half b, const __half c){
-  return cvt_float_to_half(fmaf(cvt_half_to_float(a), cvt_half_to_float(b), cvt_half_to_float(c)));
-}
-
-__device__ __half __soft_hfma_sat(const __half a, const __half b, const __half c){
-  float f = fmaf(cvt_half_to_float(a), cvt_half_to_float(b), cvt_half_to_float(c));
-  return (f < 0.0f ? __half_value_zero_float : (f > 1.0f ? __half_value_one_float: cvt_float_to_half(f)));
-}
-
-__device__ __half __soft_hmul(const __half a, const __half b){
-  return cvt_float_to_half(cvt_half_to_float(a)*cvt_half_to_float(b));
-}
-
-__device__ __half __soft_hmul_sat(const __half a, const __half b){
-  float f = cvt_half_to_float(a) * cvt_half_to_float(b);
-  return (f < 0.0f ? __half_value_zero_float : (f > 1.0f ? __half_value_one_float: cvt_float_to_half(f)));
-}
-
-__device__ __half __soft_hneq(const __half a){
-  __half ret = {a.x};
-  ret.x ^= 1 << 15;
-  return ret;
-}
-
-__device__ __half __soft_hsub(const __half a, const __half b){
-  return cvt_float_to_half(cvt_half_to_float(a)-cvt_half_to_float(b));
-}
-
-__device__ __half __soft_hsub_sat(const __half a, const __half b){
-  float f = cvt_half_to_float(a) - cvt_half_to_float(b);
-  return (f < 0.0f ? __half_value_zero_float : (f > 1.0f ? __half_value_one_float: cvt_float_to_half(f)));
-}
-
-
-/*
-Half2 Arithmetic Instructions
-*/
-
-__device__ __half2 __soft_hadd2(const __half2 a, const __half2 b){
-  __half2 ret;
-  ret.p[1] = __soft_hadd(a.p[1], b.p[1]);
-  ret.p[0] = __soft_hadd(a.p[0], b.p[0]);
-  return ret;
-}
-
-__device__ __half2 __soft_hadd2_sat(const __half2 a, const __half2 b){
-  __half2 ret;
-  ret.p[1] = __soft_hadd_sat(a.p[1], b.p[1]);
-  ret.p[0] = __soft_hadd_sat(a.p[0], b.p[0]);
-  return ret;
-}
-
-__device__ __half2 __soft_hfma2(const __half2 a, const __half2 b, const __half2 c){
-  __half2 ret;
-  ret.p[1] = __soft_hfma(a.p[1], b.p[1], c.p[1]);
-  ret.p[0] = __soft_hfma(a.p[0], b.p[0], c.p[0]);
-  return ret;
-}
-
-__device__ __half2 __soft_hfma2_sat(const __half2 a, const __half2 b, const __half2 c){
-  __half2 ret;
-  ret.p[1] = __soft_hfma_sat(a.p[1], b.p[1], c.p[1]);
-  ret.p[0] = __soft_hfma_sat(a.p[0], b.p[0], c.p[0]);
-  return ret;
-}
-
-__device__ __half2 __soft_hmul2(const __half2 a, const __half2 b){
-  __half2 ret;
-  ret.p[1] = __soft_hmul(a.p[1], b.p[1]);
-  ret.p[0] = __soft_hmul(a.p[0], b.p[0]);
-  return ret;
-}
-
-__device__ __half2 __soft_hmul2_sat(const __half2 a, const __half2 b){
-  __half2 ret;
-  ret.p[1] = __soft_hmul_sat(a.p[1], b.p[1]);
-  ret.p[0] = __soft_hmul_sat(a.p[0], b.p[0]);
-  return ret;
-}
-
-__device__ __half2 __soft_hneq2(const __half2 a){
-  __half2 ret;
-  ret.p[1] = __soft_hneq(a.p[1]);
-  ret.p[0] = __soft_hneq(a.p[0]);
-  return ret;
-}
-
-__device__ __half2 __soft_hsub2(const __half2 a, const __half2 b){
-  __half2 ret;
-  ret.p[1] = __soft_hsub(a.p[1], b.p[1]);
-  ret.p[0] = __soft_hsub(a.p[0], b.p[0]);
-  return ret;
-}
-
-__device__ __half2 __soft_hsub2_sat(const __half2 a, const __half2 b){
-  __half2 ret;
-  ret.p[1] = __soft_hsub_sat(a.p[1], b.p[1]);
-  ret.p[0] = __soft_hsub_sat(a.p[0], b.p[0]);
-  return ret;
-}
-
-/*
-Half Cmps
-*/
-
-__device__  bool __soft_heq(const __half a, const __half b){
-  return (a.x == b.x ? true:false);
-}
-
-__device__ bool __soft_hge(const __half a, const __half b){
-  return (cvt_half_to_float(a) >= cvt_half_to_float(b));
-}
-
-__device__ bool __soft_hgt(const __half a, const __half b){
-  return (cvt_half_to_float(a) > cvt_half_to_float(b));
-}
-
-__device__ bool __soft_hisinf(const __half a){
-  return ((a.x == __half_neg_inf) ? -1 : (a.x == __half_pos_inf) ? 1 : 0);
-}
-
-__device__ bool __soft_hisnan(const __half a){
-  if(((a.x & __half_pos_inf) == a.x) || ((a.x & __half_neg_inf) == a.x)){
-    return true;
-  }else{
-    return false;
-  }
-}
-
-__device__ bool __soft_hle(const __half a, const __half b){
-  return (cvt_half_to_float(a) <= cvt_half_to_float(b));
-}
-
-__device__ bool __soft_hlt(const __half a, const __half b){
-  return (cvt_half_to_float(a) < cvt_half_to_float(b));
-}
-
-__device__ bool __soft_hne(const __half a, const __half b){
-  return a.x == b.x ? false : true;
-}
-
-/*
-Half2 Cmps
-*/
-
-__device__ bool __soft_hbeq2(const __half2 a, const __half2 b){
-  return __soft_heq(a.p[1], b.p[1]) && __soft_heq(a.p[0], b.p[0]);
-}
-
-__device__ bool __soft_hbge2(const __half2 a, const __half2 b){
-  return __soft_hge(a.p[1], b.p[1]) && __soft_hge(a.p[0], b.p[0]);
-}
-
-__device__ bool __soft_hbgt2(const __half2 a, const __half2 b){
-  return __soft_hgt(a.p[1], b.p[1]) && __soft_hgt(a.p[0], b.p[0]);
-}
-
-__device__ bool __soft_hble2(const __half2 a, const __half2 b){
-  return __soft_hle(a.p[1], b.p[1]) && __soft_hle(a.p[0], b.p[0]);
-}
-
-__device__ bool __soft_hblt2(const __half2 a, const __half2 b){
-  return __soft_hlt(a.p[1], b.p[1]) && __soft_hlt(a.p[0], b.p[0]);
-}
-
-__device__ bool __soft_hbne2(const __half2 a, const __half2 b){
-  return __soft_hne(a.p[1], b.p[1]) && __soft_hne(a.p[0], b.p[0]);
-}
-
-
-
-__device__ __half2 __soft_heq2(const __half2 a, const __half2 b){
-  __half2 ret = {0};
-  ret.p[1] = (__soft_heq(a.p[1], b.p[1])) ? __half_value_one_float : __half_value_zero_float;
-  ret.p[0] = (__soft_heq(a.p[0], b.p[0])) ? __half_value_one_float : __half_value_zero_float;
-  return ret;
-}
-
-__device__ __half2 __soft_hge2(const __half2 a, const __half2 b){
-  __half2 ret = {0};
-  ret.p[1] = (__soft_hge(a.p[1], b.p[1])) ? __half_value_one_float : __half_value_zero_float;
-  ret.p[0] = (__soft_hge(a.p[0], b.p[0])) ? __half_value_one_float : __half_value_zero_float;
-  return ret;
-}
-
-__device__ __half2 __soft_hgt2(const __half2 a, const __half2 b){
-  __half2 ret = {0};
-  ret.p[1] = (__soft_hgt(a.p[1], b.p[1])) ? __half_value_one_float : __half_value_zero_float;
-  ret.p[0] = (__soft_hgt(a.p[0], b.p[0])) ? __half_value_one_float : __half_value_zero_float;
-  return ret;
-}
-
-__device__ __half2 __soft_hisnan2(const __half2 a){
-  __half2 ret = {0};
-  ret.p[1] = __soft_hisnan(a.p[1]) ? __half_value_one_float : __half_value_zero_float;
-  ret.p[0] = __soft_hisnan(a.p[0]) ? __half_value_one_float : __half_value_zero_float;
-  return ret;
-}
-
-__device__ __half2 __soft_hle2(const __half2 a, const __half2 b){
-  __half2 ret = {0};
-  ret.p[1] = (__soft_hle(a.p[1], b.p[1])) ? __half_value_one_float : __half_value_zero_float;
-  ret.p[0] = (__soft_hle(a.p[0], b.p[0])) ? __half_value_one_float : __half_value_zero_float;
-  return ret;
-}
-
-__device__ __half2 __soft_hlt2(const __half2 a, const __half2 b){
-  __half2 ret = {0};
-  ret.p[1] = (__soft_hlt(a.p[1], b.p[1])) ? __half_value_one_float : __half_value_zero_float;
-  ret.p[0] = (__soft_hlt(a.p[0], b.p[0])) ? __half_value_one_float : __half_value_zero_float;
-  return ret;
-}
-
-__device__ __half2 __soft_hne2(const __half2 a, const __half2 b){
-  __half2 ret = {0};
-  ret.p[1] = (__soft_hne(a.p[1], b.p[1])) ? __half_value_one_float : __half_value_zero_float;
-  ret.p[0] = (__soft_hne(a.p[0], b.p[0])) ? __half_value_one_float : __half_value_zero_float;
-  return ret;
-}
-
-/*
-Half Cnvs and Data Mvmnt
-*/
-
-__device__ __half2 __soft_float22half2_rn(const float2 a){
-  __half2 ret = {0};
-  ret.p[1] = cvt_float_to_half(a.x);
-  ret.p[0] = cvt_float_to_half(a.y);
-  return ret;
-}
-
-__device__ __half __soft_float2half(const float a){
-  return cvt_float_to_half(a);
-}
-
-__device__ __half2 __soft_float2half2_rn(const float a){
-  __half ret = cvt_float_to_half(a);
-  return {ret, ret};
-}
-
-__device__ __half2 __soft_floats2half2_rn(const float a, const float b){
-  return {cvt_float_to_half(a), cvt_float_to_half(b)};
-}
-
-__device__ float2 __soft_half22float2(const __half2 a){
-  return {cvt_half_to_float(a.p[1]), cvt_half_to_float(a.p[0])};
-}
-
-__device__ float __soft_half2float(const __half a){
-  return cvt_half_to_float(a);
-}
-
-__device__ __half2 __soft_half2half2(const __half a){
-  return {a,a};
-}
-
-__device__ __half2 __soft_halves2half2(const __half a, const __half b){
-  return {a,b};
-}
-
-__device__ float __soft_high2float(const __half2 a){
-  return cvt_half_to_float(a.p[1]);
-}
-
-__device__ __half __soft_high2half(const __half2 a){
-  return a.p[1];
-}
-
-__device__ __half2 __soft_high2half2(const __half2 a){
-  return {a.p[1], a.p[1]};
-}
-
-__device__ __half2 __soft_highs2half2(const __half2 a, const __half2 b){
-  return {a.p[1], b.p[1]};
-}
-
-__device__ float __soft_low2float(const __half2 a){
-  return cvt_half_to_float(a.p[0]);
-}
-
-__device__ __half __soft_low2half(const __half2 a){
-  return a.p[0];
-}
-
-__device__ __half2 __soft_low2half2(const __half2 a){
-  return {a.p[0], a.p[0]};
-}
-
-__device__ __half2 __soft_lows2half2(const __half2 a, const __half2 b){
-  return {a.p[0], b.p[0]};
-}
-
-__device__ __half2 __soft_lowhigh2highlow(const __half2 a){
-  return {a.p[0], a.p[1]};
-}
-
-__device__ __half2 __soft_low2half2(const __half2 a, const __half2 b){
-  return {a.p[0], b.p[0]};
-}
-
-
-
-#endif
@@ -2,89 +2,122 @@ target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-i64:
 target triple = "amdgcn--amdhsa"


-define i32 @__hip_hc_ir_hadd2_int(i32 %a, i32 %b) #1 {
-  %1 = tail call i32 asm sideeffect "v_add_f16 $0, $1, $2","=v,v,v"(i32 %a, i32 %b)
-  tail call void asm sideeffect "v_add_f16_sdwa $0, $1, $2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1","v,v,v"(i32 %1, i32 %a, i32 %b)
-  ret i32 %1
+define <2 x half> @__hip_hc_ir_hadd2_int(<2 x half> %a, <2 x half> %b) #1 {
+  %1 = bitcast <2 x half> %a to i32
+  %2 = bitcast <2 x half> %b to i32
+  %3 = tail call i32 asm sideeffect "v_add_f16 $0, $1, $2","=v,v,v"(i32 %1, i32 %2)
+  tail call void asm sideeffect "v_add_f16_sdwa $0, $1, $2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1","v,v,v"(i32 %3, i32 %1, i32 %2)
+  %4 = bitcast i32 %3 to <2 x half>
+  ret <2 x half> %4
 }

-define i32 @__hip_hc_ir_hfma2_int(i32 %a, i32 %b, i32 %c) #1 {
-  %1 = tail call i32 asm sideeffect "v_mad_f16 $0, $1, $2, $3","=v,v,v,v"(i32 %a, i32 %b, i32 %c)
-  tail call void asm sideeffect "v_mul_f16_sdwa $0, $1, $2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1","v,v,v"(i32 %1, i32 %a, i32 %b)
-  tail call void asm sideeffect "v_add_f16_sdwa $0, $1, $2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1","v,v,v"(i32 %1, i32 %1, i32 %c)
-  ret i32 %1
+define <2 x half> @__hip_hc_ir_hfma2_int(<2 x half> %a, <2 x half> %b, <2 x half> %c) #1 {
+  %1 = bitcast <2 x half> %a to i32
+  %2 = bitcast <2 x half> %b to i32
+  %3 = bitcast <2 x half> %c to i32
+  %4 = tail call i32 asm sideeffect "v_mad_f16 $0, $1, $2, $3","=v,v,v,v"(i32 %1, i32 %2, i32 %3)
+  tail call void asm sideeffect "v_mul_f16_sdwa $0, $1, $2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1","v,v,v"(i32 %4, i32 %1, i32 %2)
+  tail call void asm sideeffect "v_add_f16_sdwa $0, $1, $2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1","v,v,v"(i32 %4, i32 %4, i32 %3)
+  %5 = bitcast i32 %4 to <2 x half>
+  ret <2 x half> %5
 }

-define i32 @__hip_hc_ir_hmul2_int(i32 %a, i32 %b) #1 {
-  %1 = tail call i32 asm sideeffect "v_mul_f16 $0, $1, $2","=v,v,v"(i32 %a, i32 %b)
-  tail call void asm sideeffect "v_mul_f16_sdwa $0, $1, $2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1","v,v,v"(i32 %1, i32 %a, i32 %b)
-  ret i32 %1
+define <2 x half> @__hip_hc_ir_hmul2_int(<2 x half> %a, <2 x half> %b) #1 {
+  %1 = bitcast <2 x half> %a to i32
+  %2 = bitcast <2 x half> %b to i32
+  %3 = tail call i32 asm sideeffect "v_mul_f16 $0, $1, $2","=v,v,v"(i32 %1, i32 %2)
+  tail call void asm sideeffect "v_mul_f16_sdwa $0, $1, $2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1","v,v,v"(i32 %3, i32 %1, i32 %2)
+  %4 = bitcast i32 %3 to <2 x half>
+  ret <2 x half> %4
 }

-define i32 @__hip_hc_ir_hsub2_int(i32 %a, i32 %b) #1 {
-  %1 = tail call i32 asm sideeffect "v_sub_f16 $0, $1, $2","=v,v,v"(i32 %a, i32 %b)
-  tail call void asm sideeffect "v_sub_f16_sdwa $0, $1, $2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1","v,v,v"(i32 %1, i32 %a, i32 %b)
-  ret i32 %1
+define <2 x half> @__hip_hc_ir_hsub2_int(<2 x half> %a, <2 x half> %b) #1 {
+  %1 = bitcast <2 x half> %a to i32
+  %2 = bitcast <2 x half> %b to i32
+  %3 = tail call i32 asm sideeffect "v_sub_f16 $0, $1, $2","=v,v,v"(i32 %1, i32 %2)
+  tail call void asm sideeffect "v_sub_f16_sdwa $0, $1, $2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1","v,v,v"(i32 %3, i32 %1, i32 %2)
+  %4 = bitcast i32 %3 to <2 x half>
+  ret <2 x half> %4
 }

-define i32 @__hip_hc_ir_h2ceil_int(i32 %a) #1 {
-  %1 = tail call i32 asm sideeffect "v_ceil_f16 $0, $1","=v,v"(i32 %a)
-  tail call void asm sideeffect "v_ceil_f16_sdwa $0, $1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1","v,v"(i32 %1, i32 %a)
-  ret i32 %1
+define <2 x half> @__hip_hc_ir_h2ceil_int(<2 x half> %a) #1 {
+  %1 = bitcast <2 x half> %a to i32
+  %2 = tail call i32 asm sideeffect "v_ceil_f16 $0, $1","=v,v"(i32 %1)
+  tail call void asm sideeffect "v_ceil_f16_sdwa $0, $1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1","v,v"(i32 %2, i32 %1)
+  %3 = bitcast i32 %2 to <2 x half>
+  ret <2 x half> %3
 }

-define i32 @__hip_hc_ir_h2cos_int(i32 %a) #1 {
-  %1 = tail call i32 asm sideeffect "v_cos_f16 $0, $1","=v,v"(i32 %a)
-  tail call void asm sideeffect "v_cos_f16_sdwa $0, $1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1","v,v"(i32 %1, i32 %a)
-  ret i32 %1
+define <2 x half> @__hip_hc_ir_h2cos_int(<2 x half> %a) #1 {
+  %1 = bitcast <2 x half> %a to i32
+  %2 = tail call i32 asm sideeffect "v_cos_f16 $0, $1","=v,v"(i32 %1)
+  tail call void asm sideeffect "v_cos_f16_sdwa $0, $1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1","v,v"(i32 %2, i32 %1)
+  %3 = bitcast i32 %2 to <2 x half>
+  ret <2 x half> %3
 }

-define i32 @__hip_hc_ir_h2exp2_int(i32 %a) #1 {
-  %1 = tail call i32 asm sideeffect "v_exp_f16 $0, $1","=v,v"(i32 %a)
-  tail call void asm sideeffect "v_exp_f16_sdwa $0, $1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1","v,v"(i32 %1, i32 %a)
-  ret i32 %1
+define <2 x half> @__hip_hc_ir_h2exp2_int(<2 x half> %a) #1 {
+  %1 = bitcast <2 x half> %a to i32
+  %2 = tail call i32 asm sideeffect "v_exp_f16 $0, $1","=v,v"(i32 %1)
+  tail call void asm sideeffect "v_exp_f16_sdwa $0, $1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1","v,v"(i32 %2, i32 %1)
+  %3 = bitcast i32 %2 to <2 x half>
+  ret <2 x half> %3
 }

-define i32 @__hip_hc_ir_h2floor_int(i32 %a) #1 {
-  %1 = tail call i32 asm sideeffect "v_floor_f16 $0, $1","=v,v"(i32 %a)
-  tail call void asm sideeffect "v_floor_f16_sdwa $0, $1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1","v,v"(i32 %1, i32 %a)
-  ret i32 %1
+define <2 x half> @__hip_hc_ir_h2floor_int(<2 x half> %a) #1 {
+  %1 = bitcast <2 x half> %a to i32
+  %2 = tail call i32 asm sideeffect "v_floor_f16 $0, $1","=v,v"(i32 %1)
+  tail call void asm sideeffect "v_floor_f16_sdwa $0, $1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1","v,v"(i32 %2, i32 %1)
+  %3 = bitcast i32 %2 to <2 x half>
+  ret <2 x half> %3
 }

-define i32 @__hip_hc_ir_h2log2_int(i32 %a) #1 {
-  %1 = tail call i32 asm sideeffect "v_log_f16 $0, $1","=v,v"(i32 %a)
-  tail call void asm sideeffect "v_log_f16_sdwa $0, $1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1","v,v"(i32 %1, i32 %a)
-  ret i32 %1
+define <2 x half> @__hip_hc_ir_h2log2_int(<2 x half> %a) #1 {
+  %1 = bitcast <2 x half> %a to i32
+  %2 = tail call i32 asm sideeffect "v_log_f16 $0, $1","=v,v"(i32 %1)
+  tail call void asm sideeffect "v_log_f16_sdwa $0, $1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1","v,v"(i32 %2, i32 %1)
+  %3 = bitcast i32 %2 to <2 x half>
+  ret <2 x half> %3
 }

-define i32 @__hip_hc_ir_h2rcp_int(i32 %a) #1 {
-  %1 = tail call i32 asm sideeffect "v_rcp_f16 $0, $1","=v,v"(i32 %a)
-  tail call void asm sideeffect "v_rcp_f16_sdwa $0, $1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1","v,v"(i32 %1, i32 %a)
-  ret i32 %1
+define <2 x half> @__hip_hc_ir_h2rcp_int(<2 x half> %a) #1 {
+  %1 = bitcast <2 x half> %a to i32
+  %2 = tail call i32 asm sideeffect "v_rcp_f16 $0, $1","=v,v"(i32 %1)
+  tail call void asm sideeffect "v_rcp_f16_sdwa $0, $1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1","v,v"(i32 %2, i32 %1)
+  %3 = bitcast i32 %2 to <2 x half>
+  ret <2 x half> %3
 }

-define i32 @__hip_hc_ir_h2rsqrt_int(i32 %a) #1 {
-  %1 = tail call i32 asm sideeffect "v_rsq_f16 $0, $1","=v,v"(i32 %a)
-  tail call void asm sideeffect "v_rsq_f16_sdwa $0, $1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1","v,v"(i32 %1, i32 %a)
-  ret i32 %1
+define <2 x half> @__hip_hc_ir_h2rsqrt_int(<2 x half> %a) #1 {
+  %1 = bitcast <2 x half> %a to i32
+  %2 = tail call i32 asm sideeffect "v_rsq_f16 $0, $1","=v,v"(i32 %1)
+  tail call void asm sideeffect "v_rsq_f16_sdwa $0, $1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1","v,v"(i32 %2, i32 %1)
+  %3 = bitcast i32 %2 to <2 x half>
+  ret <2 x half> %3
 }

-define i32 @__hip_hc_ir_h2sin_int(i32 %a) #1 {
-  %1 = tail call i32 asm sideeffect "v_sin_f16 $0, $1","=v,v"(i32 %a)
-  tail call void asm sideeffect "v_sin_f16_sdwa $0, $1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1","v,v"(i32 %1, i32 %a)
-  ret i32 %1
+define <2 x half> @__hip_hc_ir_h2sin_int(<2 x half> %a) #1 {
+  %1 = bitcast <2 x half> %a to i32
+  %2 = tail call i32 asm sideeffect "v_sin_f16 $0, $1","=v,v"(i32 %1)
+  tail call void asm sideeffect "v_sin_f16_sdwa $0, $1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1","v,v"(i32 %2, i32 %1)
+  %3 = bitcast i32 %2 to <2 x half>
+  ret <2 x half> %3
 }

-define i32 @__hip_hc_ir_h2sqrt_int(i32 %a) #1 {
-  %1 = tail call i32 asm sideeffect "v_sqrt_f16 $0, $1","=v,v"(i32 %a)
-  tail call void asm sideeffect "v_sqrt_f16_sdwa $0, $1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1","v,v"(i32 %1, i32 %a)
-  ret i32 %1
+define <2 x half> @__hip_hc_ir_h2sqrt_int(<2 x half> %a) #1 {
+  %1 = bitcast <2 x half> %a to i32
+  %2 = tail call i32 asm sideeffect "v_sqrt_f16 $0, $1","=v,v"(i32 %1)
+  tail call void asm sideeffect "v_sqrt_f16_sdwa $0, $1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1","v,v"(i32 %2, i32 %1)
+  %3 = bitcast i32 %2 to <2 x half>
+  ret <2 x half> %3
 }

-define i32 @__hip_hc_ir_h2trunc_int(i32 %a) #1 {
-  %1 = tail call i32 asm sideeffect "v_trunc_f16 $0, $1","=v,v"(i32 %a)
-  tail call void asm sideeffect "v_trunc_f16_sdwa $0, $1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1","v,v"(i32 %1, i32 %a)
-  ret i32 %1
+define <2 x half> @__hip_hc_ir_h2trunc_int(<2 x half> %a) #1 {
+  %1 = bitcast <2 x half> %a to i32
+  %2 = tail call i32 asm sideeffect "v_trunc_f16 $0, $1","=v,v"(i32 %1)
+  tail call void asm sideeffect "v_trunc_f16_sdwa $0, $1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1","v,v"(i32 %2, i32 %1)
+  %3 = bitcast i32 %2 to <2 x half>
+  ret <2 x half> %3
 }

 attributes #1 = { alwaysinline nounwind }
@@ -48,13 +48,8 @@ THE SOFTWARE.
 #include "env.h"


-#ifndef USE_COPY_EXT_V2
-#define USE_COPY_EXT_V2 1
-#endif
-
-#ifndef USE_ROCR_1_4
-#define USE_ROCR_1_4 1
-#endif
+// needs HCC change for hc::no_scope
+#define USE_NO_SCOPE 1

 //=================================================================================================
 //Global variables:
@@ -69,7 +64,6 @@ std::string HIP_LAUNCH_BLOCKING_KERNELS;
 std::vector<std::string> g_hipLaunchBlockingKernels;
 int HIP_API_BLOCKING = 0;

-int HIP_MAX_QUEUES = 0;

 int HIP_PRINT_ENV = 0;
 int HIP_TRACE_API= 0;
@@ -80,8 +74,7 @@ int HIP_PROFILE_API= 0;
 std::string HIP_DB_START_API;
 std::string HIP_DB_STOP_API;
 int HIP_DB= 0;
-int HIP_VISIBLE_DEVICES = 0; /* Contains a comma-separated sequence of GPU identifiers */
-int HIP_NUM_KERNELS_INFLIGHT = 128;
+int HIP_VISIBLE_DEVICES = 0; 
 int HIP_WAIT_MODE = 0;

 int HIP_FORCE_P2P_HOST = 0;
@@ -91,12 +84,20 @@ int HIP_DENY_PEER_ACCESS = 0;
 // Force async copies to actually use the synchronous copy interface.
 int HIP_FORCE_SYNC_COPY = 0;

+// TODO - set these to 0 and 1
+int HIP_EVENT_SYS_RELEASE=1;
 int HIP_COHERENT_HOST_ALLOC = 0;

 // TODO - set to 0 once we resolve stability.
 // USE_ HIP_SYNC_HOST_ALLOC
 int HIP_SYNC_HOST_ALLOC = 1;

+// Chicken bit to sync on host to implement null stream.
+// If 0, null stream synchronization is performed on the GPU
+int HIP_SYNC_NULL_STREAM = 0;
+
+// HIP needs to change some behavior based on HCC_OPT_FLUSH :
+// TODO - set this to 1
 int HCC_OPT_FLUSH = 0;


@@ -104,9 +105,6 @@ int HCC_OPT_FLUSH = 0;



-#define HIP_USE_PRODUCT_NAME 1
-//#define DISABLE_COPY_EXT 1
-

 std::once_flag hip_initialized;

@@ -118,6 +116,7 @@ bool g_visible_device = false;
 unsigned g_deviceCnt;
 std::vector<int> g_hip_visible_devices;
 hsa_agent_t g_cpu_agent;
+hsa_agent_t *g_allAgents; // CPU agents + all the visible GPU agents.
 unsigned g_numLogicalThreads;

 std::atomic<int> g_lastShortTid(1);
@@ -272,64 +271,40 @@ ihipStream_t::~ihipStream_t()
 }


-inline void ihipStream_t::ensureHaveQueue(LockedAccessor_StreamCrit_t &streamCrit)
+hc::hcWaitMode ihipStream_t::waitMode() const
 {
-    if (HIP_MAX_QUEUES && !streamCrit->_hasQueue)  {
+    hc::hcWaitMode waitMode = hc::hcWaitModeActive;

-        // To avoid deadlock, we have to release the stream lock before acquiring context lock.
-        // Else we can get hung if another thread has the context lock is trying to get lock for this stream.
-        // We lock it again below.
-        streamCrit->munlock();
-
-        // Obtain mutex access to the device critical data, release by destructor
-        LockedAccessor_CtxCrit_t  ctxCrit(this->_ctx->criticalData());
-        // TODO
-        auto needyCritPtr = this->_criticalData.mlock();
-
-        // Second test to ensure we still need to steal the queue - another thread may have
-        // snuck in here and already solved the issue.
-        if (!needyCritPtr->_hasQueue) {
-            needyCritPtr->_av = this->_ctx->stealActiveQueue(ctxCrit, this);
+    if (_scheduleMode == Auto) {
+        if (g_deviceCnt > g_numLogicalThreads) {
+            waitMode = hc::hcWaitModeActive;
+        } else {
+            waitMode = hc::hcWaitModeBlocked;
        }
-
-        streamCrit->_hasQueue = true;
+    } else if (_scheduleMode == Spin) {
+        waitMode = hc::hcWaitModeActive;
+    } else if (_scheduleMode == Yield) {
+        waitMode = hc::hcWaitModeBlocked;
+    } else {
+        assert(0); // bad wait mode.
    }
-    assert(streamCrit->_hasQueue);
-}

+    if (HIP_WAIT_MODE == 1) {
+        waitMode = hc::hcWaitModeBlocked;
+    } else if (HIP_WAIT_MODE == 2) {
+        waitMode = hc::hcWaitModeActive;
+    }
+
+    return waitMode;
+}

 //Wait for all kernel and data copy commands in this stream to complete.
 //This signature should be used in routines that already have locked the stream mutex
 void ihipStream_t::wait(LockedAccessor_StreamCrit_t &crit)
 {
-    if (crit->_hasQueue) {
-        tprintf (DB_SYNC, "%s wait for queue-empty..\n", ToString(this).c_str());
-        hc::hcWaitMode waitMode = hc::hcWaitModeActive;
+    tprintf (DB_SYNC, "%s wait for queue-empty..\n", ToString(this).c_str());

-        if (_scheduleMode == Auto) {
-            if (g_deviceCnt > g_numLogicalThreads) {
-                waitMode = hc::hcWaitModeActive;
-            } else {
-                waitMode = hc::hcWaitModeBlocked;
-            }
-        } else if (_scheduleMode == Spin) {
-            waitMode = hc::hcWaitModeActive;
-        } else if (_scheduleMode == Yield) {
-            waitMode = hc::hcWaitModeBlocked;
-        } else {
-            assert(0); // bad wait mode.
-        }
-
-        if (HIP_WAIT_MODE == 1) {
-            waitMode = hc::hcWaitModeBlocked;
-        } else if (HIP_WAIT_MODE == 2) {
-            waitMode = hc::hcWaitModeActive;
-        }
-
-        crit->_av.wait(waitMode);
-    } else {
-        tprintf (DB_SYNC, "%s wait for queue empty (done since stream has no physical queue).\n", ToString(this).c_str());
-    }
+    crit->_av.wait(waitMode());

    crit->_kernelCnt = 0;
 }
@@ -345,14 +320,13 @@ void ihipStream_t::locked_wait()
 };

 // Causes current stream to wait for specified event to complete:
-// Note this does not require any kind of host serialization.
+// Note this does not provide any kind of host serialization.
 void ihipStream_t::locked_waitEvent(hipEvent_t event)
 {
    LockedAccessor_StreamCrit_t crit(_criticalData);

-    this->ensureHaveQueue(crit);

-    crit->_av.create_blocking_marker(event->_marker);
+    crit->_av.create_blocking_marker(event->_marker, hc::accelerator_scope);
 }

 // Create a marker in this stream.
@@ -362,9 +336,19 @@ void ihipStream_t::locked_recordEvent(hipEvent_t event)
    // Lock the stream to prevent simultaneous access
    LockedAccessor_StreamCrit_t crit(_criticalData);

-    this->ensureHaveQueue(crit);
-    event->_marker = crit->_av.create_marker();
-}
+    auto scopeFlag = hc::accelerator_scope;
+    // The env var HIP_EVENT_SYS_RELEASE sets the default,
+    // The explicit flags override the env var (if specified)
+    if (event->_flags & hipEventReleaseToSystem) {
+        scopeFlag = hc::system_scope;
+    } else if (event->_flags & hipEventReleaseToDevice) {
+        scopeFlag = hc::accelerator_scope;
+    } else {
+        scopeFlag = HIP_EVENT_SYS_RELEASE ? hc::system_scope : hc::accelerator_scope;
+    }
+
+    event->_marker = crit->_av.create_marker(scopeFlag);
+};

 //=============================================================================

@@ -396,14 +380,6 @@ LockedAccessor_StreamCrit_t ihipStream_t::lockopen_preKernelCommand()

    LockedAccessor_StreamCrit_t crit(_criticalData, false/*no unlock at destruction*/);

-    if(crit->_kernelCnt > HIP_NUM_KERNELS_INFLIGHT){
-       this->wait(crit);
-       crit->_kernelCnt = 0;
-    }
-
-    this->ensureHaveQueue(crit);
-
-

    return crit;
 }
@@ -775,6 +751,9 @@ hipError_t ihipDevice_t::initProperties(hipDeviceProp_t* prop)
    if(strcmp(archName,"gfx803")==0){
      prop->gcnArch = 803;
    }
+    if(strcmp(archName,"gfx900")==0){
+      prop->gcnArch = 900;
+    }

    DeviceErrorCheck(err);

@@ -848,11 +827,7 @@ hipError_t ihipDevice_t::initProperties(hipDeviceProp_t* prop)

    // Get Max Threads Per Multiprocessor
    uint32_t max_waves_per_cu;
-#if USE_ROCR_1_4
    err = hsa_agent_get_info(_hsaAgent,(hsa_agent_info_t) HSA_AMD_AGENT_INFO_MAX_WAVES_PER_CU, &max_waves_per_cu);
-#else
-    max_waves_per_cu = 10;
-#endif
    DeviceErrorCheck(err);
    prop-> maxThreadsPerMultiProcessor = prop->warpSize*max_waves_per_cu;

@@ -997,55 +972,6 @@ std::string ihipCtx_t::toString() const
 };


-hc::accelerator_view
-ihipCtx_t::stealActiveQueue(LockedAccessor_CtxCrit_t &ctxCrit, ihipStream_t *needyStream)
-{
-
-    // TODO - review handling if queue can't be found.
-    while (1) {
-
-        for (auto iter=ctxCrit->streams().begin(); iter != ctxCrit->streams().end(); iter++) {
-            if (*iter != needyStream) {
-                auto victimCritPtr = (*iter)->_criticalData.mtry_lock();
-                if (victimCritPtr)   {
-                    // try-lock succeeded:
-                    if (victimCritPtr->_hasQueue && (victimCritPtr->_kernelCnt == 0)) {
-
-                        victimCritPtr->_hasQueue = false;
-
-                        tprintf(DB_SYNC, " stealActiveQueue from victim:%s to needy:%s\n",
-                                ToString(*iter).c_str(), ToString(needyStream).c_str());
-
-                        hc::accelerator_view av = victimCritPtr->_av;
-
-                        // TODO - cleanup to remove forced setting to N
-                        uint64_t *p = (uint64_t*)(&victimCritPtr->_av);
-                        *p = 0; // damage the victim av so attempt to use it will fault.
-
-                        (*iter)->_criticalData.munlock();
-                        return av;
-                    }
-                    (*iter)->_criticalData.munlock();
-                }
-            }
-        }
-    }
-}
-
-
-hc::accelerator_view
-ihipCtx_t::createOrStealQueue(LockedAccessor_CtxCrit_t &ctxCrit)
-{
-    if (HIP_MAX_QUEUES && (ctxCrit->streams().size() >= HIP_MAX_QUEUES)) {
-        // Steal a queue from an existing stream:
-        hc::accelerator_view av = this->stealActiveQueue (ctxCrit, nullptr);
-        return av;
-    } else {
-        // Create a new view
-        return getWriteableDevice()->_acc.create_view();
-    }
-}
-
 //----


@@ -1062,29 +988,68 @@ ihipCtx_t::createOrStealQueue(LockedAccessor_CtxCrit_t &ctxCrit)



-// Implement "default" stream syncronization
-//   This waits for all other streams to drain before continuing.
+//   This called for submissions that are sent to the null/default stream.  This routine ensures
+//   that this new command waits for activity in the other streams to complete before proceeding.
+//
+//   HIP_SYNC_NULL_STREAM=0 does all dependency resolutiokn on the GPU
+//   HIP_SYNC_NULL_STREAM=1 s legacy non-optimal mode which conservatively waits on host.
+//
 //   If waitOnSelf is set, this additionally waits for the default stream to empty.
-void ihipCtx_t::locked_syncDefaultStream(bool waitOnSelf)
+//   In new HIP_SYNC_NULL_STREAM=0 mode, this enqueues a marker which causes the default stream to wait for other
+//   activity, but doesn't actually block the host.  If host blocking is desired, the caller should set syncHost.
+//
+//   syncToHost causes host to wait for the stream to finish.
+//   Note HIP_SYNC_NULL_STREAM=1 path always sync to Host. 
+void ihipCtx_t::locked_syncDefaultStream(bool waitOnSelf, bool syncHost)
 {
    LockedAccessor_CtxCrit_t  crit(_criticalData);

-    tprintf(DB_SYNC, "syncDefaultStream\n");
+    tprintf(DB_SYNC, "syncDefaultStream \n");
+
+    // Vector of ops sent to each stream that will complete before ops sent to null stream:
+    std::vector<hc::completion_future> depOps;

    for (auto streamI=crit->const_streams().begin(); streamI!=crit->const_streams().end(); streamI++) {
        ihipStream_t *stream = *streamI;

        // Don't wait for streams that have "opted-out" of syncing with NULL stream.
-        // And - don't wait for the NULL stream
-        if (!(stream->_flags & hipStreamNonBlocking)) {
+        // And - don't wait for the NULL stream, unless waitOnSelf specified.
+        bool waitThisStream = (!(stream->_flags & hipStreamNonBlocking)) && 
+                              (waitOnSelf || (stream != _defaultStream));

-            if (waitOnSelf || (stream != _defaultStream)) {
-                // TODO-hcc - use blocking or active wait here?
-                // TODO-sync - cudaDeviceBlockingSync
+        if (HIP_SYNC_NULL_STREAM) {
+
+            if (waitThisStream) {
                stream->locked_wait();
            }
+        } else {
+            if (waitThisStream) {
+                LockedAccessor_StreamCrit_t streamCrit(stream->_criticalData);
+
+                // The last marker will provide appropriate visibility:
+                if (!streamCrit->_av.get_is_empty()) {
+                    depOps.push_back(streamCrit->_av.create_marker(hc::accelerator_scope));
+                    tprintf(DB_SYNC, "  push marker to wait for stream=%s\n", ToString(stream).c_str());
+                } else {
+                    tprintf(DB_SYNC, "  skipped stream=%s since it is empty\n", ToString(stream).c_str());
+                }
+            }
        }
    }
+
+    
+    // Enqueue a barrier to wait on all the barriers we sent above:
+    if (!HIP_SYNC_NULL_STREAM && !depOps.empty()) {
+        LockedAccessor_StreamCrit_t defaultStreamCrit(_defaultStream->_criticalData);
+        tprintf(DB_SYNC, "  null-stream wait on %zu non-empty streams. sync_host=%d\n", depOps.size(), syncHost);
+        hc::completion_future defaultCf = defaultStreamCrit->_av.create_blocking_marker(depOps.begin(), depOps.end(), hc::accelerator_scope);
+        if (syncHost) {
+            defaultCf.wait(); // TODO - account for active or blocking here.
+        }
+    }
+
+    tprintf(DB_SYNC, "  syncDefaultStream depOps=%zu\n", depOps.size());
+
 }


@@ -1244,7 +1209,6 @@ void HipReadEnv()
    READ_ENV_I(release, HIP_API_BLOCKING, 0, "Make HIP APIs 'host-synchronous', so they block until completed.  Impacts hipMemcpyAsync, hipMemsetAsync." );


-    READ_ENV_I(release, HIP_MAX_QUEUES, 0, "Maximum number of queues that this app will use per-device.  Additional streams will share the specified number of queues.  0=no limit.");

    READ_ENV_C(release, HIP_DB, 0,  "Print debug info.  Bitmask (HIP_DB=0xff) or flags separated by '+' (HIP_DB=api+sync+mem+copy)", HIP_DB_callback);
    if ((HIP_DB & (1<<DB_API))  && (HIP_TRACE_API == 0)) {
@@ -1271,14 +1235,14 @@ void HipReadEnv()
    READ_ENV_I(release, HIP_FAIL_SOC, 0, "Fault on Sub-Optimal-Copy, rather than use a slower but functional implementation.  Bit 0x1=Fail on async copy with unpinned memory.  Bit 0x2=Fail peer copy rather than use staging buffer copy");

    READ_ENV_I(release, HIP_SYNC_HOST_ALLOC, 0, "Sync before and after all host memory allocations.  May help stability");
+    READ_ENV_I(release, HIP_SYNC_NULL_STREAM, 0, "Synchronize on host for null stream submissions");

-    // TODO - review, can we remove this?
-    READ_ENV_I(release, HIP_NUM_KERNELS_INFLIGHT, 128, "Max number of inflight kernels per stream before active synchronization is forced.");

    READ_ENV_I(release, HIP_COHERENT_HOST_ALLOC, 0, "If set, all host memory will be allocated as fine-grained system memory.  This allows threadfence_system to work but prevents host memory from being cached on GPU which may have performance impact.");


-    READ_ENV_I(release, HCC_OPT_FLUSH, 0, "Note this flag also impact HCC.  When set, use agent-scope flush rather than system-scope flush when possible.");
+    READ_ENV_I(release, HCC_OPT_FLUSH, 0, "When set, use agent-scope fence operations rather than system-scope fence operationsflush when possible. This flag controls both HIP and HCC behavior.");
+    READ_ENV_I(release, HIP_EVENT_SYS_RELEASE, 0, "If set, event are created with hipEventReleaseToSystem by default.  If 0, events are created with hipEventReleaseToDevice by default.  The defaults can be overridden by specifying hipEventReleaseToSystem or hipEventReleaseToDevice flag when creating the event.");

    // Some flags have both compile-time and runtime flags - generate a warning if user enables the runtime flag but the compile-time flag is disabled.
    if (HIP_DB && !COMPILE_HIP_DB) {
@@ -1389,6 +1353,14 @@ void ihipInit()
            g_deviceCnt++;
        }
    }
+
+    g_allAgents = static_cast<hsa_agent_t*> (malloc((g_deviceCnt+1) * sizeof(hsa_agent_t)));
+    g_allAgents[0] = g_cpu_agent;
+    for (int i=0; i<g_deviceCnt; i++) {
+        g_allAgents[i+1] = g_deviceArray[i]->_hsaAgent;
+    }
+
+
    g_numLogicalThreads = std::thread::hardware_concurrency();

    // If HIP_VISIBLE_DEVICES is not set, make sure all devices are initialized
@@ -1411,17 +1383,49 @@ void ihipInit()
 hipStream_t ihipSyncAndResolveStream(hipStream_t stream)
 {
    if (stream == hipStreamNull ) {
-        ihipCtx_t *device = ihipGetTlsDefaultCtx();
+        // Submitting to NULL stream, call locked_syncDefaultStream to wait for all other streams:
+        ihipCtx_t *ctx = ihipGetTlsDefaultCtx();
+        tprintf(DB_SYNC, "ihipSyncAndResolveStream %s wait on default stream\n", ToString(stream).c_str());

 #ifndef HIP_API_PER_THREAD_DEFAULT_STREAM
-        device->locked_syncDefaultStream(false);
+        ctx->locked_syncDefaultStream(false, false);
 #endif
-        return device->_defaultStream;
+        return ctx->_defaultStream;
    } else {
-        // ALl streams have to wait for legacy default stream to be empty:
+        // Submitting to a "normal" stream, just wait for null stream:
        if (!(stream->_flags & hipStreamNonBlocking))  {
-            tprintf(DB_SYNC, "%s wait default stream\n", ToString(stream).c_str());
-            stream->getCtx()->_defaultStream->locked_wait();
+            if (HIP_SYNC_NULL_STREAM) {
+                tprintf(DB_SYNC, "ihipSyncAndResolveStream %s host-wait on default stream\n", ToString(stream).c_str());
+                stream->getCtx()->_defaultStream->locked_wait();
+            } else {
+                ihipStream_t *defaultStream = stream->getCtx()->_defaultStream;
+
+
+                bool needGatherMarker = false; // used to gather together other markers.
+                hc::completion_future dcf;
+                {
+                    LockedAccessor_StreamCrit_t defaultStreamCrit(defaultStream->criticalData());
+                    // TODO - could call create_blocking_marker(queue) or uses existing marker.
+                    if (!defaultStreamCrit->_av.get_is_empty()) {
+                        needGatherMarker = true;
+
+                        tprintf(DB_SYNC, "  %s adding marker to default %s for dependency\n", 
+                                ToString(stream).c_str(), ToString(defaultStream).c_str());
+                        dcf = defaultStreamCrit->_av.create_marker(hc::accelerator_scope);
+                    } else {
+                        tprintf(DB_SYNC, "  %s skipping marker since default stream is empty\n", ToString(stream).c_str());
+                    }
+                }
+
+                if (needGatherMarker) {
+                    // ensure any commands sent to this stream wait on the NULL stream before continuing
+                    LockedAccessor_StreamCrit_t thisStreamCrit(stream->criticalData());
+                    // TODO - could be "noret" version of create_blocking_marker
+                    thisStreamCrit->_av.create_blocking_marker(dcf, hc::accelerator_scope);
+                    tprintf(DB_SYNC, "  %s adding marker to wait for freshly recorded default-stream marker \n", 
+                            ToString(stream).c_str());
+                }
+            }
        }

        return stream;
@@ -1431,7 +1435,7 @@ hipStream_t ihipSyncAndResolveStream(hipStream_t stream)
 void ihipPrintKernelLaunch(const char *kernelName, const grid_launch_parm *lp, const hipStream_t stream)
 {

-    if ((HIP_TRACE_API & (1<<TRACE_CMD)) || HIP_PROFILE_API || (COMPILE_HIP_DB && HIP_TRACE_API)) {
+    if ((HIP_TRACE_API & (1<<TRACE_KCMD)) || HIP_PROFILE_API || (COMPILE_HIP_DB & HIP_TRACE_API)) {
        std::stringstream os_pre;
        std::stringstream os;
        os_pre  << "<<hip-api tid:";
@@ -1900,13 +1904,8 @@ void ihipStream_t::locked_copySync(void* dst, const void* src, size_t sizeBytes,
        printPointerInfo(DB_COPY, "  dst", dst, dstPtrInfo);
        printPointerInfo(DB_COPY, "  src", src, srcPtrInfo);

-        this->ensureHaveQueue(crit);

-#if USE_COPY_EXT_V2
        crit->_av.copy_ext(src, dst, sizeBytes, hcCopyDir, srcPtrInfo, dstPtrInfo, copyDevice ? &copyDevice->getDevice()->_acc : nullptr, forceUnpinnedCopy);
-#else
-        crit->_av.copy_ext(src, dst, sizeBytes, hcCopyDir, srcPtrInfo, dstPtrInfo, forceUnpinnedCopy);
-#endif
    }
 }

@@ -2011,21 +2010,12 @@ void ihipStream_t::locked_copyAsync(void* dst, const void* src, size_t sizeBytes

            // Perform fast asynchronous copy - we know copyDevice != NULL based on check above
            try {
-                this->ensureHaveQueue(crit);

                if (HIP_FORCE_SYNC_COPY) {
-#if USE_COPY_EXT_V2
                    crit->_av.copy_ext      (src, dst, sizeBytes, hcCopyDir, srcPtrInfo, dstPtrInfo, &copyDevice->getDevice()->_acc, forceUnpinnedCopy);
-#else
-                    crit->_av.copy_ext      (src, dst, sizeBytes, hcCopyDir, srcPtrInfo, dstPtrInfo, forceUnpinnedCopy);
-#endif

                } else {
-#if USE_COPY_EXT_V2
                    crit->_av.copy_async_ext(src, dst, sizeBytes, hcCopyDir, srcPtrInfo, dstPtrInfo, &copyDevice->getDevice()->_acc);
-#else
-                    crit->_av.copy_async(src, dst, sizeBytes);
-#endif
                }
            } catch (Kalmar::runtime_exception) {
                throw ihipException(hipErrorRuntimeOther);
@@ -2056,13 +2046,8 @@ void ihipStream_t::locked_copyAsync(void* dst, const void* src, size_t sizeBytes
            // Perform slow synchronous copy:
            LockedAccessor_StreamCrit_t crit(_criticalData);

-            this->ensureHaveQueue(crit);

-#if USE_COPY_EXT_V2
            crit->_av.copy_ext(src, dst, sizeBytes, hcCopyDir, srcPtrInfo, dstPtrInfo, copyDevice ? &copyDevice->getDevice()->_acc : nullptr, forceUnpinnedCopy);
-#else
-            crit->_av.copy_ext(src, dst, sizeBytes, hcCopyDir, srcPtrInfo, dstPtrInfo, forceUnpinnedCopy);
-#endif
        }
    }
 }
@@ -2115,7 +2100,6 @@ hipError_t hipHccGetAccelerator(int deviceId, hc::accelerator *acc)


 //---
-// Warning - with HIP_MAX_QUEUES!=0 there is no mechanism to prevent accelerator_view from being re-assigned...
 hipError_t hipHccGetAcceleratorView(hipStream_t stream, hc::accelerator_view **av)
 {
    HIP_INIT_API(stream, av);
@@ -66,6 +66,8 @@ extern int HIP_COHERENT_HOST_ALLOC;
 // Chicken bits for disabling functionality to work around potential issues:
 extern int HIP_SYNC_HOST_ALLOC;

+extern int HIP_SYNC_NULL_STREAM;
+
 // TODO - remove when this is standard behavior.
 extern int HCC_OPT_FLUSH;

@@ -187,10 +189,11 @@ extern const char *API_COLOR_END;


 //---
-//HIP Trace modes
-#define TRACE_ALL 0 // 0x1
-#define TRACE_CMD 1 // 0x2
-#define TRACE_MEM 2 // 0x4
+//HIP Trace modes - use with HIP_TRACE_API=...
+#define TRACE_ALL  0 // 0x1
+#define TRACE_KCMD 1 // 0x2, kernel command
+#define TRACE_MCMD 2 // 0x4, memory command
+#define TRACE_MEM  3 // 0x8, memory allocation or deallocation.


 //---
@@ -275,12 +278,13 @@ extern void recordApiTrace(std::string *fullStr, const std::string &apiStr);
    API_TRACE(0, __VA_ARGS__);


-// Like above, but will trace with DB_CMD.
-// Replace HIP_INIT_API with this call inside important APIs that launch work on the GPU:
+// Like above, but will trace with a specified "special" bit.
+// Replace HIP_INIT_API with this call inside HIP APIs that launch work on the GPU:
 // kernel launches, copy commands, memory sets, etc.
-#define HIP_INIT_CMD_API(...) \
+#define HIP_INIT_SPECIAL_API(tbit, ...) \
    HIP_INIT()\
-    API_TRACE((HIP_TRACE_API&(1<<TRACE_CMD)), __VA_ARGS__);
+    API_TRACE((HIP_TRACE_API&(1<<tbit)), __VA_ARGS__);
+

 // This macro should be called at the end of every HIP API, and only at the end of top-level hip APIS (not internal hip)
 // It has dual function: logs the last error returned for use by hipGetLastError,
@@ -443,7 +447,6 @@ public:
    ihipStreamCriticalBase_t(ihipStream_t *parentStream, hc::accelerator_view av) :
        _kernelCnt(0),
        _av(av),
-        _hasQueue(true),
        _parent(parentStream)
    {
    };
@@ -469,11 +472,6 @@ public:
    uint32_t                    _kernelCnt;    // Count of inflight kernels in this stream.  Reset at ::wait().

    hc::accelerator_view        _av;
-
-    // True if the stream has an allocated queue (accelerato_view) for its use:
-    // Always true at ihipStream creation but queue may later be stolen.
-    // This acts as a valid bit for the _av.
-    bool                        _hasQueue;
 private:
 };

@@ -519,8 +517,10 @@ public:
    void                 locked_waitEvent(hipEvent_t event);
    void                 locked_recordEvent(hipEvent_t event);

+    ihipStreamCritical_t  &criticalData() { return _criticalData; };

    //---
+    hc::hcWaitMode waitMode() const;

    // Use this if we already have the stream critical data mutex:
    void                 wait(LockedAccessor_StreamCrit_t &crit);
@@ -538,12 +538,13 @@ public:
    const ihipDevice_t *     getDevice() const;
    ihipCtx_t *              getCtx() const;

-    void ensureHaveQueue(LockedAccessor_StreamCrit_t &streamCrit);
+    // Before calling this function, stream must be resolved from "0" to the actual stream:
+    bool isDefaultStream() const { return _id == 0; };

 public:
    //---
    //Public member vars - these are set at initialization and never change:
-    SeqNum_t                    _id;   // monotonic sequence ID
+    SeqNum_t                    _id;   // monotonic sequence ID.  0 is the default stream.
    unsigned                    _flags;


@@ -562,6 +563,7 @@ private:

    void addSymbolPtrToTracker(hc::accelerator& acc, void* ptr, size_t sizeBytes);

+
 public: // TODO - move private
    // Critical Data - MUST be accessed through LockedAccessor_StreamCrit_t
    ihipStreamCritical_t        _criticalData;
@@ -584,10 +586,10 @@ private: // Data
 //----
 // Internal event structure:
 enum hipEventStatus_t {
-    hipEventStatusUnitialized = 0, // event is unutilized, must be "Created" before use.
-    hipEventStatusCreated     = 1,
-    hipEventStatusRecording   = 2, // event has been enqueued to record something.
-    hipEventStatusRecorded    = 3, // event has been recorded - timestamps are valid.
+    hipEventStatusUnitialized = 0, // event is uninitialized, must be "Created" before use.
+    hipEventStatusCreated     = 1, // event created, but not yet Recorded
+    hipEventStatusRecording   = 2, // event has been recorded into a stream but not completed yet.
+    hipEventStatusComplete    = 3, // event has been recorded - timestamps are valid.
 } ;

 // TODO - rename to ihip type of some kind
@@ -601,8 +603,8 @@ enum ihipEventType_t {
 class ihipEvent_t {
 public:
    ihipEvent_t(unsigned flags);
-    void attachToCompletionFuture(const hc::completion_future *cf, ihipEventType_t eventType);
-    void setTimestamp();
+    void attachToCompletionFuture(const hc::completion_future *cf, hipStream_t stream, ihipEventType_t eventType);
+    void refereshEventStatus();
    uint64_t timestamp() const { return _timestamp; } ;
    ihipEventType_t type() const { return _type; };

@@ -784,11 +786,7 @@ public: // Functions:
    void locked_removeStream(ihipStream_t *s);
    void locked_reset();
    void locked_waitAllStreams();
-    void locked_syncDefaultStream(bool waitOnSelf);
-
-    // Will allocate a queue and assign it to the needyStream:
-    hc::accelerator_view  stealActiveQueue(LockedAccessor_CtxCrit_t &ctxCrit, ihipStream_t *needyStream);
-    hc::accelerator_view createOrStealQueue(LockedAccessor_CtxCrit_t &ctxCrit);
+    void locked_syncDefaultStream(bool waitOnSelf, bool syncHost);

    ihipCtxCritical_t  &criticalData() { return _criticalData; };

@@ -826,6 +824,7 @@ private:  // Critical data, protected with locked access:
 extern std::once_flag hip_initialized;
 extern unsigned g_deviceCnt;
 extern hsa_agent_t g_cpu_agent ;   // the CPU agent.
+extern hsa_agent_t *g_allAgents; // CPU agents + all the visible GPU agents.

 //=================================================================================================
 // Extern functions:
@@ -59,31 +59,40 @@ hipError_t memcpyAsync (void* dst, const void* src, size_t sizeBytes, hipMemcpyK
 }

 // return 0 on success or -1 on error:
-int sharePtr(void *ptr, ihipCtx_t *ctx, unsigned hipFlags)
+int sharePtr(void *ptr, ihipCtx_t *ctx, bool shareWithAll, unsigned hipFlags)
 {
    int ret = 0;

    auto device = ctx->getWriteableDevice();

    hc::am_memtracker_update(ptr, device->_deviceId, hipFlags);
-    int peerCnt=0;
-    {
-        LockedAccessor_CtxCrit_t crit(ctx->criticalData());
-        // the peerCnt always stores self so make sure the trace actually
-        peerCnt = crit->peerCnt();
-        tprintf(DB_MEM, "  allow access to %d other peer(s)\n", peerCnt-1);
-        if (peerCnt > 1) {

-            //printf ("peer self access\n");
+    if (shareWithAll) {
+        hsa_status_t s = hsa_amd_agents_allow_access(g_deviceCnt+1, g_allAgents, NULL, ptr);
+        tprintf (DB_MEM, "    allow access to CPU + all %d GPUs (shareWithAll)\n", g_deviceCnt); 
+        if (s != HSA_STATUS_SUCCESS) {
+            ret = -1;
+        }
+    } else {
+        int peerCnt=0;
+        {
+            LockedAccessor_CtxCrit_t crit(ctx->criticalData());
+            // the peerCnt always stores self so make sure the trace actually
+            peerCnt = crit->peerCnt();
+            tprintf(DB_MEM, "  allow access to %d other peer(s)\n", peerCnt-1);
+            if (peerCnt > 1) {

-            // TODOD - remove me:
-            for (auto iter = crit->_peers.begin(); iter!=crit->_peers.end(); iter++) {
-                tprintf (DB_MEM, "    allow access to peer: %s%s\n", (*iter)->toString().c_str(), (iter == crit->_peers.begin()) ? " (self)":"");
-            };
+                //printf ("peer self access\n");

-            hsa_status_t s = hsa_amd_agents_allow_access(crit->peerCnt(), crit->peerAgents(), NULL, ptr);
-            if (s != HSA_STATUS_SUCCESS) {
-                ret = -1;
+                // TODOD - remove me:
+                for (auto iter = crit->_peers.begin(); iter!=crit->_peers.end(); iter++) {
+                    tprintf (DB_MEM, "    allow access to peer: %s%s\n", (*iter)->toString().c_str(), (iter == crit->_peers.begin()) ? " (self)":"");
+                };
+
+                hsa_status_t s = hsa_amd_agents_allow_access(crit->peerCnt(), crit->peerAgents(), NULL, ptr);
+                if (s != HSA_STATUS_SUCCESS) {
+                    ret = -1;
+                }
            }
        }
    }
@@ -96,7 +105,7 @@ int sharePtr(void *ptr, ihipCtx_t *ctx, unsigned hipFlags)

 // Allocate a new pointer with am_alloc and share with all valid peers.
 // Returns null-ptr if a memory error occurs (either allocation or sharing)
-void * allocAndSharePtr(const char *msg, size_t sizeBytes, ihipCtx_t *ctx, unsigned amFlags, unsigned hipFlags)
+void * allocAndSharePtr(const char *msg, size_t sizeBytes, ihipCtx_t *ctx, bool shareWithAll, unsigned amFlags, unsigned hipFlags)
 {

    void *ptr = nullptr;
@@ -104,11 +113,11 @@ void * allocAndSharePtr(const char *msg, size_t sizeBytes, ihipCtx_t *ctx, unsig
    auto device = ctx->getWriteableDevice();

    ptr = hc::am_alloc(sizeBytes, device->_acc, amFlags);
-    tprintf(DB_MEM, " alloc %s ptr:%p size:%zu on dev:%d\n",
-            msg, ptr, sizeBytes, device->_deviceId);
+    tprintf(DB_MEM, " alloc %s ptr:%p-%p size:%zu on dev:%d\n",
+            msg, ptr, static_cast<char*>(ptr)+sizeBytes, sizeBytes, device->_deviceId);

    if (ptr != nullptr) {
-        int r = sharePtr(ptr, ctx, hipFlags);
+        int r = sharePtr(ptr, ctx, shareWithAll, hipFlags);
        if (r != 0) {
            ptr = nullptr;
        }
@@ -193,7 +202,8 @@ hipError_t hipHostGetDevicePointer(void **devicePointer, void *hostPointer, unsi
        hc::AmPointerInfo amPointerInfo(NULL, NULL, 0, acc, 0, 0);
        am_status_t status = hc::am_memtracker_getinfo(&amPointerInfo, hostPointer);
        if (status == AM_SUCCESS) {
-            *devicePointer = amPointerInfo._devicePointer;
+            *devicePointer = static_cast<char*>(amPointerInfo._devicePointer) + (static_cast<char*>(hostPointer) - static_cast<char*>(amPointerInfo._hostPointer)) ;
+            tprintf(DB_MEM, " host_ptr=%p returned device_pointer=%p\n", hostPointer, *devicePointer);
        } else {
            e = hipErrorMemoryAllocation;
        }
@@ -204,7 +214,7 @@ hipError_t hipHostGetDevicePointer(void **devicePointer, void *hostPointer, unsi

 hipError_t hipMalloc(void** ptr, size_t sizeBytes)
 {
-    HIP_INIT_API(ptr, sizeBytes);
+    HIP_INIT_SPECIAL_API((TRACE_MEM), ptr, sizeBytes);
    HIP_SET_DEVICE();
    hipError_t  hip_status = hipSuccess;

@@ -220,7 +230,7 @@ hipError_t hipMalloc(void** ptr, size_t sizeBytes)

    } else {
        auto device = ctx->getWriteableDevice();
-        *ptr = hip_internal::allocAndSharePtr("device_mem", sizeBytes, ctx,  0/*amFlags*/, 0/*hipFlags*/);
+        *ptr = hip_internal::allocAndSharePtr("device_mem", sizeBytes, ctx,  false/*shareWithAll*/, 0/*amFlags*/, 0/*hipFlags*/);

        if(sizeBytes  && (*ptr == NULL)){
            hip_status = hipErrorMemoryAllocation;
@@ -235,7 +245,7 @@ hipError_t hipMalloc(void** ptr, size_t sizeBytes)

 hipError_t hipHostMalloc(void** ptr, size_t sizeBytes, unsigned int flags)
 {
-    HIP_INIT_CMD_API(ptr, sizeBytes, flags);
+    HIP_INIT_SPECIAL_API((TRACE_MEM), ptr, sizeBytes, flags);
    HIP_SET_DEVICE();
    hipError_t hip_status = hipSuccess;

@@ -253,20 +263,42 @@ hipError_t hipHostMalloc(void** ptr, size_t sizeBytes, unsigned int flags)
    } else {
        unsigned trueFlags = flags;
        if (flags == hipHostMallocDefault) {
-            trueFlags = hipHostMallocMapped | hipHostMallocWriteCombined;
+            // HCC/ROCM provide a modern system with unified memory and should set both of these flags by default:
+            trueFlags = hipHostMallocMapped | hipHostMallocPortable;
        }

-        const unsigned supportedFlags = hipHostMallocPortable | hipHostMallocMapped | hipHostMallocWriteCombined;

-        if (flags & ~supportedFlags) {
+        const unsigned supportedFlags = hipHostMallocPortable 
+                                      | hipHostMallocMapped 
+                                      | hipHostMallocWriteCombined 
+                                      | hipHostMallocCoherent 
+                                      | hipHostMallocNonCoherent;
+
+
+        const unsigned coherencyFlags = hipHostMallocCoherent | hipHostMallocNonCoherent;
+
+        if ((flags & ~supportedFlags) ||
+            ((flags & coherencyFlags) == coherencyFlags)) {
+            *ptr = nullptr;
+             // can't specify unsupported flags, can't specify both Coherent + NonCoherent
            hip_status = hipErrorInvalidValue;
-        }
-        else {
+        } else {
            auto device = ctx->getWriteableDevice();
-            unsigned amFlags = HIP_COHERENT_HOST_ALLOC ? amHostCoherent : amHostPinned;
+            
+            unsigned amFlags = 0;
+            if (flags & hipHostMallocCoherent) {
+                amFlags = amHostCoherent;
+            } else if (flags & hipHostMallocNonCoherent) {
+                amFlags = amHostPinned;
+            } else {
+                // depends on env variables:
+                amFlags = HIP_COHERENT_HOST_ALLOC ? amHostCoherent : amHostPinned;
+            }
+
+
+            *ptr = hip_internal::allocAndSharePtr((amFlags & amHostCoherent) ? "finegrained_host":"pinned_host",
+                                                  sizeBytes, ctx, (trueFlags & hipHostMallocPortable) /*shareWithAll*/, amFlags, flags);

-            *ptr = hip_internal::allocAndSharePtr(HIP_COHERENT_HOST_ALLOC ? "finegrained_host":"pinned_host",
-                                                  sizeBytes, ctx, amFlags, flags);
            if(sizeBytes  && (*ptr == NULL)){
                hip_status = hipErrorMemoryAllocation;
            }
@@ -296,7 +328,7 @@ hipError_t hipHostAlloc(void** ptr, size_t sizeBytes, unsigned int flags)
 // width in bytes
 hipError_t hipMallocPitch(void** ptr, size_t* pitch, size_t width, size_t height)
 {
-    HIP_INIT_CMD_API(ptr, pitch, width, height);
+    HIP_INIT_SPECIAL_API((TRACE_MEM), ptr, pitch, width, height);
    HIP_SET_DEVICE();
    hipError_t  hip_status = hipSuccess;

@@ -314,7 +346,7 @@ hipError_t hipMallocPitch(void** ptr, size_t* pitch, size_t width, size_t height
        auto device = ctx->getWriteableDevice();

        const unsigned am_flags = 0;
-        *ptr = hip_internal::allocAndSharePtr("device_pitch", sizeBytes, ctx, am_flags, 0);
+        *ptr = hip_internal::allocAndSharePtr("device_pitch", sizeBytes, ctx, false/*shareWithAll*/, am_flags, 0);

        if (sizeBytes && (*ptr == NULL)) {
            hip_status = hipErrorMemoryAllocation;
@@ -337,7 +369,7 @@ hipChannelFormatDesc hipCreateChannelDesc(int x, int y, int z, int w, hipChannel
 hipError_t hipMallocArray(hipArray** array, const hipChannelFormatDesc* desc,
        size_t width, size_t height, unsigned int flags)
 {
-    HIP_INIT_CMD_API(array, desc, width, height, flags);
+    HIP_INIT_SPECIAL_API((TRACE_MEM), array, desc, width, height, flags);
    HIP_SET_DEVICE();
    hipError_t  hip_status = hipSuccess;

@@ -373,7 +405,7 @@ hipError_t hipMallocArray(hipArray** array, const hipChannelFormatDesc* desc,
                hip_status = hipErrorUnknown;
                break;
        }
-        *ptr = hip_internal::allocAndSharePtr("device_array", allocSize, ctx, am_flags, 0);
+        *ptr = hip_internal::allocAndSharePtr("device_array", allocSize, ctx, false/*shareWithAll*/, am_flags, 0);
        if (size && (*ptr == NULL)) {
            hip_status = hipErrorMemoryAllocation;
        }
@@ -478,7 +510,7 @@ hipError_t hipHostUnregister(void *hostPtr)

 hipError_t hipMemcpyToSymbol(const void* symbolName, const void *src, size_t count, size_t offset, hipMemcpyKind kind)
 {
-    HIP_INIT_CMD_API(symbolName, src, count, offset, kind);
+    HIP_INIT_SPECIAL_API((TRACE_MCMD), symbolName, src, count, offset, kind);

    if(symbolName == nullptr)
    {
@@ -513,7 +545,7 @@ hipError_t hipMemcpyToSymbol(const void* symbolName, const void *src, size_t cou

 hipError_t hipMemcpyFromSymbol(void* dst, const void* symbolName, size_t count, size_t offset, hipMemcpyKind kind)
 {
-    HIP_INIT_CMD_API(symbolName, dst, count, offset, kind);
+    HIP_INIT_SPECIAL_API((TRACE_MCMD), symbolName, dst, count, offset, kind);

    if(symbolName == nullptr)
    {
@@ -548,7 +580,7 @@ hipError_t hipMemcpyFromSymbol(void* dst, const void* symbolName, size_t count,

 hipError_t hipMemcpyToSymbolAsync(const void* symbolName, const void *src, size_t count, size_t offset, hipMemcpyKind kind, hipStream_t stream)
 {
-    HIP_INIT_CMD_API(symbolName, src, count, offset, kind, stream);
+    HIP_INIT_SPECIAL_API((TRACE_MCMD), symbolName, src, count, offset, kind, stream);

    if(symbolName == nullptr)
    {
@@ -586,7 +618,7 @@ hipError_t hipMemcpyToSymbolAsync(const void* symbolName, const void *src, size_

 hipError_t hipMemcpyFromSymbolAsync(void* dst, const void* symbolName, size_t count, size_t offset, hipMemcpyKind kind, hipStream_t stream)
 {
-    HIP_INIT_CMD_API(symbolName, dst, count, offset, kind, stream);
+    HIP_INIT_SPECIAL_API((TRACE_MCMD), symbolName, dst, count, offset, kind, stream);

    if(symbolName == nullptr)
    {
@@ -625,7 +657,7 @@ hipError_t hipMemcpyFromSymbolAsync(void* dst, const void* symbolName, size_t co
 //---
 hipError_t hipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind)
 {
-    HIP_INIT_CMD_API(dst, src, sizeBytes, kind);
+    HIP_INIT_SPECIAL_API((TRACE_MCMD), dst, src, sizeBytes, kind);

    hipStream_t stream = ihipSyncAndResolveStream(hipStreamNull);

@@ -647,7 +679,7 @@ hipError_t hipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind

 hipError_t hipMemcpyHtoD(hipDeviceptr_t dst, void* src, size_t sizeBytes)
 {
-    HIP_INIT_CMD_API(dst, src, sizeBytes);
+    HIP_INIT_SPECIAL_API((TRACE_MCMD), dst, src, sizeBytes);

    hipStream_t stream = ihipSyncAndResolveStream(hipStreamNull);

@@ -669,7 +701,7 @@ hipError_t hipMemcpyHtoD(hipDeviceptr_t dst, void* src, size_t sizeBytes)

 hipError_t hipMemcpyDtoH(void* dst, hipDeviceptr_t src, size_t sizeBytes)
 {
-    HIP_INIT_CMD_API(dst, src, sizeBytes);
+    HIP_INIT_SPECIAL_API((TRACE_MCMD), dst, src, sizeBytes);

    hipStream_t stream = ihipSyncAndResolveStream(hipStreamNull);

@@ -691,7 +723,7 @@ hipError_t hipMemcpyDtoH(void* dst, hipDeviceptr_t src, size_t sizeBytes)

 hipError_t hipMemcpyDtoD(hipDeviceptr_t dst, hipDeviceptr_t src, size_t sizeBytes)
 {
-    HIP_INIT_CMD_API(dst, src, sizeBytes);
+    HIP_INIT_SPECIAL_API((TRACE_MCMD), dst, src, sizeBytes);

    hipStream_t stream = ihipSyncAndResolveStream(hipStreamNull);

@@ -713,7 +745,7 @@ hipError_t hipMemcpyDtoD(hipDeviceptr_t dst, hipDeviceptr_t src, size_t sizeByte

 hipError_t hipMemcpyHtoH(void* dst, void* src, size_t sizeBytes)
 {
-    HIP_INIT_CMD_API(dst, src, sizeBytes);
+    HIP_INIT_SPECIAL_API((TRACE_MCMD), dst, src, sizeBytes);

    hipStream_t stream = ihipSyncAndResolveStream(hipStreamNull);

@@ -738,7 +770,7 @@ hipError_t hipMemcpyHtoH(void* dst, void* src, size_t sizeBytes)

 hipError_t hipMemcpyAsync(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind, hipStream_t stream)
 {
-    HIP_INIT_CMD_API(dst, src, sizeBytes, kind, stream);
+    HIP_INIT_SPECIAL_API((TRACE_MCMD), dst, src, sizeBytes, kind, stream);

    return ihipLogStatus(hip_internal::memcpyAsync(dst, src, sizeBytes, kind, stream));

@@ -747,21 +779,21 @@ hipError_t hipMemcpyAsync(void* dst, const void* src, size_t sizeBytes, hipMemcp

 hipError_t hipMemcpyHtoDAsync(hipDeviceptr_t dst, void* src, size_t sizeBytes, hipStream_t stream)
 {
-    HIP_INIT_CMD_API(dst, src, sizeBytes, stream);
+    HIP_INIT_SPECIAL_API((TRACE_MCMD), dst, src, sizeBytes, stream);

    return ihipLogStatus(hip_internal::memcpyAsync(dst, src, sizeBytes, hipMemcpyHostToDevice, stream));
 }

 hipError_t hipMemcpyDtoDAsync(hipDeviceptr_t dst, hipDeviceptr_t src, size_t sizeBytes, hipStream_t stream)
 {
-    HIP_INIT_CMD_API(dst, src, sizeBytes, stream);
+    HIP_INIT_SPECIAL_API((TRACE_MCMD), dst, src, sizeBytes, stream);

    return ihipLogStatus(hip_internal::memcpyAsync(dst, src, sizeBytes, hipMemcpyDeviceToDevice, stream));
 }

 hipError_t hipMemcpyDtoHAsync(void* dst, hipDeviceptr_t src, size_t sizeBytes, hipStream_t stream)
 {
-    HIP_INIT_CMD_API(dst, src, sizeBytes, stream);
+    HIP_INIT_SPECIAL_API((TRACE_MCMD), dst, src, sizeBytes, stream);

    return ihipLogStatus(hip_internal::memcpyAsync(dst, src, sizeBytes, hipMemcpyDeviceToHost, stream));
 }
@@ -770,7 +802,7 @@ hipError_t hipMemcpyDtoHAsync(void* dst, hipDeviceptr_t src, size_t sizeBytes, h
 hipError_t hipMemcpy2D(void* dst, size_t dpitch, const void* src, size_t spitch,
        size_t width, size_t height, hipMemcpyKind kind) {

-    HIP_INIT_CMD_API(dst, dpitch, src, spitch, width, height, kind);
+    HIP_INIT_SPECIAL_API((TRACE_MCMD), dst, dpitch, src, spitch, width, height, kind);

    if(width > dpitch || width > spitch)
        return ihipLogStatus(hipErrorUnknown);
@@ -793,10 +825,28 @@ hipError_t hipMemcpy2D(void* dst, size_t dpitch, const void* src, size_t spitch,
    return ihipLogStatus(e);
 }

+hipError_t hipMemcpy2D(void* dst, size_t dpitch, const void* src, size_t spitch,
+        size_t width, size_t height, hipMemcpyKind kind, hipStream_t stream) {
+    HIP_INIT_SPECIAL_API((TRACE_MCMD), dst, dpitch, src, spitch, width, height, kind, stream);
+    if(width > dpitch || width > spitch)
+        return ihipLogStatus(hipErrorUnknown);
+    hipError_t e = hipSuccess;
+    try {
+        for(int i = 0; i < height; ++i) {
+            e = hip_internal::memcpyAsync((unsigned char*)dst + i*dpitch, (unsigned char*)src + i*spitch, width, kind,stream);
+        }
+    }
+    catch (ihipException ex) {
+        e = ex._code;
+    }
+
+    return ihipLogStatus(e);
+}
+
 hipError_t hipMemcpy2DToArray(hipArray* dst, size_t wOffset, size_t hOffset, const void* src,
        size_t spitch, size_t width, size_t height, hipMemcpyKind kind) {

-    HIP_INIT_CMD_API(dst, wOffset, hOffset, src, spitch, width, height, kind);
+    HIP_INIT_SPECIAL_API((TRACE_MCMD), dst, wOffset, hOffset, src, spitch, width, height, kind);

    hipStream_t stream = ihipSyncAndResolveStream(hipStreamNull);

@@ -849,7 +899,7 @@ hipError_t hipMemcpy2DToArray(hipArray* dst, size_t wOffset, size_t hOffset, con
 hipError_t hipMemcpyToArray(hipArray* dst, size_t wOffset, size_t hOffset,
        const void* src, size_t count, hipMemcpyKind kind) {

-    HIP_INIT_CMD_API(dst, wOffset, hOffset, src, count, kind);
+    HIP_INIT_SPECIAL_API((TRACE_MCMD), dst, wOffset, hOffset, src, count, kind);

    hipStream_t stream = ihipSyncAndResolveStream(hipStreamNull);

@@ -908,7 +958,7 @@ ihipMemsetKernel(hipStream_t stream,
 // TODO-sync: function is async unless target is pinned host memory - then these are fully sync.
 hipError_t hipMemsetAsync(void* dst, int  value, size_t sizeBytes, hipStream_t stream )
 {
-    HIP_INIT_CMD_API(dst, value, sizeBytes, stream);
+    HIP_INIT_SPECIAL_API((TRACE_MCMD), dst, value, sizeBytes, stream);

    hipError_t e = hipSuccess;

@@ -917,7 +967,6 @@ hipError_t hipMemsetAsync(void* dst, int  value, size_t sizeBytes, hipStream_t s
    if (stream) {
        auto crit = stream->lockopen_preKernelCommand();

-        stream->ensureHaveQueue(crit);

        hc::completion_future cf ;

@@ -958,7 +1007,7 @@ hipError_t hipMemsetAsync(void* dst, int  value, size_t sizeBytes, hipStream_t s

 hipError_t hipMemset(void* dst, int  value, size_t sizeBytes )
 {
-    HIP_INIT_CMD_API(dst, value, sizeBytes);
+    HIP_INIT_SPECIAL_API((TRACE_MCMD), dst, value, sizeBytes);

    hipError_t e = hipSuccess;

@@ -969,7 +1018,6 @@ hipError_t hipMemset(void* dst, int  value, size_t sizeBytes )
    if (stream) {
        auto crit = stream->lockopen_preKernelCommand();

-        stream->ensureHaveQueue(crit);
        hc::completion_future cf ;

        if ((sizeBytes & 0x3) == 0) {
@@ -1011,7 +1059,7 @@ hipError_t hipMemset(void* dst, int  value, size_t sizeBytes )

 hipError_t hipMemsetD8(hipDeviceptr_t dst, unsigned char  value, size_t sizeBytes )
 {
-    HIP_INIT_CMD_API(dst, value, sizeBytes);
+    HIP_INIT_SPECIAL_API((TRACE_MCMD), dst, value, sizeBytes);

    hipError_t e = hipSuccess;

@@ -1022,7 +1070,6 @@ hipError_t hipMemsetD8(hipDeviceptr_t dst, unsigned char  value, size_t sizeByte
    if (stream) {
        auto crit = stream->lockopen_preKernelCommand();

-        stream->ensureHaveQueue(crit);
        hc::completion_future cf ;

        if ((sizeBytes & 0x3) == 0) {
@@ -1118,7 +1165,7 @@ hipError_t hipMemPtrGetInfo(void *ptr, size_t *size)

 hipError_t hipFree(void* ptr)
 {
-    HIP_INIT_API(ptr);
+    HIP_INIT_SPECIAL_API((TRACE_MEM), ptr);

    hipError_t hipStatus = hipErrorInvalidDevicePointer;

@@ -1146,7 +1193,7 @@ hipError_t hipFree(void* ptr)

 hipError_t hipHostFree(void* ptr)
 {
-    HIP_INIT_API(ptr);
+    HIP_INIT_SPECIAL_API((TRACE_MEM), ptr);

    // Synchronize to ensure all work has finished.
    ihipGetTlsDefaultCtx()->locked_waitAllStreams(); // ignores non-blocking streams, this waits for all activity to finish.
@@ -1180,7 +1227,7 @@ hipError_t hipFreeHost(void* ptr)

 hipError_t hipFreeArray(hipArray* array)
 {
-    HIP_INIT_API(array);
+    HIP_INIT_SPECIAL_API((TRACE_MEM), array);

    hipError_t hipStatus = hipErrorInvalidDevicePointer;

@@ -352,14 +352,14 @@ hipError_t ihipModuleGetSymbol(hipFunction_t *func, hipModule_t hmod, const char
        *func = sym;
        hmod->funcTrack.push_back(*func);
    }
-    return ihipLogStatus(ret);
+    return ret;
 }


 hipError_t hipModuleGetFunction(hipFunction_t *hfunc, hipModule_t hmod,
                                const char *name){
    HIP_INIT_API(hfunc, hmod, name);
-    return ihipModuleGetSymbol(hfunc, hmod, name);
+    return ihipLogStatus(ihipModuleGetSymbol(hfunc, hmod, name));
 }


@@ -455,10 +455,10 @@ hipError_t ihipModuleLaunchKernel(hipFunction_t f,


        if (startEvent) {
-            startEvent->attachToCompletionFuture(&cf, hipEventTypeStartCommand);
+            startEvent->attachToCompletionFuture(&cf, hStream, hipEventTypeStartCommand);
        }
        if (stopEvent) {
-            stopEvent->attachToCompletionFuture (&cf, hipEventTypeStopCommand);
+            stopEvent->attachToCompletionFuture (&cf, hStream, hipEventTypeStopCommand);
        }


@@ -525,7 +525,6 @@ hipError_t hipModuleGetGlobal(hipDeviceptr_t *dptr, size_t *bytes,
    }
 }

-
 hipError_t hipModuleLoadData(hipModule_t *module, const void *image)
 {
    HIP_INIT_API(module, image);
@@ -575,3 +574,8 @@ hipError_t hipModuleLoadData(hipModule_t *module, const void *image)
    }
    return ihipLogStatus(ret);
 }
+
+hipError_t hipModuleLoadDataEx(hipModule_t *module, const void *image, unsigned int numOptions, hipJitOption *options, void **optionValues)
+{
+    return hipModuleLoadData(module, image);
+}
@@ -49,7 +49,7 @@ hipError_t ihipStreamCreate(hipStream_t *stream, unsigned int flags)
            // Obtain mutex access to the device critical data, release by destructor
            LockedAccessor_CtxCrit_t  ctxCrit(ctx->criticalData());

-            auto istream = new ihipStream_t(ctx, ctx->createOrStealQueue(ctxCrit), flags);
+            auto istream = new ihipStream_t(ctx, acc.create_view(), flags);

            ctxCrit->addStream(istream);
            *stream = istream;
@@ -93,20 +93,17 @@ hipError_t hipStreamWaitEvent(hipStream_t stream, hipEvent_t event, unsigned int

    } else if (event->_state != hipEventStatusUnitialized) {

-        bool fastWait = false;
-
        if (stream != hipStreamNull) {
+
+            // This will user create_blocking_marker to wait on the specified queue.
            stream->locked_waitEvent(event);

-            fastWait = true; // don't use the slow host-side synchronization.
-        }
-
-        if (!fastWait) {
+        } else {
            // TODO-hcc Convert to use create_blocking_marker(...) functionality.
            // Currently we have a super-conservative version of this - block on host, and drain the queue.
            // This should create a barrier packet in the target queue.
+            // TODO-HIP_SYNC_NULL_STREAM
            stream->locked_wait();
-            e = hipSuccess;
        }
    } // else event not recorded, return immediately and don't create marker.

@@ -129,9 +126,7 @@ hipError_t hipStreamQuery(hipStream_t stream)

    {
        LockedAccessor_StreamCrit_t crit(stream->_criticalData);
-        if (crit->_hasQueue) {
-            pendingOps = crit->_av.get_pending_async_ops();
-        }
+        pendingOps = crit->_av.get_pending_async_ops();
    }


@@ -148,10 +143,11 @@ hipError_t hipStreamSynchronize(hipStream_t stream)

    hipError_t e = hipSuccess;

-    if (stream == NULL) {
+    if (stream == hipStreamNull) {
        ihipCtx_t *ctx = ihipGetTlsDefaultCtx();
-        ctx->locked_syncDefaultStream(true/*waitOnSelf*/);
+        ctx->locked_syncDefaultStream(true/*waitOnSelf*/, true/*syncToHost*/);
    } else {
+		// note this does not synchornize with the NULL stream:
        stream->locked_wait();
        e = hipSuccess;
    }
@@ -173,20 +169,18 @@ hipError_t hipStreamDestroy(hipStream_t stream)

    //--- Drain the stream:
    if (stream == NULL) {
-        ihipCtx_t *ctx = ihipGetTlsDefaultCtx();
-        ctx->locked_syncDefaultStream(true/*waitOnSelf*/);
+        e = hipErrorInvalidResourceHandle; // TODO - review - what happens if try to destroy null stream
    } else {
        stream->locked_wait();
-        e = hipSuccess;
-    }

-    ihipCtx_t *ctx = stream->getCtx();
+        ihipCtx_t *ctx = stream->getCtx();

-    if (ctx) {
-        ctx->locked_removeStream(stream);
-        delete stream;
-    } else {
-        e = hipErrorInvalidResourceHandle;
+        if (ctx) {
+            ctx->locked_removeStream(stream);
+            delete stream;
+        } else {
+            e = hipErrorInvalidResourceHandle;
+        }
    }

    return ihipLogStatus(e);
@@ -200,7 +194,7 @@ hipError_t hipStreamGetFlags(hipStream_t stream, unsigned int *flags)

    if (flags == NULL) {
        return ihipLogStatus(hipErrorInvalidValue);
-    } else if (stream == NULL) {
+    } else if (stream == hipStreamNull) {
        return ihipLogStatus(hipErrorInvalidResourceHandle);
    } else {
        *flags = stream->_flags;
@@ -830,16 +830,6 @@ __host__ double erfcinv(double y)
    return __hip_host_erfcinv(y);
 }

-__host__ float erfinvf(float x)
-{
-    return __hip_host_erfinvf(x);
-}
-
-__host__ double erfinv(double x)
-{
-    return __hip_host_erfinv(x);
-}
-
 __host__ double fdivide(double x, double y)
 {
    return x/y;
@@ -947,15 +937,15 @@ __host__ void sincospi(double x, double *sptr, double *cptr)
    *cptr = std::cos(HIP_PI*x);
 }

-__host__ float normcdfinvf(float x)
-{
-    return std::sqrt(2) * erfinv(2*x-1);
-}
+//__host__ float normcdfinvf(float x)
+//{
+//    return std::sqrt(2) * erfinvf(2*x-1);
+//}

-__host__ double normcdfinv(double x)
-{
-    return std::sqrt(2) * erfinv(2*x-1);
-}
+//__host__ double normcdfinv(double x)
+//{
+//    return std::sqrt(2) * erfinv(2*x-1);
+//}

 __host__ float nextafterf(float x, float y)
 {
@@ -59,5 +59,9 @@ Find the test and commandline that fail:
 grep -IR hipMemcpy-modes -IR ../tests/
 ../tests/src/runtimeApi/memory/hipMemcpy.cpp: * RUN_NAMED: %t hipMemcpy-modes --tests 0x1

+# Guidelines for adding new tests

+- Prefer to enhance an existing test as opposed to writing a new one. Tests have overhead to start and many small tests spend precious test time on startup and initialization issues.
+- Make the test run standalone without requirement for command-line arguments.  THis makes it easier to debug since the name of the test is shown in the test report and if you know the name of the test you can the run the test.
+- For long-running tests or tests with multiple phases, consider using the --tests option as an optional mechanism to allow debuggers to start with the failing subset of the test.

@@ -99,7 +99,7 @@ __device__ void double_precision_math_functions()
    normcdf(0.0);
    normcdfinv(1.0);
    pow(1.0, 0.0);
-    rcbrt(1.0);
+    //rcbrt(1.0);
    remainder(2.0, 1.0);
 //    remquo(1.0, 2.0, &iX);
    rhypot(0.0, 1.0);
@@ -109,8 +109,8 @@ __device__ void double_precision_math_functions()
    rnorm4d(0.0, 0.0, 0.0, 1.0);
    round(0.0);
    rsqrt(1.0);
-    scalbln(0.0, 1);
-    scalbn(0.0, 1);
+    //scalbln(0.0, 1);
+    //scalbn(0.0, 1);
    signbit(1.0);
    sin(0.0);
    sincos(0.0, &fX, &fY);
@@ -100,7 +100,7 @@ __device__ void single_precision_math_functions()
    normcdfinvf(1.0f);
    fX = 1.0f; normf(1, &fX);
    powf(1.0f, 0.0f);
-    rcbrtf(1.0f);
+    //rcbrtf(1.0f);
    remainderf(2.0f, 1.0f);
    //remquof(1.0f, 2.0f, &iX);
    rhypotf(0.0f, 1.0f);
@@ -110,8 +110,8 @@ __device__ void single_precision_math_functions()
    fX = 1.0f; rnormf(1, &fX);
    roundf(0.0f);
    rsqrtf(1.0f);
-    scalblnf(0.0f, 1);
-    scalbnf(0.0f, 1);
+    //scalblnf(0.0f, 1);
+    //scalbnf(0.0f, 1);
    signbit(1.0f);
    sincosf(0.0f, &fX, &fY);
    sincospif(0.0f, &fX, &fY);
@@ -99,7 +99,7 @@ inline int zrand(int max)


 //=================================================================================================
-// Functins to run tests
+// Functions to run tests
 //=================================================================================================
 //--
 //Run through a couple simple cases to test lookups and host pointer arithmetic:
@@ -0,0 +1,126 @@
+/* Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
+associated documentation files (the "Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the
+following conditions:
+
+The above copyright notice and this permission notice shall be included in all copies or substantial
+portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT
+LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO
+EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
+THE USE OR OTHER DEALINGS IN THE SOFTWARE. */
+
+/* HIT_START
+ * BUILD: %t %s EXCLUDE_HIP_PLATFORM nvcc
+ * RUN: %t
+ * HIT_END
+ */
+
+
+#include<iostream>
+
+// hip header file
+#include "hip/hip_runtime.h"
+
+#define NUM       1024
+
+#define THREADS_PER_BLOCK_X  4
+
+// Device (Kernel) function, it must be void
+// hipLaunchParm provides the execution configuration
+__global__ void vadd_asm(hipLaunchParm lp,
+                                float *out,
+                                float *in)
+{
+    int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
+
+    asm volatile ("v_add_f32_e32 %0, %1, %2" : "=v" (out[i]) : "v"(in[i]),"v" (out[i]));
+}
+
+// CPU implementation of Vector Result
+void addCPUReference(
+    float * output,
+    float * input)
+{
+    for(unsigned int j=0; j < NUM; j++)
+    {
+        
+        output[j]= input[j] + output[j];
+    }
+}
+
+int main(){
+
+    float* VectorA;
+    float* ResultVector;
+    float* VectorB;
+
+    float* gpuVector;
+    float* gpuResultVector;
+
+    int i;
+    int errors;
+
+    VectorA = (float*)malloc(NUM * sizeof(float));
+    ResultVector = (float*)malloc(NUM * sizeof(float));
+    VectorB = (float*)malloc(NUM * sizeof(float));
+
+    // initialize the input data
+    for (i = 0; i < NUM; i++) {
+    VectorA[i] = (float)i*10.0f;
+    VectorB[i] = (float)i*30.0f;
+    }
+
+    // allocate the memory on the device side
+    hipMalloc((void**)&gpuVector, NUM * sizeof(float));
+    hipMalloc((void**)&gpuResultVector, NUM * sizeof(float));
+
+    // Memory transfer from host to device
+    hipMemcpy(gpuVector, VectorA, NUM*sizeof(float), hipMemcpyHostToDevice);
+    hipMemcpy(gpuResultVector, VectorB, NUM*sizeof(float), hipMemcpyHostToDevice);
+
+    // Lauching kernel from host
+    hipLaunchKernel(vadd_asm,
+                    dim3(NUM/THREADS_PER_BLOCK_X),
+                    dim3(THREADS_PER_BLOCK_X),
+                    0, 0,
+                    gpuResultVector , gpuVector);
+
+    // Memory transfer from device to host
+    hipMemcpy(ResultVector, gpuResultVector, NUM*sizeof(float), hipMemcpyDeviceToHost);
+
+    // CPU Result computation
+    addCPUReference(VectorB, VectorA);
+
+    // verify the results
+    errors = 0;
+    double eps = 1.0E-3;
+    for (i = 0; i < NUM; i++) {
+    if (std::abs(ResultVector[i] - VectorB[i]) > eps ) {
+        errors++;
+        }
+    }
+    if (errors!=0) {
+    printf("FAILED: %d errors\n",errors);
+    } else {
+        printf ("PASSED!\n");
+    }
+
+    //free the resources on device side
+    hipFree(gpuVector);
+    hipFree(gpuResultVector);
+
+    hipDeviceReset();
+
+    //free the resources on host side
+    free(VectorA);
+    free(ResultVector);
+    free(VectorB);
+
+    return errors;
+}
@@ -0,0 +1,125 @@
+/*
+Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include<iostream>
+
+// hip header file
+#include "hip/hip_runtime.h"
+
+#define NUM       1024
+
+#define THREADS_PER_BLOCK_X  4
+
+// Device (Kernel) function, it must be void
+// hipLaunchParm provides the execution configuration
+__global__ void vmac_asm(hipLaunchParm lp,
+                                float *out,
+                                float *in)
+{
+    int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
+
+    asm volatile ("v_mac_f32_e32 %0, %2, %3" : "=v" (out[i]) : "0"(out[i]), "v" (a), "v" (in[i]));
+}
+
+// CPU implementation of saxpy
+void CPUReference(
+    float * output,
+    float * input)
+{
+    for(unsigned int j=0; j < NUM; j++)
+    {
+        
+        output[j]= a*input[j] + output[j];
+    }
+}
+
+int main(){
+
+    float* VectorA;
+    float* ResultVector;
+    float* VectorB;
+
+    float* gpuVector;
+    float* gpuResultVector;
+
+    const float a = 10.0f
+    int i;
+    int errors;
+
+    VectorA = (float*)malloc(NUM * sizeof(float));
+    ResultVector = (float*)malloc(NUM * sizeof(float));
+    VectorB = (float*)malloc(NUM * sizeof(float));
+
+    // initialize the input data
+    for (i = 0; i < NUM; i++) {
+    VectorA[i] = (float)i*10.0f;
+    VectorB[i] = (float)i*30.0f;
+    }
+
+    // allocate the memory on the device side
+    hipMalloc((void**)&gpuVector, NUM * sizeof(float));
+    hipMalloc((void**)&gpuResultVector, NUM * sizeof(float));
+
+    // Memory transfer from host to device
+    hipMemcpy(gpuVector, VectorA, NUM*sizeof(float), hipMemcpyHostToDevice);
+    hipMemcpy(gpuResultVector, VectorB, NUM*sizeof(float), hipMemcpyHostToDevice);
+
+    // Lauching kernel from host
+    hipLaunchKernel(vmac_asm,
+                    dim3(NUM/THREADS_PER_BLOCK_X),
+                    dim3(THREADS_PER_BLOCK_X),
+                    0, 0,
+                    gpuResultVector , gpuVector);
+
+    // Memory transfer from device to host
+    hipMemcpy(ResultVector, gpuResultVector, NUM*sizeof(float), hipMemcpyDeviceToHost);
+
+    // CPU Result computation
+    addCPUReference(VectorB, VectorA);
+
+    // verify the results
+    errors = 0;
+    double eps = 1.0E-3;
+    for (i = 0; i < NUM; i++) {
+    if (std::abs(ResultVector[i] - VectorB[i]) > eps ) {
+        errors++;
+        }
+    }
+    if (errors!=0) {
+    printf("FAILED: %d errors\n",errors);
+    } else {
+        printf ("PASSED!\n");
+    }
+
+    //free the resources on device side
+    hipFree(gpuVector);
+    hipFree(gpuResultVector);
+
+    hipDeviceReset();
+
+    //free the resources on host side
+    free(VectorA);
+    free(ResultVector);
+    free(VectorB);
+
+    return errors;
+}
@@ -0,0 +1,200 @@
+/*
+Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+/* HIT_START
+ * BUILD: %t %s ../../test_common.cpp
+ * RUN: %t
+ * HIT_END
+ */
+
+
+#include "test_common.h"
+
+enum SyncMode {
+    syncNone,
+    syncStream,
+    syncStopEvent,
+};
+
+
+const char *syncModeString(int syncMode) {
+    switch (syncMode) {
+        case syncNone:
+            return "syncNone";
+        case syncStream:
+            return "syncStream";
+        case syncStopEvent:
+            return "syncStopEvent";
+        default:
+            return "unknown";
+    };
+};
+
+
+void test(unsigned testMask, int *C_d, int *C_h, int64_t numElements, hipStream_t stream, int waitStart, SyncMode syncMode)
+{
+    if (!(testMask & p_tests)) {
+        return;
+    }
+    printf ("\ntest 0x%3x: stream=%p waitStart=%d syncMode=%s\n", 
+            testMask, stream, waitStart, syncModeString(syncMode));
+
+    size_t sizeBytes = numElements * sizeof(int);
+
+    int count =100;
+    int init0 = 0;
+    HIPCHECK(hipMemset(C_d, init0, sizeBytes));
+    for (int i=0; i<numElements; i++) {
+        C_h[i] = -1; // initialize
+    }
+
+    hipEvent_t neverCreated=0, neverRecorded, timingDisabled;
+    HIPCHECK(hipEventCreate(&neverRecorded));
+    HIPCHECK(hipEventCreateWithFlags(&timingDisabled, hipEventDisableTiming));
+
+    hipEvent_t start, stop;
+    HIPCHECK(hipEventCreate(&start));
+    HIPCHECK(hipEventCreate(&stop));
+
+    unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, numElements);
+
+    HIPCHECK(hipEventRecord(timingDisabled, stream));
+    // sandwhich a kernel:
+    HIPCHECK(hipEventRecord(start, stream));
+    hipLaunchKernelGGL(HipTest::addCountReverse , dim3(blocks), dim3(threadsPerBlock), 0, stream,    C_d, C_h, numElements, count);
+    HIPCHECK(hipEventRecord(stop, stream));
+
+
+    if (waitStart) {
+        HIPCHECK(hipEventSynchronize(start));
+    }
+
+   
+    hipError_t expectedStopError = hipSuccess; 
+
+    // How to wait for the events to finish:
+    switch (syncMode) {
+        case syncNone:
+            expectedStopError = hipErrorNotReady;
+            break;
+        case syncStream:
+            HIPCHECK(hipStreamSynchronize(stream));  // wait for recording to finish...
+            break;
+        case syncStopEvent:
+            HIPCHECK(hipEventSynchronize(stop)); 
+            break;
+        default:
+            assert(0);
+    };
+            
+
+    float t;
+
+    hipError_t e = hipEventElapsedTime(&t, start, start);
+    if ((e != hipSuccess) && (e != hipErrorNotReady))  {
+        failed ("start event not in expected state, was %d=%s\n", e, hipGetErrorName(e));
+    }
+
+    if (e == hipSuccess) 
+        assert (t==0.0f);
+        
+
+    // stop usually ready unless we skipped the synchronization (syncNone)
+    HIPCHECK_API(hipEventElapsedTime(&t, stop, stop), expectedStopError);
+    if (e == hipSuccess) 
+        assert (t==0.0f);
+
+
+    e = hipEventElapsedTime(&t, start, stop);
+    HIPCHECK_API(e, expectedStopError);
+    if (expectedStopError == hipSuccess) 
+        assert (t>0.0f);
+    printf ("time=%6.2f error=%s\n", t, hipGetErrorName(e));
+
+    e = hipEventElapsedTime(&t, stop, start);
+    HIPCHECK_API(e, expectedStopError);
+    if (expectedStopError == hipSuccess) 
+        assert (t<0.0f);
+    printf ("negtime=%6.2f error=%s\n", t, hipGetErrorName(e));
+
+
+
+    {
+        // Check some error conditions for incomplete events:
+        HIPCHECK_API(hipEventElapsedTime(&t, timingDisabled, stop), hipErrorInvalidResourceHandle);
+        HIPCHECK_API(hipEventElapsedTime(&t, start, timingDisabled), hipErrorInvalidResourceHandle);
+
+        HIPCHECK_API(hipEventElapsedTime(&t, neverCreated, stop), hipErrorInvalidResourceHandle);
+        HIPCHECK_API(hipEventElapsedTime(&t, start, neverCreated),  hipErrorInvalidResourceHandle);
+
+        HIPCHECK_API(hipEventElapsedTime(&t, neverRecorded, stop), hipErrorInvalidResourceHandle);
+        HIPCHECK_API(hipEventElapsedTime(&t, start, neverRecorded),  hipErrorInvalidResourceHandle);
+    }
+
+    HIPCHECK(hipEventDestroy(start));
+    HIPCHECK(hipEventDestroy(stop));
+
+    // Clear out everything:
+    HIPCHECK(hipDeviceSynchronize());
+
+    printf ("test:   OK  \n");
+}
+
+
+
+void runTests(int64_t numElements)
+{
+    size_t sizeBytes = numElements * sizeof(int);
+
+    printf ("test: starting sequence with sizeBytes=%zu bytes, %6.2f MB\n", sizeBytes, sizeBytes/1024.0/1024.0);
+
+
+    int *C_h, *C_d;
+    HIPCHECK(hipMalloc(&C_d, sizeBytes));
+    HIPCHECK(hipHostMalloc(&C_h, sizeBytes));
+
+    hipStream_t stream;
+    HIPCHECK(hipStreamCreateWithFlags(&stream, 0x0));
+
+    //for (int waitStart=0; waitStart<2; waitStart++) {
+    for (int waitStart=1; waitStart>=0; waitStart--) {
+        unsigned W = waitStart ? 0x1000:0;
+        test (W | 0x01, C_d, C_h, numElements,  0     , waitStart, syncNone);
+        test (W | 0x02, C_d, C_h, numElements,  stream, waitStart, syncNone);
+        test (W | 0x04, C_d, C_h, numElements,  0     , waitStart, syncStream);
+        test (W | 0x08, C_d, C_h, numElements,  stream, waitStart, syncStream);
+        test (W | 0x10, C_d, C_h, numElements,  0,      waitStart, syncStopEvent);
+        test (W | 0x20, C_d, C_h, numElements,  stream, waitStart, syncStopEvent);
+    }
+
+
+    HIPCHECK(hipStreamDestroy(stream));
+    HIPCHECK(hipFree(C_d));
+    HIPCHECK(hipHostFree(C_h));
+}
+
+
+int main(int argc, char *argv[])
+{
+    HipTest::parseStandardArguments(argc, argv, true /*failOnUndefinedArg*/);
+
+    runTests(80000000);
+
+    passed();
+}
@@ -21,24 +21,92 @@
   */

 /* HIT_START
- * BUILD: %t %s ../../test_common.cpp
+ * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS -std=c++11
 * RUN: %t
 * HIT_END
 */

+#include <vector>
 #include"test_common.h"

 #define LEN 1024*1024
 #define SIZE LEN*sizeof(float)

-__global__ void Add(hipLaunchParm lp, float *Ad, float *Bd, float *Cd){
+__global__ void Add(float *Ad, float *Bd, float *Cd){
    int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
    Cd[tx] = Ad[tx] + Bd[tx];
 }

+
+__global__ void Set(int *Ad, int val){
+    int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
+    Ad[tx] = val;
+}
+
+
+#define SYNC_EVENT 0
+#define SYNC_STREAM 1
+#define SYNC_DEVICE 2
+
+std::vector<std::string> syncMsg = {"event", "stream", "device"};
+
+void CheckHostPointer(int numElements, int *ptr, unsigned eventFlags, int syncMethod, std::string msg)
+{
+    std::cerr << "test: CheckHostPointer " << msg 
+              //<< " HIP_COHERENT_HOST_ALLOC=" << HIP_COHERENT_HOST_ALLOC
+              //<< " HIP_EVENT_SYS_RELEASE=" <<   HIP_EVENT_SYS_RELEASE
+              << " eventFlags = " << std::hex << eventFlags 
+              << ((eventFlags & hipEventReleaseToDevice) ?  " hipEventReleaseToDevice" : "")  
+              << ((eventFlags & hipEventReleaseToSystem) ? " hipEventReleaseToSystem" : "") 
+              << " ptr=" << ptr 
+              << " syncMethod=" << syncMsg[syncMethod] << "\n";
+
+    hipStream_t s;
+    hipEvent_t e;
+
+    // Init:
+    HIPCHECK(hipStreamCreate(&s));
+    HIPCHECK(hipEventCreateWithFlags(&e, eventFlags))
+    dim3 dimBlock(64,1,1);
+    dim3 dimGrid(numElements/dimBlock.x,1,1);
+
+    const int expected = 13;
+
+    // Init array to know state:
+    hipLaunchKernelGGL(Set, dimGrid, dimBlock, 0, 0x0, ptr, -42);
+    HIPCHECK(hipDeviceSynchronize());
+
+    hipLaunchKernelGGL(Set, dimGrid, dimBlock, 0, s, ptr, expected);
+    HIPCHECK(hipEventRecord(e, s));
+
+    // Host waits for event :
+    switch (syncMethod) {
+        case SYNC_EVENT:
+            HIPCHECK(hipEventSynchronize(e));
+            break;
+        case SYNC_STREAM:
+            HIPCHECK(hipStreamSynchronize(s));
+            break;
+        case SYNC_DEVICE:
+            HIPCHECK(hipDeviceSynchronize());
+            break;
+        default:
+            assert(0);
+    };
+            
+    for (int i=0; i<numElements; i++) {
+        if (ptr[i] != expected) {
+            printf ("mismatch at %d: %d != %d\n", i, ptr[i], expected);
+            assert(ptr[i] == expected);
+        }
+    }
+
+    HIPCHECK(hipStreamDestroy(s));
+    HIPCHECK(hipEventDestroy(e));
+};
+
 int main(){
-    float *A, *B, *C;
-    float *Ad, *Bd, *Cd;
+

    hipDeviceProp_t prop;
    int device;
@@ -49,26 +117,96 @@ int main(){
        failed("Does support HostPinned Memory");
    }

-    HIPCHECK(hipHostMalloc((void**)&A, SIZE, hipHostMallocWriteCombined | hipHostMallocMapped));
-    HIPCHECK(hipHostMalloc((void**)&B, SIZE, hipHostMallocDefault));
-    HIPCHECK(hipHostMalloc((void**)&C, SIZE, hipHostMallocMapped));

-    HIPCHECK(hipHostGetDevicePointer((void**)&Ad, A, 0));
-    HIPCHECK(hipHostGetDevicePointer((void**)&Cd, C, 0));
+    {
+        float *A, *B, *C;
+        float *Ad, *Bd, *Cd;
+        HIPCHECK(hipHostMalloc((void**)&A, SIZE, hipHostMallocWriteCombined | hipHostMallocMapped));
+        HIPCHECK(hipHostMalloc((void**)&B, SIZE, hipHostMallocDefault));
+        HIPCHECK(hipHostMalloc((void**)&C, SIZE, hipHostMallocMapped));

-    for(int i=0;i<LEN;i++){
-        A[i] = 1.0f;
-        B[i] = 2.0f;
+        HIPCHECK(hipHostGetDevicePointer((void**)&Ad, A, 0));
+        HIPCHECK(hipHostGetDevicePointer((void**)&Cd, C, 0));
+
+        for(int i=0;i<LEN;i++){
+            A[i] = 1.0f;
+            B[i] = 2.0f;
+        }
+
+        HIPCHECK(hipMalloc((void**)&Bd, SIZE));
+        HIPCHECK(hipMemcpy(Bd, B, SIZE, hipMemcpyHostToDevice));
+
+        dim3 dimGrid(LEN/512,1,1);
+        dim3 dimBlock(512,1,1);
+
+        hipLaunchKernelGGL(Add, dimGrid, dimBlock, 0, 0, Ad, Bd, Cd);
+
+        HIPCHECK(hipDeviceSynchronize());
+
+        HIPCHECK(hipHostFree(A));
+        HIPCHECK(hipHostFree(B));
+        HIPCHECK(hipHostFree(C));
    }

-    HIPCHECK(hipMalloc((void**)&Bd, SIZE));
-    HIPCHECK(hipMemcpy(Bd, B, SIZE, hipMemcpyHostToDevice));
+    {
+        int numElements = 1024*16;
+        size_t sizeBytes = numElements * sizeof (int);

-    dim3 dimGrid(LEN/512,1,1);
-    dim3 dimBlock(512,1,1);
+#ifdef __HIP_PLATFORM_HCC__
+        { 
+            // Stimulate error condition:
+            int *A = &numElements;
+            HIPCHECK_API(hipHostMalloc((void**)&A, sizeBytes, hipHostMallocCoherent|hipHostMallocNonCoherent), hipErrorInvalidValue);

-    hipLaunchKernel(HIP_KERNEL_NAME(Add), dimGrid, dimBlock, 0, 0, Ad, Bd, Cd);
+            assert (A == 0);
+        }
+#endif

+
+        {
+            int *A = nullptr;
+            HIPCHECK(hipHostMalloc((void**)&A, sizeBytes, hipHostMallocNonCoherent));
+            const char *ptrType = "non-coherent"; // TODO
+            CheckHostPointer(numElements, A, hipEventReleaseToSystem, SYNC_DEVICE,   ptrType);
+            CheckHostPointer(numElements, A, hipEventReleaseToSystem, SYNC_STREAM,  ptrType);
+            CheckHostPointer(numElements, A, hipEventReleaseToSystem, SYNC_EVENT,   ptrType);
+
+            // agent-scope releases don't provide host visibility, don't use them here:
+        }
+
+        if (1) { 
+            int *A = nullptr;
+            HIPCHECK(hipHostMalloc((void**)&A, sizeBytes, hipHostMallocCoherent));
+            const char *ptrType = "coherent";
+            CheckHostPointer(numElements, A, hipEventReleaseToDevice, SYNC_DEVICE,   ptrType);
+            CheckHostPointer(numElements, A, hipEventReleaseToDevice, SYNC_STREAM,  ptrType);
+            CheckHostPointer(numElements, A, hipEventReleaseToDevice, SYNC_EVENT,   ptrType);
+
+            CheckHostPointer(numElements, A, hipEventReleaseToSystem, SYNC_DEVICE,   ptrType);
+            CheckHostPointer(numElements, A, hipEventReleaseToSystem, SYNC_STREAM,  ptrType);
+            CheckHostPointer(numElements, A, hipEventReleaseToSystem, SYNC_EVENT,   ptrType);
+        }
+
+
+        // Check defaults:
+        if (1) { 
+            int *A = nullptr;
+            HIPCHECK(hipHostMalloc((void**)&A, sizeBytes));
+            const char *ptrType = "default";
+            CheckHostPointer(numElements, A, 0, SYNC_DEVICE,   ptrType);
+            CheckHostPointer(numElements, A, 0, SYNC_STREAM,  ptrType);
+            CheckHostPointer(numElements, A, 0, SYNC_EVENT,   ptrType);
+            
+            CheckHostPointer(numElements, A, 0, SYNC_DEVICE,   ptrType);
+            CheckHostPointer(numElements, A, 0, SYNC_STREAM,  ptrType);
+            CheckHostPointer(numElements, A, 0, SYNC_EVENT,   ptrType);
+        }
+
+
+
+
+    }
+        
    passed();

 }
@@ -19,9 +19,7 @@ THE SOFTWARE.

 /* HIT_START
 * BUILD: %t %s ../../test_common.cpp
- * RUN: %t --tests 0x1
- * RUN: %t --tests 0x2
- * RUN: %t --tests 0x4
+ * RUN: %t
 * HIT_END
 */

@@ -131,7 +129,7 @@ int main(int argc, char *argv[])
        HIPCHECK(hipMalloc(&Bd, size));

        // TODO - set to 128
-#define OFFSETS_TO_TRY 1 
+#define OFFSETS_TO_TRY 128 
        assert (N>OFFSETS_TO_TRY);

        if (p_tests & 0x2) {
@@ -1,70 +0,0 @@
-/*
-Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR
-IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
-*/
-
-/* HIT_START
- * BUILD: %t %s ../../test_common.cpp
- * RUN: %t
- * HIT_END
- */
-
-#include"test_common.h"
-
-#define SIZE 1024*1024*256
-
-int main(){
-    float *Ad, *B, *Bd, *Bm, *C, *Cd, *ptr_0;
-    B = (float*)malloc(SIZE);
-    hipMalloc((void**)&Ad, SIZE);
-    hipHostMalloc((void**)&B, SIZE);
-    hipHostMalloc((void**)&Bd, SIZE, hipHostMallocDefault);
-    hipHostMalloc((void**)&Bm, SIZE, hipHostMallocMapped);
-    hipHostMalloc((void**)&C, SIZE, hipHostMallocMapped);
-
-    hipHostGetDevicePointer((void**)&Cd, C, 0/*flags*/);
-
-    HIPCHECK_API(hipMalloc((void**)&ptr_0,0), hipSuccess);
-
-    HIPCHECK_API(hipFree(Ad) ,  hipSuccess);
-    HIPCHECK_API(hipHostFree(Ad) , hipErrorInvalidValue);
-
-    HIPCHECK_API(hipFree(B)  , hipErrorInvalidDevicePointer); // try to hipFree on malloced memory
-    HIPCHECK_API(hipFree(Bd) , hipErrorInvalidDevicePointer);
-    HIPCHECK_API(hipFree(Bm) , hipErrorInvalidDevicePointer);
-    HIPCHECK_API(hipFree(ptr_0) , hipSuccess);
-    HIPCHECK_API(hipHostFree(Bd) , hipSuccess);
-    HIPCHECK_API(hipHostFree(Bm) , hipSuccess);
-
-    HIPCHECK_API(hipFree(C) , hipErrorInvalidDevicePointer);
-    HIPCHECK_API(hipHostFree(C) , hipSuccess);
-
-
-    HIPCHECK_API(hipFree(NULL) , hipSuccess);
-    HIPCHECK_API(hipHostFree(NULL) , hipSuccess);
-
-    {
-        // Some negative testing - request a too-big allocation and verify it fails:
-        // Someday when we support virtual memory may need to refactor these:
-        size_t tooBig = 128LL*1024*1024*1024*1024;  // 128 TB;
-        void *p;
-        HIPCHECK_API ( hipMalloc(&p, tooBig),  hipErrorMemoryAllocation );
-        HIPCHECK_API ( hipHostMalloc(&p, tooBig),  hipErrorMemoryAllocation );
-    }
-
-    passed();
-}
@@ -0,0 +1,170 @@
+/*
+Copyright (c) 2015-2017 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+// Simple test for memset.
+// Also serves as a template for other tests.
+
+/* HIT_START
+ * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS --std=c++11
+ * RUN: %t 
+ * HIT_END
+ */
+
+#include "hip/hip_runtime.h"
+#include "test_common.h"
+
+#ifdef __HIP_PLATFORM_HCC__
+#include <hc_am.hpp>
+#endif
+
+#define USE_HSA_COPY 1
+
+int enablePeers(int dev0, int dev1)
+{
+    int canAccessPeer01, canAccessPeer10;
+    HIPCHECK(hipDeviceCanAccessPeer(&canAccessPeer01, dev0, dev1));
+    HIPCHECK(hipDeviceCanAccessPeer(&canAccessPeer10, dev1, dev0));
+    if (!canAccessPeer01 || !canAccessPeer10) {
+        return -1;
+    }
+
+    HIPCHECK(hipSetDevice(dev0));
+    HIPCHECK(hipDeviceEnablePeerAccess(dev1, 0/*flags*/));
+    HIPCHECK(hipSetDevice(dev1));
+    HIPCHECK(hipDeviceEnablePeerAccess(dev0, 0/*flags*/));
+
+    return 0;
+};
+
+
+__global__ void
+memsetIntKernel(int * ptr, int val, size_t numElements)
+{
+    int gid = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
+    if (gid < numElements) {
+        ptr[gid] = val;
+    }
+};
+
+
+void checkReverse(const int *ptr, int numElements, int expected) {
+    for (int i=numElements-1; i>=0; i--) {
+        if (ptr[i] != expected) {
+            printf ("i=%d, ptr[](%d) != expected (%d)\n", i, ptr[i], expected);
+            assert (ptr[i] == expected);
+        }
+    }
+
+    printf ("test:   OK\n");
+}
+
+
+void runTest(bool stepAIsCopy, hipStream_t gpu0Stream, hipStream_t gpu1Stream, int numElements,
+             int * dataGpu0, int *dataGpu1, int *dataHost, int expected)
+{
+    hipEvent_t e;
+    HIPCHECK(hipEventCreateWithFlags(&e,0));
+
+    printf ("test: runTest with %s\n", stepAIsCopy ? "copy" : "kernel");
+    const size_t sizeElements = numElements * sizeof(int);
+
+    hipStream_t stepAStream = gpu0Stream;
+
+    if (stepAIsCopy) {
+#ifdef USE_HSA_COPY
+        HIPCHECK(hipMemcpyAsync(dataGpu1, dataGpu0, sizeElements, hipMemcpyDeviceToDevice, stepAStream));
+#endif
+    } else {
+        assert(0); // not yet supported.
+    }
+
+    HIPCHECK(hipEventRecord(e, stepAStream));
+    HIPCHECK(hipStreamWaitEvent(gpu1Stream, e, 0));
+
+    HIPCHECK(hipMemcpyAsync(dataHost, dataGpu1, sizeElements, hipMemcpyDeviceToHost, gpu1Stream));
+
+    HIPCHECK(hipStreamSynchronize(gpu1Stream));
+
+    checkReverse(dataHost, numElements, expected);
+}
+
+
+void testMultiGpu0(int dev0, int dev1, int numElements)
+{
+    const size_t sizeElements = numElements * sizeof(int);
+
+    int * dataGpu0, *dataGpu1, *dataHost;
+    hipStream_t gpu0Stream, gpu1Stream;
+    const int expected = 42;
+    unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, numElements);
+
+    HIPCHECK(hipSetDevice(dev0));
+
+    HIPCHECK(hipMalloc(&dataGpu0, sizeElements));
+    HIPCHECK(hipStreamCreate(&gpu0Stream));
+    hipLaunchKernelGGL(memsetIntKernel, dim3(blocks), dim3(threadsPerBlock), 0, gpu0Stream,
+                       dataGpu0, expected, numElements);
+    HIPCHECK(hipDeviceSynchronize());
+
+
+    HIPCHECK(hipSetDevice(dev1));
+    HIPCHECK(hipMalloc(&dataGpu1, sizeElements));
+    HIPCHECK(hipStreamCreate(&gpu1Stream));
+    hipLaunchKernelGGL(memsetIntKernel, dim3(blocks), dim3(threadsPerBlock), 0, gpu0Stream,
+                       dataGpu1, 0x34, numElements);
+    HIPCHECK(hipDeviceSynchronize());
+
+    HIPCHECK(hipHostMalloc(&dataHost, sizeElements));
+    memset(dataHost, 13, sizeElements);
+
+#ifdef __HIP_PLATFORM_HCC__
+    hc::am_memtracker_print(0x0);
+#endif
+    
+    printf ("  test: init complete\n");
+
+    runTest(true/*stepAIsCopy*/, gpu0Stream, gpu1Stream, numElements, dataGpu0, dataGpu1, dataHost, expected);
+
+};
+
+
+
+int main(int argc, char *argv[])
+{
+    HipTest::parseStandardArguments(argc, argv, true);
+
+    int numElements = N;
+
+    int dev0 = 0;
+    int dev1 = 1;
+
+    // TODO - only works on multi-GPU system:
+    if (enablePeers(dev0,dev1) == -1) {
+        printf ("warning : could not find peer gpus\n");
+        return -1;
+    };
+
+    //testMultiGpu0(dev0, dev1, numElements);
+
+
+
+    passed();
+};
@@ -0,0 +1,36 @@
+/*
+Copyright (c) 2015-Present Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+/* HIT_START
+ * BUILD: %t %s ../../test_common.cpp
+ * RUN: %t
+ * HIT_END
+ */
+
+#include<hip/hip_runtime_api.h>
+#include<iostream>
+#include"test_common.h"
+
+int main(){
+  hipFuncCache_t cacheConfig;
+  void *func;
+  hipFuncSetCacheConfig(func, cacheConfig);
+  passed();
+}
+
@@ -29,6 +29,8 @@ THE SOFTWARE.
 #include "hip/hip_runtime.h"
 #include "test_common.h"

+int p_iters=10;
+
 void printSep()
 {
    printf ("======================================================================================\n");
@@ -43,7 +45,7 @@ template<
 	class P=HipTest::Unpinned, 
 	class C=HipTest::Memcpy
 >
-void simpleVectorCopy(size_t numElements, int iters, hipStream_t stream)
+void simpleVectorAdd(size_t numElements, int iters, hipStream_t stream)
 {
 	using HipTest::MemTraits;

@@ -57,6 +59,24 @@ void simpleVectorCopy(size_t numElements, int iters, hipStream_t stream)
    T *A_h, *B_h, *C_h;

    HipTest::initArrays (&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, P::isPinned);
+	for (size_t i=0; i<numElements; i++) {
+		A_h[i] = 1000.0f;
+		B_h[i] = 2000.0f;
+		C_h[i] = -1;
+	}
+
+
+	MemTraits<C>::Copy(B_d, B_h, Nbytes, hipMemcpyHostToDevice, stream);
+	MemTraits<C>::Copy(A_d, A_h, Nbytes, hipMemcpyHostToDevice, stream);
+	MemTraits<C>::Copy(C_d, C_h, Nbytes, hipMemcpyHostToDevice, stream);
+    HIPCHECK (hipDeviceSynchronize());
+
+	for (size_t i=0; i<numElements; i++) {
+		A_h[i] = 1.0f;
+		B_h[i] = 2.0f;
+		C_h[i] = -1;
+	}
+


 	for (int i=0; i<iters; i++) {
@@ -66,7 +86,11 @@ void simpleVectorCopy(size_t numElements, int iters, hipStream_t stream)
 		MemTraits<C>::Copy(A_d, A_h, Nbytes, hipMemcpyHostToDevice, stream);
 		MemTraits<C>::Copy(B_d, B_h, Nbytes, hipMemcpyHostToDevice, stream);

-		hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d, B_d, C_d, numElements);
+		//HIPCHECK(hipStreamSynchronize(stream));
+
+		// This is the null stream?
+		//hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d, B_d, C_d, numElements);
+		hipLaunchKernel(HipTest::vectorADDReverse, dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d, B_d, C_d, numElements);

 		MemTraits<C>::Copy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost, stream);

@@ -76,9 +100,9 @@ void simpleVectorCopy(size_t numElements, int iters, hipStream_t stream)
 	}

    HipTest::freeArrays (A_d, B_d, C_d, A_h, B_h, C_h, P::isPinned);
+	std::cout <<"  pid" << pid << " success\n";
    HIPCHECK (hipDeviceSynchronize());

-	std::cout <<"  pid" << pid << " success\n";
 }

 template<typename T, class C>
@@ -88,12 +112,14 @@ void test_multiThread_1(std::string testName, hipStream_t stream0, hipStream_t s
 	printf ("%s\n", __func__);
 	std::cout << testName << std::endl;

+	size_t numElements = N;
+
 	// Test 2 threads operating on same stream:
-    std::thread t1 (simpleVectorCopy<T, HipTest::Pinned, C>, 2000000/*mb*/, 100/*iters*/, stream0);
+    std::thread t1 (simpleVectorAdd<T, HipTest::Pinned, C>, numElements, p_iters/*iters*/, stream0);
    if (serialize) {
        t1.join();
    }
-    std::thread t2 (simpleVectorCopy<T, HipTest::Pinned, C>, 2000000/*mb*/, 100/*iters*/, stream1);
+    std::thread t2 (simpleVectorAdd<T, HipTest::Pinned, C>, numElements, p_iters/*iters*/, stream1);
    if (serialize) {
        t2.join();
    }
@@ -109,6 +135,7 @@ void test_multiThread_1(std::string testName, hipStream_t stream0, hipStream_t s

 int main(int argc, char *argv[])
 {
+	N = 8000000;
    HipTest::parseStandardArguments(argc, argv, true);

    printf ("info: set device to %d\n", p_gpuDevice);
@@ -121,8 +148,8 @@ int main(int argc, char *argv[])
        hipStream_t stream;
        HIPCHECK (hipStreamCreate(&stream));

-        simpleVectorCopy<float, HipTest::Pinned, HipTest::MemcpyAsync>	(2000000/*mb*/, 10/*iters*/, stream);
-        simpleVectorCopy<float, HipTest::Pinned, HipTest::Memcpy>		(2000000/*mb*/, 10/*iters*/, stream);
+        simpleVectorAdd<float, HipTest::Pinned, HipTest::MemcpyAsync>	(N/*mb*/, 10/*iters*/, stream);
+        simpleVectorAdd<float, HipTest::Pinned, HipTest::Memcpy>		(N/*mb*/, 10/*iters*/, stream);

        HIPCHECK(hipStreamDestroy(stream));
    }
@@ -139,8 +166,8 @@ int main(int argc, char *argv[])
    }

    if (p_tests & 0x4) {
-		test_multiThread_1<float, HipTest::MemcpyAsync> ("Multithread with NULL stream", NULL, NULL, false);
-		test_multiThread_1<float, HipTest::MemcpyAsync> ("Multithread with two streams", stream0, stream1, false);
+		//test_multiThread_1<float, HipTest::MemcpyAsync> ("Multithread with NULL stream", NULL, NULL, false);
+		//test_multiThread_1<float, HipTest::MemcpyAsync> ("Multithread with two streams", stream0, stream1, false);
 		test_multiThread_1<float, HipTest::MemcpyAsync> ("Multithread with one stream",  stream0, stream0, false);
 	}

@@ -0,0 +1,300 @@
+/*
+Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+/* HIT_START
+ * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS --std=c++11
+ * RUN: %t
+ * HIT_END
+ */
+
+
+#include "hip/hip_runtime.h"
+#include "test_common.h"
+#include <vector>
+unsigned p_streams =16;
+int      p_repeat = 10;
+int      p_db     = 0;
+
+
+template <typename T>
+__global__ void
+vectorADDRepeat(hipLaunchParm lp,
+            const T *A_d,
+            const T *B_d,
+            T *C_d,
+            size_t NELEM,
+            int repeat)
+{
+    size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
+    size_t stride = hipBlockDim_x * hipGridDim_x ;
+
+    for (int j=1; j<=repeat;j++) {
+        for (size_t i=offset; i<NELEM; i+=stride) {
+           C_d[i] = A_d[i]*j + B_d[i]*j;
+        }
+    };
+}
+
+
+
+
+//------
+// Structure for one stream - includes the stream + data buffers that are used by the stream.
+template <typename T>
+class Streamer {
+public:
+    Streamer(size_t numElements, bool useNullStream=false);
+    ~Streamer();
+    void enqueAsync();
+    void queryUntilComplete();
+
+    void reset();
+    void H2D();
+    void D2H();
+
+
+public:
+    T *_A_h;
+    T *_B_h;
+    T *_C_h;
+
+    T *_A_d;
+    T *_B_d;
+    T *_C_d;
+
+    hipStream_t _stream;
+    hipEvent_t  _event;
+
+    size_t      _numElements;
+};
+
+template <typename T>
+Streamer<T>::Streamer(size_t numElements, bool useNullStream) :
+    _numElements(numElements)
+{
+    HipTest::initArrays (&_A_d, &_B_d, &_C_d, &_A_h, &_B_h, &_C_h, numElements, true);
+
+    if (useNullStream) {
+        _stream = 0x0;
+    } else {
+        HIPCHECK(hipStreamCreate(&_stream));
+    }
+    HIPCHECK(hipEventCreate(&_event));
+
+    H2D();
+
+};
+
+template <typename T>
+void Streamer<T>::H2D()
+{
+    HIPCHECK(hipMemcpy(_A_d, _A_h, _numElements*sizeof(T), hipMemcpyHostToDevice));
+    HIPCHECK(hipMemcpy(_B_d, _B_h, _numElements*sizeof(T), hipMemcpyHostToDevice));
+}
+
+template <typename T>
+void Streamer<T>::D2H()
+{
+    HIPCHECK(hipMemcpy(_C_h, _C_d, _numElements*sizeof(T), hipMemcpyDeviceToHost));
+}
+
+template <typename T>
+void Streamer<T>::reset()
+{
+    HipTest::setDefaultData(_numElements, _A_h, _B_h, _C_h);
+    H2D();
+    
+}
+
+
+template <typename T>
+void Streamer<T>::enqueAsync()
+{
+    printf ("testing: %s  numElements=%zu size=%6.2fMB\n", __func__, _numElements, _numElements * sizeof(T) / 1024.0/1024.0);
+    unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, _numElements);
+    hipLaunchKernel(vectorADDRepeat, dim3(blocks), dim3(threadsPerBlock), 0, _stream, _A_d, _B_d, _C_d, _numElements, p_repeat);
+
+}
+
+template <typename T>
+void Streamer<T>::queryUntilComplete()
+{
+    int numQueries = 0;
+    hipError_t e = hipSuccess;
+    do {
+        numQueries++;
+        e = hipStreamQuery(_stream);
+    } while (e != hipSuccess) ;
+
+    printf ("completed after %d queries\n", numQueries);
+};
+
+
+
+//---
+//Parse arguments specific to this test.
+void parseMyArguments(int argc, char *argv[])
+{
+    int more_argc = HipTest::parseStandardArguments(argc, argv, false);
+
+    // parse args for this test:
+    for (int i = 1; i < more_argc; i++) {
+        const char *arg = argv[i];
+
+        if (!strcmp(arg, "--streams")) {
+            if (++i >= argc || !HipTest::parseUInt(argv[i], &p_streams)) {
+               failed("Bad streams argument");
+            }
+        } else if (!strcmp(arg, "--repeat") || (!strcmp(arg, "-r"))) {
+            if (++i >= argc || !HipTest::parseInt(argv[i], &p_repeat)) {
+               failed("Bad repeat argument");
+            }
+        } else {
+            failed("Bad argument '%s'", arg);
+        }
+    };
+};
+
+
+void
+printBuffer(std::string name, int *f, size_t numElements)
+{
+    std::cout << name << "\n";
+    for (size_t i=0; i<numElements; i++) {
+        printf ("%5zu: %d\n", i, f[i]);
+    }
+}
+
+
+
+
+//---
+int main(int argc, char *argv[])
+{
+    HipTest::parseStandardArguments(argc, argv, false);
+    parseMyArguments(argc, argv);
+
+    typedef Streamer<int> IntStreamer;
+
+    std::vector<IntStreamer *> streamers;
+
+    size_t numElements = N;
+
+    int *expected_H = (int*)malloc(numElements*sizeof(int));
+
+
+    auto nullStreamer = new IntStreamer(numElements, true);
+
+    // Expected resultr - last streamer runs vectorADDRepeat, then nullstreamer adds lastStreamer->_C_d + lastStreamer->_C_d
+    for (size_t i=0; i<numElements; i++) {
+        expected_H[i] = ((nullStreamer->_A_h[i])*p_repeat + (nullStreamer->_B_h[i]) * p_repeat) *2;
+    }
+
+
+    for (int i=0; i<p_streams; i++) {
+        IntStreamer * s = new IntStreamer(numElements);
+        streamers.push_back(s);
+    }
+    unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, numElements);
+
+    for (int s=1; s<p_streams; s++) {
+        if (p_tests & (1<<s)) {
+            printf ("==> Test %x runAsnc, #streams=%d\n", (1<<s), s);
+            nullStreamer->reset();
+
+            for (int i=0; i<s; i++) {
+                streamers[i]->enqueAsync();
+            }
+
+            auto lastStreamer = streamers[s - 1];
+
+            // Dispatch to NULL stream, should wait for prior async activity to complete before beginning:
+            hipLaunchKernel(vectorADDRepeat, dim3(blocks), dim3(threadsPerBlock), 0, 0/*nullstream*/, lastStreamer->_C_d, lastStreamer->_C_d, nullStreamer->_C_d, numElements, 1/*repeat*/);
+
+
+            if (p_db) {
+                HIPCHECK(hipDeviceSynchronize());
+                lastStreamer->D2H();
+                printBuffer("lastStream _A_h", lastStreamer->_A_h, min(numElements, size_t(20)));
+                printBuffer("lastStream _B_h", lastStreamer->_B_h, min(numElements, size_t(20)));
+                printBuffer("lastStream _C_h", lastStreamer->_C_h, min(numElements, size_t(20)));
+            }
+            nullStreamer->D2H();
+            HIPCHECK(hipDeviceSynchronize());
+
+            HipTest::checkTest(expected_H, nullStreamer->_C_h, numElements); 
+        }
+    }
+
+
+    for (int s=1; s<p_streams; s+=2) {
+        unsigned tmask = (0x10000 | (1<<s));
+        if (p_tests & tmask) {
+            nullStreamer->reset();
+            printf ("==> Test %x runAsnc-odd-only, #streams=%d\n", tmask, s);
+            for (int i=0; i<s; i++) {
+                // RUn just odd streams so we have some empty ones to examine/optimize:
+                if (i & 0x1) {
+                    streamers[i]->enqueAsync();
+                }
+            }
+            auto lastStreamer = streamers[s - 1];
+
+            // Dispatch to NULL stream, should wait for prior async activity to complete before beginning:
+            hipLaunchKernel(vectorADDRepeat, dim3(blocks), dim3(threadsPerBlock), 0, 0/*nullstream*/, lastStreamer->_C_d, lastStreamer->_C_d, nullStreamer->_C_d, numElements, 1/*repeat*/);
+
+            nullStreamer->D2H();
+
+            HIPCHECK(hipDeviceSynchronize());
+
+            HipTest::checkTest(expected_H, nullStreamer->_C_h, numElements); 
+        }
+    }
+
+    // Expected resultr - last streamer runs vectorADDRepeat
+    for (size_t i=0; i<numElements; i++) {
+        expected_H[i] = ((nullStreamer->_A_h[i])*p_repeat + (nullStreamer->_B_h[i]) * p_repeat);
+    }
+
+    if (p_tests & 0x20000) {
+
+        assert (p_streams >=2); // need a couple streams in order to run this test.
+        nullStreamer->reset();
+        printf ("\n==> Test hipStreamSynchronize with defaultStream \n");
+
+        // Enqueue a long-running job to stream1
+        streamers[0]->enqueAsync();
+
+        // Check to see if synchronizing on a null stream synchronizes all other streams or just the null stream.
+        // This function follows null stream semantics and will wait for all other blocking streams before returning.
+        // This will wait on the host
+        HIPCHECK(hipStreamSynchronize(0));
+
+        // Copy with stream1, this could go async if the streamSync doesn't synchronize ALL the streams.
+        HIPCHECK(hipMemcpyAsync(streamers[0]->_C_h, streamers[0]->_C_d, streamers[0]->_numElements*sizeof(int), hipMemcpyDeviceToHost, streamers[1]->_stream));
+
+        
+        HIPCHECK(hipDeviceSynchronize());
+
+        HipTest::checkTest(expected_H, streamers[0]->_C_h, numElements); 
+    }
+
+
+    passed();
+}
@@ -0,0 +1,207 @@
+/*
+Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+/* HIT_START
+ * BUILD: %t %s ../../test_common.cpp
+ * RUN: %t
+ * HIT_END
+ */
+
+
+#include "test_common.h"
+
+enum SyncMode {
+    syncNone,
+    syncNullStream,
+    syncOtherStream,
+    syncMarkerThenOtherStream,
+    syncMarkerThenOtherNonBlockingStream,
+    syncDevice
+};
+
+
+const char *syncModeString(int syncMode) {
+    switch (syncMode) {
+        case syncNone:
+            return "syncNone";
+        case syncNullStream:
+            return "syncNullStream";
+        case syncOtherStream:
+            return "syncOtherStream";
+        case syncMarkerThenOtherStream:
+            return "syncMarkerThenOtherStream";
+        case syncMarkerThenOtherNonBlockingStream:
+            return "syncMarkerThenOtherNonBlockingStream";
+        case syncDevice:
+            return "syncDevice";
+        default:
+            return "unknown";
+    };
+};
+
+
+void test(unsigned testMask, int *C_d, int *C_h, int64_t numElements, SyncMode syncMode, bool expectMismatch)
+{
+
+    // This test sends a long-running kernel to the null stream, then tests to see if the 
+    // specified synchronization technique is effective.
+    //
+    // Some syncMode are not expected to correctly sync (for example "syncNone").  in these 
+    // cases the test sets expectMismatch and the check logic below will attempt to ensure that
+    // the undesired synchronization did not occur - ie ensure the kernel is still running and did
+    // not yet update the stop event.  This can be tricky since if the kernel runs fast enough it
+    // may complete before the check.  To prevent this, the addCountReverse has a count parameter 
+    // which causes it to loop repeatedly, and the results are checked in reverse order.  
+    //
+    // Tests with expectMismatch=true should ensure the kernel finishes correctly. This results
+    // are checked and we test to make sure stop event has completed.
+    
+    if (!(testMask & p_tests)) {
+        return;
+    }
+    printf ("\ntest 0x%02x: syncMode=%s expectMismatch=%d\n", 
+            testMask, syncModeString(syncMode), expectMismatch);
+
+    size_t sizeBytes = numElements * sizeof(int);
+
+    int count =100;
+    int init0 = 0;
+    HIPCHECK(hipMemset(C_d, init0, sizeBytes));
+    for (int i=0; i<numElements; i++) {
+        C_h[i] = -1; // initialize
+    }
+
+    hipStream_t otherStream = 0;
+    unsigned flags = (syncMode == syncMarkerThenOtherNonBlockingStream) ?  hipStreamNonBlocking : hipStreamDefault;
+    HIPCHECK(hipStreamCreateWithFlags(&otherStream, flags));
+    hipEvent_t stop, otherStreamEvent;
+    HIPCHECK(hipEventCreate(&stop));
+    HIPCHECK(hipEventCreate(&otherStreamEvent));
+
+
+    unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, numElements);
+    // Launch kernel into null stream, should result in C_h == count.
+    hipLaunchKernelGGL(HipTest::addCountReverse , dim3(blocks), dim3(threadsPerBlock), 0, 0 /*stream*/,    C_d, C_h, numElements, count);
+    HIPCHECK(hipEventRecord(stop, 0/*default*/));  
+
+    switch (syncMode) {
+        case syncNone:
+            break;
+        case syncNullStream:
+            HIPCHECK(hipStreamSynchronize(0));  // wait on host for null stream:
+            break;
+        case syncOtherStream:
+            // Does this synchronize with the null stream?
+            HIPCHECK(hipStreamSynchronize(otherStream));  
+            break;
+        case syncMarkerThenOtherStream:
+        case syncMarkerThenOtherNonBlockingStream:
+            
+            // this may wait for NULL stream depending hipStreamNonBlocking flag above
+            HIPCHECK(hipEventRecord(otherStreamEvent, otherStream));  
+
+            HIPCHECK(hipStreamSynchronize(otherStream));  
+            break;
+        case syncDevice:
+            HIPCHECK(hipDeviceSynchronize());  
+            break;
+        default:
+            assert(0);
+    };
+
+    hipError_t done = hipEventQuery(stop);
+
+    if (expectMismatch) {
+        assert (done == hipErrorNotReady);
+    } else {
+        assert (done == hipSuccess);
+    }
+
+    int mismatches = 0;
+    int expected = init0 + count;
+    for (int i=0; i<numElements; i++) {
+        bool compareEqual = (C_h[i] == expected);
+        if (!compareEqual) {
+            mismatches ++;
+            if  (!expectMismatch) {
+                printf ("C_h[%d] (%d) != %d\n", i, C_h[i], expected);
+                assert(C_h[i] == expected);
+            }
+        }
+    }
+
+    if (expectMismatch) {
+        assert (mismatches > 0);
+    }
+
+
+    HIPCHECK(hipStreamDestroy(otherStream));
+    HIPCHECK(hipEventDestroy(stop));
+    HIPCHECK(hipEventDestroy(otherStreamEvent));
+
+    HIPCHECK(hipDeviceSynchronize());
+
+    printf ("test:   OK - %d mismatches (%6.2f%%)\n",  mismatches, ((double)(mismatches)*100.0)/numElements);
+}
+
+
+void runTests(int64_t numElements)
+{
+    size_t sizeBytes = numElements * sizeof(int);
+
+    printf ("\n\ntest: starting sequence with sizeBytes=%zu bytes, %6.2f MB\n", sizeBytes, sizeBytes/1024.0/1024.0);
+
+
+    int *C_h, *C_d;
+    HIPCHECK(hipMalloc(&C_d, sizeBytes));
+    HIPCHECK(hipHostMalloc(&C_h, sizeBytes));
+
+
+    {
+        test (0x01, C_d, C_h, numElements,  syncNone,                             true /*expectMismatch*/);
+        test (0x02, C_d, C_h, numElements,  syncNullStream,                       false /*expectMismatch*/);
+        test (0x04, C_d, C_h, numElements,  syncOtherStream,                      true /*expectMismatch*/);
+        test (0x08, C_d, C_h, numElements,  syncDevice,                           false /*expectMismatch*/);
+
+        // Sending a marker to to null stream may synchronize the otherStream
+        //  - other created with hipStreamNonBlocking=0 : synchronization, should match
+        //  - other created with hipStreamNonBlocking=1 : no synchronization, may mismatch
+        test (0x10, C_d, C_h, numElements,  syncMarkerThenOtherStream,            false /*expectMismatch*/);
+
+        // TODO - review why this test seems flaky
+        //test (0x20, C_d, C_h, numElements,  syncMarkerThenOtherNonBlockingStream, true /*expectMismatch*/);
+    }
+
+
+    HIPCHECK(hipFree(C_d));
+    HIPCHECK(hipHostFree(C_h));
+}
+
+
+int main(int argc, char *argv[])
+{
+    // Can' destroy the default stream:// TODO - move to another test
+    HIPCHECK_API(hipStreamDestroy(0), hipErrorInvalidResourceHandle); 
+
+    HipTest::parseStandardArguments(argc, argv, true /*failOnUndefinedArg*/);
+
+    runTests(40000000);
+
+    passed();
+}
@@ -18,7 +18,7 @@ THE SOFTWARE.
 */

 /* HIT_START
- * BUILD: %t %s ../../test_common.cpp
+ * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS --std=c++11
 * RUN: %t
 * HIT_END
 */
@@ -28,60 +28,162 @@ THE SOFTWARE.
 #include "hip/hip_runtime.h"
 #include "test_common.h"
 #include <vector>
-unsigned p_streams = 6;
+#include <limits>
+unsigned p_streams = 8;
+unsigned p_db = 0;
+unsigned p_count = 100;
+
+
+


 //------
 // Structure for one stream;
 template <typename T>
 class Streamer {
+
+#define COMMAND_ADD_FORWARD 0
+#define COMMAND_ADD_REVERSE 1
+#define COMMAND_COPY        2
+
+
 public:
-    Streamer(size_t numElements);
+    Streamer(int deviceId, T *input, size_t numElements, int commandType);
    ~Streamer();
-    void runAsync();
+    void runAsyncAfter(Streamer<T> *depStreamer, bool waitSameStream=false);
+    void runAsyncWaitSameStream();
    void queryUntilComplete();

+    size_t check(int streamerNum, T initValue, T expectedOffset, bool expectPass=true);
+    void copyToHost(hipStream_t copyStream);

+    hipEvent_t event() { return _event; };
+
+    int deviceId() const { return _deviceId; };
+    size_t mismatchCount() const { return _mismatchCount; };
+    T *C_d() { return _C_d; };
+
+    // How much does this streamer add to A[i] after running runAsyncAfter
+    int expectedAdd() const { return (_commandType == COMMAND_COPY) ? 0 : p_count; };
+
+
+    int         _commandType;  // 0=addReverse, 1=addFwd, 2=move
 private:
-    T *_A_h;
-    T *_B_h;
+
    T *_C_h;

+    T *_preA_d; // if input is on another device, this is pointer to that memory.
    T *_A_d;
-    T *_B_d;
    T *_C_d;

    hipStream_t _stream;
    hipEvent_t  _event;

+    int         _deviceId;
    size_t      _numElements;
+
+    size_t      _mismatchCount;
 };

+
 template <typename T>
-Streamer<T>::Streamer(size_t numElements) :
-    _numElements(numElements)
+Streamer<T>::Streamer(int deviceId, T * A_d, size_t numElements, int commandType) :
+    _preA_d(NULL), 
+    _A_d(A_d),
+    _deviceId(deviceId),
+    _numElements(numElements),
+    _commandType(commandType)
 {
-    HipTest::initArrays (&_A_d, &_B_d, &_C_d, &_A_h, &_B_h, &_C_h, numElements, true);
+    size_t sizeElements = numElements * sizeof(int);
+
+    //if (commandType == 0) _commandType = 1; // TODO - remove me
+
+    HIPCHECK(hipSetDevice(_deviceId));
+
+
+    hipPointerAttribute_t attr;
+    HIPCHECK(hipPointerGetAttributes(&attr, A_d));
+    if (attr.device != deviceId) {
+        // source is on another device, we will need to copy later.
+        // So save original source pointer and allocate local space.
+        printf ("info: source for streamer on another device, will insert memcpy\n");
+        _preA_d = A_d;
+        HIPCHECK(hipMalloc(&_A_d, sizeElements));
+        HIPCHECK(hipMemset(_A_d, -3, sizeElements));
+    }
+
+    HIPCHECK(hipMalloc(&_C_d, sizeElements));
+    HIPCHECK(hipHostMalloc(&_C_h, sizeElements));
+
+    HIPCHECK(hipMemset(_C_d, -1, sizeElements));
+    HIPCHECK(hipMemset(_C_h, -2, sizeElements));

    HIPCHECK(hipStreamCreate(&_stream));
    HIPCHECK(hipEventCreate(&_event));
+
+
+
 };

-template <typename T>
-void Streamer<T>::runAsync()
-{
-    printf ("testing: %s  numElements=%zu size=%6.2fMB\n", __func__, _numElements, _numElements * sizeof(T) / 1024.0/1024.0);
-    unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, _numElements);
-    hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, _stream, _A_d, _B_d, _C_d, _numElements);

-    // Test case where hipStreamWaitEvent waits on same event we just placed into the queue.
-    HIPCHECK(hipEventRecord(_event, _stream));
-    HIPCHECK(hipStreamWaitEvent(_stream, _event, 0));
+template <typename T>
+Streamer<T>::~Streamer()
+{
+    HIPCHECK(hipSetDevice(_deviceId));
+
+    printf ("info: ~Streamer\n");
+    if (_preA_d) {
+        HIPCHECK(hipFree(_preA_d));
+    }
+    HIPCHECK(hipFree(_C_d));
+    HIPCHECK(hipHostFree(_C_h));
+
+    HIPCHECK(hipStreamDestroy(_stream));
+    HIPCHECK(hipEventDestroy(_event));
 }

+
+template <typename T>
+void Streamer<T>::runAsyncAfter(Streamer<T> *depStreamer, bool waitSameStream)
+{
+    HIPCHECK(hipSetDevice(_deviceId));
+    if (p_db) {
+      printf ("testing: %s  numElements=%zu size=%6.2fMB\n", __func__, _numElements, _numElements * sizeof(T) / 1024.0/1024.0);
+    }
+
+    if (depStreamer) {
+        HIPCHECK(hipStreamWaitEvent(_stream, depStreamer->event(), 0));
+    }
+
+    if (_preA_d) {
+        // _preA_d is on another device, so copy to local device so kernel can access it:
+        HIPCHECK(hipMemcpyAsync(_A_d, _preA_d, _numElements * sizeof(T), hipMemcpyDeviceToDevice, _stream));
+    }
+
+
+    unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, _numElements);
+    if (_commandType == COMMAND_ADD_REVERSE) {
+        hipLaunchKernelGGL(HipTest::addCountReverse , dim3(blocks), dim3(threadsPerBlock), 0, _stream,    _A_d, _C_d, _numElements, p_count);
+    } else if (_commandType == COMMAND_ADD_FORWARD) {
+        hipLaunchKernelGGL(HipTest::addCount,         dim3(blocks), dim3(threadsPerBlock), 0, _stream,    _A_d, _C_d, _numElements, p_count);
+    } else if (_commandType == COMMAND_COPY) {
+        HIPCHECK(hipMemcpyAsync(_C_d, _A_d, _numElements * sizeof(T), hipMemcpyDeviceToDevice, _stream));
+    } else {
+        assert(0); // bad command type
+    }
+    HIPCHECK(hipEventRecord(_event, _stream));
+
+    if (waitSameStream) {
+        HIPCHECK(hipStreamWaitEvent(_stream, _event, 0)); // this is essentially a no-op, but make sure it doesn't crash
+    }
+}
+
+
+
 template <typename T>
 void Streamer<T>::queryUntilComplete()
 {
+    HIPCHECK(hipSetDevice(_deviceId));
    int numQueries = 0;
    hipError_t e = hipSuccess;
    do {
@@ -89,15 +191,62 @@ void Streamer<T>::queryUntilComplete()
        e = hipStreamQuery(_stream);
    } while (e != hipSuccess) ;

-    printf ("completed after %d queries\n", numQueries);
+    printf ("info: hipStreamQuery completed after %d queries\n", numQueries);
 };


+// If copyStream is !nullptr it is used for the copy.
+template <typename T>
+void Streamer<T>::copyToHost(hipStream_t copyStream)
+{
+    if (p_db) {
+        printf ("db: copy back to host\n");
+    }
+    HIPCHECK(hipSetDevice(_deviceId));
+    HIPCHECK(hipMemcpyAsync(_C_h, _C_d, _numElements*sizeof(T), hipMemcpyDeviceToHost, copyStream ? copyStream : _stream));
+    HIPCHECK(hipStreamSynchronize(copyStream ? copyStream:_stream));
+
+}
+
+
+template <typename T>
+size_t Streamer<T>::check(int streamerNum, T initValue, T expectedOffset, bool expectPass)
+{
+    T expected = initValue + expectedOffset;
+    if (p_db) {
+        printf ("db: check\n");
+    }
+
+    _mismatchCount = 0;
+    for (size_t i=0; i<_numElements; i++) {
+        if (_C_h[i] != expected) {
+            _mismatchCount++;
+            if (expectPass) {
+                fprintf(stderr, "for streamer:%d  _C_h[%zu] (%d)  !=  expected(%d)\n", streamerNum, i, _C_h[i], expected);
+                if (_mismatchCount > 10) {
+                    failed("for streamer:%d  _C_h[%zu] (%d)  !=  expected(%d)\n", streamerNum, i, _C_h[i], expected);
+                }
+            }
+        }
+    }
+
+    if (!expectPass && (_mismatchCount ==0)) {
+        // the test should run kernels long enough that if we don't correctly wait for them to finish then an error is reported.
+        //failed("for streamer:%d  we expected inavalid synchronization to lead to mismatch but none was detected.  Increase --N to sensitize sync.\n", streamerNum);
+
+    }
+
+    return _mismatchCount;
+}
+
+   

 //---
 //Parse arguments specific to this test.
 void parseMyArguments(int argc, char *argv[])
 {
+    N = 64*1024*1024;
+
    int more_argc = HipTest::parseStandardArguments(argc, argv, false);

    // parse args for this test:
@@ -108,6 +257,14 @@ void parseMyArguments(int argc, char *argv[])
            if (++i >= argc || !HipTest::parseUInt(argv[i], &p_streams)) {
               failed("Bad streams argument");
            }
+        } else if (!strcmp(arg, "--count")) {
+            if (++i >= argc || !HipTest::parseUInt(argv[i], &p_count)) {
+               failed("Bad count argument");
+            }
+        } else if (!strcmp(arg, "--db")) {
+            if (++i >= argc || !HipTest::parseUInt(argv[i], &p_db)) {
+               failed("Bad db argument");
+            }
        } else {
            failed("Bad argument '%s'", arg);
        }
@@ -115,6 +272,95 @@ void parseMyArguments(int argc, char *argv[])
 };


+typedef Streamer<int> IntStreamer;
+
+
+
+
+void runStreamerLoop(std::vector<IntStreamer *> &streamers)
+{
+    for (int i=0; i<streamers.size(); i++) {
+        streamers[i]->runAsyncAfter(i ? streamers[i-1] : NULL);
+    }
+}
+
+
+void checkAll(int initValue, std::vector<IntStreamer *> &streamers, std::vector<hipStream_t> &sideStreams, bool expectPass=true)
+{
+    size_t mismatchCount=0;
+
+    // Copy in reverse order to catch anything not yet finished...
+    for (int i=streamers.size()-1; i>=0; i--) {
+        streamers[i]->copyToHost(sideStreams.empty() ? NULL : sideStreams[streamers[i]->deviceId()]);
+    }
+
+
+    int expected = 0;
+    // Check in forward order so we can find first mismatch:
+    for (int i=0; i<streamers.size(); i++) {
+
+        expected += streamers[i]->expectedAdd();
+        
+        mismatchCount += streamers[i]->check(i+1, initValue, expected, expectPass);
+
+    }
+    if (!expectPass && (mismatchCount==0)) {
+        // the test should run kernels long enough that if we don't correctly wait for them to finish then an error is reported.
+        failed("we expected inavalid synchronization to lead to mismatch but none was detected.  Increase --count to sensitize sync.\n");
+    }
+
+}
+
+
+
+#define RUN_SYNC_TEST(_enableBit, _streamers, _sync, _expectPass)\
+    if (p_tests & (_enableBit)) {\
+        printf ("==> Test %02x runAsyncAfter sync=%s\n", (_enableBit), #_sync);\
+        runStreamerLoop(_streamers);\
+        (_sync);\
+        checkAll (initValue, _streamers, sideStreams, _expectPass);\
+    }
+
+
+
+
+//---
+// A family of sync functions which somehow wait for inflight activity to finish:
+
+
+void sync_none(void) {};
+
+void sync_allDevices(int numDevices) 
+{
+    for (int d=0; d<numDevices; d++) {
+        HIPCHECK(hipSetDevice(d));
+        HIPCHECK(hipDeviceSynchronize());
+    }
+}
+
+
+void sync_queryAllUntilComplete(std::vector<IntStreamer *> streamers) 
+{
+    for (int i=streamers.size()-1; i>=0; i--) {
+        streamers[i]->queryUntilComplete();
+    };
+}
+
+
+void sync_streamWaitEvent(hipEvent_t lastEvent, int sideDeviceId, hipStream_t sideStream, bool waitHere) 
+{
+    HIPCHECK(hipSetDevice(sideDeviceId));
+
+    // wait on the last event in the stream of chained streamers:
+    // This plants a marker which the subsquent copy for this device will wait on:
+    HIPCHECK(hipStreamWaitEvent(sideStream, lastEvent, 0));
+
+    if (waitHere) {
+        HIPCHECK(hipStreamSynchronize(sideStream));
+    }
+}
+
+

 //---
 int main(int argc, char *argv[])
@@ -122,39 +368,115 @@ int main(int argc, char *argv[])
    HipTest::parseStandardArguments(argc, argv, false);
    parseMyArguments(argc, argv);

-    typedef Streamer<float> FloatStreamer;

-    std::vector<FloatStreamer *> streamers;
+

    size_t numElements = N;
+    size_t sizeElements = numElements * sizeof(int);

-    for (int i=0; i<p_streams; i++) {
-        FloatStreamer * s = new FloatStreamer(numElements);
-        streamers.push_back(s);
+    printf("info: sizeof arrays = %zu elements (%6.3f MB)\n", numElements, sizeElements / 1024.0/1024.0);
+    printf("info: streams=%d count=%d\n", p_streams, p_count);
+
+    assert (sizeElements <= std::numeric_limits<int64_t>::max());
+
+
+    int initValue = 1000;
+
+    int * initArray_d, *initArray_h;
+    HIPCHECK(hipMalloc(&initArray_d, sizeElements));
+    HIPCHECK(hipHostMalloc(&initArray_h, sizeElements));
+    for (size_t i=0; i<numElements; i++) {
+        initArray_h[i] = initValue;
    }
+    HIPCHECK(hipMemcpy(initArray_d, initArray_h, sizeElements, hipMemcpyHostToDevice));
+    

-    if (p_tests & 0x1) {
-        printf ("==> Test 0x1 runAsnc\n");
+    int numDevices;
+    HIPCHECK(hipGetDeviceCount(&numDevices));
+    numDevices = min(2, numDevices); // multi-GPU to 2 device.
+
+    std::vector<IntStreamer *> streamers;
+    std::vector<IntStreamer *> streamersDev0; // streamers for first device.
+
+    for (int d=0; d<numDevices/*TODO*/; d++) {
        for (int i=0; i<p_streams; i++) {
-            streamers[i]->runAsync();
+            int command = (i%2) ? COMMAND_ADD_FORWARD : COMMAND_ADD_REVERSE;
+            IntStreamer * s = new IntStreamer(d, i ? streamers.back()->C_d() : initArray_d, numElements, command);
+            streamers.push_back(s);
+            if (d==0) {
+                streamersDev0.push_back(s);
+            }
        }
-        HIPCHECK(hipDeviceSynchronize());
    }

-    if (p_tests & 0x2) {
-        printf ("==> Test 0x2 queryUntilComplete\n");
-        for (int i=0; i<p_streams; i++) {
-            streamers[i]->runAsync();
-            streamers[i]->queryUntilComplete();
-        }
-        HIPCHECK(hipDeviceSynchronize());
+
+
+
+
+    // A sideband stream channel that is independent from above.
+    // Used to check to ensure the WaitEvent or other synchronization is working correctly since by default sideStream is 
+    // asynchronous wrt the other streams.
+    std::vector<hipStream_t> sideStreams;
+    for (int d=0; d<numDevices; d++) {
+        hipStream_t s;
+        HIPCHECK(hipStreamCreate(&s));
+        sideStreams.push_back(s);
    }

-    if (p_tests & 0x4) {
+
+    // Tests on first GPU:
+    //
+    // This test has no synchronization - make sure it mismatches so we can ensure the other tests properyl prevent the mismatch:
+    RUN_SYNC_TEST(0x01, streamersDev0, sync_none(), false);
+
+    RUN_SYNC_TEST(0x02, streamersDev0, sync_allDevices(numDevices),  true);
+    RUN_SYNC_TEST(0x04, streamersDev0, sync_queryAllUntilComplete(streamersDev0),  true);
+    RUN_SYNC_TEST(0x08, streamersDev0, sync_streamWaitEvent(streamersDev0.back()->event(), 0, sideStreams[0], false),  true);
+
+    if (numDevices > 1) {
+        // Sync on second device for activity running on device 0:
+        RUN_SYNC_TEST(0x10, streamersDev0, sync_streamWaitEvent(streamersDev0.back()->event(), 1, sideStreams[1], true),  true);
+    }
+
+
+    // Tests on all GPUs:
+    // RUN_SYNC_TEST(0x100, streamers, sync_streamWaitEvent(streamers.back()->event(), 0, sideStreams[0], false),  true);
+
+
+
+
+    if (p_tests & 0x1000) {
+        printf ("==> Test 0x1000 try null stream\n"); 
        hipStreamQuery(0/* try null stream*/);

    }


+    // Insert small wrinkle here, insert a wait on event just recorded, all in the same stream.
+    if (p_tests & 0x2000) {
+        printf ("==> Test 0x2000 runAsyncWaitSameStream\n");
+        for (int i=0; i<streamersDev0.size(); i++) {
+            streamersDev0[i]->runAsyncAfter(i ? streamersDev0[i-1] : NULL, true/*waitSameStream*/);
+        }
+
+        sync_streamWaitEvent(streamersDev0.back()->event(), 0, sideStreams[0], false);
+        checkAll (initValue, streamersDev0, sideStreams);
+    }
+
+
+    // Change Adds to copies to stimulate different case with event followign copy:
+    for (auto &s : streamers) {
+        if (s->_commandType == COMMAND_ADD_FORWARD)
+            s->_commandType = COMMAND_COPY;
+    }
+
+
+    {
+        printf ("test: alternating memcpy/count-reverse followed by event\n");
+        RUN_SYNC_TEST(0x4000, streamersDev0, sync_queryAllUntilComplete(streamersDev0),  true);
+        RUN_SYNC_TEST(0x8000, streamersDev0, sync_streamWaitEvent(streamersDev0.back()->event(), 0, sideStreams[0], false),  true);
+    }
+
+
    passed();
 }
@@ -146,6 +146,90 @@ vectorADD(hipLaunchParm lp,
 }


+template <typename T>
+__global__ void
+vectorADDReverse(hipLaunchParm lp,
+            const T *A_d,
+            const T *B_d,
+            T *C_d,
+            size_t NELEM)
+{
+    size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
+    size_t stride = hipBlockDim_x * hipGridDim_x ;
+
+    for (int64_t i=NELEM-stride+offset; i>=0; i-=stride) {
+        C_d[i] = A_d[i] + B_d[i];
+	}
+}
+
+
+template <typename T>
+__global__ void
+addCount( const T *A_d,
+        T *C_d,
+        size_t NELEM,
+        int count)
+{
+    size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
+    size_t stride = hipBlockDim_x * hipGridDim_x ;
+
+    // Deliberately do this in an inefficient way to increase kernel runtime
+    for (int i=0; i<count; i++) {
+        for (size_t i=offset; i<NELEM; i+=stride) {
+            C_d[i] = A_d[i] + (T)count;
+        }
+    }
+}
+
+
+template <typename T>
+__global__ void
+addCountReverse( const T *A_d,
+        T *C_d,
+        int64_t NELEM,
+        int count)
+{
+    size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
+    size_t stride = hipBlockDim_x * hipGridDim_x ;
+
+    // Deliberately do this in an inefficient way to increase kernel runtime
+    for (int i=0; i<count; i++) {
+        for (int64_t i=NELEM-stride+offset; i>=0; i-=stride) {
+            C_d[i] = A_d[i] + (T)count;
+        }
+    }
+}
+
+
+template <typename T>
+__global__ void
+memsetReverse( T *C_d,  T val,
+        int64_t NELEM)
+{
+    size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
+    size_t stride = hipBlockDim_x * hipGridDim_x ;
+
+    for (int64_t i=NELEM-stride+offset; i>=0; i-=stride) {
+        C_d[i] = val;
+    }
+}
+
+
+template <typename T>
+void setDefaultData(size_t numElements, T *A_h, T* B_h, T *C_h)
+{
+    // Initialize the host data:
+    for (size_t i=0; i<numElements; i++) {
+        if (A_h) 
+            (A_h)[i] = 3.146f + i; // Pi
+        if (B_h) 
+            (B_h)[i] = 1.618f + i; // Phi
+        if (C_h) 
+            (C_h)[i] = 0.0f + i;  
+    }
+}
+
+
 template <typename T>
 void initArraysForHost(T **A_h, T **B_h, T **C_h,
                size_t N, bool usePinnedHost=false)
@@ -179,15 +263,10 @@ void initArraysForHost(T **A_h, T **B_h, T **C_h,
        }
    }

-    // Initialize the host data:
-    for (size_t i=0; i<N; i++) {
-        if (A_h) 
-            (*A_h)[i] = 3.146f + i; // Pi
-        if (B_h) 
-            (*B_h)[i] = 1.618f + i; // Phi
-    }
+    setDefaultData(N, A_h ? *A_h : NULL, B_h ? *B_h : NULL, C_h ? *C_h : NULL);
 }

+
 template <typename T>
 void initArrays(T **A_d, T **B_d, T **C_d,
                T **A_h, T **B_h, T **C_h, 
@@ -295,7 +374,7 @@ inline void initHIPArrays(hipArray **A_d, hipArray **B_d, hipArray **C_d,
 // Assumes C_h contains vector add of A_h + B_h
 // Calls the test "failed" macro if a mismatch is detected.
 template <typename T>
-void checkVectorADD(T* A_h, T* B_h, T* result_H, size_t N, bool expectMatch=true)
+size_t checkVectorADD(T* A_h, T* B_h, T* result_H, size_t N, bool expectMatch=true, bool reportMismatch=true)
 {
    size_t  mismatchCount = 0;
    size_t  firstMismatch = 0;
@@ -316,9 +395,50 @@ void checkVectorADD(T* A_h, T* B_h, T* result_H, size_t N, bool expectMatch=true
        }
    }

+	if (reportMismatch) {
+        if (expectMatch) {
+            if (mismatchCount) {
+                failed("%zu mismatches ; first at index:%zu\n", mismatchCount, firstMismatch);
+            }
+        } else {
+            if (mismatchCount == 0) {
+                failed("expected mismatches but did not detect any!");
+            }
+        }
+	}
+
+	return mismatchCount;
+
+}
+
+
+// Assumes C_h contains vector add of A_h + B_h
+// Calls the test "failed" macro if a mismatch is detected.
+template <typename T>
+void checkTest(T* expected_H, T* result_H, size_t N, bool expectMatch=true)
+{
+    size_t  mismatchCount = 0;
+    size_t  firstMismatch = 0;
+    size_t  mismatchesToPrint = 10;
+    for (size_t i=0; i<N; i++) {
+        if (result_H[i] != expected_H[i]) {
+            if (mismatchCount == 0) {
+                firstMismatch = i;
+            }
+            mismatchCount++;
+            if ((mismatchCount <= mismatchesToPrint) && expectMatch) {
+                std::cout << std::fixed << std::setprecision(32);
+                std::cout << "At " << i << std::endl;
+                std::cout << "  Computed:" << result_H[i]  << std::endl;
+                std::cout << "  Expected:" << expected_H[i] << std::endl;
+            }
+        }
+    }
+
    if (expectMatch) {
        if (mismatchCount) {
-            failed("%zu mismatches ; first at index:%zu\n", mismatchCount, firstMismatch);
+            fprintf(stderr, "%zu mismatches ; first at index:%zu\n", mismatchCount, firstMismatch);
+            //failed("%zu mismatches ; first at index:%zu\n", mismatchCount, firstMismatch);
        }
    } else {
        if (mismatchCount == 0) {
@@ -185,6 +185,8 @@ syn keyword hipFlags hipHostMallocDefault
 syn keyword hipFlags hipHostMallocPortable
 syn keyword hipFlags hipHostMallocMapped
 syn keyword hipFlags hipHostMallocWriteCombined
+syn keyword hipFlags hipHostMallocCoherent
+syn keyword hipFlags hipHostMallocNonCoherent

 syn keyword hipFlags hipHostRegisterDefault
 syn keyword hipFlags hipHostRegisterPortable