Merge branch 'amd-master-next' into amd-npi-next

Change-Id: Id241c60d6c2ceb4049c3ec15d9fe06baf28bcb3a
2020-07-20 09:49:15 -04:00
parent e356f5ff85 94c4462848
commit 6be1b7ce2b
71 changed files with 4934 additions and 629 deletions
@@ -4,6 +4,8 @@ project(hip)
 #  cmake -DHIP_COMPILER=clang -DHIP_PLATFORM=rocclr ..
 #  cmake -DHIP_COMPILER=clang -DHIP_PLATFORM=rocclr -DOPENCL_DIR=/path/to/opencl/api/opencl -DCMAKE_PREFIX_PATH=/path/to/rocclr/build/or/install/directory ..

+set(BUILD_SHARED_LIBS ON  CACHE BOOL "Build shared library (.so) or static lib (.a) ")
+
 set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/cmake")

 #############################
@@ -626,18 +628,13 @@ if(POLICY CMP0037)
    cmake_policy(PUSH)
    cmake_policy(SET CMP0037 OLD)
 endif()
-file(GENERATE OUTPUT ${PROJECT_BINARY_DIR}/fixnames
-        CONTENT "pwd; for i in *.deb; do mv \"\$i\" \"\${i/.deb/-amd64.deb}\" ; done
-for i in *.rpm ; do mv \$i \${i/.rpm/.x86_64.rpm} ; done
-")
+
 if(HIP_PLATFORM STREQUAL "hcc")
    add_custom_target(package
-    COMMAND bash ${PROJECT_BINARY_DIR}/fixnames
    WORKING_DIRECTORY ${PROJECT_BINARY_DIR}
    DEPENDS pkg_hip_base pkg_hip_hcc pkg_hip_nvcc pkg_hip_doc pkg_hip_samples)
 elseif(HIP_PLATFORM STREQUAL "rocclr")
    add_custom_target(package
-    COMMAND bash ${PROJECT_BINARY_DIR}/fixnames
    WORKING_DIRECTORY ${PROJECT_BINARY_DIR}
    DEPENDS pkg_hip_base hip_on_rocclr pkg_hip_nvcc pkg_hip_doc pkg_hip_samples)
 endif()
@@ -4,11 +4,11 @@

 Key features include:

-* HIP is very thin and has little or no performance impact over coding directly in CUDA or hcc "HC" mode.
+* HIP is very thin and has little or no performance impact over coding directly in CUDA mode.
 * HIP allows coding in a single-source C++ programming language including features such as templates, C++11 lambdas, classes, namespaces, and more.
 * HIP allows developers to use the "best" development environment and tools on each target platform.
 * The [HIPIFY](https://github.com/ROCm-Developer-Tools/HIPIFY/blob/master/README.md) tools automatically convert source from CUDA to HIP.
-* Developers can specialize for the platform (CUDA or hcc) to tune for performance or handle tricky cases 
+* Developers can specialize for the platform (CUDA or AMD) to tune for performance or handle tricky cases.

 New projects can be developed directly in the portable HIP C++ language and can run on either NVIDIA or AMD platforms.  Additionally, HIP provides porting tools which make it easy to port existing CUDA codes to the HIP layer, with no loss of performance as compared to the original CUDA application.  HIP is not intended to be a drop-in replacement for CUDA, and developers should expect to do some manual coding and performance tuning work to complete the port.

@@ -27,16 +27,15 @@ HIP releases are typically of two types. The tag naming convention is different
 * preview_x.yy.zzzz: These denote pre-release code and are based on the developer-preview branch. This type of release is typically made once a week.

 ## More Info:
- [Installation](INSTALL.md) 
+- [Installation](INSTALL.md)
 - [HIP FAQ](docs/markdown/hip_faq.md)
 - [HIP Kernel Language](docs/markdown/hip_kernel_language.md)
 - [HIP Runtime API (Doxygen)](http://rocm-developer-tools.github.io/HIP)
 - [HIP Porting Guide](docs/markdown/hip_porting_guide.md)
 - [HIP Porting Driver Guide](docs/markdown/hip_porting_driver_api.md)
 - [HIP Programming Guide](docs/markdown/hip_programming_guide.md)
- [HIP Profiling ](docs/markdown/hip_profiling.md)
- [HIP Debugging](docs/markdown/hip_debugging.md)
- [HIP Terminology](docs/markdown/hip_terms.md) (including Rosetta Stone of GPU computing terms across CUDA/HIP/HC/AMP/OpenCL)
+- [HIP Logging ](docs/markdown/hip_logging.md)
+- [HIP Terminology](docs/markdown/hip_terms2.md) (including Rosetta Stone of GPU computing terms across CUDA/HIP/OpenCL)
 - [HIPIFY](https://github.com/ROCm-Developer-Tools/HIPIFY/blob/master/README.md)
 - Supported CUDA APIs:
  * [Runtime API](docs/markdown/CUDA_Runtime_API_functions_supported_by_HIP.md)
@@ -55,7 +54,7 @@ HIP releases are typically of two types. The tag naming convention is different
 See the [Installation](INSTALL.md) notes.

 ## Simple Example
-The HIP API includes functions such as hipMalloc, hipMemcpy, and hipFree.  
+The HIP API includes functions such as hipMalloc, hipMemcpy, and hipFree.
 Programmers familiar with CUDA will also be able to quickly learn and start coding with the HIP API.
 Compute kernels are launched with the "hipLaunchKernel" macro call.    Here is simple example showing a
 snippet of HIP API code:
@@ -76,11 +75,10 @@ hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost);
 ```


-The HIP kernel language defines builtins for determining grid and block coordinates, math functions, short vectors, 
-atomics, and timer functions. It also specifies additional defines and keywords for function types, address spaces, and 
-optimization controls.  (See the [HIP Kernel Language](docs/markdown/hip_kernel_language.md) for a full description).
-Here's an example of defining a simple 'vector_square' kernel.  
-
+The HIP kernel language defines builtins for determining grid and block coordinates, math functions, short vectors,
+atomics, and timer functions.
+It also specifies additional defines and keywords for function types, address spaces, and optimization controls (See the [HIP Kernel Language](docs/markdown/hip_kernel_language.md) for a full description).
+Here's an example of defining a simple 'vector_square' kernel.


 ```cpp
@@ -100,14 +98,14 @@ vector_square(T *C_d, const T *A_d, size_t N)
 The HIP Runtime API code and compute kernel definition can exist in the same source file - HIP takes care of generating host and device code appropriately.

 ## HIP Portability and Compiler Technology
-HIP C++ code can be compiled with either :
+HIP C++ code can be compiled with either,
 - On the NVIDIA CUDA platform, HIP provides header file which translate from the HIP runtime APIs to CUDA runtime APIs.  The header file contains mostly inlined
  functions and thus has very low overhead - developers coding in HIP should expect the same performance as coding in native CUDA.  The code is then
  compiled with nvcc, the standard C++ compiler provided with the CUDA SDK.  Developers can use any tools supported by the CUDA SDK including the CUDA
  profiler and debugger.
- On the AMD ROCm platform, HIP provides a header and runtime library built on top of hcc compiler.  The HIP runtime implements HIP streams, events, and memory APIs,
+- On the AMD ROCm platform, HIP provides a header and runtime library built on top of HIP-Clang compiler.  The HIP runtime implements HIP streams, events, and memory APIs,
  and is a object library that is linked with the application.  The source code for all headers and the library implementation is available on GitHub.
-  HIP developers on ROCm can use AMD's CodeXL for debugging and profiling.
+  HIP developers on ROCm can use AMD's ROCgdb (https://github.com/ROCm-Developer-Tools/ROCgdb) for debugging and profiling.

 Thus HIP source code can be compiled to run on either platform.  Platform-specific features can be isolated to a specific platform using conditional compilation.  Thus HIP
 provides source portability to either platform.   HIP provides the _hipcc_ compiler driver which will call the appropriate toolchain depending on the desired platform.
@@ -117,7 +115,7 @@ provides source portability to either platform.   HIP provides the _hipcc_ compi

 * A sample and [blog](http://gpuopen.com/hip-to-be-squared-an-introductory-hip-tutorial) that uses any of [HIPIFY](https://github.com/ROCm-Developer-Tools/HIPIFY/blob/master/README.md) tools to convert a simple app from CUDA to HIP:

- 
+
 ```shell
 cd samples/01_Intro/square
 # follow README / blog steps to hipify the application.
@@ -142,11 +140,11 @@ The README with the procedures and tips the team used during this porting effort
    * **hip_runtime.h** : Includes everything in hip_runtime_api.h PLUS hipLaunchKernel and syntax for writing device kernels and device functions.  hip_runtime.h can only be compiled with hcc.
    * **hcc_detail/**** , **nvcc_detail/**** : Implementation details for specific platforms. HIP applications should not include these files directly.
    * **hcc.h** : Includes interop APIs for HIP and HCC
-    
+
 * **bin**: Tools and scripts to help with hip porting
    * **hipify-perl** : Script based tool to convert CUDA code to portable CPP. Converts CUDA APIs and kernel builtins.
-    * **hipcc** : Compiler driver that can be used to replace nvcc in existing CUDA code. hipcc will call nvcc or hcc depending on platform and include appropriate platform-specific headers and libraries.
-    * **hipconfig** : Print HIP configuration (HIP_PATH, HIP_PLATFORM, CXX config flags, etc.)
+    * **hipcc** : Compiler driver that can be used to replace nvcc in existing CUDA code. hipcc will call nvcc or HIP-Clang depending on platform and include appropriate platform-specific headers and libraries.
+    * **hipconfig** : Print HIP configuration (HIP_PATH, HIP_PLATFORM, HIP_COMPILER, HIP_RUNTIME, CXX config flags, etc.)
    * **hipexamine-perl.sh** : Script to scan the directory, find all code, and report statistics on how much can be ported with HIP (and identify likely features not yet supported).
    * **hipconvertinplace-perl.sh** : Script to scan the directory, find all code, and convert the found CUDA code to HIP reporting all unconverted things.

@@ -675,7 +675,7 @@ foreach $arg (@ARGV)
                $needCXXFLAGS = 1;
                if ($HIP_COMPILE_CXX_AS_HIP eq '0' or $HIP_COMPILER ne "clang") {
                    $hasCXX = 1;
-                } else {
+                } elsif ($HIP_PLATFORM eq "hcc" and $HIP_COMPILER eq "clang") {
                    $hasHIP = 1;
                    $toolArgs .= " -x hip";
                }
@@ -837,9 +837,9 @@ if ($HIP_PLATFORM eq "hcc" and $HIP_COMPILER eq "clang") {

    if (not $isWindows  and not $compileOnly) {
      if ($linkType eq 0) {
-        $toolArgs .= " -L$HIP_LIB_PATH -lamdhip64_static -L$ROCM_PATH/lib -lhsa-runtime64 -ldl ";
+        $toolArgs .= " -L$HIP_LIB_PATH -lamdhip64 -L$ROCM_PATH/lib -lhsa-runtime64 -ldl -lnuma ";
      } else {
-        $toolArgs .= " -Wl,--enable-new-dtags -Wl,--rpath=$HIP_LIB_PATH:$ROCM_PATH/lib -lhip_hcc ";
+        $toolArgs .= " -Wl,--enable-new-dtags -Wl,--rpath=$HIP_LIB_PATH:$ROCM_PATH/lib -lhip_hcc -lnuma ";
      }
      # To support __fp16 and _Float16, explicitly link with compiler-rt
      $toolArgs .= " -L$HIP_CLANG_PATH/../lib/clang/$HIP_CLANG_VERSION/lib/linux -lclang_rt.builtins-x86_64 "
@@ -1,7 +1,7 @@
 #!/usr/bin/perl -w

 $HIP_BASE_VERSION_MAJOR = "3";
-$HIP_BASE_VERSION_MINOR = "6";
+$HIP_BASE_VERSION_MINOR = "7";

 # Need perl > 5.10 to use logic-defined or
 use 5.006; use v5.10.1;
@@ -1,168 +0,0 @@
-Table of Contents
-=================
-
-  * [Profiling HIP Code](#profiling-hip-code)
-      * [Using HIP_DB](#using-hip_db)
-      * [Using ltrace](#using-ltrace)
-      * [Chicken bits](#chicken-bits)
-      * [Debugging HIP Applications](#debugging-hip-applications)
-      * [General Debugging Tips](#general-debugging-tips)
-        * [Print env var state](#print-env-var-state)
-
-### Using HIP_DB
-
-This flag is primarily targeted to assist HIP development team in the development of the HIP runtime, but in some situations may be useful to HIP application developers as well.
-The HIP debug information is designed to print important information during the execution of a HIP API.  HIP provides
-different color-coded levels of debug information:
-  - api  : Print the beginning and end of each HIP API, including the arguments and return codes.  This is equivalent to setting HIP_TRACE_API=1.
-  - sync : Print multi-thread and other synchronization debug information.
-  - copy : Print which engine is doing the copy, which copy flavor is selected, information on source and destination memory.
-  - mem  : Print information about memory allocation - which pointers are allocated, where they are allocated, peer mappings, and more.
-
-HIP_DB format is flags separated by '+' sign, or a hex code for the bitmask.  Generally the + format is preferred.  
-For example:
-```
-$ HIP_DB=api+copy+mem  my-application
-$ HIP_DB=0xF  my-application
-```
-
-### Using ltrace
-ltrace is a standard linux tool which provides a message to stderr on every dynamic library call.  Since ROCr and the ROCt (the ROC thunk, which is the thin user-space interface to the ROC kernel driver) are both dynamic libraries, this provides an easy way to trace the activity in these libraries.  Tracing can be a powerful way to quickly observe the flow of the application before diving into the details with a command-line debugger.
-The trace can also show performance issues related to accidental calls to expensive API calls on the critical path.
-
-ltrace can be easily combined with the HIP_DB switches to visualize the runtime behavior of the entire ROCm software stack.  Here's a sample command-line and output:
-
-```
-$ HIP_DB=api ltrace -C -e 'hsa*'   <applicationName> <applicationArguments>
-
-...
-
-<<hip-api tid:1.17 hipMemcpy (0x7f7776d3e010, 0x503d1d000, 4194304, hipMemcpyDeviceToHost)
-libmcwamp_hsa.so->hsa_signal_store_relaxed(0x1804000, 0, 0, 0x400000) = 0
-libmcwamp_hsa.so->hsa_signal_store_relaxed(0x1816000, 0, 0x7f777f85f2a0, 0x400000) = 0
-libmcwamp_hsa.so->hsa_amd_memory_lock(0x7f7776d3e010, 0x400000, 0x1213b70, 1 <unfinished ...>
-libhsa-runtime64.so.1->hsaKmtRegisterMemoryToNodes(0x7f7776d3e010, 0x400000, 1, 0x1220c10) = 0
-libhsa-runtime64.so.1->hsaKmtMapMemoryToGPUNodes(0x7f7776d3e010, 0x400000, 0x7ffc32865400, 64) = 0
-<... hsa_amd_memory_lock resumed> )              = 0
-libmcwamp_hsa.so->hsa_signal_store_relaxed(0x1804000, 1, 0x7f777e95a770, 0x12205b0) = 0
-libmcwamp_hsa.so->hsa_amd_memory_async_copy(0x50411d010, 0x11e70d0, 0x503d1d000, 0x11e70d0) = 0
-libmcwamp_hsa.so->hsa_signal_wait_acquire(0x1804000, 2, 1, -1) = 0
-libmcwamp_hsa.so->hsa_amd_memory_unlock(0x7f7776d3e010, 0x1213c6c, 0x12c3c600000000, 0x1804000 <unfinished ...>
-libhsa-runtime64.so.1->hsaKmtUnmapMemoryToGPU(0x7f7776d3e010, 0x7f7776d3e010, 0x12c3c600000000, 0x1804000) = 0
-libhsa-runtime64.so.1->hsaKmtDeregisterMemory(0x7f7776d3e010, 0x7f7776d3e010, 0x7f777f60f9e8, 0x1220580) = 0
-<... hsa_amd_memory_unlock resumed> )            = 0
-  hip-api tid:1.17 hipMemcpy                      ret= 0 (hipSuccess)>>
-```
-
-Some key information from the trace above.
-  - Thy trace snippet shows the execution of a hipMemcpy API, bracketed by the first and last message in the trace output.  The messages show the thread id and API sequence number (`1.17`).  ltrace output intermixes messages from all threads, so the HIP debug information can be useful to determine which threads are executing.
-  - The code flows through HIP APIs into ROCr (HSA) APIs (hsa*) and into the thunk (hsaKmt*) calls.
-  - The HCC runtime is "libmcwamp_hsa.so" and the HSA/ROCr runtime is "libhsa-runtime64.so".
-  - In this particular case, the memory copy is for unpinned memory, and the selected copy algorithm is to pin the host memory "in-place" before performing the copy.  The signaling APIs and calls to pin ("lock", "register") the memory are readily apparent in the trace output.
-
-
-### Chicken bits
-Chicken bits are environment variables which cause the HIP, HCC, or HSA driver to disable some feature or optimization.
-These are not intended for production but can be useful diagnose synchronization problems in the application (or driver).
-
-Some of the most useful chicken bits are described here. These bits are supported on the ROCm path:
-
-HIP provides 3 environment variables in the HIP_*_BLOCKING family.  These introduce additional synchronization and can be useful to isolate synchronization problems. Specifically, if the code works with this flag set, then it indicates the kernels are executing correctly, and any failures likely are causes by improper or missing synchronization.  These flags will have performance impact and are not intended for production use.
-
- HIP_LAUNCH_BLOCKING=1 : Waits on the host after each kernel launch.  Equivalent to setting CUDA_LAUNCH_BLOCKING.
- HIP_LAUNCH_BLOCKING_KERNELS: A comma-separated list of kernel names.  The HIP runtime will wait on the host after one of the named kernels executes.  This provides a more targeted version of HIP_LAUNCH_BLOCKING and may be useful to isolate exactly which kernel needs further analysis if HIP_LAUNCH_BLOCKING=1 improves functionality.  There is no indication if kernel names are spelled incorrectly.  One mechanism to verify that the blocking is working is to run with HIP_DB=api+sync and search for debug messages with "LAUNCH_BLOCKING".
- HIP_API_BLOCKING : Forces hipMemcpyAsync and hipMemsetAsync to be host-synchronous, meaning they will wait for the requested operation to complete before returning to the caller.
-
-These options cause HCC to serialize.  Useful if you have libraries or code which is calling HCC kernels directly rather than using HIP.  
- HCC_SERIALIZE_KERNEL : 0x1=pre-serialize before each kernel launch, 0x2=post-serialize after each kernel launch., 0x3= pre- and post- serialize.
- HCC_SERIALIZE_COPY    : 0x1=pre-serialize before each async copy, 0x2=post-serialize after each async copy., 0x3= pre- and post- serialize.
-
- HSA_ENABLE_SDMA=0     : Causes host-to-device and device-to-host copies to use compute shader blit kernels rather than the dedicated DMA copy engines.  Compute shader copies have low latency (typically < 5us) and can achieve approximately 80% of the bandwidth of the DMA copy engine.  This flag is useful to isolate issues with the hardware copy engines.
- HSA_ENABLE_INTERRUPT=0 : Causes completion signals to be detected with memory-based polling rather than interrupts.  Can be useful to diagnose interrupt storm issues in the driver.
- HSA_DISABLE_CACHE=1  : Disables the GPU L2 data cache.
-
-### Debugging HIP Applications
-
- The variable "tls_tidInfo" contains the API sequence number (_apiSeqNum)- a monotonically increasing count of the HIP APIs called from this thread.  This can be useful for setting conditional breakpoints.  Also, each new HIP thread is mapped to monotically increasing shortTid ID.  Both of these fields are displayed in the HIP debug info. 
-```
-(gdb) p tls_tidInfo
-$32 = {_shortTid = 1, _apiSeqNum = 803}
-```
-
- HCC tracks all of the application memory allocations, including those from HIP and HC's "am_alloc".
-If the HCC runtime is built with debug information (HCC_RUNTIME_DEBUG=ON when building HCC), then calling the function 'hc::am_memtracker_print()' will show all memory allocations. 
-An optional argument specifies a void * targetPointer - the print routine will mark the allocation which contains the specified pointer with "-->" in the printed output.
-This example shows a sample GDB session where we print the memory allocated by this process and mark a specified address by using the gdb "call" function..
-The gdb syntax also supports using the variable name (in this case 'dst'):
-```
-(gdb) p dst
-$33 = (void *) 0x5ec7e9000
-(gdb) call hc::am_memtracker_print(dst)
-TargetAddress:0x5ec7e9000
-   0x504cfc000-0x504cfc00f::  allocSeqNum:1 hostPointer:0x504cfc000 devicePointer:0x504cfc000 sizeBytes:16 isInDeviceMem:0 isAmManaged:1 appId:0 appAllocFlags:0 appPtr:(nil)
-...
-->0x5ec7e9000-0x5f7e28fff::  allocSeqNum:488 hostPointer:(nil) devicePointer:0x5ec7e9000 sizeBytes:191102976 isInDeviceMem:1 isAmManaged:1 appId:0 appAllocFlags:0 appPtr:(nil)
-
-```
-
-To debug an explicit address, cast the address to (void*) :
-```
-(gdb) call hc::am_memtracker_print((void*)0x508c7f000)
-```
- Debugging GPUVM fault.
-For example:
-```
-Memory access fault by GPU node-1 on address 0x5924000. Reason: Page not present or supervisor privilege.
-
-Program received signal SIGABRT, Aborted.
-[Switching to Thread 0x7fffdffb5700 (LWP 14893)]
-0x00007ffff2057c37 in __GI_raise (sig=sig@entry=6) at ../nptl/sysdeps/unix/sysv/linux/raise.c:56
-56      ../nptl/sysdeps/unix/sysv/linux/raise.c: No such file or directory.
-(gdb) bt
-#0  0x00007ffff2057c37 in __GI_raise (sig=sig@entry=6) at ../nptl/sysdeps/unix/sysv/linux/raise.c:56
-#1  0x00007ffff205b028 in __GI_abort () at abort.c:89
-#2  0x00007ffff6f960eb in ?? () from /opt/rocm/hsa/lib/libhsa-runtime64.so.1
-#3  0x00007ffff6f99ea5 in ?? () from /opt/rocm/hsa/lib/libhsa-runtime64.so.1
-#4  0x00007ffff6f78107 in ?? () from /opt/rocm/hsa/lib/libhsa-runtime64.so.1
-#5  0x00007ffff744f184 in start_thread (arg=0x7fffdffb5700) at pthread_create.c:312
-#6  0x00007ffff211b37d in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:111
-(gdb) info threads
-  Id   Target Id         Frame
-  4    Thread 0x7fffdd521700 (LWP 14895) "caffe" pthread_cond_wait@@GLIBC_2.3.2 () at ../nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S:185
-  3    Thread 0x7fffddd22700 (LWP 14894) "caffe" pthread_cond_wait@@GLIBC_2.3.2 () at ../nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S:185
-* 2    Thread 0x7fffdffb5700 (LWP 14893) "caffe" 0x00007ffff2057c37 in __GI_raise (sig=sig@entry=6) at ../nptl/sysdeps/unix/sysv/linux/raise.c:56
-  1    Thread 0x7ffff7fa6ac0 (LWP 14892) "caffe" 0x00007ffff6f934d5 in ?? () from /opt/rocm/hsa/lib/libhsa-runtime64.so.1
-(gdb) thread 1
-[Switching to thread 1 (Thread 0x7ffff7fa6ac0 (LWP 14892))]
-#0  0x00007ffff6f934d5 in ?? () from /opt/rocm/hsa/lib/libhsa-runtime64.so.1
-(gdb) bt
-#0  0x00007ffff6f934d5 in ?? () from /opt/rocm/hsa/lib/libhsa-runtime64.so.1
-#1  0x00007ffff6f929ba in ?? () from /opt/rocm/hsa/lib/libhsa-runtime64.so.1
-#2  0x00007fffe080beca in HSADispatch::waitComplete() () from /opt/rocm/hcc/lib/libmcwamp_hsa.so
-#3  0x00007fffe080415f in HSADispatch::dispatchKernelAsync(Kalmar::HSAQueue*, void const*, int, bool) () from /opt/rocm/hcc/lib/libmcwamp_hsa.so
-#4  0x00007fffe080238e in Kalmar::HSAQueue::dispatch_hsa_kernel(hsa_kernel_dispatch_packet_s const*, void const*, unsigned long, hc::completion_future*) () from /opt/rocm/hcc/lib/libmcwamp_hsa.so
-#5  0x00007ffff7bb7559 in hipModuleLaunchKernel () from /opt/rocm/hip/lib/libhip_hcc.so
-#6  0x00007ffff2e6cd2c in mlopen::HIPOCKernel::run (this=0x7fffffffb5a8, args=0x7fffffffb2a8, size=80) at /root/MIOpen/src/hipoc/hipoc_kernel.cpp:15  
-...
-```
-
-### General Debugging Tips
- The fault will be caught by the runtime but was actually generated by an asynchronous command running on the GPU.    So, the GDB backtrace will show a path in the runtime, ie inside "GI_Raise" as shown in the example above.
- To determine the true location of the fault, force the kernels to execute synchronously by seeing the environment variables HCC_SERIALIZE_KERNEL=3 HCC_SERIALIZE_COPY=3.  This will force HCC to wait for the kernel to finish executing before retuning.  If the fault occurs during the execution of a kernel, you can see the code which launched the kernel inside the backtrace.  A bit of guesswork is required to determine which thread is actually causing the issue - typically it will the thread which is waiting inside the libhsa-runtime64.so.
- VM faults inside kernels can be caused byi:
-   - incorrect code (ie a for loop which extends past array boundaries), i
-   - memory issues  - kernel arguments which are invalid (null pointers, unregistered host pointers, bad pointers).
-   - synchronization issues
-   - compiler issues (incorrect code generation from the compiler)
-   - runtime issues 
-
-- General debug tips:
- 'gdb --args' can be used to conviently pass the executable and arguments to gdb.
- From inside GDB, you can set environment variables "set env".  Note the command does not use an '=' sign:
-```
-(gdb) set env HIP_DB 1
-```
-
-#### Print env var state
-Setting HIP_PRINT_ENV=1 and then running a HIP application will print the HIP environment variables, their current values, and usage info.
-Setting HCC_PRINT_ENV=1 and then running a HCC application will print the HCC environment variables, their current values, and usage info.
@@ -1,6 +1,6 @@
 # HIP Deprecated APIs

-## HIP Context API
+## HIP Context Management APIs

 CUDA supports cuCtx API, the Driver API that defines "Context" and "Devices" as separate entities. Contexts contain a single device, and a device can theoretically have multiple contexts. HIP initially added limited support for these API to facilitate easy porting from existing driver codes. These API are marked as deprecated now since there are better alternate interface (such as hipSetDevice or the stream API) to achieve the required functions.

@@ -20,3 +20,17 @@ CUDA supports cuCtx API, the Driver API that defines "Context" and "Devices" as
 ### hipCtxGetFlags
 ### hipCtxEnablePeerAccess
 ### hipCtxDisablePeerAccess
+
+## HIP Management APIs
+
+### hipMallocHost
+Should use "hipHostMalloc" instead.
+
+### hipMemAllocHost
+Should use "hipHostMalloc" instead.
+
+### hipHostAlloc
+Should use "hipHostMalloc" instead.
+
+### hipFreeHost
+Should use "hipHostFree" instead.
@@ -0,0 +1,187 @@
+## What is HIP logging for? ###
+
+HIP provides a logging mechanism, which is a convinient way of printing important information so as to trace HIP API and runtime codes during the execution of HIP application.
+It assists HIP development team in the development of HIP runtime, and is useful for HIP application developers as well.
+Depending on the setting of logging level and logging mask, HIP logging will print different kinds of information, for different types of functionalities such as HIP APIs, executed kernels, queue commands and queue contents, etc.
+
+## HIP Logging Level:
+
+By Default, HIP logging is disabled, it can be enabled via environment setting,
+  - AMD_LOG_LEVEL
+
+The value of the setting controls different logging level,
+
+```
+enum LogLevel {
+LOG_NONE = 0,
+LOG_ERROR = 1,
+LOG_WARNING = 2,
+LOG_INFO = 3,
+LOG_DEBUG = 4
+};
+```
+
+## HIP Logging Mask:
+
+Logging mask is designed to print types of functionalities during the execution of HIP application.
+It can be set as one of the following values,
+
+```
+enum LogMask {
+  LOG_API       = 0x00000001, //!< API call
+  LOG_CMD       = 0x00000002, //!< Kernel and Copy Commands and Barriers
+  LOG_WAIT      = 0x00000004, //!< Synchronization and waiting for commands to finish
+  LOG_AQL       = 0x00000008, //!< Decode and display AQL packets
+  LOG_QUEUE     = 0x00000010, //!< Queue commands and queue contents
+  LOG_SIG       = 0x00000020, //!< Signal creation, allocation, pool
+  LOG_LOCK      = 0x00000040, //!< Locks and thread-safety code.
+  LOG_KERN      = 0x00000080, //!< kernel creations and arguments, etc.
+  LOG_COPY      = 0x00000100, //!< Copy debug
+  LOG_COPY2     = 0x00000200, //!< Detailed copy debug
+  LOG_RESOURCE  = 0x00000400, //!< Resource allocation, performance-impacting events.
+  LOG_INIT      = 0x00000800, //!< Initialization and shutdown
+  LOG_MISC      = 0x00001000, //!< misc debug, not yet classified
+  LOG_AQL2      = 0x00002000, //!< Show raw bytes of AQL packet
+  LOG_CODE      = 0x00004000, //!< Show code creation debug
+  LOG_CMD2      = 0x00008000, //!< More detailed command info, including barrier commands
+  LOG_LOCATION  = 0x00010000, //!< Log message location
+  LOG_ALWAYS    = 0xFFFFFFFF, //!< Log always even mask flag is zero
+};
+```
+
+Once AMD_LOG_LEVEL is set, logging mask is set as default with the value 0x7FFFFFFF.
+However, for different pupose of logging functionalities, logging mask can be defined as well via environment variable,
+
+  - AMD_LOG_MASK
+
+## HIP Logging command:
+
+To pring HIP logging information, the function is defined as
+```
+#define ClPrint(level, mask, format, ...)
+  do {
+    if (AMD_LOG_LEVEL >= level) {
+      if (AMD_LOG_MASK & mask || mask == amd::LOG_ALWAYS) {
+        if (AMD_LOG_MASK & amd::LOG_LOCATION) {
+          amd::log_printf(level, __FILENAME__, __LINE__, format, ##__VA_ARGS__);
+        } else {
+          amd::log_printf(level, "", 0, format, ##__VA_ARGS__);
+        }
+      }
+    }
+  } while (false)
+```
+
+So in HIP code, call ClPrint() function with proper input varibles as needed, for example,
+```
+ClPrint(amd::LOG_INFO, amd::LOG_INIT, "Initializing HSA stack.");
+```
+
+## HIP Logging Example:
+
+Below is an example to enable HIP logging and get logging information during execution of hipinfo,
+
+```
+user@user-test:~/hip/bin$ export AMD_LOG_LEVEL=4
+user@user-test:~/hip/bin$ ./hipinfo
+
+:3:rocdevice.cpp            :453 : 23647210092: Initializing HSA stack.
+:3:comgrctx.cpp             :33  : 23647639336: Loading COMGR library.
+:3:rocdevice.cpp            :203 : 23647687108: Numa select cpu agent[0]=0x13407c0(fine=0x13409a0,coarse=0x1340ad0) for gpu agent=0x1346150
+:4:runtime.cpp              :82  : 23647698669: init
+:3:hip_device_runtime.cpp   :473 : 23647698869: 5617 : [7fad295dd840] hipGetDeviceCount: Returned hipSuccess
+:3:hip_device_runtime.cpp   :502 : 23647698990: 5617 : [7fad295dd840] hipSetDevice ( 0 )
+:3:hip_device_runtime.cpp   :507 : 23647699042: 5617 : [7fad295dd840] hipSetDevice: Returned hipSuccess
+--------------------------------------------------------------------------------
+device#                           0
+:3:hip_device.cpp           :150 : 23647699276: 5617 : [7fad295dd840] hipGetDeviceProperties ( 0x7ffdbe7db730, 0 )
+:3:hip_device.cpp           :237 : 23647699335: 5617 : [7fad295dd840] hipGetDeviceProperties: Returned hipSuccess
+Name:                             Device 7341
+pciBusID:                         3
+pciDeviceID:                      0
+pciDomainID:                      0
+multiProcessorCount:              11
+maxThreadsPerMultiProcessor:      2560
+isMultiGpuBoard:                  0
+clockRate:                        1900 Mhz
+memoryClockRate:                  875 Mhz
+memoryBusWidth:                   0
+clockInstructionRate:             1000 Mhz
+totalGlobalMem:                   7.98 GB
+maxSharedMemoryPerMultiProcessor: 64.00 KB
+totalConstMem:                    8573157376
+sharedMemPerBlock:                64.00 KB
+canMapHostMemory:                 1
+regsPerBlock:                     0
+warpSize:                         32
+l2CacheSize:                      0
+computeMode:                      0
+maxThreadsPerBlock:               1024
+maxThreadsDim.x:                  1024
+maxThreadsDim.y:                  1024
+maxThreadsDim.z:                  1024
+maxGridSize.x:                    2147483647
+maxGridSize.y:                    2147483647
+maxGridSize.z:                    2147483647
+major:                            10
+minor:                            12
+concurrentKernels:                1
+cooperativeLaunch:                0
+cooperativeMultiDeviceLaunch:     0
+arch.hasGlobalInt32Atomics:       1
+arch.hasGlobalFloatAtomicExch:    1
+arch.hasSharedInt32Atomics:       1
+arch.hasSharedFloatAtomicExch:    1
+arch.hasFloatAtomicAdd:           1
+arch.hasGlobalInt64Atomics:       1
+arch.hasSharedInt64Atomics:       1
+arch.hasDoubles:                  1
+arch.hasWarpVote:                 1
+arch.hasWarpBallot:               1
+arch.hasWarpShuffle:              1
+arch.hasFunnelShift:              0
+arch.hasThreadFenceSystem:        1
+arch.hasSyncThreadsExt:           0
+arch.hasSurfaceFuncs:             0
+arch.has3dGrid:                   1
+arch.hasDynamicParallelism:       0
+gcnArch:                          1012
+isIntegrated:                     0
+maxTexture1D:                     65536
+maxTexture2D.width:               16384
+maxTexture2D.height:              16384
+maxTexture3D.width:               2048
+maxTexture3D.height:              2048
+maxTexture3D.depth:               2048
+isLargeBar:                       0
+:3:hip_device_runtime.cpp   :471 : 23647701557: 5617 : [7fad295dd840] hipGetDeviceCount ( 0x7ffdbe7db714 )
+:3:hip_device_runtime.cpp   :473 : 23647701608: 5617 : [7fad295dd840] hipGetDeviceCount: Returned hipSuccess
+:3:hip_peer.cpp             :76  : 23647701731: 5617 : [7fad295dd840] hipDeviceCanAccessPeer ( 0x7ffdbe7db728, 0, 0 )
+:3:hip_peer.cpp             :60  : 23647701784: 5617 : [7fad295dd840] canAccessPeer: Returned hipSuccess
+:3:hip_peer.cpp             :77  : 23647701831: 5617 : [7fad295dd840] hipDeviceCanAccessPeer: Returned hipSuccess
+peers:
+:3:hip_peer.cpp             :76  : 23647701921: 5617 : [7fad295dd840] hipDeviceCanAccessPeer ( 0x7ffdbe7db728, 0, 0 )
+:3:hip_peer.cpp             :60  : 23647701965: 5617 : [7fad295dd840] canAccessPeer: Returned hipSuccess
+:3:hip_peer.cpp             :77  : 23647701998: 5617 : [7fad295dd840] hipDeviceCanAccessPeer: Returned hipSuccess
+non-peers:                        device#0
+
+:3:hip_memory.cpp           :345 : 23647702191: 5617 : [7fad295dd840] hipMemGetInfo ( 0x7ffdbe7db718, 0x7ffdbe7db720 )
+:3:hip_memory.cpp           :360 : 23647702243: 5617 : [7fad295dd840] hipMemGetInfo: Returned hipSuccess
+memInfo.total:                    7.98 GB
+memInfo.free:                     7.98 GB (100%)
+```
+
+## HIP Logging Tips:
+
+- HIP logging works for both release and debug version of HIP application.
+
+- Logging function with different logging level can be called in the code as needed.
+
+- Information with logging level less than AMD_LOG_LEVEL will be printed.
+
+- If need to save the HIP logging output information in a file, just define the file at the command when run the application at the terminal, for example,
+
+```
+user@user-test:~/hip/bin$ ./hipinfo > ~/hip_log.txt
+```
+
@@ -1,72 +0,0 @@
-# Profiling HIP Code
-
-This section describes the tracing and debugging capabilities that HIP provides.  
-<!-- toc -->
-
- [Tracing and Debug](#tracing-and-debug)
-  * [Tracing HIP APIs](#tracing-hip-apis)
-    + [Color](#color)
-
-<!-- tocstop -->
-
-## Tracing and Debug
-
-### Tracing HIP APIs
-The HIP runtime can print the HIP function strings to stderr using HIP_TRACE_API environment variable.
-The trace prints two messages for each API - one at the beginning of the API call (line starts with "<<") and one at the end of the API call (line ends with ">>").
-Here's an example for one API followed by a description for the sections of the trace:
-
-```
-<<hip-api tid:1.6 hipMemcpy (0x7f32154db010, 0x50446e000, 4000000, hipMemcpyDeviceToHost)
-  hip-api tid:1.6 hipMemcpy                      ret= 0 (hipSuccess)>>
-```
-
- `<<hip-api` is the header used for all HIP API debug messages.  The message is also shown in a specific color.  This can be used to distinguish this API from other HIP or application messages.
-  `tid:1.6` indicates that this API call came from thread #1 and is the 6th API call in that thread.   When the first API in a new thread is called, HIP will associates a short sequential ID with that thread.  You can see the full thread ID (reported by C++) as 0x7f6183b097c0 in the example below.  
- `hipMemcpy` is the name of the API.
- The first line then prints a comma-separated list of the arguments to the function.  APIs which return values to the caller by writing to pointers will show the pointer addresses rather than the pointer contents.  This behavior may change in the future.
- The second line shows the completion of the API, including the numeric return value (`ret= 0`) as well as an string representation for the error code (`hipSuccess`).  If the returned error code is non-zero, then the csecond line message is shown in red (unless HIP_TRACE_API_COLOR is "none" - see below).
-
-
-Heres a specific example showing the output of the [square](https://github.com/ROCm-Developer-Tools/HIP/tree/master/samples/0_Intro/square) program running on HIP:
-
-```
-$ HIP_TRACE_API=1  ./square.hip.out 
-  hip-api tid:1:HIP initialized short_tid#1 (maps to full_tid: 0x7f6183b097c0)
-<<hip-api tid:1.1 hipGetDeviceProperties (0x7ffddb673e08, 0)
-  hip-api tid:1.1 hipGetDeviceProperties         ret= 0 (hipSuccess)>>
-info: running on device gfx803
-info: allocate host mem (  7.63 MB)
-info: allocate device mem (  7.63 MB)
-<<hip-api tid:1.2 hipMalloc (0x7ffddb673fb8, 4000000)
-  hip-api tid:1.2 hipMalloc                      ret= 0 (hipSuccess)>>
-<<hip-api tid:1.3 hipMalloc (0x7ffddb673fb0, 4000000)
-  hip-api tid:1.3 hipMalloc                      ret= 0 (hipSuccess)>>
-info: copy Host2Device
-<<hip-api tid:1.4 hipMemcpy (0x50409d000, 0x7f32158ac010, 4000000, hipMemcpyHostToDevice)
-  hip-api tid:1.4 hipMemcpy                      ret= 0 (hipSuccess)>>
-info: launch 'vector_square' kernel
-1.5 hipLaunchKernel 'HIP_KERNEL_NAME(vector_square)' gridDim:{512,1,1} groupDim:{256,1,1} sharedMem:+0 stream#0.0
-info: copy Device2Host
-<<hip-api tid:1.6 hipMemcpy (0x7f32154db010, 0x50446e000, 4000000, hipMemcpyDeviceToHost)
-  hip-api tid:1.6 hipMemcpy                      ret= 0 (hipSuccess)>>
-info: check result
-PASSED!
-```
-
-HIP_TRACE_API supports multiple levels of debug information:
-   - 0x1 = print all HIP APIs.  This is the most verbose setting; the flags below allow selecting a subset.
-   - 0x2 = print HIP APIs which initiate GPU kernel commands.  Includes hipLaunchKernel, hipLaunchModuleKernel
-   - 0x4 = print HIP APIs which initiate GPU memory commands.  Includes hipMemcpy*, hipMemset*.
-   - 0x8 = print HIP APIs which allocate or free memory.  Includes hipMalloc, hipHostMalloc, hipFree, hipHostFree.
-
-These can be combined.  For example, HIP_TRACE_API=6 shows a concise view of the HIP commands (both kernel and memory) that are sent to the GPU.
-
-
-#### Color
-Note this trace mode uses colors. "less -r" can handle raw control characters and will display the debug output in proper colors.
-You can change the color used for the trace mode with the HIP_TRACE_API_COLOR environment variable.  Possible values are None/Red/Green/Yellow/Blue/Magenta/Cyan/White.
-None will disable use of color control codes for both the opening and closing and may be useful when saving the trace file or when a pure text trace is desired.
-
-
-
@@ -94,6 +94,12 @@ if( DEFINED ENV{ROCM_PATH} )
     set(ROCM_PATH "$ENV{ROCM_PATH}")
 endif()

+#Using find_dependecy to locate the dependency for the packagaes
+#This makes the cmake generated file xxxx-targets to supply the linker libraries
+# without worrying other transitive dependencies
+find_dependency(hsa-runtime64)
+find_dependency(Threads)
+
 #get_filename_component cannot resolve the symlinks if called from /opt/rocm/lib/hip
 #and do three level up again
 get_filename_component(_DIR "${CMAKE_CURRENT_LIST_DIR}" REALPATH)
@@ -1131,6 +1131,30 @@ void __syncthreads()
  __barrier(__CLK_LOCAL_MEM_FENCE);
 }

+__device__
+inline
+__attribute__((convergent))
+int __syncthreads_count(int predicate)
+{
+  return __ockl_wgred_add_i32(!!predicate);
+}
+
+__device__
+inline
+__attribute__((convergent))
+int __syncthreads_and(int predicate)
+{
+  return __ockl_wgred_and_i32(!!predicate);
+}
+
+__device__
+inline
+__attribute__((convergent))
+int __syncthreads_or(int predicate)
+{
+  return __ockl_wgred_or_i32(!!predicate);
+}
+
 // hip.amdgcn.bc - device routine
 /*
   HW_ID Register bit structure
@@ -74,6 +74,11 @@ extern "C" __device__ __attribute__((convergent)) void __ockl_multi_grid_sync(vo

 extern "C" __device__ void __ockl_atomic_add_noret_f32(float*, float);

+extern "C" __device__ __attribute__((convergent)) int __ockl_wgred_add_i32(int a);
+extern "C" __device__ __attribute__((convergent)) int __ockl_wgred_and_i32(int a);
+extern "C" __device__ __attribute__((convergent)) int __ockl_wgred_or_i32(int a);
+
+
 // Introduce local address space
 #define __local __attribute__((address_space(3)))

@@ -44,6 +44,7 @@ extern "C"
    __device__ __attribute__((pure)) _Float16 __ocml_log_f16(_Float16);
    __device__ __attribute__((pure)) _Float16 __ocml_log10_f16(_Float16);
    __device__ __attribute__((pure)) _Float16 __ocml_log2_f16(_Float16);
+    __device__ __attribute__((pure)) _Float16 __ocml_pown_f16(_Float16, int);
    __device__ __attribute__((const)) _Float16 __llvm_amdgcn_rcp_f16(_Float16);
    __device__ __attribute__((const)) _Float16 __ocml_rint_f16(_Float16);
    __device__ __attribute__((const)) _Float16 __ocml_rsqrt_f16(_Float16);
@@ -121,7 +121,7 @@ extern int HIP_TRACE_API;
 #endif

 // TODO-HCC add a dummy implementation of assert, need to replace with a proper kernel exit call.
-#if __HIP_DEVICE_COMPILE__ == 1
+#if defined(__HCC__) && __HIP_DEVICE_COMPILE__ == 1
 #undef assert
 #define assert(COND)                                                                               \
    {                                                                                              \
@@ -435,10 +435,12 @@ void hipLaunchKernelGGL(F kernel, const dim3& numBlocks, const dim3& dimBlocks,
    hipLaunchKernel(k, numBlocks, dimBlocks, _Args, sharedMemBytes, stream);
 }
 #else
-#define hipLaunchKernelGGL(kernelName, numblocks, numthreads, memperblock, streamId, ...)          \
+#define hipLaunchKernelGGLInternal(kernelName, numBlocks, numThreads, memPerBlock, streamId, ...)  \
    do {                                                                                           \
-        kernelName<<<(numblocks), (numthreads), (memperblock), (streamId)>>>(__VA_ARGS__);         \
+        kernelName<<<(numBlocks), (numThreads), (memPerBlock), (streamId)>>>(__VA_ARGS__);         \
    } while (0)
+
+#define hipLaunchKernelGGL(kernelName, ...)  hipLaunchKernelGGLInternal((kernelName), __VA_ARGS__)
 #endif

 #include <hip/hip_runtime_api.h>
@@ -446,22 +448,22 @@ void hipLaunchKernelGGL(F kernel, const dim3& numBlocks, const dim3& dimBlocks,
 #pragma push_macro("__DEVICE__")
 #define __DEVICE__ static __device__ __forceinline__

-extern "C" __device__ size_t __ockl_get_local_id(uint);
+extern "C" __device__ __attribute__((const)) size_t __ockl_get_local_id(uint);
 __DEVICE__ uint __hip_get_thread_idx_x() { return __ockl_get_local_id(0); }
 __DEVICE__ uint __hip_get_thread_idx_y() { return __ockl_get_local_id(1); }
 __DEVICE__ uint __hip_get_thread_idx_z() { return __ockl_get_local_id(2); }

-extern "C" __device__ size_t __ockl_get_group_id(uint);
+extern "C" __device__ __attribute__((const)) size_t __ockl_get_group_id(uint);
 __DEVICE__ uint __hip_get_block_idx_x() { return __ockl_get_group_id(0); }
 __DEVICE__ uint __hip_get_block_idx_y() { return __ockl_get_group_id(1); }
 __DEVICE__ uint __hip_get_block_idx_z() { return __ockl_get_group_id(2); }

-extern "C" __device__ size_t __ockl_get_local_size(uint);
+extern "C" __device__ __attribute__((const)) size_t __ockl_get_local_size(uint);
 __DEVICE__ uint __hip_get_block_dim_x() { return __ockl_get_local_size(0); }
 __DEVICE__ uint __hip_get_block_dim_y() { return __ockl_get_local_size(1); }
 __DEVICE__ uint __hip_get_block_dim_z() { return __ockl_get_local_size(2); }

-extern "C" __device__ size_t __ockl_get_num_groups(uint);
+extern "C" __device__ __attribute__((const)) size_t __ockl_get_num_groups(uint);
 __DEVICE__ uint __hip_get_grid_dim_x() { return __ockl_get_num_groups(0); }
 __DEVICE__ uint __hip_get_grid_dim_y() { return __ockl_get_num_groups(1); }
 __DEVICE__ uint __hip_get_grid_dim_z() { return __ockl_get_num_groups(2); }
@@ -171,6 +171,9 @@ enum hipLimit_t {
    0x2  ///< Map the allocation into the address space for the current device.  The device pointer
         ///< can be obtained with #hipHostGetDevicePointer.
 #define hipHostMallocWriteCombined 0x4
+#define hipHostMallocNumaUser                                                                      \
+    0x20000000  ///< Host memory allocation will follow numa policy set by user
+
 #define hipHostMallocCoherent                                                                      \
    0x40000000  ///< Allocate coherent memory. Overrides HIP_COHERENT_HOST_ALLOC for specific
                ///< allocation.
@@ -430,6 +430,9 @@ inline
 float powf(float x, float y) { return __ocml_pow_f32(x, y); }
 __DEVICE__
 inline
+float powif(float base, int iexp) { return __ocml_pown_f32(base, iexp); }
+__DEVICE__
+inline
 float rcbrtf(float x) { return __ocml_rcbrt_f32(x); }
 __DEVICE__
 inline
@@ -985,6 +988,9 @@ inline
 double pow(double x, double y) { return __ocml_pow_f64(x, y); }
 __DEVICE__
 inline
+double powi(double base, int iexp) { return __ocml_pown_f64(base, iexp); }
+__DEVICE__
+inline
 double rcbrt(double x) { return __ocml_rcbrt_f64(x); }
 __DEVICE__
 inline
@@ -1412,6 +1418,7 @@ float func(float x, int y) \
  return func##f(x, y); \
 }
 __DEF_FLOAT_FUN2I(scalbn)
+__DEF_FLOAT_FUN2I(ldexp)

 template<class T>
 __DEVICE__ inline T min(T arg1, T arg2) {
@@ -1510,6 +1517,22 @@ __host__ inline static int min(int arg1, int arg2) {
 __host__ inline static int max(int arg1, int arg2) {
  return std::max(arg1, arg2);
 }
+
+__DEVICE__
+inline float pow(float base, int iexp) {
+  return powif(base, iexp);
+}
+
+__DEVICE__
+inline double pow(double base, int iexp) {
+  return powi(base, iexp);
+}
+
+__DEVICE__
+inline _Float16 pow(_Float16 base, int iexp) {
+  return __ocml_pown_f16(base, iexp);
+}
+
 #endif // !__CLANG_HIP_RUNTIME_WRAPPER_INCLUDED__

 #pragma pop_macro("__DEF_FLOAT_FUN")
@@ -243,6 +243,9 @@ __attribute__((pure))
 float __ocml_pow_f32(float, float);
 __device__
 __attribute__((pure))
+float __ocml_pown_f32(float, int);
+__device__
+__attribute__((pure))
 float __ocml_rcbrt_f32(float);
 __device__
 __attribute__((const))
@@ -555,6 +558,9 @@ __attribute__((pure))
 double __ocml_pow_f64(double, double);
 __device__
 __attribute__((pure))
+double __ocml_pown_f64(double, int);
+__device__
+__attribute__((pure))
 double __ocml_rcbrt_f64(double);
 __device__
 __attribute__((const))
@@ -138,6 +138,7 @@ typedef struct hipDeviceProp_t {
    int cooperativeMultiDeviceUnmatchedSharedMem;   ///< HIP device supports cooperative launch on multiple
                                                    ///devices with unmatched shared memories
    int isLargeBar;                  ///< 1: if it is a large PCI bar device, else 0
+    int asicRevision;                ///< Revision of the GPU in this device.
 } hipDeviceProp_t;


@@ -345,8 +346,9 @@ typedef enum hipDeviceAttribute_t {
                                                                  ///devices with unmatched grid dimensions
    hipDeviceAttributeCooperativeMultiDeviceUnmatchedBlockDim,    ///< Supports cooperative launch on multiple
                                                                  ///devices with unmatched block dimensions
-    hipDeviceAttributeCooperativeMultiDeviceUnmatchedSharedMem    ///< Supports cooperative launch on multiple
+    hipDeviceAttributeCooperativeMultiDeviceUnmatchedSharedMem,   ///< Supports cooperative launch on multiple
                                                                  ///devices with unmatched shared memories
+    hipDeviceAttributeAsicRevision          ///< Revision of the GPU in this device
 } hipDeviceAttribute_t;

 enum hipComputeMode {
@@ -31,11 +31,13 @@ THE SOFTWARE.

 typedef int hipLaunchParm;

-#define hipLaunchKernelGGL(kernelName, numblocks, numthreads, memperblock, streamId, ...)          \
+#define hipLaunchKernelGGLInternal(kernelName, numBlocks, numThreads, memPerBlock, streamId, ...)  \
    do {                                                                                           \
-        kernelName<<<numblocks, numthreads, memperblock, streamId>>>(__VA_ARGS__);                 \
+        kernelName<<<numBlocks, numThreads, memPerBlock, streamId>>>(__VA_ARGS__);                 \
    } while (0)

+#define hipLaunchKernelGGL(kernelName, ...)  hipLaunchKernelGGLInternal((kernelName), __VA_ARGS__)
+
 #define hipReadModeElementType cudaReadModeElementType

 #ifdef __CUDA_ARCH__
@@ -23,9 +23,14 @@ set_target_properties(
        RUNTIME_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR})
 target_include_directories(ca SYSTEM PUBLIC ${HSA_PATH}/include)
 target_include_directories(ca PUBLIC ${PROJECT_SOURCE_DIR}/src)
-find_library(
-    hsart NAMES libhsa-runtime64.so libhsa-runtime64.so.1 HINTS ${HSA_PATH}/lib)
-target_link_libraries(ca PUBLIC ${hsart})
+
+find_package(hsa-runtime64 REQUIRED CONFIG
+  PATHS
+    /opt/rocm/
+  PATH_SUFFIXES
+    cmake/hsa-runtime64)
+
+target_link_libraries(ca PUBLIC hsa-runtime64::hsa-runtime64 )
 target_compile_options(ca PUBLIC -DDISABLE_REDUCED_GPU_BLOB_COPY -Wall)

 install(TARGETS ca RUNTIME DESTINATION bin)
@@ -27,12 +27,16 @@ set(CPACK_PACKAGE_VERSION_MINOR @HIP_VERSION_MINOR@)
 set(CPACK_PACKAGE_VERSION_PATCH @HIP_VERSION_PATCH@)
 set(CPACK_PACKAGE_FILE_NAME ${CPACK_PACKAGE_NAME}-${CPACK_PACKAGE_VERSION_MAJOR}.${CPACK_PACKAGE_VERSION_MINOR}.${CPACK_PACKAGE_VERSION_PATCH})
 set(CPACK_GENERATOR "TGZ;DEB;RPM")
+
 set(CPACK_BINARY_DEB "ON")
+set(CPACK_DEBIAN_FILE_NAME ${CPACK_PACKAGE_FILE_NAME}_amd64.deb)
 set(CPACK_DEBIAN_PACKAGE_CONTROL_EXTRA "${PROJECT_BINARY_DIR}/postinst;${PROJECT_BINARY_DIR}/prerm")
 set(CPACK_DEBIAN_PACKAGE_DEPENDS "perl (>= 5.0),libfile-which-perl")
 set(CPACK_DEBIAN_PACKAGE_PROVIDES "hip-base")
 set(CPACK_DEBIAN_PACKAGE_REPLACES "hip_base")
+
 set(CPACK_BINARY_RPM "ON")
+set(CPACK_RPM_FILE_NAME ${CPACK_PACKAGE_FILE_NAME}.x86_64.rpm)
 set(CPACK_RPM_PACKAGE_ARCHITECTURE "${CMAKE_SYSTEM_PROCESSOR}")
 set(CPACK_RPM_POST_INSTALL_SCRIPT_FILE "${PROJECT_BINARY_DIR}/postinst")
 set(CPACK_RPM_PRE_UNINSTALL_SCRIPT_FILE "${PROJECT_BINARY_DIR}/prerm")
@@ -30,11 +30,15 @@ set(CPACK_PACKAGE_VERSION_MINOR @HIP_VERSION_MINOR@)
 set(CPACK_PACKAGE_VERSION_PATCH @HIP_VERSION_PATCH@)
 set(CPACK_PACKAGE_FILE_NAME ${CPACK_PACKAGE_NAME}-${CPACK_PACKAGE_VERSION_MAJOR}.${CPACK_PACKAGE_VERSION_MINOR}.${CPACK_PACKAGE_VERSION_PATCH})
 set(CPACK_GENERATOR "TGZ;DEB;RPM")
+
 set(CPACK_BINARY_DEB "ON")
+set(CPACK_DEBIAN_FILE_NAME ${CPACK_PACKAGE_FILE_NAME}_amd64.deb)
 set(CPACK_DEBIAN_PACKAGE_DEPENDS "hip-base (= ${CPACK_PACKAGE_VERSION})")
 set(CPACK_DEBIAN_PACKAGE_PROVIDES "hip-doc")
 set(CPACK_DEBIAN_PACKAGE_REPLACES "hip_doc")
+
 set(CPACK_BINARY_RPM "ON")
+set(CPACK_RPM_FILE_NAME ${CPACK_PACKAGE_FILE_NAME}.x86_64.rpm)
 set(CPACK_RPM_PACKAGE_ARCHITECTURE "${CMAKE_SYSTEM_PROCESSOR}")
 set(CPACK_RPM_PACKAGE_AUTOREQPROV " no")
 string(REPLACE "-" "_" HIP_BASE_VERSION ${CPACK_PACKAGE_VERSION})
@@ -16,12 +16,16 @@ set(CPACK_PACKAGE_VERSION_MINOR @HIP_VERSION_MINOR@)
 set(CPACK_PACKAGE_VERSION_PATCH @HIP_VERSION_PATCH@)
 set(CPACK_PACKAGE_FILE_NAME ${CPACK_PACKAGE_NAME}-${CPACK_PACKAGE_VERSION_MAJOR}.${CPACK_PACKAGE_VERSION_MINOR}.${CPACK_PACKAGE_VERSION_PATCH})
 set(CPACK_GENERATOR "TGZ;DEB;RPM")
+
 set(CPACK_BINARY_DEB "ON")
+set(CPACK_DEBIAN_FILE_NAME ${CPACK_PACKAGE_FILE_NAME}_amd64.deb)
 set(CPACK_DEBIAN_PACKAGE_CONTROL_EXTRA "${PROJECT_BINARY_DIR}/postinst;${PROJECT_BINARY_DIR}/prerm")
 set(CPACK_DEBIAN_PACKAGE_DEPENDS "hip-base (= ${CPACK_PACKAGE_VERSION}), cuda (>= 7.5)")
 set(CPACK_DEBIAN_PACKAGE_PROVIDES "hip-nvcc")
 set(CPACK_DEBIAN_PACKAGE_REPLACES "hip_nvcc")
+
 set(CPACK_BINARY_RPM "ON")
+set(CPACK_RPM_FILE_NAME ${CPACK_PACKAGE_FILE_NAME}.x86_64.rpm)
 set(CPACK_RPM_PACKAGE_ARCHITECTURE "${CMAKE_SYSTEM_PROCESSOR}")
 set(CPACK_RPM_POST_INSTALL_SCRIPT_FILE "${PROJECT_BINARY_DIR}/postinst")
 set(CPACK_RPM_PRE_UNINSTALL_SCRIPT_FILE "${PROJECT_BINARY_DIR}/prerm")
@@ -1,12 +1,15 @@
 cmake_minimum_required(VERSION 2.8.3)
 project(hip_rocclr)

-install(FILES @PROJECT_BINARY_DIR@/lib/libamdhip64.so DESTINATION lib)
-install(FILES @PROJECT_BINARY_DIR@/lib/libamdhip64.so.@HIP_LIB_VERSION_MAJOR@ DESTINATION lib)
-install(FILES @PROJECT_BINARY_DIR@/lib/libamdhip64.so.@HIP_LIB_VERSION_STRING@ DESTINATION lib)
-install(FILES @PROJECT_BINARY_DIR@/lib/libamdhip64_static.a DESTINATION lib)
-install(FILES @PROJECT_BINARY_DIR@/lib/libhip_hcc.so DESTINATION lib)
-install(FILES @PROJECT_BINARY_DIR@/lib/libhiprtc.so DESTINATION lib)
+if(@BUILD_SHARED_LIBS@)
+    install(FILES @PROJECT_BINARY_DIR@/lib/libamdhip64.so DESTINATION lib)
+    install(FILES @PROJECT_BINARY_DIR@/lib/libamdhip64.so.@HIP_LIB_VERSION_MAJOR@ DESTINATION lib)
+    install(FILES @PROJECT_BINARY_DIR@/lib/libamdhip64.so.@HIP_LIB_VERSION_STRING@ DESTINATION lib)
+    install(FILES @PROJECT_BINARY_DIR@/lib/libhip_hcc.so DESTINATION lib)
+    install(FILES @PROJECT_BINARY_DIR@/lib/libhiprtc.so DESTINATION lib)
+else()
+    install(FILES @PROJECT_BINARY_DIR@/lib/libamdhip64.a DESTINATION lib)
+endif()

 install(FILES @PROJECT_BINARY_DIR@/.hipInfo DESTINATION lib)
 install(FILES @PROJECT_BINARY_DIR@/hip-config.cmake @PROJECT_BINARY_DIR@/hip-config-version.cmake DESTINATION lib/cmake/hip)
@@ -38,22 +41,22 @@ set(CPACK_PACKAGE_VERSION_MINOR @HIP_VERSION_MINOR@)
 set(CPACK_PACKAGE_VERSION_PATCH @HIP_VERSION_PATCH@)
 set(CPACK_PACKAGE_FILE_NAME ${CPACK_PACKAGE_NAME}-${CPACK_PACKAGE_VERSION_MAJOR}.${CPACK_PACKAGE_VERSION_MINOR}.${CPACK_PACKAGE_VERSION_PATCH})
 set(CPACK_GENERATOR "TGZ;DEB;RPM")
+
 set(CPACK_BINARY_DEB "ON")
+set(CPACK_DEBIAN_FILE_NAME ${CPACK_PACKAGE_FILE_NAME}_amd64.deb)
 set(CPACK_DEBIAN_PACKAGE_CONTROL_EXTRA "${PROJECT_BINARY_DIR}/postinst;${PROJECT_BINARY_DIR}/prerm")
 set(CPACK_DEBIAN_PACKAGE_DEPENDS "hsa-rocr-dev, rocm-utils, hip-base (= ${CPACK_PACKAGE_VERSION}),  comgr (>= 1.1), llvm-amdgpu")
-set(CPACK_DEBIAN_PACKAGE_PROVIDES "hip_rocclr, hip-hcc (= ${CPACK_PACKAGE_VERSION})")
-set(CPACK_DEBIAN_PACKAGE_REPLACES "hip_rocclr")
-set(CPACK_DEBIAN_PACKAGE_CONFLICTS "hip_rocclr")
+set(CPACK_DEBIAN_PACKAGE_PROVIDES "hip-hcc (= ${CPACK_PACKAGE_VERSION})")
+
 set(CPACK_BINARY_RPM "ON")
+set(CPACK_RPM_FILE_NAME ${CPACK_PACKAGE_FILE_NAME}.x86_64.rpm)
 set(CPACK_RPM_PACKAGE_ARCHITECTURE "${CMAKE_SYSTEM_PROCESSOR}")
 set(CPACK_RPM_POST_INSTALL_SCRIPT_FILE "${PROJECT_BINARY_DIR}/postinst")
 set(CPACK_RPM_PRE_UNINSTALL_SCRIPT_FILE "${PROJECT_BINARY_DIR}/prerm")
 set(CPACK_RPM_PACKAGE_AUTOREQPROV " no")
 string(REPLACE "-" "_" HIP_BASE_VERSION ${CPACK_PACKAGE_VERSION})
 set(CPACK_RPM_PACKAGE_REQUIRES "hsa-rocr-dev, rocm-utils, hip-base = ${HIP_BASE_VERSION},  comgr >= 1.1, llvm-amdgpu")
-set(CPACK_RPM_PACKAGE_PROVIDES "hip_rocclr, hip-hcc = ${HIP_BASE_VERSION}")
-set(CPACK_RPM_PACKAGE_OBSOLETES "hip_rocclr")
-set(CPACK_RPM_PACKAGE_CONFLICTS "hip_rocclr")
+set(CPACK_RPM_PACKAGE_PROVIDES "hip-hcc = ${HIP_BASE_VERSION}")
 set(CPACK_RPM_EXCLUDE_FROM_AUTO_FILELIST_ADDITION "/opt")
 set(CPACK_SOURCE_GENERATOR "TGZ")
 include(CPack)
@@ -18,11 +18,15 @@ set(CPACK_PACKAGE_VERSION_MINOR @HIP_VERSION_MINOR@)
 set(CPACK_PACKAGE_VERSION_PATCH @HIP_VERSION_PATCH@)
 set(CPACK_PACKAGE_FILE_NAME ${CPACK_PACKAGE_NAME}-${CPACK_PACKAGE_VERSION_MAJOR}.${CPACK_PACKAGE_VERSION_MINOR}.${CPACK_PACKAGE_VERSION_PATCH})
 set(CPACK_GENERATOR "TGZ;DEB;RPM")
+
 set(CPACK_BINARY_DEB "ON")
+set(CPACK_DEBIAN_FILE_NAME ${CPACK_PACKAGE_FILE_NAME}_amd64.deb)
 set(CPACK_DEBIAN_PACKAGE_DEPENDS "hip-base (= ${CPACK_PACKAGE_VERSION})")
 set(CPACK_DEBIAN_PACKAGE_PROVIDES "hip-samples")
 set(CPACK_DEBIAN_PACKAGE_REPLACES "hip_samples")
+
 set(CPACK_BINARY_RPM "ON")
+set(CPACK_RPM_FILE_NAME ${CPACK_PACKAGE_FILE_NAME}.x86_64.rpm)
 set(CPACK_RPM_PACKAGE_ARCHITECTURE "${CMAKE_SYSTEM_PROCESSOR}")
 set(CPACK_RPM_PACKAGE_AUTOREQPROV " no")
 string(REPLACE "-" "_" HIP_BASE_VERSION ${CPACK_PACKAGE_VERSION})
@@ -71,6 +71,14 @@ set(CMAKE_MODULE_PATH${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/cmake" "$

 add_definitions(-DUSE_COMGR_LIBRARY -DCOMGR_DYN_DLL)

+
+find_package(hsa-runtime64 REQUIRED CONFIG
+   PATHS
+     /opt/rocm/lib
+   PATH_SUFFIXES
+     cmake/hsa-runtime64
+)
+
 find_package(amd_comgr REQUIRED CONFIG
  PATHS
    /opt/rocm/
@@ -163,26 +171,33 @@ add_dependencies(hip64 gen-prof-api-str-header)
 set(THREADS_PREFER_PTHREAD_FLAG ON)
 find_package(Threads REQUIRED)

-add_library(amdhip64 SHARED
-    $<TARGET_OBJECTS:hip64>
-    )

-add_library(amdhip64_static STATIC
-    $<TARGET_OBJECTS:hip64>
-    )
+if(${BUILD_SHARED_LIBS})

-set_target_properties(
-    amdhip64 PROPERTIES
-        VERSION ${HIP_LIB_VERSION_STRING}
-        SOVERSION ${HIP_LIB_VERSION_MAJOR}
-        )
+  add_library(amdhip64
+      $<TARGET_OBJECTS:hip64>
+  )
+
+  set_target_properties(
+      amdhip64 PROPERTIES
+         VERSION ${HIP_LIB_VERSION_STRING}
+         SOVERSION ${HIP_LIB_VERSION_MAJOR}
+      )
+
+  set_target_properties(hip64 PROPERTIES PUBLIC_HEADER ${PROF_API_STR})
+
+else()
+
+   add_library(amdhip64 STATIC
+      $<TARGET_OBJECTS:hip64>
+      )
+
+endif()

-set_target_properties(hip64 PROPERTIES PUBLIC_HEADER ${PROF_API_STR})
 set_target_properties(amdhip64 PROPERTIES PUBLIC_HEADER ${PROF_API_STR})
-set_target_properties(amdhip64_static PROPERTIES PUBLIC_HEADER ${PROF_API_STR})
-
 add_library(host INTERFACE)
-target_link_libraries(host INTERFACE amdhip64)
+target_link_libraries(host INTERFACE hip::amdhip64)
+
 add_library(device INTERFACE)
 target_link_libraries(device INTERFACE host)
 # TODO: we may create host_static and device_static to let app
@@ -190,30 +205,37 @@ target_link_libraries(device INTERFACE host)

 # FIXME: Linux convention is to create static library with same base
 # filename.
-target_link_libraries(amdhip64 PRIVATE amdrocclr_static Threads::Threads dl)
-target_link_libraries(amdhip64_static PRIVATE Threads::Threads dl)

-# combine objects of vid and hip into amdhip64_static
-add_custom_target(
-    amdhip64_static_combiner
-    ALL
-    COMMAND rm -rf  static_lib_temp && mkdir static_lib_temp && cd static_lib_temp # Create temp folder to contain *.o
-    COMMAND ${CMAKE_AR} -x $<TARGET_FILE:amdrocclr_static> # Extract *.o from amdrocclr_static
-    COMMAND ${CMAKE_AR} -rcs $<TARGET_FILE:amdhip64_static> *.o # Append *.o to amdhip64_static
-    COMMAND cd .. && rm -rf  static_lib_temp # Remove temp folder
-    DEPENDS amdhip64_static amdrocclr_static # To make sure this is the last step
-    COMMENT "Combining static libs into amdhip64_static"
-)

-INSTALL(PROGRAMS $<TARGET_FILE:amdhip64_static> DESTINATION lib COMPONENT MAIN)
-INSTALL(PROGRAMS $<TARGET_FILE:amdhip64> DESTINATION lib COMPONENT MAIN)
-INSTALL(CODE "execute_process( COMMAND ${CMAKE_COMMAND} -E create_symlink libamdhip64.so lib/libhip_hcc.so  )" DESTINATION lib COMPONENT MAIN)
+if(${BUILD_SHARED_LIBS})
+    target_link_libraries(amdhip64 PRIVATE amdrocclr_static Threads::Threads dl numa hsa-runtime64::hsa-runtime64)
+    INSTALL(PROGRAMS $<TARGET_FILE:amdhip64> DESTINATION lib COMPONENT MAIN)
+    INSTALL(CODE "execute_process( COMMAND ${CMAKE_COMMAND} -E create_symlink libamdhip64.so lib/libhip_hcc.so  )" DESTINATION lib COMPONENT MAIN)

-INSTALL(CODE "execute_process( COMMAND ${CMAKE_COMMAND} -E create_symlink libamdhip64.so lib/libhiprtc.so  )" DESTINATION lib COMPONENT MAIN)
-INSTALL(FILES ${CMAKE_BINARY_DIR}/lib/libhip_hcc.so DESTINATION lib COMPONENT MAIN)
+    INSTALL(CODE "execute_process( COMMAND ${CMAKE_COMMAND} -E create_symlink libamdhip64.so lib/libhiprtc.so  )" DESTINATION lib COMPONENT MAIN)
+    INSTALL(FILES ${CMAKE_BINARY_DIR}/lib/libhip_hcc.so DESTINATION lib COMPONENT MAIN)

-INSTALL(FILES ${CMAKE_BINARY_DIR}/lib/libhiprtc.so DESTINATION lib COMPONENT MAIN)
+    INSTALL(FILES ${CMAKE_BINARY_DIR}/lib/libhiprtc.so DESTINATION lib COMPONENT MAIN)

-INSTALL(TARGETS amdhip64 amdhip64_static host device EXPORT hip-targets DESTINATION ${LIB_INSTALL_DIR})
+else()
+
+    target_link_libraries(amdhip64 PRIVATE Threads::Threads dl numa hsa-runtime64::hsa-runtime64 amd_comgr)
+    # combine objects of vid and hip into amdhip64_static
+    add_custom_target(
+       amdhip64_static_combiner
+       ALL
+       COMMAND rm -rf  static_lib_temp && mkdir static_lib_temp && cd static_lib_temp # Create temp folder to contain *.o
+       COMMAND ${CMAKE_AR} -x $<TARGET_FILE:amdrocclr_static> # Extract *.o from amdrocclr_static
+       COMMAND ${CMAKE_AR} -rcs $<TARGET_FILE:amdhip64> *.o # Append *.o to amdhip64_static
+       COMMAND cd .. && rm -rf  static_lib_temp # Remove temp folder
+       DEPENDS amdhip64 amdrocclr_static # To make sure this is the last step
+       COMMENT "Combining static libs into amdhip64_static"
+    )
+
+    INSTALL(PROGRAMS $<TARGET_FILE:amdhip64> DESTINATION lib COMPONENT MAIN)
+
+endif()
+
+INSTALL(TARGETS amdhip64 host device EXPORT hip-targets DESTINATION ${LIB_INSTALL_DIR})
 INSTALL(EXPORT hip-targets DESTINATION ${CONFIG_PACKAGE_INSTALL_DIR} NAMESPACE hip::)

@@ -89,8 +89,7 @@ hipError_t CodeObject::extractCodeObjectFromFatBinary(const void* data,
  if (num_code_objs == devices.size()) {
    return hipSuccess;
  } else {
-    DevLogError("hipErrorNoBinaryForGpu: Coudn't find binary for current devices!");
-    guarantee(false);
+    guarantee(false && "hipErrorNoBinaryForGpu: Coudn't find binary for current devices!");
    return hipErrorNoBinaryForGpu;
  }
 }
@@ -92,15 +92,23 @@ amd::HostQueue* getQueue(hipStream_t stream) {
  }
 }

+// ================================================================================================
 amd::HostQueue* getNullStream(amd::Context& ctx) {
- for (auto& it : g_devices) {
-   if (it->asContext() == &ctx) {
-     return it->NullStream();
-   }
- }
- return nullptr;
+  for (auto& it : g_devices) {
+    if (it->asContext() == &ctx) {
+      return it->NullStream();
+    }
+  }
+  // If it's a pure SVM allocation with system memory access, then it shouldn't matter which device
+  // runtime selects by default
+  if (hip::host_device->asContext() == &ctx) {
+    // Return current...
+    return getNullStream();
+  }
+  return nullptr;
 }

+// ================================================================================================
 amd::HostQueue* getNullStream() {
  Device* device = getCurrentDevice();
  return device ? device->NullStream() : nullptr;
@@ -186,9 +194,6 @@ hipError_t hipCtxDestroy(hipCtx_t ctx) {
    HIP_RETURN(hipErrorInvalidValue);
  }

-  // Release last tracked command
-  hip::getNullStream()->setLastQueuedCommand(nullptr);
-
  // Need to remove the ctx of calling thread if its the top one
  if (!g_ctxtStack.empty() && g_ctxtStack.top() == dev) {
    g_ctxtStack.pop();
@@ -898,6 +898,6 @@ HIP_RESOURCE_VIEW_DESC getResourceViewDesc(const hipResourceViewDesc& resViewDes

 inline
 size_t getElementSize(const hipChannelFormatDesc &desc) {
-  return (desc.x / 4) * getNumChannels(desc);
+  return (desc.x / 8) * getNumChannels(desc);
 }
 };
@@ -233,6 +233,7 @@ hipError_t hipGetDeviceProperties ( hipDeviceProp_t* props, hipDevice_t device )
  deviceProps.kernelExecTimeoutEnabled = 0;
  deviceProps.ECCEnabled = info.errorCorrectionSupport_? 1:0;
  deviceProps.isLargeBar = info.largeBar_ ? 1 : 0;
+  deviceProps.asicRevision = info.asicRevision_;

  *props = deviceProps;
  HIP_RETURN(hipSuccess);
@@ -296,6 +296,9 @@ hipError_t hipDeviceGetAttribute(int* pi, hipDeviceAttribute_t attr, int device)
  case hipDeviceAttributeCooperativeMultiDeviceUnmatchedSharedMem:
    *pi = prop.cooperativeMultiDeviceUnmatchedSharedMem;
    break;
+  case hipDeviceAttributeAsicRevision:
+    *pi = prop.asicRevision;
+     break;
  default:
    HIP_RETURN(hipErrorInvalidValue);
  }
@@ -99,7 +99,7 @@ hipError_t Event::elapsedTime(Event& eStop, float& ms) {
    // Events are the same, which indicates the stream is empty and likely
    // eventRecord is called on another stream. For such cases insert and measure a
    // marker.
-    amd::Command* command = new amd::Marker(*event_->command().queue(), false);
+    amd::Command* command = new amd::Marker(*event_->command().queue(), kMarkerDisableFlush);
    command->enqueue();
    command->awaitCompletion();
    ms = static_cast<float>(static_cast<int64_t>(command->event().profilingInfo().end_ -
@@ -127,7 +127,7 @@ hipError_t Event::streamWait(amd::HostQueue* hostQueue, uint flags) {
  amd::Command::EventWaitList eventWaitList;
  eventWaitList.push_back(event_);

-  amd::Command* command = new amd::Marker(*hostQueue, false, eventWaitList);
+  amd::Command* command = new amd::Marker(*hostQueue, kMarkerDisableFlush, eventWaitList);
  if (command == NULL) {
    return hipErrorOutOfMemory;
  }
@@ -230,6 +230,7 @@ hipError_t hipEventElapsedTime(float *ms, hipEvent_t start, hipEvent_t stop) {
  HIP_RETURN(eStart->elapsedTime(*eStop, *ms), "Elapsed Time = ", *ms);
 }

+// ================================================================================================
 hipError_t hipEventRecord(hipEvent_t event, hipStream_t stream) {
  HIP_INIT_API(hipEventRecord, event, stream);

@@ -237,19 +238,21 @@ hipError_t hipEventRecord(hipEvent_t event, hipStream_t stream) {
    HIP_RETURN(hipErrorInvalidHandle);
  }

+  hip::Event* e = reinterpret_cast<hip::Event*>(event);
+  amd::ScopedLock lock(e->lock());
+
  amd::HostQueue* queue = hip::getQueue(stream);
  amd::Command* command = queue->getLastQueuedCommand(true);
  if (command == nullptr) {
-    command = new amd::Marker(*queue, false);
+    command = new amd::Marker(*queue, kMarkerDisableFlush);
    command->enqueue();
  }

-  hip::Event* e = reinterpret_cast<hip::Event*>(event);
  e->addMarker(queue, command, true);
-
  HIP_RETURN(hipSuccess);
 }

+// ================================================================================================
 hipError_t hipEventSynchronize(hipEvent_t event) {
  HIP_INIT_API(hipEventSynchronize, event);

@@ -37,7 +37,7 @@ public:

 class Event {
 public:
-  Event(unsigned int flags) : flags(flags), lock_("hipEvent_t"),
+  Event(unsigned int flags) : flags(flags), lock_("hipEvent_t", true),
                              event_(nullptr), recorded_(false) {
    // No need to init event_ here as addMarker does that
  }
@@ -56,6 +56,8 @@ public:

  void addMarker(amd::HostQueue* queue, amd::Command* command, bool record);

+  amd::Monitor& lock() { return lock_; }
+
 private:
  amd::Monitor lock_;
  amd::HostQueue* stream_;
@@ -19,7 +19,7 @@ DeviceVar::DeviceVar(std::string name, hipModule_t hmod) : shadowVptr(nullptr),
  }

  if(!dev_program->createGlobalVarObj(&amd_mem_obj_, &device_ptr_, &size_, name.c_str())) {
-    DevLogPrintfError("Cannot create Global Var obj for symbol: %s \n", name);
+    DevLogPrintfError("Cannot create Global Var obj for symbol: %s \n", name.c_str());
    guarantee(false);
  }

@@ -54,13 +54,13 @@ DeviceFunc::DeviceFunc(std::string name, hipModule_t hmod) : dflock_("function l

  const amd::Symbol *symbol = program->findSymbol(name.c_str());
  if (symbol == nullptr) {
-    DevLogPrintfError("Cannot find Symbol with name: %s \n", name);
+    DevLogPrintfError("Cannot find Symbol with name: %s \n", name.c_str());
    guarantee(false);
  }

  kernel_ = new amd::Kernel(*program, *symbol, name);
  if (kernel_ == nullptr) {
-    DevLogPrintfError("Cannot create kernel with name: %s \n", name);
+    DevLogPrintfError("Cannot create kernel with name: %s \n", name.c_str());
    guarantee(false);
  }
 }
@@ -98,7 +98,7 @@ hipError_t Function::getDynFunc(hipFunction_t* hfunc, hipModule_t hmod) {
 hipError_t Function::getStatFunc(hipFunction_t* hfunc, int deviceId) {
  guarantee(modules_ != nullptr);
  guarantee(deviceId >= 0);
-  guarantee(deviceId < modules_->size());
+  guarantee(static_cast<size_t>(deviceId) < modules_->size());

  hipModule_t module = (*modules_)[deviceId].first;
  FatBinaryMetaInfo* fb_meta = (*modules_)[deviceId].second;
@@ -121,7 +121,7 @@ hipError_t Function::getStatFunc(hipFunction_t* hfunc, int deviceId) {
 hipError_t Function::getStatFuncAttr(hipFuncAttributes* func_attr, int deviceId) {
  guarantee(modules_ != nullptr);
  guarantee(deviceId >= 0);
-  guarantee(deviceId < modules_->size());
+  guarantee(static_cast<size_t>(deviceId) < modules_->size());

  hipModule_t module = (*modules_)[deviceId].first;
  FatBinaryMetaInfo* fb_meta = (*modules_)[deviceId].second;
@@ -141,11 +141,19 @@ hipError_t Function::getStatFuncAttr(hipFuncAttributes* func_attr, int deviceId)

  amd::Kernel* kernel = dFunc_[deviceId]->kernel();
  const device::Kernel::WorkGroupInfo* wginfo = kernel->getDeviceKernel(*devices[deviceId])->workGroupInfo();
+  func_attr->sharedSizeBytes = static_cast<int>(wginfo->localMemSize_);
+  func_attr->binaryVersion = static_cast<int>(kernel->signature().version());
+  func_attr->cacheModeCA = 0;
+  func_attr->constSizeBytes = 0;
  func_attr->localSizeBytes = wginfo->privateMemSize_;
-  func_attr->sharedSizeBytes = wginfo->localMemSize_;
-  func_attr->maxDynamicSharedSizeBytes = wginfo->availableLDSSize_ - wginfo->localMemSize_;
-  func_attr->maxThreadsPerBlock = wginfo->size_;
-  func_attr->numRegs = wginfo->usedVGPRs_;
+  func_attr->maxDynamicSharedSizeBytes = static_cast<int>(wginfo->availableLDSSize_
+                                                          - wginfo->localMemSize_);
+
+  func_attr->maxThreadsPerBlock = static_cast<int>(wginfo->size_);
+  func_attr->numRegs = static_cast<int>(wginfo->usedVGPRs_);
+  func_attr->preferredShmemCarveout = 0;
+  func_attr->ptxVersion = 30;
+

  return hipSuccess;
 }
@@ -166,7 +174,7 @@ Var::~Var() {

 hipError_t Var::getDeviceVar(DeviceVar** dvar, int deviceId, hipModule_t hmod) {
  guarantee(deviceId >= 0);
-  guarantee(deviceId < g_devices.size());
+  guarantee(static_cast<size_t>(deviceId) < g_devices.size());
  guarantee(dVar_.size() == g_devices.size());

  if (dVar_[deviceId] == nullptr) {
@@ -179,7 +187,7 @@ hipError_t Var::getDeviceVar(DeviceVar** dvar, int deviceId, hipModule_t hmod) {

 hipError_t Var::getStatDeviceVar(DeviceVar** dvar, int deviceId) {
  guarantee(deviceId >= 0);
-  guarantee(deviceId < g_devices.size());
+  guarantee(static_cast<size_t>(deviceId) < g_devices.size());

  hipModule_t module = (*modules_)[deviceId].first;
  FatBinaryMetaInfo* fb_meta = (*modules_)[deviceId].second;
@@ -112,7 +112,8 @@ hipError_t hipMemPrefetchAsync(const void* dev_ptr, size_t count, int device,
 hipError_t hipMemAdvise(const void* dev_ptr, size_t count, hipMemoryAdvise advice, int device) {
  HIP_INIT_API(hipMemAdvise, dev_ptr, count, advice, device);

-  if ((dev_ptr == nullptr) || (count == 0) || (device >= g_devices.size())) {
+  if ((dev_ptr == nullptr) || (count == 0) ||
+      (static_cast<size_t>(device) >= g_devices.size())) {
    HIP_RETURN(hipErrorInvalidValue);
  }
  amd::Device* dev = g_devices[device]->devices()[0];
@@ -236,9 +236,13 @@ extern hipError_t ihipDeviceGetCount(int* count);
 extern int ihipGetDevice();
 extern hipError_t ihipMalloc(void** ptr, size_t sizeBytes, unsigned int flags);
 extern amd::Memory* getMemoryObject(const void* ptr, size_t& offset);
+extern amd::Memory* getMemoryObjectWithOffset(const void* ptr, const size_t size);
 extern bool CL_CALLBACK getSvarInfo(cl_program program, std::string var_name, void** var_addr,
                                    size_t* var_size);

 constexpr bool kOptionChangeable = true;
 constexpr bool kNewDevProg = false;
+
+constexpr bool kMarkerDisableFlush = true;   //!< Avoids command batch flush in ROCclr
+
 #endif // HIP_SRC_HIP_INTERNAL_H
@@ -43,6 +43,27 @@ amd::Memory* getMemoryObject(const void* ptr, size_t& offset) {
  return memObj;
 }

+// ================================================================================================
+amd::Memory* getMemoryObjectWithOffset(const void* ptr, const size_t size) {
+  size_t offset;
+  amd::Memory* memObj = getMemoryObject(ptr, offset);
+
+  if (memObj != nullptr) {
+    assert(size <= (memObj->getSize() - offset));
+    memObj = new (memObj->getContext()) amd::Buffer(*memObj, memObj->getMemFlags(), offset, size);
+    if (memObj == nullptr) {;
+      return nullptr;
+    }
+
+    if (!memObj->create(nullptr)) {
+      memObj->release();
+      return nullptr;
+    }
+  }
+
+  return memObj;
+}
+
 // ================================================================================================
 hipError_t ihipFree(void *ptr)
 {
@@ -157,10 +178,9 @@ hipError_t ihipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKin
              *srcMemory->asBuffer(), sOffset, sizeBytes, dst);
    isAsync = false;
  } else if ((srcMemory != nullptr) && (dstMemory != nullptr)) {
-    if ((kind == hipMemcpyDeviceToDevice) &&
-        // Check if the queue device doesn't match the device on any memory object. Hence
-        // it's a P2P transfer, because the app has requested access to another GPU
-        (srcMemory->getContext().devices()[0] != dstMemory->getContext().devices()[0])) {
+    // Check if the queue device doesn't match the device on any memory object. Hence
+    // it's a P2P transfer, because the app has requested access to another GPU
+    if (srcMemory->getContext().devices()[0] != dstMemory->getContext().devices()[0]) {
      command = new amd::CopyMemoryP2PCommand(queue, CL_COMMAND_COPY_BUFFER, waitList,
          *srcMemory->asBuffer(), *dstMemory->asBuffer(), sOffset, dOffset, sizeBytes);
      if (command == nullptr) {
@@ -236,6 +256,10 @@ hipError_t hipHostMalloc(void** ptr, size_t sizeBytes, unsigned int flags) {
    ihipFlags |= CL_MEM_SVM_ATOMICS;
  }

+  if (flags & hipHostMallocNumaUser) {
+    ihipFlags |= CL_MEM_FOLLOW_USER_NUMA_POLICY;
+  }
+
  HIP_RETURN(ihipMalloc(ptr, sizeBytes, ihipFlags), *ptr);
 }

@@ -1662,11 +1686,10 @@ hipError_t ihipMemset(void* dst, int64_t value, size_t valueSize, size_t sizeByt
  size_t offset = 0;
  auto aligned_dst = amd::alignUp(reinterpret_cast<address>(dst), sizeof(uint64_t));

-  amd::Memory* memory = getMemoryObject(aligned_dst, offset);
+  amd::Memory* memory = getMemoryObject(dst, offset);
  if (memory == nullptr) {
-    // Host alloced memory
-    memset(dst, value, sizeBytes);
-    return hipSuccess;
+    // dst ptr is host ptr hence error
+    return hipErrorInvalidValue;
  }

  hipError_t hip_error = hipSuccess;
@@ -1678,6 +1701,7 @@ hipError_t ihipMemset(void* dst, int64_t value, size_t valueSize, size_t sizeByt
  if (sizeBytes/sizeof(int64_t) > 0) {
    n_head_bytes = static_cast<uint8_t*>(aligned_dst) - static_cast<uint8_t*>(dst);
    n_tail_bytes = ((sizeBytes - n_head_bytes) % sizeof(int64_t));
+    offset = offset + n_head_bytes;
    size_t n_bytes = sizeBytes - n_tail_bytes - n_head_bytes;
    if (n_bytes > 0) {
      if (valueSize == sizeof(int8_t)) {
@@ -1795,7 +1819,8 @@ hipError_t ihipMemset3D(hipPitchedPtr pitchedDevPtr,
  amd::Coord3D origin(offset);
  amd::Coord3D region(pitchedDevPtr.xsize, pitchedDevPtr.ysize, extent.depth);
  amd::BufferRect rect;
-  if (!rect.create(static_cast<size_t*>(origin), static_cast<size_t*>(region), pitchedDevPtr.pitch, 0)) {
+  if (pitchedDevPtr.pitch == 0 ||
+      !rect.create(static_cast<size_t*>(origin), static_cast<size_t*>(region), pitchedDevPtr.pitch, 0)) {
    return hipErrorInvalidValue;
  }

@@ -1826,12 +1851,7 @@ hipError_t ihipMemset3D(hipPitchedPtr pitchedDevPtr,
      command->release();
    }
  } else {
-    for (size_t slice = 0; slice < extent.depth; slice++) {
-      for (size_t row = 0; row < extent.height; row++) {
-        const size_t rowOffset = rect.offset(0, row, slice);
-        std::memset(pitchedDevPtr.ptr, value, extent.width);
-      }
-    }
+	return hipErrorInvalidValue;
  }

  return hipSuccess;
@@ -1878,36 +1898,19 @@ hipError_t hipMemAllocHost(void** ptr, size_t size) {
 hipError_t hipIpcGetMemHandle(hipIpcMemHandle_t* handle, void* dev_ptr) {
  HIP_INIT_API(hipIpcGetMemHandle, handle, dev_ptr);

-  size_t offset = 0;
-  amd::Memory* amd_mem_obj = nullptr;
-  device::Memory* dev_mem_obj = nullptr;
+  amd::Device* device = nullptr;
  ihipIpcMemHandle_t* ihandle = nullptr;

  if ((handle == nullptr) || (dev_ptr == nullptr)) {
    HIP_RETURN(hipErrorInvalidValue);
  }

-  /* Get AMD::Memory object corresponding to this pointer */
-  amd_mem_obj = getMemoryObject(dev_ptr, offset);
-  if (amd_mem_obj == nullptr) {
-    DevLogPrintfError("Cannot retrieve amd_mem_obj for dev_ptr: 0x%x with offset: %u \n",
-                      dev_ptr, offset);
-    HIP_RETURN(hipErrorInvalidDevicePointer);
-  }
-
-  /* Get Device::Memory object pointer */
-  dev_mem_obj = amd_mem_obj->getDeviceMemory(*hip::getCurrentDevice()->devices()[0],false);
-  if (dev_mem_obj == nullptr) {
-    DevLogPrintfError("Cannot get Device memory for amd_mem_obj: 0x%x dev_ptr: 0x%x offset: %u \n",
-                      amd_mem_obj, dev_ptr, offset);
-    HIP_RETURN(hipErrorInvalidDevicePointer);
-  }
-
-  /* Create an handle for IPC. Store the memory size inside the handle */
+  device = hip::getCurrentDevice()->devices()[0];
  ihandle = reinterpret_cast<ihipIpcMemHandle_t *>(handle);
-  if(!dev_mem_obj->IpcCreate(offset, &(ihandle->psize), &(ihandle->ipc_handle))) {
+
+  if(!device->IpcCreate(dev_ptr, &(ihandle->psize), &(ihandle->ipc_handle))) {
    DevLogPrintfError("IPC memory creation failed for memory: 0x%x", dev_ptr);
-    HIP_RETURN(hipErrorInvalidValue);
+    HIP_RETURN(hipErrorInvalidDevicePointer);
  }

  HIP_RETURN(hipSuccess);
@@ -1928,15 +1931,11 @@ hipError_t hipIpcOpenMemHandle(void** dev_ptr, hipIpcMemHandle_t handle, unsigne
  device = hip::getCurrentDevice()->devices()[0];
  ihandle = reinterpret_cast<ihipIpcMemHandle_t *>(&handle);

-  amd_mem_obj = device->IpcAttach(&(ihandle->ipc_handle), ihandle->psize, flags, dev_ptr);
-  if (amd_mem_obj == nullptr) {
+  if(!device->IpcAttach(&(ihandle->ipc_handle), ihandle->psize, flags, dev_ptr)) {
    DevLogPrintfError("cannot attach ipc_handle: with ipc_size: %u flags: %u", ihandle->psize, flags);
    HIP_RETURN(hipErrorInvalidDevicePointer);
  }

-  /* Add the memory to the MemObjMap */
-  amd::MemObjMap::AddMemObj(*dev_ptr, amd_mem_obj);
-
  HIP_RETURN(hipSuccess);
 }

@@ -1953,28 +1952,14 @@ hipError_t hipIpcCloseMemHandle(void* dev_ptr) {
    HIP_RETURN(hipErrorInvalidValue);
  }

-  /* Get the amd::Memory object */
-  amd_mem_obj = getMemoryObject(dev_ptr, offset);
-  if (amd_mem_obj == nullptr) {
-    HIP_RETURN(hipErrorInvalidDevicePointer);
-  }
-
  /* Call IPC Detach from Device class */
  device = hip::getCurrentDevice()->devices()[0];
  if (device == nullptr) {
    HIP_RETURN(hipErrorNoDevice);
  }

-  /* Remove the memory from MemObjMap */
-  if (amd_mem_obj->getSvmPtr() != nullptr) {
-    amd::MemObjMap::RemoveMemObj(amd_mem_obj->getSvmPtr());
-  } else {
-    DevLogPrintfError("Does not have SVM or Host Mem for 0x%x, crash here!", dev_ptr);
-    guarantee(false);
-  }
-
  /* detach the memory */
-  if (!device->IpcDetach(*amd_mem_obj)){
+  if (!device->IpcDetach(dev_ptr)){
     HIP_RETURN(hipErrorInvalidHandle);
  }

@@ -81,34 +81,6 @@ extern "C" hip::FatBinaryInfoType* __hipRegisterFatBinary(const void* data)
  return PlatformState::instance().addFatBinary(fbwrapper->binary);
 }

-bool ihipGetFuncAttributes(const char* func_name, amd::Program* program, hipFuncAttributes* func_attr) {
-  device::Program* dev_program
-    = program->getDeviceProgram(*hip::getCurrentDevice()->devices()[0]);
-
-  const auto it = dev_program->kernels().find(std::string(func_name));
-  if (it == dev_program->kernels().cend()) {
-    DevLogPrintfError("Could not find the function %s \n", func_name);
-    return false;
-  }
-
-  const device::Kernel* kernel = it->second;
-  const device::Kernel::WorkGroupInfo* wginfo = kernel->workGroupInfo();
-  func_attr->sharedSizeBytes = static_cast<int>(wginfo->localMemSize_);
-  func_attr->binaryVersion = static_cast<int>(kernel->signature().version());
-  func_attr->cacheModeCA = 0;
-  func_attr->constSizeBytes = 0;
-  func_attr->localSizeBytes = wginfo->privateMemSize_;
-  func_attr->maxDynamicSharedSizeBytes = static_cast<int>(wginfo->availableLDSSize_
-                                                          - wginfo->localMemSize_);
-
-  func_attr->maxThreadsPerBlock = static_cast<int>(wginfo->size_);
-  func_attr->numRegs = static_cast<int>(wginfo->usedVGPRs_);
-  func_attr->preferredShmemCarveout = 0;
-  func_attr->ptxVersion = 30;
-
-  return true;
-}
-
 bool PlatformState::getShadowVarInfo(std::string var_name, hipModule_t hmod,
                                     void** var_addr, size_t* var_size) {

@@ -130,13 +102,6 @@ bool CL_CALLBACK getSvarInfo(cl_program program, std::string var_name, void** va
                                                    var_addr, var_size);
 }

-namespace {
-const int HIP_ENABLE_DEFERRED_LOADING{[] () {
-  char *var = getenv("HIP_ENABLE_DEFERRED_LOADING");
-  return var ? atoi(var) : 1;
-}()};
-} /* namespace */
-
 extern "C" void __hipRegisterFunction(
  hip::FatBinaryInfoType* modules,
  const void*  hostFunction,
@@ -148,9 +113,15 @@ extern "C" void __hipRegisterFunction(
  dim3*        blockDim,
  dim3*        gridDim,
  int*         wSize) {
+  static int enable_deferred_loading { []() {
+    char *var = getenv("HIP_ENABLE_DEFERRED_LOADING");
+    return var ? atoi(var) : 1;
+  }() };
+
  hip::Function* func = new hip::Function(std::string(deviceName), modules);
  PlatformState::instance().registerStatFunction(hostFunction, func);
-  if (!HIP_ENABLE_DEFERRED_LOADING) {
+
+  if (!enable_deferred_loading) {
    HIP_INIT();
    hipFunction_t hfunc = nullptr;
    hipError_t hip_error = hipSuccess;
@@ -491,10 +491,10 @@ def generate_prof_header(f, api_map, opts_map):
      fld_name = arg_tuple[1]
      var_name = 'data->args.' + name + '.' + fld_name
      if arg_type == "char*":
-        f.write('      ' + var_name + ' = (' + var_name + ') ? strdup(' + var_name + ') : NULL; \\\n')
+        f.write('      ' + var_name + ' = (' + var_name + ') ? strdup(' + var_name + ') : NULL;\n')
      else:
        if ptr_type != '':
-          f.write('      if (' + var_name + ') ' + var_name + '__val = *(' + var_name + '); \\\n')
+          f.write('      if (' + var_name + ') ' + var_name + '__val = *(' + var_name + ');\n')
    f.write('      break;\n')
  f.write('    default: break;\n')
  f.write('  };\n')
@@ -161,7 +161,7 @@ void iHipWaitActiveStreams(amd::HostQueue* blocking_queue, bool wait_null_stream

  // Check if we have to wait anything
  if (eventWaitList.size() > 0) {
-    amd::Command* command = new amd::Marker(*blocking_queue, false, eventWaitList);
+    amd::Command* command = new amd::Marker(*blocking_queue, kMarkerDisableFlush, eventWaitList);
    if (command != nullptr) {
      command->enqueue();
      command->release();
@@ -322,7 +322,7 @@ hipError_t hipStreamAddCallback(hipStream_t stream, hipStreamCallback_t callback
  amd::Command* command = hostQueue->getLastQueuedCommand(true);
  if (command == nullptr) {
    amd::Command::EventWaitList eventWaitList;
-    command = new amd::Marker(*hostQueue, false, eventWaitList);
+    command = new amd::Marker(*hostQueue, kMarkerDisableFlush, eventWaitList);
    command->enqueue();
  }
  amd::Event& event = command->event();
@@ -247,20 +247,20 @@ hipError_t ihipCreateTextureObject(hipTextureObject_t* pTexObject,
    const cl_channel_type channelType = hip::getCLChannelType(hip::getArrayFormat(pResDesc->res.linear.desc), pTexDesc->readMode);
    const amd::Image::Format imageFormat({channelOrder, channelType});
    const cl_mem_object_type imageType = hip::getCLMemObjectType(pResDesc->resType);
-    size_t offset = 0;
+    const size_t imageSizeInBytes = pResDesc->res.linear.sizeInBytes;
+    amd::Memory* buffer = getMemoryObjectWithOffset(pResDesc->res.linear.devPtr, imageSizeInBytes);
    image = ihipImageCreate(channelOrder,
                            channelType,
                            imageType,
-                            (pResDesc->res.linear.sizeInBytes / imageFormat.getElementSize()), /* imageWidth */
+                            imageSizeInBytes / imageFormat.getElementSize(), /* imageWidth */
                            0, /* imageHeight */
                            0, /* imageDepth */
                            0, /* imageArraySize */
                            0, /* imageRowPitch */
                            0, /* imageSlicePitch */
                            0, /* numMipLevels */
-                            getMemoryObject(pResDesc->res.linear.devPtr, offset));
-    // TODO take care of non-zero offset.
-    assert(offset == 0);
+                            buffer);
+    buffer->release();
    if (image == nullptr) {
      return hipErrorInvalidValue;
    }
@@ -270,7 +270,8 @@ hipError_t ihipCreateTextureObject(hipTextureObject_t* pTexObject,
    const cl_channel_order channelOrder = hip::getCLChannelOrder(hip::getNumChannels(pResDesc->res.pitch2D.desc), pTexDesc->sRGB);
    const cl_channel_type channelType = hip::getCLChannelType(hip::getArrayFormat(pResDesc->res.pitch2D.desc), pTexDesc->readMode);
    const cl_mem_object_type imageType = hip::getCLMemObjectType(pResDesc->resType);
-    size_t offset = 0;
+    const size_t imageSizeInBytes = pResDesc->res.pitch2D.pitchInBytes * pResDesc->res.pitch2D.height;
+    amd::Memory* buffer = getMemoryObjectWithOffset(pResDesc->res.pitch2D.devPtr, imageSizeInBytes);
    image = ihipImageCreate(channelOrder,
                            channelType,
                            imageType,
@@ -281,9 +282,8 @@ hipError_t ihipCreateTextureObject(hipTextureObject_t* pTexObject,
                            pResDesc->res.pitch2D.pitchInBytes, /* imageRowPitch */
                            0, /* imageSlicePitch */
                            0, /* numMipLevels */
-                            getMemoryObject(pResDesc->res.pitch2D.devPtr, offset));
-    // TODO take care of non-zero offset.
-    assert(offset == 0);
+                            buffer);
+    buffer->release();
    if (image == nullptr) {
      return hipErrorInvalidValue;
    }
@@ -147,6 +147,7 @@ void printDeviceProp(int deviceId) {
    cout << setw(w1) << "maxTexture3D.height: " << props.maxTexture3D[1] << endl;
    cout << setw(w1) << "maxTexture3D.depth: " << props.maxTexture3D[2] << endl;
    cout << setw(w1) << "isLargeBar: " << props.isLargeBar << endl;
+    cout << setw(w1) << "asicRevision: " << props.asicRevision << endl;

    int deviceCnt;
    hipGetDeviceCount(&deviceCnt);
@@ -0,0 +1,188 @@
+/*
+Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "test_common.h"
+#include <iostream>
+#include <time.h>
+#include <cstdio>
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <numaif.h>
+#include <iostream>
+#include <memory>
+#include <stdexcept>
+#include <string>
+#include <array>
+#include "hip/hip_runtime.h"
+/* HIT_START
+ * BUILD: %t %s ../../src/test_common.cpp EXCLUDE_HIP_PLATFORM nvcc
+ * TEST: %t
+ * HIT_END
+ */
+
+// To run it correctly, we must not export HIP_VISIBLE_DEVICES
+#define NUM_PAGES 4
+char *h = nullptr;
+char *d_h = nullptr;
+char *m = nullptr;
+char *d_m = nullptr;
+int page_size = 0;
+const int mode[] = { MPOL_DEFAULT, MPOL_BIND, MPOL_PREFERRED, MPOL_INTERLEAVE };
+const char* modeStr[] = { "MPOL_DEFAULT", "MPOL_BIND", "MPOL_PREFERRED", "MPOL_INTERLEAVE" };
+
+std::string exeCommand(const char* cmd) {
+  std::array<char, 128> buff;
+  std::string result;
+  std::unique_ptr<FILE, decltype(&pclose)> pipe(popen(cmd, "r"), pclose);
+  if (!pipe) {
+    return result;
+  }
+  while (fgets(buff.data(), buff.size(), pipe.get()) != nullptr) {
+    result += buff.data();
+  }
+  return result;
+}
+
+int getCpuAgentCount() {
+  const char* cmd = "cat /proc/cpuinfo | grep \"physical id\" | sort | uniq | wc -l";
+  int cpuAgentCount = std::atoi(exeCommand(cmd).c_str());
+  return cpuAgentCount;
+}
+
+bool test(int cpuId, int gpuId, int numaMode, unsigned int hostMallocflags) {
+  void *pages[NUM_PAGES];
+  int status[NUM_PAGES];
+  int nodes[NUM_PAGES];
+  int ret_code;
+
+  printf("set cpu %d, gpu %d, numaMode %d, hostMallocflags 0x%x\n", cpuId,
+         gpuId, numaMode, hostMallocflags);
+
+  if (cpuId >= 0) {
+    unsigned long nodeMask = 1 << cpuId;
+    unsigned long maxNode = sizeof(nodeMask) * 8;
+    if (set_mempolicy(numaMode, numaMode == MPOL_DEFAULT ? NULL : &nodeMask,
+                      numaMode == MPOL_DEFAULT ? 0 : maxNode) == -1) {
+      printf("set_mempolicy() failed with err %d\n", errno);
+      return false;
+    }
+  }
+
+  if (gpuId >= 0) {
+    HIPCHECK(hipSetDevice(gpuId));
+  }
+
+  posix_memalign((void**) &m, page_size, page_size * NUM_PAGES);
+  hipHostRegister(m, page_size * NUM_PAGES, hipHostRegisterMapped);
+  hipHostGetDevicePointer((void**) &d_m, m, 0);
+
+  status[0] = -1;
+  pages[0] = m;
+  for (int i = 1; i < NUM_PAGES; i++) {
+    pages[i] = (char*) pages[0] + page_size;
+  }
+  ret_code = move_pages(0, NUM_PAGES, pages, NULL, status, 0);
+  printf("Memory (malloc) ret %d at %p (dev %p) is at node: ", ret_code, m, d_m);
+  for (int i = 0; i < NUM_PAGES; i++) {
+    printf("%d ", status[i]); // Don't verify as it's out of our control
+  }
+  printf("\n");
+
+  HIPCHECK(hipHostMalloc((void**) &h, page_size*NUM_PAGES, hostMallocflags));
+  pages[0] = h;
+  for (int i = 1; i < NUM_PAGES; i++) {
+    pages[i] = (char*) pages[0] + page_size;
+  }
+  ret_code = move_pages(0, NUM_PAGES, pages, NULL, status, 0);
+  d_h = nullptr;
+  if (hostMallocflags & hipHostMallocMapped) {
+    hipHostGetDevicePointer((void**) &d_h, h, 0);
+    printf("Memory (hipHostMalloc) ret %d at %p (dev %p) is at node: ",
+           ret_code, h, d_h);
+  } else {
+    printf("Memory (hipHostMalloc) ret %d at %p is at node: ", ret_code, h);
+  }
+  for (int i = 0; i < NUM_PAGES; i++) {
+    printf("%d ", status[i]);  // Always print it even if it's wrong. Verify later
+  }
+  printf("\n");
+
+  HIPCHECK(hipHostFree((void* )h));
+  free(m);
+
+  if (cpuId >= 0 && (numaMode == MPOL_BIND || numaMode == MPOL_PREFERRED)) {
+    for (int i = 0; i < NUM_PAGES; i++) {
+      if (status[i] != cpuId) {  // Now verify
+        printf("Failed at %d", i);
+        return false;
+      }
+    }
+  }
+  return true;
+}
+
+bool runTest(const int &cpuCount, const int &gpuCount,
+             const unsigned int &hostMallocflags, const std::string &str) {
+  printf("%s\n", str.c_str());
+
+  for (int m = 0; m < sizeof(mode) / sizeof(mode[0]); m++) {
+    printf("Testing %s\n", modeStr[m]);
+
+    for (int i = 0; i < cpuCount; i++) {
+      for (int j = 0; j < gpuCount; j++) {
+        if (!test(i, j, mode[m],
+                  hipHostMallocDefault | hipHostMallocNumaUser)) {
+          return false;
+        }
+      }
+    }
+  }
+  return true;
+}
+
+int main(int argc, char *argv[]) {
+  int gpuCount = 0;
+  HIPCHECK(hipGetDeviceCount(&gpuCount));
+  int cpuCount = getCpuAgentCount();
+  page_size = getpagesize();
+  printf("Cpu count %d, Gpu count %d, Page size %d\n", cpuCount, gpuCount,
+         page_size);
+
+  if (cpuCount < 0 || gpuCount < 0) {
+    failed("Bad device count\n");
+    return -1;
+  }
+
+  if (!runTest(cpuCount, gpuCount, hipHostMallocDefault | hipHostMallocNumaUser,
+               "Testing hipHostMallocDefault | hipHostMallocNumaUser........................")) {
+    failed("Failed testing hipHostMallocDefault | hipHostMallocNumaUser\n");
+    return -1;
+  }
+
+  if (!runTest(cpuCount, gpuCount, hipHostMallocMapped | hipHostMallocNumaUser,
+               "Testing hipHostMallocMapped | hipHostMallocNumaUser.........................")) {
+    failed("Failed testing hipHostMallocMapped | hipHostMallocNumaUser\n");
+    return -1;
+  }
+
+  passed();
+}
@@ -0,0 +1,340 @@
+/*
+Copyright (c) 2020-present Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+
+/* HIT_START
+ * BUILD: %t %s ../test_common.cpp NVCC_OPTIONS -std=c++11
+ * TEST: %t
+ * HIT_END
+ */
+
+#include "test_common.h"
+#include "hip/hip_fp16.h"
+
+#define test_passed(test_name) \
+  printf("%s %s  PASSED!%s\n", KGRN, #test_name, KNRM);
+
+enum half2Op {
+  HALF2_OP_HEQ2 = 0,
+  HALF2_OP_HNE2,
+  HALF2_OP_HLE2,
+  HALF2_OP_HGE2,
+  HALF2_OP_HLT2,
+  HALF2_OP_HGT2,
+  HALF2_OP_MAX
+};
+
+enum half2Test {
+  HALF2_TEST_FUNCTION = 0,
+  HALF2_TEST_NAN,
+  HALF2_TEST_MAX
+};
+
+// Kernels for half2 comparision functions
+
+__global__
+void __half2Compare(float* result_D, __half2 a, int n, int half2Op,
+            int testType) {
+  size_t gputhread = (blockIdx.x * blockDim.x + threadIdx.x);
+  size_t stride = blockDim.x * gridDim.x;
+
+  for (size_t i = gputhread; i < n; i += stride) {
+    switch (half2Op) {
+      case HALF2_OP_HEQ2:
+        if (testType == HALF2_TEST_FUNCTION) {
+          result_D[i] = __high2float(__heq2(__hadd2(a, __half2{1, 1}),
+                                     __half2{2, 2}));
+        } else {
+          result_D[i] = __high2float(__heq2(__h2div(a, __half2{0, 0}),
+                                     __half2{0, 0}));
+        }
+        break;
+      case HALF2_OP_HNE2:
+          result_D[i] = __high2float(__hne2(__hadd2(a, __half2{1, 1}),
+                                     __half2{2, 2}));
+        break;
+      case HALF2_OP_HLE2:
+        if (testType == HALF2_TEST_FUNCTION) {
+          result_D[i] = __high2float(__hle2(__hadd2(a, __half2{1, 1}),
+                                     __half2{3, 3}));
+        } else {
+          result_D[i] = __high2float(__hle2(__h2div(a, __half2{0, 0}),
+                                     __half2{0, 0}));
+        }
+        break;
+      case HALF2_OP_HGE2:
+        if (testType == HALF2_TEST_FUNCTION) {
+          result_D[i] = __high2float(__hge2(__hadd2(a, __half2{1, 1}),
+                                     __half2{2, 2}));
+        } else {
+          result_D[i] = __high2float(__hge2(__h2div(a, __half2{0, 0}),
+                                     __half2{0, 0}));
+        }
+        break;
+      case HALF2_OP_HLT2:
+        if (testType == HALF2_TEST_FUNCTION) {
+          result_D[i] = __high2float(__hlt2(__hadd2(a, __half2{1, 1}),
+                                     __half2{3, 3}));
+        } else {
+          result_D[i] = __high2float(__hlt2(__h2div(a, __half2{0, 0}),
+                                     __half2{0, 0}));
+        }
+        break;
+      case HALF2_OP_HGT2:
+        if (testType == HALF2_TEST_FUNCTION) {
+          result_D[i] = __high2float(__hgt2(__hadd2(a, __half2{1, 1}),
+                                     __half2{3, 3}));
+        } else {
+          result_D[i] = __high2float(__hgt2(__h2div(a, __half2{0, 0}),
+                                     __half2{0, 0}));
+        }
+        break;
+    }
+  }
+}
+
+static bool isFailed(float expectedValue, float *result_H, int size) {
+  for (int index = 0; index < size; index++) {
+    if (expectedValue != result_H[index]) {
+      return true;
+    }
+  }
+  return false;
+}
+
+int main() {
+  const int n = 64;
+  float* result_H = reinterpret_cast<float*>(malloc(n*sizeof(float)));
+  float* result_D;
+  bool bFunctionalTestFailed = false;
+  bool bNanTestFailed = false;
+  int index = 0;
+  HIPCHECK(hipMalloc(&result_D, n*sizeof(float)));
+
+  // kernel launch and hipmemcpy operation to get return value for heq2
+  hipLaunchKernelGGL(__half2Compare, dim3(1, 1, 1), dim3(1, 1, 1), 0, 0,
+                     result_D, __half2{1, 1}, n, HALF2_OP_HEQ2,
+                     HALF2_TEST_FUNCTION);
+  hipDeviceSynchronize();
+  HIPCHECK(hipMemcpy(result_H, result_D, n*sizeof(float),
+                     hipMemcpyDeviceToHost));
+  if (isFailed(1.0, result_H, n)) {
+    printf("heq2: failure when arguments are equal\n");
+    bFunctionalTestFailed = true;
+  }
+
+  hipLaunchKernelGGL(__half2Compare, dim3(1, 1, 1), dim3(1, 1, 1), 0, 0,
+                     result_D, __half2{2, 2}, n, HALF2_OP_HEQ2,
+                     HALF2_TEST_FUNCTION);
+  hipDeviceSynchronize();
+  HIPCHECK(hipMemcpy(result_H, result_D, n*sizeof(float),
+                     hipMemcpyDeviceToHost));
+  if (isFailed(0.0, result_H, n)) {
+    printf("heq2: failure when arguments are not equal\n");
+    bFunctionalTestFailed = true;
+  }
+
+  // kernel launch and hipmemcpy operation to get return value for hne2
+  hipLaunchKernelGGL(__half2Compare, dim3(1, 1, 1), dim3(1, 1, 1), 0, 0,
+                     result_D, __half2{2, 2}, n, HALF2_OP_HNE2,
+                     HALF2_TEST_FUNCTION);
+  hipDeviceSynchronize();
+  HIPCHECK(hipMemcpy(result_H, result_D, n*sizeof(float),
+                     hipMemcpyDeviceToHost));
+  if (isFailed(1.0, result_H, n)) {
+    printf("hne2: failure when arguments are not equal\n");
+    bFunctionalTestFailed = true;
+  }
+
+  hipLaunchKernelGGL(__half2Compare, dim3(1, 1, 1), dim3(1, 1, 1), 0, 0,
+                     result_D, __half2{1, 1}, n, HALF2_OP_HNE2,
+                     HALF2_TEST_FUNCTION);
+  hipDeviceSynchronize();
+  HIPCHECK(hipMemcpy(result_H, result_D, n*sizeof(float),
+                     hipMemcpyDeviceToHost));
+  if (isFailed(0.0, result_H, n)) {
+    printf("hne2: failure when arguments are equal\n");
+    bFunctionalTestFailed = true;
+  }
+
+  // kernel launch and hipmemcpy operation to get return value for hle2
+  hipLaunchKernelGGL(__half2Compare, dim3(1, 1, 1), dim3(1, 1, 1), 0, 0,
+                     result_D, __half2{1, 1}, n, HALF2_OP_HLE2,
+                     HALF2_TEST_FUNCTION);
+  hipDeviceSynchronize();
+  HIPCHECK(hipMemcpy(result_H, result_D, n*sizeof(float),
+                     hipMemcpyDeviceToHost));
+  if (isFailed(1.0, result_H, n)) {
+    printf("hle2: failure when argument is less than equal\n");
+    bFunctionalTestFailed = true;
+  }
+
+  hipLaunchKernelGGL(__half2Compare, dim3(1, 1, 1), dim3(1, 1, 1), 0, 0,
+                     result_D, __half2{2, 2}, n, HALF2_OP_HLE2,
+                     HALF2_TEST_FUNCTION);
+  hipDeviceSynchronize();
+  HIPCHECK(hipMemcpy(result_H, result_D, n*sizeof(float),
+                     hipMemcpyDeviceToHost));
+  if (isFailed(1.0, result_H, n)) {
+    printf("hle2: failure when argument is equal\n");
+    bFunctionalTestFailed = true;
+  }
+
+  hipLaunchKernelGGL(__half2Compare, dim3(1, 1, 1), dim3(1, 1, 1), 0, 0,
+                     result_D, __half2{3, 3}, n, HALF2_OP_HLE2,
+                     HALF2_TEST_FUNCTION);
+  hipDeviceSynchronize();
+  HIPCHECK(hipMemcpy(result_H, result_D, n*sizeof(float),
+                     hipMemcpyDeviceToHost));
+  if (isFailed(0.0, result_H, n)) {
+    printf("hle2: failure when argument is greater\n");
+    bFunctionalTestFailed = true;
+  }
+
+  // kernel launch and hipmemcpy operation to get return value for hge2
+  hipLaunchKernelGGL(__half2Compare, dim3(1, 1, 1), dim3(1, 1, 1), 0, 0,
+                     result_D, __half2{2, 2}, n, HALF2_OP_HGE2,
+                     HALF2_TEST_FUNCTION);
+  hipDeviceSynchronize();
+  HIPCHECK(hipMemcpy(result_H, result_D, n*sizeof(float),
+                     hipMemcpyDeviceToHost));
+  if (isFailed(1.0, result_H, n)) {
+    printf("hge2: failure when argument is greater\n");
+    bFunctionalTestFailed = true;
+  }
+
+  hipLaunchKernelGGL(__half2Compare, dim3(1, 1, 1), dim3(1, 1, 1), 0, 0,
+                     result_D, __half2{1, 1}, n, HALF2_OP_HGE2,
+                     HALF2_TEST_FUNCTION);
+  hipDeviceSynchronize();
+  HIPCHECK(hipMemcpy(result_H, result_D, n*sizeof(float),
+                     hipMemcpyDeviceToHost));
+  if (isFailed(1.0, result_H, n)) {
+    printf("hge2: failure when argument is equal\n");
+    bFunctionalTestFailed = true;
+  }
+
+  hipLaunchKernelGGL(__half2Compare, dim3(1, 1, 1), dim3(1, 1, 1), 0, 0,
+                     result_D, __half2{0, 0}, n, HALF2_OP_HGE2,
+                     HALF2_TEST_FUNCTION);
+  hipDeviceSynchronize();
+  HIPCHECK(hipMemcpy(result_H, result_D, n*sizeof(float),
+                     hipMemcpyDeviceToHost));
+  if (isFailed(0.0, result_H, n)) {
+    printf("hge2: failure when argument is less\n");
+    bFunctionalTestFailed = true;
+  }
+
+  // kernel launch and hipmemcpy operation to get return value for hlt2
+  hipLaunchKernelGGL(__half2Compare, dim3(1, 1, 1), dim3(1, 1, 1), 0, 0,
+                     result_D, __half2{1, 1}, n, HALF2_OP_HLT2,
+                     HALF2_TEST_FUNCTION);
+  hipDeviceSynchronize();
+  HIPCHECK(hipMemcpy(result_H, result_D, n*sizeof(float),
+                     hipMemcpyDeviceToHost));
+  if (isFailed(1.0, result_H, n)) {
+    printf("hlt2: failure when argument is less\n");
+    bFunctionalTestFailed = true;
+  }
+
+  hipLaunchKernelGGL(__half2Compare, dim3(1, 1, 1), dim3(1, 1, 1), 0, 0,
+                     result_D, __half2{2, 2}, n, HALF2_OP_HLT2,
+                     HALF2_TEST_FUNCTION);
+  hipDeviceSynchronize();
+  HIPCHECK(hipMemcpy(result_H, result_D, n*sizeof(float),
+                     hipMemcpyDeviceToHost));
+  if (isFailed(0.0, result_H, n)) {
+    printf("hlt2: failure when argument is equal\n");
+    bFunctionalTestFailed = true;
+  }
+
+  hipLaunchKernelGGL(__half2Compare, dim3(1, 1, 1), dim3(1, 1, 1), 0, 0,
+                     result_D, __half2{3, 3}, n, HALF2_OP_HLT2,
+                     HALF2_TEST_FUNCTION);
+  hipDeviceSynchronize();
+  HIPCHECK(hipMemcpy(result_H, result_D, n*sizeof(float),
+                     hipMemcpyDeviceToHost));
+  if (isFailed(0.0, result_H, n)) {
+    printf("hlt2: failure when argument is greater\n");
+    bFunctionalTestFailed = true;
+  }
+
+  // kernel launch and hipmemcpy operation to get return value for hgt2
+  hipLaunchKernelGGL(__half2Compare, dim3(1, 1, 1), dim3(1, 1, 1), 0, 0,
+                     result_D, __half2{3, 3}, n, HALF2_OP_HGT2,
+                     HALF2_TEST_FUNCTION);
+  hipDeviceSynchronize();
+  HIPCHECK(hipMemcpy(result_H, result_D, n*sizeof(float),
+                     hipMemcpyDeviceToHost));
+  if (isFailed(1.0, result_H, n)) {
+    printf("hgt2: failure when argument is greater\n");
+    bFunctionalTestFailed = true;
+  }
+
+  hipLaunchKernelGGL(__half2Compare, dim3(1, 1, 1), dim3(1, 1, 1), 0, 0,
+                     result_D, __half2{2, 2}, n, HALF2_OP_HGT2,
+                     HALF2_TEST_FUNCTION);
+  hipDeviceSynchronize();
+  HIPCHECK(hipMemcpy(result_H, result_D, n*sizeof(float),
+                     hipMemcpyDeviceToHost));
+  if (isFailed(0.0, result_H, n)) {
+    printf("hgt2: failure when argument is equal\n");
+    bFunctionalTestFailed = true;
+  }
+
+  hipLaunchKernelGGL(__half2Compare, dim3(1, 1, 1), dim3(1, 1, 1), 0, 0,
+                     result_D, __half2{1, 1}, n, HALF2_OP_HGT2,
+                     HALF2_TEST_FUNCTION);
+  hipDeviceSynchronize();
+  HIPCHECK(hipMemcpy(result_H, result_D, n*sizeof(float),
+                     hipMemcpyDeviceToHost));
+  if (isFailed(0.0, result_H, n)) {
+    printf("hgt2: failure when argument is less\n");
+    bFunctionalTestFailed = true;
+  }
+
+  for (int nanFunctionTest = HALF2_OP_HEQ2; nanFunctionTest < HALF2_OP_MAX;
+       nanFunctionTest++) {
+    // HNE2 will not have a NaN test
+    if (nanFunctionTest != HALF2_OP_HNE2) {
+      hipLaunchKernelGGL(__half2Compare, dim3(1, 1, 1), dim3(1, 1, 1), 0, 0,
+                         result_D, __half2{0, 0}, n, nanFunctionTest,
+                         HALF2_TEST_NAN);
+      hipDeviceSynchronize();
+      HIPCHECK(hipMemcpy(result_H, result_D, n*sizeof(float),
+                         hipMemcpyDeviceToHost));
+      if (isFailed(0.0, result_H, n)) {
+        printf("NaN test failed for half function: %d\n", nanFunctionTest);
+        bNanTestFailed = true;
+      }
+    }
+  }
+
+  hipFree(result_D);
+  free(result_H);
+
+  if ((false == bFunctionalTestFailed) && (false == bNanTestFailed)) {
+    passed();
+  } else {
+    failed("Some Half2 tests failed");
+  }
+
+  return 0;
+}
+
@@ -186,6 +186,14 @@ void test_fp16() {
  CHECK_SIMPLE([]__device__(){ return min<__fp16>(1.0f, 2.0f); }, 1.0f);
 }

+void test_pown() {
+  CHECK_SIMPLE([]__device__(){ return powif(2.0f, 2); }, 4.0f);
+  CHECK_SIMPLE([]__device__(){ return powi(2.0, 2); }, 4.0);
+  CHECK_SIMPLE([]__device__(){ return pow(2.0f, 2); }, 4.0f);
+  CHECK_SIMPLE([]__device__(){ return pow(2.0, 2); }, 4.0);
+  CHECK_SIMPLE([]__device__(){ return pow(2.0f16, 2); }, 4.0f16);
+}
+
 int main(int argc, char* argv[]) {
    HipTest::parseStandardArguments(argc, argv, true);

@@ -195,5 +203,7 @@ int main(int argc, char* argv[]) {
    
    test_fp16();

+    test_pown();
+
    passed();
 }
@@ -0,0 +1,155 @@
+/*
+Copyright (c) 2020 - present Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+
+/* HIT_START
+ * BUILD: %t %s ../test_common.cpp
+ * TEST: %t
+ * HIT_END
+ */
+
+#include <hip/hip_runtime.h>
+#include "test_common.h"
+
+#define ASSERT_EQUAL(lhs, rhs) assert(lhs == rhs)
+
+static __global__
+void kernel_syncthreads_and(int *syncTestD,
+                            int *allThreadsZeroD,
+                            int *allThreadsOneD,
+                            int *oneThreadZeroD,
+                            int *allThreadsMinusOneD)
+{
+  int blockSize = blockDim.x;
+  int predicate = 10;
+  // First block index starts with 0, and second block index starts
+  // with blockSize
+  int i = (blockIdx.x == 0) ? threadIdx.x : blockSize + threadIdx.x;
+
+  // At very first, we need to ensure work-group level syncronization
+  // properly happened, don't bother about predicate testing for now.
+  // Thread 0 and thread 1 writes to shared memory. After call to api,
+  // every thread reads shared memory, and store product for verification
+  __shared__ int sm[2];
+  if (threadIdx.x == 0)
+    sm[0] = 10;
+  else if (threadIdx.x == 1)
+    sm[1] = 20;
+  __syncthreads_and(predicate);
+  syncTestD[i] = sm[0] * sm[1];
+
+  // All threads pass 0 as predicate value, result should be 0
+  predicate = 0;
+  allThreadsZeroD[i] = __syncthreads_and(predicate);
+
+  // All threads pass 1 as predicate value, result should be 1
+  predicate = 1;
+  allThreadsOneD[i] = __syncthreads_and(predicate);
+
+  // Thread 0 pass 0, and all other threads 1 as predicate value,
+  // result should be 0
+  predicate = (threadIdx.x == 0) ? 0 : 1;
+  oneThreadZeroD[i] = __syncthreads_and(predicate);
+
+  // All threads pass -1 as predicate value, result should be 1
+  predicate = -1;
+  allThreadsMinusOneD[i] = __syncthreads_and(predicate);
+}
+
+static void test_syncthreads_and(int blockSize)
+{
+  int nBytes = sizeof(int) * 2 * blockSize;
+  int * syncTestD, *syncTestH;
+  int *allThreadsZeroD, *allThreadsZeroH;
+  int *allThreadsOneD, *allThreadsOneH;
+  int *oneThreadZeroD, *oneThreadZeroH;
+  int *allThreadsMinusOneD, *allThreadsMinusOneH;
+
+  // Allocate device memory
+  ASSERT_EQUAL(hipMalloc((void**)&syncTestD, nBytes), hipSuccess);
+  ASSERT_EQUAL(hipMalloc((void**)&allThreadsZeroD, nBytes), hipSuccess);
+  ASSERT_EQUAL(hipMalloc((void**)&allThreadsOneD, nBytes), hipSuccess);
+  ASSERT_EQUAL(hipMalloc((void**)&oneThreadZeroD, nBytes), hipSuccess);
+  ASSERT_EQUAL(hipMalloc((void**)&allThreadsMinusOneD, nBytes), hipSuccess);
+
+  // Allocate host memory
+  ASSERT_EQUAL(hipHostMalloc((void**)&syncTestH, nBytes), hipSuccess);
+  ASSERT_EQUAL(hipHostMalloc((void**)&allThreadsZeroH, nBytes), hipSuccess);
+  ASSERT_EQUAL(hipHostMalloc((void**)&allThreadsOneH, nBytes), hipSuccess);
+  ASSERT_EQUAL(hipHostMalloc((void**)&oneThreadZeroH, nBytes), hipSuccess);
+  ASSERT_EQUAL(hipHostMalloc((void**)&allThreadsMinusOneH, nBytes), hipSuccess);
+
+  // Launch Kernel
+  hipLaunchKernelGGL(kernel_syncthreads_and,
+                     2,
+                     blockSize,
+                     0,
+                     0,
+                     syncTestD,
+                     allThreadsZeroD,
+                     allThreadsOneD,
+                     oneThreadZeroD,
+                     allThreadsMinusOneD);
+
+  // Copy result from device to host
+  ASSERT_EQUAL(hipMemcpy(syncTestH, syncTestD, nBytes, hipMemcpyDeviceToHost),
+               hipSuccess);
+  ASSERT_EQUAL(hipMemcpy(allThreadsZeroH, allThreadsZeroD, nBytes, hipMemcpyDeviceToHost),
+               hipSuccess);
+  ASSERT_EQUAL(hipMemcpy(allThreadsOneH, allThreadsOneD, nBytes, hipMemcpyDeviceToHost),
+               hipSuccess);
+  ASSERT_EQUAL(hipMemcpy(oneThreadZeroH, oneThreadZeroD, nBytes, hipMemcpyDeviceToHost),
+               hipSuccess);
+  ASSERT_EQUAL(hipMemcpy(allThreadsMinusOneH, allThreadsMinusOneD, nBytes, hipMemcpyDeviceToHost),
+               hipSuccess);
+
+  // Validate results for both blocks together
+  for (int i = 0; i < 2 * blockSize; ++i) {
+    ASSERT_EQUAL(syncTestH[i], 200);
+    ASSERT_EQUAL(allThreadsZeroH[i], 0);
+    ASSERT_EQUAL(allThreadsOneH[i], 1);
+    ASSERT_EQUAL(oneThreadZeroH[i], 0);
+    ASSERT_EQUAL(allThreadsMinusOneH[i], 1);
+  }
+
+  // Free device memory
+  ASSERT_EQUAL(hipFree(syncTestD), hipSuccess);
+  ASSERT_EQUAL(hipFree(allThreadsZeroD), hipSuccess);
+  ASSERT_EQUAL(hipFree(allThreadsOneD), hipSuccess);
+  ASSERT_EQUAL(hipFree(oneThreadZeroD), hipSuccess);
+  ASSERT_EQUAL(hipFree(allThreadsMinusOneD), hipSuccess);
+
+  //Free host memory
+  ASSERT_EQUAL(hipHostFree(syncTestH), hipSuccess);
+  ASSERT_EQUAL(hipHostFree(allThreadsZeroH), hipSuccess);
+  ASSERT_EQUAL(hipHostFree(allThreadsOneH), hipSuccess);
+  ASSERT_EQUAL(hipHostFree(oneThreadZeroH), hipSuccess);
+  ASSERT_EQUAL(hipHostFree(allThreadsMinusOneH), hipSuccess);
+}
+
+int main()
+{
+  int blockSizes[] = {10, 40, 70, 130, 240, 723, 32, 64, 128, 256, 512, 1024};
+  for (int i = 0; i < (sizeof(blockSizes) / sizeof(blockSizes[0])); ++i)
+    test_syncthreads_and(blockSizes[i]);
+  passed();
+}
@@ -0,0 +1,169 @@
+/*
+Copyright (c) 2020 - present Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+
+/* HIT_START
+ * BUILD: %t %s ../test_common.cpp
+ * TEST: %t
+ * HIT_END
+ */
+
+#include <hip/hip_runtime.h>
+#include "test_common.h"
+
+#define ASSERT_EQUAL(lhs, rhs) assert(lhs == rhs)
+
+static __global__
+void kernel_syncthreads_count(int *syncTestD,
+                              int *allThreadsZeroD,
+                              int *allThreadsOneD,
+                              int *oddThreadsOneD,
+                              int *allThreadsMinusOneD,
+                              int *allThreadsIdD)
+{
+  int blockSize = blockDim.x;
+  int predicate = 10;
+  // First block index starts with 0, and second block index starts
+  // with blockSize
+  int i = (blockIdx.x == 0) ? threadIdx.x : blockSize + threadIdx.x;
+
+  // At very first, we need to ensure work-group level syncronization
+  // properly happened, don't bother about predicate testing for now.
+  // Thread 0 and thread 1 writes to shared memory. After call to api,
+  // every thread reads shared memory, and store sum for verification
+  __shared__ int sm[2];
+  if (threadIdx.x == 0)
+    sm[0] = 10;
+  else if (threadIdx.x == 1)
+    sm[1] = 20;
+  __syncthreads_count(predicate);
+  syncTestD[i] = sm[0] + sm[1];
+
+  // All threads pass 0 as predicate value, result should be 0
+  predicate = 0;
+  allThreadsZeroD[i] = __syncthreads_count(predicate);
+
+  // All threads pass 1 as predicate value, result should be blockSize
+  predicate = 1;
+  allThreadsOneD[i] = __syncthreads_count(predicate);
+
+  // Odd numbered threads pass 1, and even numbered threads pass 0, as
+  // predicate value, result should be blockSize / 2
+  predicate = threadIdx.x % 2;
+  oddThreadsOneD[i] = __syncthreads_count(predicate);
+
+  // All threads pass -1 as predicate value, result should blockSize
+  predicate = -1;
+  allThreadsMinusOneD[i] = __syncthreads_count(predicate);
+
+  // Each thread pass its ID as predicate value, result should be blockSize - 1
+  predicate = threadIdx.x;
+  allThreadsIdD[i] = __syncthreads_count(predicate);
+}
+
+void test_syncthreads_count(int blockSize)
+{
+  int nBytes = sizeof(int) * 2 * blockSize;
+  int * syncTestD, *syncTestH;
+  int *allThreadsZeroD, *allThreadsZeroH;
+  int *allThreadsOneD, *allThreadsOneH;
+  int *oddThreadsOneD, *oddThreadsOneH;
+  int *allThreadsMinusOneD, *allThreadsMinusOneH;
+  int *allThreadsIdD, *allThreadsIdH;
+
+  // Allocate device memory
+  ASSERT_EQUAL(hipMalloc((void**)&syncTestD, nBytes), hipSuccess);
+  ASSERT_EQUAL(hipMalloc((void**)&allThreadsZeroD, nBytes), hipSuccess);
+  ASSERT_EQUAL(hipMalloc((void**)&allThreadsOneD, nBytes), hipSuccess);
+  ASSERT_EQUAL(hipMalloc((void**)&oddThreadsOneD, nBytes), hipSuccess);
+  ASSERT_EQUAL(hipMalloc((void**)&allThreadsMinusOneD, nBytes), hipSuccess);
+  ASSERT_EQUAL(hipMalloc((void**)&allThreadsIdD, nBytes), hipSuccess);
+
+  // Allocate host memory
+  ASSERT_EQUAL(hipHostMalloc((void**)&syncTestH, nBytes), hipSuccess);
+  ASSERT_EQUAL(hipHostMalloc((void**)&allThreadsZeroH, nBytes), hipSuccess);
+  ASSERT_EQUAL(hipHostMalloc((void**)&allThreadsOneH, nBytes), hipSuccess);
+  ASSERT_EQUAL(hipHostMalloc((void**)&oddThreadsOneH, nBytes), hipSuccess);
+  ASSERT_EQUAL(hipHostMalloc((void**)&allThreadsMinusOneH, nBytes), hipSuccess);
+  ASSERT_EQUAL(hipHostMalloc((void**)&allThreadsIdH, nBytes), hipSuccess);
+
+  // Launch Kernel
+  hipLaunchKernelGGL(kernel_syncthreads_count,
+                     2,
+                     blockSize,
+                     0,
+                     0,
+                     syncTestD,
+                     allThreadsZeroD,
+                     allThreadsOneD,
+                     oddThreadsOneD,
+                     allThreadsMinusOneD,
+                     allThreadsIdD);
+
+  // Copy result from device to host
+  ASSERT_EQUAL(hipMemcpy(syncTestH, syncTestD, nBytes, hipMemcpyDeviceToHost),
+               hipSuccess);
+  ASSERT_EQUAL(hipMemcpy(allThreadsZeroH, allThreadsZeroD, nBytes, hipMemcpyDeviceToHost),
+               hipSuccess);
+  ASSERT_EQUAL(hipMemcpy(allThreadsOneH, allThreadsOneD, nBytes, hipMemcpyDeviceToHost),
+               hipSuccess);
+  ASSERT_EQUAL(hipMemcpy(oddThreadsOneH, oddThreadsOneD, nBytes, hipMemcpyDeviceToHost),
+               hipSuccess);
+  ASSERT_EQUAL(hipMemcpy(allThreadsMinusOneH, allThreadsMinusOneD, nBytes, hipMemcpyDeviceToHost),
+               hipSuccess);
+  ASSERT_EQUAL(hipMemcpy(allThreadsIdH, allThreadsIdD, nBytes, hipMemcpyDeviceToHost),
+               hipSuccess);
+
+  // Validate results for both the blocks together
+  for (int i = 0; i < 2 * blockSize; ++i) {
+    ASSERT_EQUAL(syncTestH[i], 30);
+    ASSERT_EQUAL(allThreadsZeroH[i], 0);
+    ASSERT_EQUAL(allThreadsOneH[i], blockSize);
+    ASSERT_EQUAL(oddThreadsOneH[i], blockSize / 2);
+    ASSERT_EQUAL(allThreadsMinusOneH[i], blockSize);
+    ASSERT_EQUAL(allThreadsIdH[i], (blockSize-1));
+  }
+
+  // Free device memory
+  ASSERT_EQUAL(hipFree(syncTestD), hipSuccess);
+  ASSERT_EQUAL(hipFree(allThreadsZeroD), hipSuccess);
+  ASSERT_EQUAL(hipFree(allThreadsOneD), hipSuccess);
+  ASSERT_EQUAL(hipFree(oddThreadsOneD), hipSuccess);
+  ASSERT_EQUAL(hipFree(allThreadsMinusOneD), hipSuccess);
+  ASSERT_EQUAL(hipFree(allThreadsIdD), hipSuccess);
+
+  //Free host memory
+  ASSERT_EQUAL(hipHostFree(syncTestH), hipSuccess);
+  ASSERT_EQUAL(hipHostFree(allThreadsZeroH), hipSuccess);
+  ASSERT_EQUAL(hipHostFree(allThreadsOneH), hipSuccess);
+  ASSERT_EQUAL(hipHostFree(oddThreadsOneH), hipSuccess);
+  ASSERT_EQUAL(hipHostFree(allThreadsMinusOneH), hipSuccess);
+  ASSERT_EQUAL(hipHostFree(allThreadsIdH), hipSuccess);
+}
+
+int main()
+{
+  int blockSizes[] = {10, 40, 70, 130, 240, 723, 32, 64, 128, 256, 512, 1024};
+  for (int i = 0; i < (sizeof(blockSizes) / sizeof(blockSizes[0])); ++i)
+    test_syncthreads_count(blockSizes[i]);
+  passed();
+}
@@ -0,0 +1,155 @@
+/*
+Copyright (c) 2020 - present Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+
+/* HIT_START
+ * BUILD: %t %s ../test_common.cpp
+ * TEST: %t
+ * HIT_END
+ */
+
+#include <hip/hip_runtime.h>
+#include "test_common.h"
+
+#define ASSERT_EQUAL(lhs, rhs) assert(lhs == rhs)
+
+static __global__
+void kernel_syncthreads_or(int *syncTestD,
+                           int *allThreadsZeroD,
+                           int *allThreadsOneD,
+                           int *oneThreadOneD,
+                           int *allThreadsMinusOneD)
+{
+  int blockSize = blockDim.x;
+  int predicate = 10;
+  // First block index starts with 0, and second block index starts
+  // with blockSize
+  int i = (blockIdx.x == 0) ? threadIdx.x : blockSize + threadIdx.x;
+
+  // At very first, we need to ensure work-group level syncronization
+  // properly happened, don't bother about predicate testing for now.
+  // Thread 0 and thread 1 writes to shared memory. After call to api,
+  // every thread reads shared memory, and store subtraction for verification
+  __shared__ int sm[2];
+  if (threadIdx.x == 0)
+    sm[0] = 10;
+  else if (threadIdx.x == 1)
+    sm[1] = 20;
+  __syncthreads_or(predicate);
+  syncTestD[i] = sm[1] - sm[0];
+
+  // All threads pass 0 as predicate value, result should be 0
+  predicate = 0;
+  allThreadsZeroD[i] = __syncthreads_or(predicate);
+
+  // All threads pass 1 as predicate value, result should be 1
+  predicate = 1;
+  allThreadsOneD[i] = __syncthreads_or(predicate);
+
+  // Thread 0 pass 1, and all other threads 0 as predicate value,
+  // result should be 1
+  predicate = (threadIdx.x == 0) ? 1 : 0;
+  oneThreadOneD[i] = __syncthreads_or(predicate);
+
+  // All threads pass -1 as predicate value, result should be 1
+  predicate = -1;
+  allThreadsMinusOneD[i] = __syncthreads_or(predicate);
+}
+
+static void test_syncthreads_or(int blockSize)
+{
+  int nBytes = sizeof(int) * 2 * blockSize;
+  int * syncTestD, *syncTestH;
+  int *allThreadsZeroD, *allThreadsZeroH;
+  int *allThreadsOneD, *allThreadsOneH;
+  int *oneThreadOneD, *oneThreadOneH;
+  int *allThreadsMinusOneD, *allThreadsMinusOneH;
+
+  // Allocate device memory
+  ASSERT_EQUAL(hipMalloc((void**)&syncTestD, nBytes), hipSuccess);
+  ASSERT_EQUAL(hipMalloc((void**)&allThreadsZeroD, nBytes), hipSuccess);
+  ASSERT_EQUAL(hipMalloc((void**)&allThreadsOneD, nBytes), hipSuccess);
+  ASSERT_EQUAL(hipMalloc((void**)&oneThreadOneD, nBytes), hipSuccess);
+  ASSERT_EQUAL(hipMalloc((void**)&allThreadsMinusOneD, nBytes), hipSuccess);
+
+  // Allocate host memory
+  ASSERT_EQUAL(hipHostMalloc((void**)&syncTestH, nBytes), hipSuccess);
+  ASSERT_EQUAL(hipHostMalloc((void**)&allThreadsZeroH, nBytes), hipSuccess);
+  ASSERT_EQUAL(hipHostMalloc((void**)&allThreadsOneH, nBytes), hipSuccess);
+  ASSERT_EQUAL(hipHostMalloc((void**)&oneThreadOneH, nBytes), hipSuccess);
+  ASSERT_EQUAL(hipHostMalloc((void**)&allThreadsMinusOneH, nBytes), hipSuccess);
+
+  // Launch Kernel
+  hipLaunchKernelGGL(kernel_syncthreads_or,
+                     2,
+                     blockSize,
+                     0,
+                     0,
+                     syncTestD,
+                     allThreadsZeroD,
+                     allThreadsOneD,
+                     oneThreadOneD,
+                     allThreadsMinusOneD);
+
+  // Copy result from device to host
+  ASSERT_EQUAL(hipMemcpy(syncTestH, syncTestD, nBytes, hipMemcpyDeviceToHost),
+               hipSuccess);
+  ASSERT_EQUAL(hipMemcpy(allThreadsZeroH, allThreadsZeroD, nBytes, hipMemcpyDeviceToHost),
+               hipSuccess);
+  ASSERT_EQUAL(hipMemcpy(allThreadsOneH, allThreadsOneD, nBytes, hipMemcpyDeviceToHost),
+               hipSuccess);
+  ASSERT_EQUAL(hipMemcpy(oneThreadOneH, oneThreadOneD, nBytes, hipMemcpyDeviceToHost),
+               hipSuccess);
+  ASSERT_EQUAL(hipMemcpy(allThreadsMinusOneH, allThreadsMinusOneD, nBytes, hipMemcpyDeviceToHost),
+               hipSuccess);
+
+  // Validate results for both blocks together
+  for (int i = 0; i < 2 * blockSize; ++i) {
+    ASSERT_EQUAL(syncTestH[i], 10);
+    ASSERT_EQUAL(allThreadsZeroH[i], 0);
+    ASSERT_EQUAL(allThreadsOneH[i], 1);
+    ASSERT_EQUAL(oneThreadOneH[i], 1);
+    ASSERT_EQUAL(allThreadsMinusOneH[i], 1);
+  }
+
+  // Free device memory
+  ASSERT_EQUAL(hipFree(syncTestD), hipSuccess);
+  ASSERT_EQUAL(hipFree(allThreadsZeroD), hipSuccess);
+  ASSERT_EQUAL(hipFree(allThreadsOneD), hipSuccess);
+  ASSERT_EQUAL(hipFree(oneThreadOneD), hipSuccess);
+  ASSERT_EQUAL(hipFree(allThreadsMinusOneD), hipSuccess);
+
+  //Free host memory
+  ASSERT_EQUAL(hipHostFree(syncTestH), hipSuccess);
+  ASSERT_EQUAL(hipHostFree(allThreadsZeroH), hipSuccess);
+  ASSERT_EQUAL(hipHostFree(allThreadsOneH), hipSuccess);
+  ASSERT_EQUAL(hipHostFree(oneThreadOneH), hipSuccess);
+  ASSERT_EQUAL(hipHostFree(allThreadsMinusOneH), hipSuccess);
+}
+
+int main()
+{
+  int blockSizes[] = {10, 40, 70, 130, 240, 723, 32, 64, 128, 256, 512, 1024};
+  for (int i = 0; i < (sizeof(blockSizes) / sizeof(blockSizes[0])); ++i)
+    test_syncthreads_or(blockSizes[i]);
+  passed();
+}
@@ -0,0 +1,90 @@
+/*
+Copyright (c) 2015-2017 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+/* HIT_START
+ * BUILD: %t %s ../test_common.cpp
+ * TEST: %t
+ * HIT_END
+ */
+
+#include "test_common.h"
+
+#define N 1024
+#define OFFSET 128
+
+void single_process() {
+  int* ipc_dptr = nullptr;
+  int* ipc_hptr = nullptr;
+  int* ipc_out_dptr = nullptr;
+  int* ipc_out_hptr = nullptr;
+
+  int* ipc_offset_dptr = nullptr;
+
+  hipIpcMemHandle_t ipc_handle;
+  hipIpcMemHandle_t ipc_offset_handle;
+
+  HIPCHECK(hipMalloc((void**)&ipc_dptr, N * sizeof(int)));
+
+  // Negative, Make sure we return error when an offset of original ptr is passed
+  ipc_offset_dptr = ipc_dptr + (OFFSET * sizeof(int));
+  assert(hipErrorInvalidDevicePointer == hipIpcGetMemHandle(&ipc_offset_handle, ipc_offset_dptr));
+
+  // Get handle for the device_ptr
+  HIPCHECK(hipIpcGetMemHandle(&ipc_handle, ipc_dptr));
+
+  // Set Values @ Host Ptr
+  ipc_hptr = new int[N];
+  for (size_t idx = 0; idx < N; ++idx) {
+     ipc_hptr[idx] = idx;
+  }
+
+  // Copy values to Device ptr
+  HIPCHECK(hipMemset(ipc_dptr, 0x00, (N * sizeof(int))));
+  HIPCHECK(hipMemcpy(ipc_dptr, ipc_hptr, (N * sizeof(int)), hipMemcpyHostToDevice));
+
+  // Open handle to get dev_ptr
+  ipc_out_hptr = new int[N];
+  memset(ipc_out_hptr, 0x00, (N * sizeof(int)));
+  HIPCHECK(hipIpcOpenMemHandle((void**)&ipc_out_dptr, ipc_handle, 0));
+
+  // Copy Values from Device to Host and Check for correctness
+  HIPCHECK(hipMemcpy(ipc_out_hptr, ipc_out_dptr, (N * sizeof(int)), hipMemcpyDeviceToHost));
+  for (size_t idx = 0; idx < N; ++idx) {
+    if(ipc_out_hptr[idx] != idx) {
+      std::cout<<"Failing @ idx: "<<idx<<std::endl;
+    }
+  }
+
+  //Close All Mem Handle
+  HIPCHECK(hipIpcCloseMemHandle(ipc_out_dptr));
+  HIPCHECK(hipFree(ipc_dptr));
+
+  delete[] ipc_hptr;
+  delete[] ipc_out_hptr;
+}
+
+void multi_process() {
+  //To create and open IPC handle via multiple process
+}
+
+int main() {
+  single_process();
+  multi_process();
+  passed();
+}
@@ -571,6 +571,10 @@ __global__ void hipLaunchKernelStructFunc21(

 __global__ void vAdd(float* a) {}

+template<class T1, class T2>
+__global__ void myKernel(T1 a, T2 b) {}
+
+
 //---
 // Some wrapper macro for testing:
 #define WRAP(...) __VA_ARGS__
@@ -913,6 +917,18 @@ int main() {
    hipLaunchKernelGGL(HIP_KERNEL_NAME(vAdd), dim3(1024), 1, 0, 0, Ad);
    hipLaunchKernelGGL(HIP_KERNEL_NAME(vAdd), dim3(1024), dim3(1), 0, 0, Ad);

+    // Test: Passing macro to hipLaunchKernelGGL
+#define KERNEL_CONFIG  dim3(1024), dim3(1), 0, 0
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(vAdd), KERNEL_CONFIG, Ad);
+
+    // Test: Same thing with templates:
+    int a;
+    float b;
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(myKernel<int, float>), KERNEL_CONFIG, a, b);
+
+#define TYPE_PARAM_CONFIG int, float
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(myKernel<TYPE_PARAM_CONFIG>), KERNEL_CONFIG, a, b);
+
    // Test: Passing hipLaunchKernelGGL inside another macro:
    float e0;
    MY_LAUNCH_MACRO(hipLaunchKernelGGL(vAdd, dim3(1024),
@@ -1,51 +1,168 @@
 /*
-Copyright (c) 2015-2017 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2015-present Advanced Micro Devices, Inc. All rights reserved.
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */

-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
-*/
+/*
+ * Test to compare
+ * 1.pciBusID from hipDeviceGetPCIBusId and hipDeviceGetAttribute **
+ * 2.{pciDomainID, pciBusID, pciDeviceID} values hipDeviceGetPCIBusId vs lspci **
+ */

 /* HIT_START
- * BUILD: %t %s ../../test_common.cpp
- * TEST: %t
+ * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS -std=c++11
+ * TEST_NAMED: %t  hipDeviceGetPCIBusId-vs-hipDeviceGetAttribute --tests 0x1
+ * TEST_NAMED: %t  hipDeviceGetPCIBusId-vs-lspci --tests 0x2
 * HIT_END
 */

-#include <stdio.h>
-#include "hip/hip_runtime.h"
 #include "test_common.h"
+#define MAX_DEVICE_LENGTH 20

-int main(void) {
-    char pciBusId[13];
-    int deviceCount = 0;
-    HIPCHECK(hipGetDeviceCount(&deviceCount));
-    HIPASSERT(deviceCount != 0);
-    for (int i = 0; i < deviceCount; i++) {
-        int pciBusID = -1;
-        int pciDeviceID = -1;
-        int pciDomainID = -1;
-        int tempPciBusId = -1;
-        HIPCHECK(hipDeviceGetPCIBusId(&pciBusId[0], 13, i));
-        sscanf(pciBusId, "%04x:%02x:%02x", &pciDomainID, &pciBusID, &pciDeviceID);
-        HIPCHECK(hipDeviceGetAttribute(&tempPciBusId, hipDeviceAttributePciBusId, i));
-        if (pciBusID != tempPciBusId) {
-            exit(EXIT_FAILURE);
-        }
-    }
-    passed();
+static bool getPciBusId(int deviceCount, char hipDeviceList[][MAX_DEVICE_LENGTH]) {
+  for (int i = 0; i < deviceCount; i++) {
+    HIPCHECK(hipDeviceGetPCIBusId(hipDeviceList[i], MAX_DEVICE_LENGTH, i));
+  }
+  return true;
+}
+
+bool comparePciBusIDWithHipDeviceGetAttribute() {
+  bool testResult = true;
+  int deviceCount = 0;
+  HIPCHECK(hipGetDeviceCount(&deviceCount));
+  HIPASSERT(deviceCount != 0);
+  printf("No.of gpus in the system: %d\n", deviceCount);
+  char hipDeviceList[deviceCount][MAX_DEVICE_LENGTH];
+  char pciDeviceList[deviceCount][MAX_DEVICE_LENGTH];
+
+  getPciBusId(deviceCount, hipDeviceList);
+
+  for (int i = 0; i < deviceCount; i++) {
+    int pciBusID = -1;
+    int pciDeviceID = -1;
+    int pciDomainID = -1;
+    int tempPciBusId = -1;
+    sscanf(hipDeviceList[i], "%04x:%02x:%02x", &pciDomainID, &pciBusID,
+           &pciDeviceID);
+    HIPCHECK(hipDeviceGetAttribute(&tempPciBusId, hipDeviceAttributePciBusId, i));
+    if (pciBusID != tempPciBusId) {
+      testResult = false;
+      printf("pciBusID from hipDeviceGetPCIBusId mismatched to that from "
+             "hipDeviceGetAttribute for gpu %d\n", i);
+    }
+  }
+
+  printf("pciBusID output of both hipDeviceGetPCIBusId and"
+         " hipDeviceGetAttribute matched for all gpus\n");
+  return testResult;
+}
+
+bool compareHipDeviceGetPCIBusIdWithLspci() {
+  FILE *fpipe;
+  bool testResult = false;
+
+  {
+    // Check if lspci is installed, if not, don't proceed
+    char const *cmd = "lspci --version";
+    char *lspciCheck;
+    char temp[20];
+    fpipe = popen(cmd, "r");
+
+    if (fpipe == nullptr) {
+      printf("Unable to create command file\n");
+      return testResult;
+    }
+
+    lspciCheck = fgets(temp, 20, fpipe);
+    pclose(fpipe);
+
+    if (!lspciCheck) {
+      printf("lspci not found. Skipping the test\n");
+      return true;
+    }
+  }
+
+  int deviceCount = 0;
+  HIPCHECK(hipGetDeviceCount(&deviceCount));
+  HIPASSERT(deviceCount != 0);
+  printf("No.of gpus in the system: %d\n", deviceCount);
+  char hipDeviceList[deviceCount][MAX_DEVICE_LENGTH];
+  char pciDeviceList[deviceCount][MAX_DEVICE_LENGTH];
+
+  getPciBusId(deviceCount, hipDeviceList);
+
+  // Get lspci device list and compare with hip device list
+  char const *command = "lspci -D | grep controller | grep AMD/ATI | "
+                        "cut -d ' ' -f 1";
+  fpipe = popen(command, "r");
+
+  if (fpipe == nullptr) {
+    printf("Unable to create command file\n");
+    return testResult;
+  }
+
+  int index = 0;
+  int deviceMatchCount = 0;
+
+  while (fgets(pciDeviceList[index], sizeof(pciDeviceList[index]), fpipe)) {
+    bool bMatchFound = false;
+    for (int deviceNo = 0; deviceNo < deviceCount; deviceNo++) {
+      if (!strncmp(pciDeviceList[index], hipDeviceList[deviceNo], 10)) {
+        deviceMatchCount++;
+        bMatchFound = true;
+      }
+    }
+    if (bMatchFound == false) {
+      printf("PCI device: %s is not reported by HIP\n", pciDeviceList[index]);
+    }
+    index++;
+  }
+
+  pclose(fpipe);
+
+  if (deviceMatchCount == deviceCount) {
+    printf("hip and lspci output for {pciDomainID, pciBusID, pciDeviceID} "
+           "matched for all gpus\n");
+    testResult = true;
+  } else {
+    printf("Mismatch in number GPUs reported by HIP with lscpi\n");
+  }
+  return testResult;
+}
+
+int main(int argc, char* argv[]) {
+  bool testResult = true;
+  HipTest::parseStandardArguments(argc, argv, true);
+
+  if (p_tests & 0x1) {
+    testResult &= comparePciBusIDWithHipDeviceGetAttribute();
+  }
+
+  if (p_tests & 0x2) {
+#ifdef __unix__
+    testResult &= compareHipDeviceGetPCIBusIdWithLspci();
+#else
+    printf("Detected non-linux OS. Skipping the test\n");
+#endif
+  }
+
+  if (testResult) {
+    passed();
+  } else {
+    failed("one or more tests failed\n");
+  }
 }
@@ -145,5 +145,6 @@ int main(int argc, char* argv[]) {
    CHECK(test_hipDeviceGetAttribute(deviceId, hipDeviceAttributeKernelExecTimeout, props.kernelExecTimeoutEnabled));
    CHECK(test_hipDeviceGetAttribute(deviceId, hipDeviceAttributeCanMapHostMemory, props.canMapHostMemory));
    CHECK(test_hipDeviceGetAttribute(deviceId, hipDeviceAttributeEccEnabled, props.ECCEnabled));
+    CHECK(test_hipDeviceGetAttribute(deviceId, hipDeviceAttributeAsicRevision, props.asicRevision));
    passed();
 };
@@ -39,6 +39,9 @@ int main() {

    HIPCHECK(hipGetDeviceCount(&numDevices));
    if (numDevices > 1) {
+      int canAccessPeer = 0;
+      hipDeviceCanAccessPeer(&canAccessPeer, 0, 1);
+      if (canAccessPeer) {
        HIPCHECK(hipSetDevice(0));
        unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N);
        HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, false);
@@ -47,23 +50,21 @@ int main() {
        HIPCHECK(hipMalloc(&Y_d, Nbytes));
        HIPCHECK(hipMalloc(&Z_d, Nbytes));

-
        HIPCHECK(hipSetDevice(0));
        HIPCHECK(hipMemcpy(A_d, A_h, Nbytes, hipMemcpyHostToDevice));
        HIPCHECK(hipMemcpy(B_d, B_h, Nbytes, hipMemcpyHostToDevice));
        hipLaunchKernelGGL(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0,
-                        static_cast<const int*>(A_d), static_cast<const int*>(B_d), C_d, N);
+                           static_cast<const int*>(A_d), static_cast<const int*>(B_d), C_d, N);
        HIPCHECK(hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost));
        HIPCHECK(hipDeviceSynchronize());
        HipTest::checkVectorADD(A_h, B_h, C_h, N);

-
        HIPCHECK(hipSetDevice(1));
        HIPCHECK(hipMemcpyDtoD((hipDeviceptr_t)X_d, (hipDeviceptr_t)A_d, Nbytes));
        HIPCHECK(hipMemcpyDtoD((hipDeviceptr_t)Y_d, (hipDeviceptr_t)B_d, Nbytes));

        hipLaunchKernelGGL(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0,
-                        static_cast<const int*>(X_d), static_cast<const int*>(Y_d), Z_d, N);
+                          static_cast<const int*>(X_d), static_cast<const int*>(Y_d), Z_d, N);
        HIPCHECK(hipMemcpyDtoH(C_h, (hipDeviceptr_t)Z_d, Nbytes));
        HIPCHECK(hipDeviceSynchronize());
        HipTest::checkVectorADD(A_h, B_h, C_h, N);
@@ -72,6 +73,9 @@ int main() {
        HIPCHECK(hipFree(X_d));
        HIPCHECK(hipFree(Y_d));
        HIPCHECK(hipFree(Z_d));
+      } else {
+        std::cout<<"Machine does not seem to have P2P Capabilities, Empty Pass"<<std::endl;
+      }
    }

    passed();
@@ -40,6 +40,10 @@ int main() {

    HIPCHECK(hipGetDeviceCount(&numDevices));
    if (numDevices > 1) {
+
+      int canAccessPeer = 0;
+      hipDeviceCanAccessPeer(&canAccessPeer, 0, 1);
+      if (canAccessPeer) {
        HIPCHECK(hipSetDevice(0));
        unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N);
        HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, false);
@@ -75,6 +79,9 @@ int main() {
        HIPCHECK(hipFree(X_d));
        HIPCHECK(hipFree(Y_d));
        HIPCHECK(hipFree(Z_d));
+      } else {
+        std::cout<<"Machine does not seem to have P2P Capabilities, Empty Pass"<<std::endl;
+      }
    }

    passed();
@@ -39,6 +39,9 @@ int main() {

    HIPCHECK(hipGetDeviceCount(&numDevices));
    if (numDevices > 1) {
+      int canAccessPeer = 0;
+      hipDeviceCanAccessPeer(&canAccessPeer, 0, 1);
+      if (canAccessPeer) {
        HIPCHECK(hipSetDevice(0));
        unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N);
        HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, false);
@@ -72,6 +75,9 @@ int main() {
        HIPCHECK(hipFree(X_d));
        HIPCHECK(hipFree(Y_d));
        HIPCHECK(hipFree(Z_d));
+      } else {
+        std::cout<<"Machine does not seem to have P2P Capabilities, Empty Pass"<<std::endl;
+      }
    }
    passed();
 }
@@ -42,6 +42,10 @@ int main() {

    HIPCHECK(hipGetDeviceCount(&numDevices));
    if (numDevices > 1) {
+
+      int canAccessPeer = 0;
+      hipDeviceCanAccessPeer(&canAccessPeer, 0, 1);
+      if (canAccessPeer) {
        HIPCHECK(hipSetDevice(0));
        unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N);
        HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, false);
@@ -77,6 +81,9 @@ int main() {
        HIPCHECK(hipFree(X_d));
        HIPCHECK(hipFree(Y_d));
        HIPCHECK(hipFree(Z_d));
+      } else {
+        std::cout<<"Machine does not seem to have P2P Capabilities, Empty Pass"<<std::endl;
+      }
    }

    passed();
@@ -18,9 +18,9 @@ THE SOFTWARE.
 */

 /*
- * Conformance test for checking functionality of
- * hipError_t hipMemcpyPeer(void* dst, int dstDeviceId, const void* src, int srcDeviceId, size_t
- * sizeBytes);
+ * Different test for checking functionality of
+ * hipError_t hipMemcpyWithStream(void* dst, const void* src, size_t sizeBytes,hipMemcpyKind kind, 
+ * hipStream_t stream);
 */

 /* HIT_START
@@ -31,27 +31,481 @@ THE SOFTWARE.

 #include "test_common.h"

-int main() {
-    size_t Nbytes = N * sizeof(int);
-    int numDevices = 0;
-    int *A_d, *B_d, *C_d;
-    int *A_h, *B_h, *C_h;

-    unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N);
-    HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, false);
+#define test_passed(test_name)  printf("%s %s  PASSED!%s\n", KGRN, #test_name, KNRM);
+#define test_failed(test_name)  printf("%s %s  FAILED!%s\n", KRED, #test_name, KNRM);

-    hipStream_t stream;
-    HIPCHECK(hipStreamCreate(&stream));

-    HIPCHECK(hipMemcpyWithStream(A_d, A_h, Nbytes, hipMemcpyHostToDevice, stream));
-    HIPCHECK(hipMemcpyWithStream(B_d, B_h, Nbytes, hipMemcpyHostToDevice,stream));
-    hipLaunchKernelGGL(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, stream,
-                        static_cast<const int*>(A_d), static_cast<const int*>(B_d), C_d, N);
-    HIPCHECK(hipStreamSynchronize(stream));
-    HIPCHECK(hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost));
-    HipTest::checkVectorADD(A_h, B_h, C_h, N);
+class HipMemcpyWithStreamtests {
+ public:
+  // Test hipMemcpyWithStream with one streams and launch kernel in
+  // that stream, verify the data
+  void TestwithOnestream(void);
+  // Test hipMemcpyWithStream with two streams and launch kernels in
+  // two streams, verify the data
+  void TestwithTwoStream(void);
+  // Test hipMemcpyWithStream with one stream for each gpu and launch
+  // kernels in each, verify the data
+  void TestOnMultiGPUwithOneStream(void);
+  // Test hipMemcpyWithStream to copy data from device to host (hipMemcpyDeviceToHost)
+  void TestkindDtoH(void);
+  // Test hipMemcpyWithStream with hipMemcpyDeviceToDevice on MultiGPU
+  void TestkindDtoD(void);
+  // Test hipMemcpyWithStream with hipMemcpyHostToHost
+  void TestkindHtoH(void);
+  // Test hipMemcpyWithStream with hipMemcpyDefault
+  void TestkindDefault(void);
+  // Test hipMemcpyWithStream with hipMemcpyDefault for device to device transfer case
+  void TestkindDefaultForDtoD(void);
+  // Test hipMemcpyWithStream with hipMemcpyDeviceToDevice on same device
+  void TestDtoDonSameDevice(void);
+};

-    HipTest::freeArrays(A_d, B_d, C_d, A_h, B_h, C_h, false);
-    HIPCHECK(hipStreamDestroy(stream));
-    passed();
+void HipMemcpyWithStreamtests::TestwithOnestream(void) {
+  size_t Nbytes = N * sizeof(int);
+  int numDevices = 0;
+  int *A_d, *B_d, *C_d;
+  int *A_h, *B_h, *C_h;
+
+  unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N);
+  HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, false);
+
+  hipStream_t stream;
+  HIPCHECK(hipStreamCreate(&stream));
+
+  HIPCHECK(hipMemcpyWithStream(A_d, A_h, Nbytes, hipMemcpyHostToDevice, stream));
+  HIPCHECK(hipMemcpyWithStream(B_d, B_h, Nbytes, hipMemcpyHostToDevice, stream));
+  hipLaunchKernelGGL(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, stream,
+                      static_cast<const int*>(A_d), static_cast<const int*>(B_d), C_d, N);
+  HIPCHECK(hipStreamSynchronize(stream));
+  HIPCHECK(hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost));
+  HipTest::checkVectorADD(A_h, B_h, C_h, N);
+
+  HipTest::freeArrays(A_d, B_d, C_d, A_h, B_h, C_h, false);
+  HIPCHECK(hipStreamDestroy(stream));
+}
+
+void HipMemcpyWithStreamtests::TestwithTwoStream(void) {
+  size_t Nbytes = N * sizeof(int);
+  int numDevices = 0;
+  int noOfstreams = 2;
+  int *A_d[noOfstreams], *B_d[noOfstreams], *C_d[noOfstreams];
+  int *A_h[noOfstreams], *B_h[noOfstreams], *C_h[noOfstreams];
+
+  unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N);
+
+  for (int i=0; i < noOfstreams; ++i) {
+    HipTest::initArrays(&A_d[i], &B_d[i], &C_d[i], &A_h[i], &B_h[i], &C_h[i], N, false);
+  }
+
+  hipStream_t stream[noOfstreams];
+  for (int i=0; i < noOfstreams; ++i) {
+    HIPCHECK(hipStreamCreate(&stream[i]));
+  }
+
+  for (int i=0; i < noOfstreams; ++i) {
+    HIPCHECK(hipMemcpyWithStream(A_d[i], A_h[i], Nbytes, hipMemcpyHostToDevice, stream[i]));
+    HIPCHECK(hipMemcpyWithStream(B_d[i], B_h[i], Nbytes, hipMemcpyHostToDevice, stream[i]));
+  }
+
+  for (int i=0; i < noOfstreams; ++i) {
+    hipLaunchKernelGGL(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, stream[i],
+                      static_cast<const int*>(A_d[i]), static_cast<const int*>(B_d[i]), C_d[i], N);
+  }
+
+  for (int i=0; i < noOfstreams; ++i) {
+    HIPCHECK(hipStreamSynchronize(stream[i]));
+    HIPCHECK(hipMemcpy(C_h[i], C_d[i], Nbytes, hipMemcpyDeviceToHost));
+    HipTest::checkVectorADD(A_h[i], B_h[i], C_h[i], N);
+  }
+
+  for (int i=0; i < noOfstreams; ++i) {
+    HipTest::freeArrays(A_d[i], B_d[i], C_d[i], A_h[i], B_h[i], C_h[i], false);
+    HIPCHECK(hipStreamDestroy(stream[i]));
+  }
+}
+
+void HipMemcpyWithStreamtests::TestDtoDonSameDevice(void) {
+  size_t Nbytes = N * sizeof(int);
+  int numDevices = 0;
+  int noOfstreams = 2;
+  int *A_d[noOfstreams], *B_d[noOfstreams], *C_d[noOfstreams];
+  int *A_h[noOfstreams], *B_h[noOfstreams], *C_h[noOfstreams];
+
+  unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N);
+
+  HipTest::initArrays(&A_d[0], &B_d[0], &C_d[0], &A_h[0], &B_h[0], &C_h[0], N, false);
+
+
+  hipStream_t stream[noOfstreams];
+  for (int i=0; i < noOfstreams; ++i) {
+    HIPCHECK(hipSetDevice(0));
+    HIPCHECK(hipStreamCreate(&stream[i]));
+  }
+
+  HIPCHECK(hipSetDevice(0));
+  HIPCHECK(hipMalloc(&A_d[1], Nbytes));
+  HIPCHECK(hipMalloc(&B_d[1], Nbytes));
+  HIPCHECK(hipMalloc(&C_d[1], Nbytes));
+  C_h[1] = reinterpret_cast<int*>(malloc(Nbytes));
+  HIPASSERT(C_h[1] != NULL);
+
+  HIPCHECK(hipMemcpyWithStream(A_d[0], A_h[0], Nbytes, hipMemcpyHostToDevice, stream[0]));
+  HIPCHECK(hipMemcpyWithStream(B_d[0], B_h[0], Nbytes, hipMemcpyHostToDevice, stream[0]));
+
+  HIPCHECK(hipMemcpyWithStream(A_d[1], A_d[0], Nbytes, hipMemcpyDeviceToDevice, stream[1]));
+  HIPCHECK(hipMemcpyWithStream(B_d[1], B_d[0], Nbytes, hipMemcpyDeviceToDevice, stream[1]));
+
+
+  for (int i=0; i < noOfstreams; ++i) {
+    HIPCHECK(hipSetDevice(0));
+    hipLaunchKernelGGL(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, stream[i],
+                      static_cast<const int*>(A_d[i]), static_cast<const int*>(B_d[i]), C_d[i], N);
+  }
+
+  for (int i=0; i < noOfstreams; ++i) {
+    HIPCHECK(hipSetDevice(0));
+    HIPCHECK(hipStreamSynchronize(stream[i]));
+    HIPCHECK(hipMemcpy(C_h[i], C_d[i], Nbytes, hipMemcpyDeviceToHost));
+    HipTest::checkVectorADD(A_h[0], B_h[0], C_h[i], N);
+  }
+
+
+  HipTest::freeArrays(A_d[0], B_d[0], C_d[0], A_h[0], B_h[0], C_h[0], false);
+
+  if (A_d[1]) {
+    HIPCHECK(hipFree(A_d[1]));
+  }
+  if (B_d[1]) {
+    HIPCHECK(hipFree(B_d[1]));
+  }
+  if (C_d[1]) {
+    HIPCHECK(hipFree(C_d[1]));
+  }
+  if (C_h[1]) {
+    free(C_h[1]);
+  }
+
+
+  for (int i=0; i < noOfstreams; ++i) {
+    HIPCHECK(hipStreamDestroy(stream[i]));
+  }
+}
+
+void HipMemcpyWithStreamtests::TestOnMultiGPUwithOneStream(void) {
+  size_t Nbytes = N * sizeof(int);
+  int numDevices = 0;
+
+  unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N);
+  HIPCHECK(hipGetDeviceCount(&numDevices));
+  // If you have single GPU machine the return
+  if (numDevices <= 1) {
+    return;
+  }
+  int *A_d[numDevices], *B_d[numDevices], *C_d[numDevices];
+  int *A_h[numDevices], *B_h[numDevices], *C_h[numDevices];
+
+  hipStream_t stream[numDevices];
+  for (int i=0; i < numDevices; ++i) {
+    HIPCHECK(hipSetDevice(i));
+    HIPCHECK(hipStreamCreate(&stream[i]));
+  }
+
+  for (int i=0; i < numDevices; ++i) {
+    HIPCHECK(hipSetDevice(i));
+    HipTest::initArrays(&A_d[i], &B_d[i], &C_d[i], &A_h[i], &B_h[i], &C_h[i], N, false);
+  }
+
+
+  for (int i=0; i < numDevices; ++i) {
+    HIPCHECK(hipSetDevice(i));
+    HIPCHECK(hipMemcpyWithStream(A_d[i], A_h[i], Nbytes, hipMemcpyHostToDevice, stream[i]));
+    HIPCHECK(hipMemcpyWithStream(B_d[i], B_h[i], Nbytes, hipMemcpyHostToDevice, stream[i]));
+  }
+
+  for (int i=0; i < numDevices; ++i) {
+    HIPCHECK(hipSetDevice(i));
+    hipLaunchKernelGGL(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, stream[i],
+                      static_cast<const int*>(A_d[i]), static_cast<const int*>(B_d[i]), C_d[i], N);
+  }
+
+  for (int i=0; i < numDevices; ++i) {
+    HIPCHECK(hipSetDevice(i));
+    HIPCHECK(hipStreamSynchronize(stream[i]));
+    HIPCHECK(hipMemcpy(C_h[i], C_d[i], Nbytes, hipMemcpyDeviceToHost));
+    HipTest::checkVectorADD(A_h[i], B_h[i], C_h[i], N);
+  }
+
+  for (int i=0; i < numDevices; ++i) {
+    HIPCHECK(hipSetDevice(i));
+    HipTest::freeArrays(A_d[i], B_d[i], C_d[i], A_h[i], B_h[i], C_h[i], false);
+    HIPCHECK(hipStreamDestroy(stream[i]));
+  }
+}
+
+void HipMemcpyWithStreamtests::TestkindDtoH(void) {
+  size_t Nbytes = N * sizeof(int);
+  int numDevices = 0;
+  int *A_d, *B_d, *C_d;
+  int *A_h, *B_h, *C_h;
+
+  unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N);
+  HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, false);
+
+  hipStream_t stream;
+  HIPCHECK(hipStreamCreate(&stream));
+
+  HIPCHECK(hipMemcpyWithStream(A_d, A_h, Nbytes, hipMemcpyHostToDevice, stream));
+  HIPCHECK(hipMemcpyWithStream(B_d, B_h, Nbytes, hipMemcpyHostToDevice, stream));
+  hipLaunchKernelGGL(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, stream,
+                      static_cast<const int*>(A_d), static_cast<const int*>(B_d), C_d, N);
+  HIPCHECK(hipStreamSynchronize(stream));
+  HIPCHECK(hipMemcpyWithStream(C_h, C_d, Nbytes, hipMemcpyDeviceToHost, stream));
+  HipTest::checkVectorADD(A_h, B_h, C_h, N);
+
+  HipTest::freeArrays(A_d, B_d, C_d, A_h, B_h, C_h, false);
+  HIPCHECK(hipStreamDestroy(stream));
+}
+
+
+void HipMemcpyWithStreamtests::TestkindDtoD(void) {
+  size_t Nbytes = N * sizeof(int);
+  int numDevices = 0;
+
+  unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N);
+  HIPCHECK(hipGetDeviceCount(&numDevices));
+  // If you have single GPU machine the return
+  if (numDevices <= 1) {
+    return;
+  }
+
+  int canAccessPeer = 0;
+  hipDeviceCanAccessPeer(&canAccessPeer, 0, 1);
+  if (!canAccessPeer) {
+    std::cout<<"Machine does not seem to have P2P Capabilities"<<std::endl;
+    return;
+  }
+
+  int *A_d[numDevices], *B_d[numDevices], *C_d[numDevices];
+  int *A_h[numDevices], *B_h[numDevices], *C_h[numDevices];
+
+  hipStream_t stream[numDevices];
+  for (int i=0; i < numDevices; ++i) {
+    HIPCHECK(hipSetDevice(i));
+    HIPCHECK(hipStreamCreate(&stream[i]));
+  }
+
+  // Initialize and create the host and device elements for first device
+  HIPCHECK(hipSetDevice(0));
+  HipTest::initArrays(&A_d[0], &B_d[0], &C_d[0], &A_h[0], &B_h[0], &C_h[0], N, false);
+
+  for (int i=1; i < numDevices; ++i) {
+    HIPCHECK(hipSetDevice(i))
+    HIPCHECK(hipMalloc(&A_d[i], Nbytes));
+    HIPCHECK(hipMalloc(&B_d[i], Nbytes));
+    HIPCHECK(hipMalloc(&C_d[i], Nbytes));
+    C_h[i] = reinterpret_cast<int*>(malloc(Nbytes));
+    HIPASSERT(C_h[i] != NULL);
+  }
+
+
+
+  HIPCHECK(hipSetDevice(0));
+  HIPCHECK(hipMemcpyWithStream(A_d[0], A_h[0], Nbytes, hipMemcpyHostToDevice, stream[0]));
+  HIPCHECK(hipMemcpyWithStream(B_d[0], B_h[0], Nbytes, hipMemcpyHostToDevice, stream[0]));
+
+  for (int i=1; i < numDevices; ++i) {
+    HIPCHECK(hipSetDevice(i));
+    HIPCHECK(hipMemcpyWithStream(A_d[i], A_d[0], Nbytes, hipMemcpyDeviceToDevice, stream[i]));
+    HIPCHECK(hipMemcpyWithStream(B_d[i], B_d[0], Nbytes, hipMemcpyDeviceToDevice, stream[i]));
+  }
+
+  for (int i=0; i < numDevices; ++i) {
+    HIPCHECK(hipSetDevice(i));
+    hipLaunchKernelGGL(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, stream[i],
+                      static_cast<const int*>(A_d[i]), static_cast<const int*>(B_d[i]), C_d[i], N);
+  }
+
+  for (int i=0; i < numDevices; ++i) {
+    HIPCHECK(hipSetDevice(i));
+    HIPCHECK(hipStreamSynchronize(stream[i]));
+    HIPCHECK(hipMemcpy(C_h[i], C_d[i], Nbytes, hipMemcpyDeviceToHost));
+    HipTest::checkVectorADD(A_h[0], B_h[0], C_h[i], N);
+  }
+
+  HipTest::freeArrays(A_d[0], B_d[0], C_d[0], A_h[0], B_h[0], C_h[0], false);
+  HIPCHECK(hipStreamDestroy(stream[0]));
+
+  for (int i=1; i < numDevices; ++i) {
+    if (A_d[i]) {
+      HIPCHECK(hipFree(A_d[i]));
+    }
+    if (B_d[i]) {
+      HIPCHECK(hipFree(B_d[i]));
+    }
+    if (C_d[i]) {
+      HIPCHECK(hipFree(C_d[i]));
+    }
+    if (C_h[i]) {
+      free(C_h[i]);
+    }
+    HIPCHECK(hipStreamDestroy(stream[i]));
+  }
+}
+
+void HipMemcpyWithStreamtests::TestkindDefault(void) {
+  size_t Nbytes = N * sizeof(int);
+  int numDevices = 0;
+  int *A_d, *B_d, *C_d;
+  int *A_h, *B_h, *C_h;
+
+
+  unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N);
+  HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, false);
+
+  hipStream_t stream;
+  HIPCHECK(hipStreamCreate(&stream));
+
+  HIPCHECK(hipMemcpyWithStream(A_d, A_h, Nbytes, hipMemcpyDefault, stream));
+  HIPCHECK(hipMemcpyWithStream(B_d, B_h, Nbytes, hipMemcpyDefault, stream));
+  hipLaunchKernelGGL(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, stream,
+                      static_cast<const int*>(A_d), static_cast<const int*>(B_d), C_d, N);
+  HIPCHECK(hipStreamSynchronize(stream));
+  HIPCHECK(hipMemcpyWithStream(C_h, C_d, Nbytes, hipMemcpyDefault, stream));
+  HipTest::checkVectorADD(A_h, B_h, C_h, N);
+
+  HipTest::freeArrays(A_d, B_d, C_d, A_h, B_h, C_h, false);
+  HIPCHECK(hipStreamDestroy(stream));
+}
+
+void HipMemcpyWithStreamtests::TestkindDefaultForDtoD(void) {
+  size_t Nbytes = N * sizeof(int);
+  int numDevices = 0;
+
+
+  unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N);
+  HIPCHECK(hipGetDeviceCount(&numDevices));
+  // If you have single GPU machine the return
+  if (numDevices <= 1) {
+    return;
+  }
+
+  int *A_d[numDevices], *B_d[numDevices], *C_d[numDevices];
+  int *A_h[numDevices], *B_h[numDevices], *C_h[numDevices];
+
+  // Initialize and create the host and device elements for first device
+  HipTest::initArrays(&A_d[0], &B_d[0], &C_d[0], &A_h[0], &B_h[0], &C_h[0], N, false);
+
+  for (int i=1; i < numDevices; ++i) {
+    HIPCHECK(hipMalloc(&A_d[i], Nbytes));
+    HIPCHECK(hipMalloc(&B_d[i], Nbytes));
+    HIPCHECK(hipMalloc(&C_d[i], Nbytes));
+    C_h[i] = reinterpret_cast<int*>(malloc(Nbytes));
+    HIPASSERT(C_h[i] != NULL);
+  }
+
+  hipStream_t stream[numDevices];
+  for (int i=0; i < numDevices; ++i) {
+    HIPCHECK(hipStreamCreate(&stream[i]));
+  }
+
+  HIPCHECK(hipSetDevice(0));
+  HIPCHECK(hipMemcpyWithStream(A_d[0], A_h[0], Nbytes, hipMemcpyHostToDevice, stream[0]));
+  HIPCHECK(hipMemcpyWithStream(B_d[0], B_h[0], Nbytes, hipMemcpyHostToDevice, stream[0]));
+
+  for (int i=1; i < numDevices; ++i) {
+    HIPCHECK(hipSetDevice(i));
+    HIPCHECK(hipMemcpyWithStream(A_d[i], A_d[0], Nbytes, hipMemcpyDefault, stream[i]));
+    HIPCHECK(hipMemcpyWithStream(B_d[i], B_d[0], Nbytes, hipMemcpyDefault, stream[i]));
+  }
+
+  for (int i=0; i < numDevices; ++i) {
+    HIPCHECK(hipSetDevice(i));
+    hipLaunchKernelGGL(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, stream[i],
+                      static_cast<const int*>(A_d[i]), static_cast<const int*>(B_d[i]), C_d[i], N);
+  }
+
+  for (int i=0; i < numDevices; ++i) {
+    HIPCHECK(hipSetDevice(i));
+    HIPCHECK(hipStreamSynchronize(stream[i]));
+    HIPCHECK(hipMemcpy(C_h[i], C_d[i], Nbytes, hipMemcpyDeviceToHost));
+    HipTest::checkVectorADD(A_h[0], B_h[0], C_h[i], N);
+  }
+
+  HipTest::freeArrays(A_d[0], B_d[0], C_d[0], A_h[0], B_h[0], C_h[0], false);
+  HIPCHECK(hipStreamDestroy(stream[0]));
+
+  for (int i=1; i < numDevices; ++i) {
+    if (A_d[i]) {
+      HIPCHECK(hipFree(A_d[i]));
+    }
+    if (B_d[i]) {
+      HIPCHECK(hipFree(B_d[i]));
+    }
+    if (C_d[i]) {
+      HIPCHECK(hipFree(C_d[i]));
+    }
+    if (C_h[i]) {
+      free(C_h[i]);
+    }
+    HIPCHECK(hipStreamDestroy(stream[i]));
+  }
+}
+
+void HipMemcpyWithStreamtests::TestkindHtoH(void) {
+  size_t Nbytes = N * sizeof(int);
+  int numDevices = 0;
+  int *A_h, *B_h;
+
+  unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N);
+
+  // Allocate memory to A_h and B_h
+  A_h = static_cast<int*>(malloc(Nbytes));
+  HIPASSERT(A_h != NULL);
+  B_h = static_cast<int*>(malloc(Nbytes));
+  HIPASSERT(B_h != NULL);
+
+  for (size_t i = 0; i < N; ++i) {
+    if (A_h) (A_h)[i] = 3.146f + i;  // Pi
+  }
+
+  hipStream_t stream;
+  HIPCHECK(hipStreamCreate(&stream));
+
+  HIPCHECK(hipMemcpyWithStream(B_h, A_h, Nbytes, hipMemcpyHostToHost, stream));
+  HIPCHECK(hipStreamSynchronize(stream));
+
+  for (size_t i = 0; i < N; i++) {
+    HIPASSERT(A_h[i] == B_h[i]);
+  }
+
+  if (A_h) {
+    free(A_h);
+  }
+  if (B_h) {
+    free(B_h);
+  }
+  HIPCHECK(hipStreamDestroy(stream));
+}
+
+int main() {
+  HipMemcpyWithStreamtests tests;
+  tests.TestwithOnestream();
+  test_passed(TestwithOnestream);
+  tests.TestwithTwoStream();
+  test_passed(TestwithTwoStream);
+  tests.TestkindDtoH();
+  test_passed(TestkindsDtoH);
+  tests.TestkindDefault();
+  test_passed(TestkindDefault);
+  tests.TestDtoDonSameDevice();
+  test_passed(TestDtoDonSameDevice);
+  tests.TestOnMultiGPUwithOneStream();
+  test_passed(TestOnMultiGPUwithOneStream);
+  tests.TestkindDtoD();
+  test_passed(TestkindDtoD);
+  tests.TestkindDefaultForDtoD();
+  test_passed(TestkindDefaultForDtoD);
+  tests.TestkindHtoH();
+  test_passed(TestkindsHtoH);
 }
@@ -0,0 +1,659 @@
+/*
+Copyright (c) 2020-present Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+/*
+ * Different test for checking functionality of
+ * hipError_t hipMemcpyWithStream(void* dst, const void* src, size_t sizeBytes,
+ * hipMemcpyKind kind, hipStream_t stream);
+ */
+
+/* HIT_START
+ * BUILD: %t %s ../../test_common.cpp
+ * TEST: %t
+ * HIT_END
+ */
+
+#include <vector>
+#include <thread>
+#include <chrono>
+#include "test_common.h"
+
+#define LEN 64
+#define SIZE LEN << 2
+#define THREADS 2
+#define MAX_THREADS 16
+
+
+#define test_passed(test_name)  printf("%s %s  PASSED!%s\n", \
+                    KGRN, #test_name, KNRM);
+#define test_failed(test_name)  printf("%s %s  FAILED!%s\n", \
+                    KRED, #test_name, KNRM);
+
+enum class ops
+{   TestwithOnestream,
+    TestwithTwoStream,
+    TestOnMultiGPUwithOneStream,
+    TestkindDtoH,
+    TestkindDtoD,
+    TestkindHtoH,
+    TestkindDefault,
+    TestkindDefaultForDtoD,
+    TestDtoDonSameDevice,
+    END_OF_LIST
+};
+
+
+class HipMemcpyWithStreamMultiThreadtests {
+  // Test hipMemcpyWithStream with one streams and launch kernel in
+  // that stream, verify the data.
+  void TestwithOnestream(void);
+  // Test hipMemcpyWithStream with two streams and launch kernels in
+  // two streams, verify the data.
+  void TestwithTwoStream(void);
+  // Test hipMemcpyWithStream with one stream for each gpu and launch
+  // kernels in each, verify the data
+  void TestOnMultiGPUwithOneStream(void);
+  // Test hipMemcpyWithStream to copy data from
+  // device to host (hipMemcpyDeviceToHost).
+  void TestkindDtoH(void);
+  // Test hipMemcpyWithStream with hipMemcpyDeviceToDevice on MultiGPU.
+  void TestkindDtoD(void);
+  // Test hipMemcpyWithStream with hipMemcpyHostToHost.
+  void TestkindHtoH(void);
+  // Test hipMemcpyWithStream with hipMemcpyDefault.
+  void TestkindDefault(void);
+  // Test hipMemcpyWithStream with hipMemcpyDefault for
+  // device to device transfer case.
+  void TestkindDefaultForDtoD(void);
+  // Test hipMemcpyWithStream with hipMemcpyDeviceToDevice on same device.
+  void TestDtoDonSameDevice(void);
+
+ public:
+  // run all the tests on multithreaded.
+  void TestwithMultiThreaded(ops op);
+};
+
+struct joinable_thread : std::thread {
+    template <class... Xs>
+    explicit joinable_thread(Xs&&... xs) : std::thread(std::forward<Xs>(xs)...)
+    {} // NOLINT
+
+    joinable_thread& operator=(joinable_thread&& other) = default;
+    joinable_thread(joinable_thread&& other)            = default;
+
+    ~joinable_thread() {
+        if (this->joinable())
+            this->join();
+    }
+};
+
+void HipMemcpyWithStreamMultiThreadtests::TestwithOnestream(void) {
+  size_t Nbytes = N * sizeof(int);
+  int numDevices = 0;
+  int *A_d, *B_d, *C_d;
+  int *A_h, *B_h, *C_h;
+
+  unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N);
+  HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, false);
+
+  hipStream_t stream;
+  HIPCHECK(hipStreamCreate(&stream));
+
+  HIPCHECK(hipMemcpyWithStream(A_d, A_h, Nbytes,
+                               hipMemcpyHostToDevice, stream));
+  HIPCHECK(hipMemcpyWithStream(B_d, B_h, Nbytes,
+                               hipMemcpyHostToDevice, stream));
+  hipLaunchKernelGGL(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock),
+                     0, stream, static_cast<const int*>(A_d),
+                     static_cast<const int*>(B_d), C_d, N);
+  HIPCHECK(hipStreamSynchronize(stream));
+  HIPCHECK(hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost));
+  HipTest::checkVectorADD(A_h, B_h, C_h, N);
+
+  HipTest::freeArrays(A_d, B_d, C_d, A_h, B_h, C_h, false);
+  HIPCHECK(hipStreamDestroy(stream));
+}
+
+void HipMemcpyWithStreamMultiThreadtests::TestwithTwoStream(void) {
+  size_t Nbytes = N * sizeof(int);
+  int numDevices = 0;
+  int noOfstreams = 2;
+  int *A_d[noOfstreams], *B_d[noOfstreams], *C_d[noOfstreams];
+  int *A_h[noOfstreams], *B_h[noOfstreams], *C_h[noOfstreams];
+
+  unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N);
+
+  for (int i=0; i < noOfstreams; ++i) {
+    HipTest::initArrays(&A_d[i], &B_d[i], &C_d[i],
+                        &A_h[i], &B_h[i], &C_h[i], N, false);
+  }
+
+  hipStream_t stream[noOfstreams];
+  for (int i=0; i < noOfstreams; ++i) {
+    HIPCHECK(hipStreamCreate(&stream[i]));
+  }
+
+  for (int i=0; i < noOfstreams; ++i) {
+    HIPCHECK(hipMemcpyWithStream(A_d[i], A_h[i], Nbytes,
+             hipMemcpyHostToDevice, stream[i]));
+    HIPCHECK(hipMemcpyWithStream(B_d[i], B_h[i], Nbytes,
+             hipMemcpyHostToDevice, stream[i]));
+  }
+
+  for (int i=0; i < noOfstreams; ++i) {
+    hipLaunchKernelGGL(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock),
+                       0, stream[i], static_cast<const int*>(A_d[i]),
+                       static_cast<const int*>(B_d[i]), C_d[i], N);
+  }
+
+  for (int i=0; i < noOfstreams; ++i) {
+    HIPCHECK(hipStreamSynchronize(stream[i]));
+    HIPCHECK(hipMemcpy(C_h[i], C_d[i], Nbytes, hipMemcpyDeviceToHost));
+    HipTest::checkVectorADD(A_h[i], B_h[i], C_h[i], N);
+  }
+
+  for (int i=0; i < noOfstreams; ++i) {
+    HipTest::freeArrays(A_d[i], B_d[i], C_d[i], A_h[i], B_h[i], C_h[i], false);
+    HIPCHECK(hipStreamDestroy(stream[i]));
+  }
+}
+
+void HipMemcpyWithStreamMultiThreadtests::TestDtoDonSameDevice(void) {
+  size_t Nbytes = N * sizeof(int);
+  int numDevices = 0;
+  int noOfstreams = 2;
+  int *A_d[noOfstreams], *B_d[noOfstreams], *C_d[noOfstreams];
+  int *A_h[noOfstreams], *B_h[noOfstreams], *C_h[noOfstreams];
+
+  unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N);
+
+  HipTest::initArrays(&A_d[0], &B_d[0], &C_d[0],
+                      &A_h[0], &B_h[0], &C_h[0], N, false);
+
+
+  hipStream_t stream[noOfstreams];
+  for (int i=0; i < noOfstreams; ++i) {
+    HIPCHECK(hipSetDevice(0));
+    HIPCHECK(hipStreamCreate(&stream[i]));
+  }
+
+  HIPCHECK(hipSetDevice(0));
+  HIPCHECK(hipMalloc(&A_d[1], Nbytes));
+  HIPCHECK(hipMalloc(&B_d[1], Nbytes));
+  HIPCHECK(hipMalloc(&C_d[1], Nbytes));
+  C_h[1] = reinterpret_cast<int*>(malloc(Nbytes));
+  HIPASSERT(C_h[1] != NULL);
+
+  HIPCHECK(hipMemcpyWithStream(A_d[0], A_h[0], Nbytes,
+                               hipMemcpyHostToDevice, stream[0]));
+  HIPCHECK(hipMemcpyWithStream(B_d[0], B_h[0], Nbytes,
+                               hipMemcpyHostToDevice, stream[0]));
+
+  HIPCHECK(hipMemcpyWithStream(A_d[1], A_d[0], Nbytes,
+                               hipMemcpyDeviceToDevice, stream[1]));
+  HIPCHECK(hipMemcpyWithStream(B_d[1], B_d[0], Nbytes,
+                               hipMemcpyDeviceToDevice, stream[1]));
+
+
+  for (int i=0; i < noOfstreams; ++i) {
+    HIPCHECK(hipSetDevice(0));
+    hipLaunchKernelGGL(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock),
+                       0, stream[i], static_cast<const int*>(A_d[i]),
+                       static_cast<const int*>(B_d[i]), C_d[i], N);
+  }
+
+  for (int i=0; i < noOfstreams; ++i) {
+    HIPCHECK(hipSetDevice(0));
+    HIPCHECK(hipStreamSynchronize(stream[i]));
+    HIPCHECK(hipMemcpy(C_h[i], C_d[i], Nbytes, hipMemcpyDeviceToHost));
+    HipTest::checkVectorADD(A_h[0], B_h[0], C_h[i], N);
+  }
+
+
+  HipTest::freeArrays(A_d[0], B_d[0], C_d[0], A_h[0], B_h[0], C_h[0], false);
+
+  if (A_d[1]) {
+    HIPCHECK(hipFree(A_d[1]));
+  }
+  if (B_d[1]) {
+    HIPCHECK(hipFree(B_d[1]));
+  }
+  if (C_d[1]) {
+    HIPCHECK(hipFree(C_d[1]));
+  }
+  if (C_h[1]) {
+    free(C_h[1]);
+  }
+
+
+  for (int i=0; i < noOfstreams; ++i) {
+    HIPCHECK(hipStreamDestroy(stream[i]));
+  }
+}
+
+void HipMemcpyWithStreamMultiThreadtests::TestOnMultiGPUwithOneStream(void) {
+  size_t Nbytes = N * sizeof(int);
+  int numDevices = 0;
+
+  unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N);
+  HIPCHECK(hipGetDeviceCount(&numDevices));
+  // If you have single GPU machine the return
+  if (numDevices <= 1) {
+    return;
+  }
+  int *A_d[numDevices], *B_d[numDevices], *C_d[numDevices];
+  int *A_h[numDevices], *B_h[numDevices], *C_h[numDevices];
+
+  hipStream_t stream[numDevices];
+  for (int i=0; i < numDevices; ++i) {
+    HIPCHECK(hipSetDevice(i));
+    HIPCHECK(hipStreamCreate(&stream[i]));
+  }
+
+  for (int i=0; i < numDevices; ++i) {
+    HIPCHECK(hipSetDevice(i));
+    HipTest::initArrays(&A_d[i], &B_d[i], &C_d[i],
+                        &A_h[i], &B_h[i], &C_h[i], N, false);
+  }
+
+
+  for (int i=0; i < numDevices; ++i) {
+    HIPCHECK(hipSetDevice(i));
+    HIPCHECK(hipMemcpyWithStream(A_d[i], A_h[i], Nbytes,
+             hipMemcpyHostToDevice, stream[i]));
+    HIPCHECK(hipMemcpyWithStream(B_d[i], B_h[i], Nbytes,
+             hipMemcpyHostToDevice, stream[i]));
+  }
+
+  for (int i=0; i < numDevices; ++i) {
+    HIPCHECK(hipSetDevice(i));
+    hipLaunchKernelGGL(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock),
+                       0, stream[i], static_cast<const int*>(A_d[i]),
+                       static_cast<const int*>(B_d[i]), C_d[i], N);
+  }
+
+  for (int i=0; i < numDevices; ++i) {
+    HIPCHECK(hipSetDevice(i));
+    HIPCHECK(hipStreamSynchronize(stream[i]));
+    HIPCHECK(hipMemcpy(C_h[i], C_d[i], Nbytes, hipMemcpyDeviceToHost));
+    HipTest::checkVectorADD(A_h[i], B_h[i], C_h[i], N);
+  }
+
+  for (int i=0; i < numDevices; ++i) {
+    HIPCHECK(hipSetDevice(i));
+    HipTest::freeArrays(A_d[i], B_d[i], C_d[i], A_h[i], B_h[i], C_h[i], false);
+    HIPCHECK(hipStreamDestroy(stream[i]));
+  }
+}
+
+void HipMemcpyWithStreamMultiThreadtests::TestkindDtoH(void) {
+  size_t Nbytes = N * sizeof(int);
+  int numDevices = 0;
+  int *A_d, *B_d, *C_d;
+  int *A_h, *B_h, *C_h;
+
+  unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N);
+  HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, false);
+
+  hipStream_t stream;
+  HIPCHECK(hipStreamCreate(&stream));
+
+  HIPCHECK(hipMemcpyWithStream(A_d, A_h, Nbytes,
+                               hipMemcpyHostToDevice, stream));
+  HIPCHECK(hipMemcpyWithStream(B_d, B_h, Nbytes,
+                               hipMemcpyHostToDevice, stream));
+  hipLaunchKernelGGL(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock),
+                     0, stream, static_cast<const int*>(A_d),
+                     static_cast<const int*>(B_d), C_d, N);
+  HIPCHECK(hipStreamSynchronize(stream));
+  HIPCHECK(hipMemcpyWithStream(C_h, C_d, Nbytes,
+                               hipMemcpyDeviceToHost, stream));
+  HipTest::checkVectorADD(A_h, B_h, C_h, N);
+
+  HipTest::freeArrays(A_d, B_d, C_d, A_h, B_h, C_h, false);
+  HIPCHECK(hipStreamDestroy(stream));
+}
+
+
+void HipMemcpyWithStreamMultiThreadtests::TestkindDtoD(void) {
+  size_t Nbytes = N * sizeof(int);
+  int numDevices = 0;
+
+
+  unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N);
+  HIPCHECK(hipGetDeviceCount(&numDevices));
+  // If you have single GPU machine the return
+  if (numDevices <= 1) {
+    return;
+  }
+
+  int *A_d[numDevices], *B_d[numDevices], *C_d[numDevices];
+  int *A_h[numDevices], *B_h[numDevices], *C_h[numDevices];
+
+  hipStream_t stream[numDevices];
+  for (int i=0; i < numDevices; ++i) {
+    HIPCHECK(hipSetDevice(i));
+    HIPCHECK(hipStreamCreate(&stream[i]));
+  }
+
+  // Initialize and create the host and device elements for first device
+  HIPCHECK(hipSetDevice(0));
+  HipTest::initArrays(&A_d[0], &B_d[0], &C_d[0],
+                      &A_h[0], &B_h[0], &C_h[0], N, false);
+
+  for (int i=1; i < numDevices; ++i) {
+    HIPCHECK(hipSetDevice(i))
+    HIPCHECK(hipMalloc(&A_d[i], Nbytes));
+    HIPCHECK(hipMalloc(&B_d[i], Nbytes));
+    HIPCHECK(hipMalloc(&C_d[i], Nbytes));
+    C_h[i] = reinterpret_cast<int*>(malloc(Nbytes));
+    HIPASSERT(C_h[i] != NULL);
+  }
+
+
+
+  HIPCHECK(hipSetDevice(0));
+  HIPCHECK(hipMemcpyWithStream(A_d[0], A_h[0], Nbytes,
+           hipMemcpyHostToDevice, stream[0]));
+  HIPCHECK(hipMemcpyWithStream(B_d[0], B_h[0], Nbytes,
+           hipMemcpyHostToDevice, stream[0]));
+
+  // Copying device data from 1st GPU to the rest of the the GPUs that is
+  // numDevices in the setup. 1st GPU start numbering from 0,1,2..n etc.
+  for (int i=1; i < numDevices; ++i) {
+    HIPCHECK(hipSetDevice(i));
+    HIPCHECK(hipMemcpyWithStream(A_d[i], A_d[0], Nbytes,
+             hipMemcpyDeviceToDevice, stream[i]));
+    HIPCHECK(hipMemcpyWithStream(B_d[i], B_d[0], Nbytes,
+             hipMemcpyDeviceToDevice, stream[i]));
+  }
+
+
+  // Launching the kernel including the 1st GPU to the no of GPUs present
+  // in the setup. 1st GPU start numbering from 0,1,2..n etc.
+  for (int i=0; i < numDevices; ++i) {
+    HIPCHECK(hipSetDevice(i));
+    hipLaunchKernelGGL(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock),
+                       0, stream[i], static_cast<const int*>(A_d[i]),
+                       static_cast<const int*>(B_d[i]), C_d[i], N);
+  }
+
+  for (int i=0; i < numDevices; ++i) {
+    HIPCHECK(hipSetDevice(i));
+    HIPCHECK(hipStreamSynchronize(stream[i]));
+    HIPCHECK(hipMemcpy(C_h[i], C_d[i], Nbytes, hipMemcpyDeviceToHost));
+    HipTest::checkVectorADD(A_h[0], B_h[0], C_h[i], N);
+  }
+
+  HipTest::freeArrays(A_d[0], B_d[0], C_d[0], A_h[0], B_h[0], C_h[0], false);
+  HIPCHECK(hipStreamDestroy(stream[0]));
+
+  for (int i=1; i < numDevices; ++i) {
+    if (A_d[i]) {
+      HIPCHECK(hipFree(A_d[i]));
+    }
+    if (B_d[i]) {
+      HIPCHECK(hipFree(B_d[i]));
+    }
+    if (C_d[i]) {
+      HIPCHECK(hipFree(C_d[i]));
+    }
+    if (C_h[i]) {
+      free(C_h[i]);
+    }
+    HIPCHECK(hipStreamDestroy(stream[i]));
+  }
+}
+
+void HipMemcpyWithStreamMultiThreadtests::TestkindDefault(void) {
+  size_t Nbytes = N * sizeof(int);
+  int numDevices = 0;
+  int *A_d, *B_d, *C_d;
+  int *A_h, *B_h, *C_h;
+
+
+  unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N);
+  HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, false);
+
+  hipStream_t stream;
+  HIPCHECK(hipStreamCreate(&stream));
+
+  HIPCHECK(hipMemcpyWithStream(A_d, A_h, Nbytes, hipMemcpyDefault, stream));
+  HIPCHECK(hipMemcpyWithStream(B_d, B_h, Nbytes, hipMemcpyDefault, stream));
+  hipLaunchKernelGGL(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock),
+                     0, stream, static_cast<const int*>(A_d),
+                     static_cast<const int*>(B_d), C_d, N);
+  HIPCHECK(hipStreamSynchronize(stream));
+  HIPCHECK(hipMemcpyWithStream(C_h, C_d, Nbytes, hipMemcpyDefault, stream));
+  HipTest::checkVectorADD(A_h, B_h, C_h, N);
+
+  HipTest::freeArrays(A_d, B_d, C_d, A_h, B_h, C_h, false);
+  HIPCHECK(hipStreamDestroy(stream));
+}
+
+void HipMemcpyWithStreamMultiThreadtests::TestkindDefaultForDtoD(void) {
+  size_t Nbytes = N * sizeof(int);
+  int numDevices = 0;
+
+
+  unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N);
+  HIPCHECK(hipGetDeviceCount(&numDevices));
+  // Test case will not run on single GPU setup.
+  if (numDevices <= 1) {
+    return;
+  }
+
+  int *A_d[numDevices], *B_d[numDevices], *C_d[numDevices];
+  int *A_h[numDevices], *B_h[numDevices], *C_h[numDevices];
+
+  // Initialize and create the host and device elements for first device
+  HipTest::initArrays(&A_d[0], &B_d[0], &C_d[0],
+                      &A_h[0], &B_h[0], &C_h[0], N, false);
+
+  for (int i=1; i < numDevices; ++i) {
+    HIPCHECK(hipMalloc(&A_d[i], Nbytes));
+    HIPCHECK(hipMalloc(&B_d[i], Nbytes));
+    HIPCHECK(hipMalloc(&C_d[i], Nbytes));
+    C_h[i] = reinterpret_cast<int*>(malloc(Nbytes));
+    HIPASSERT(C_h[i] != NULL);
+  }
+
+  hipStream_t stream[numDevices];
+  for (int i=0; i < numDevices; ++i) {
+    HIPCHECK(hipStreamCreate(&stream[i]));
+  }
+
+  HIPCHECK(hipSetDevice(0));
+  HIPCHECK(hipMemcpyWithStream(A_d[0], A_h[0], Nbytes,
+           hipMemcpyHostToDevice, stream[0]));
+  HIPCHECK(hipMemcpyWithStream(B_d[0], B_h[0], Nbytes,
+           hipMemcpyHostToDevice, stream[0]));
+
+  // Copying device data from 1st GPU to the rest of the the GPUs
+  // using hipMemcpyDefault kind  that is numDevices in the setup.
+  // 1st GPU start numbering from 0,1,2..n etc.
+  for (int i=1; i < numDevices; ++i) {
+    HIPCHECK(hipSetDevice(i));
+    HIPCHECK(hipMemcpyWithStream(A_d[i], A_d[0], Nbytes,
+             hipMemcpyDefault, stream[i]));
+    HIPCHECK(hipMemcpyWithStream(B_d[i], B_d[0], Nbytes,
+             hipMemcpyDefault, stream[i]));
+  }
+
+  for (int i=0; i < numDevices; ++i) {
+    HIPCHECK(hipSetDevice(i));
+    hipLaunchKernelGGL(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock),
+                       0, stream[i], static_cast<const int*>(A_d[i]),
+                       static_cast<const int*>(B_d[i]), C_d[i], N);
+  }
+
+  for (int i=0; i < numDevices; ++i) {
+    HIPCHECK(hipSetDevice(i));
+    HIPCHECK(hipStreamSynchronize(stream[i]));
+    HIPCHECK(hipMemcpy(C_h[i], C_d[i], Nbytes, hipMemcpyDeviceToHost));
+    // Output of each GPU is getting validated with input of 1st GPU.
+    HipTest::checkVectorADD(A_h[0], B_h[0], C_h[i], N);
+  }
+
+  HipTest::freeArrays(A_d[0], B_d[0], C_d[0], A_h[0], B_h[0], C_h[0], false);
+  HIPCHECK(hipStreamDestroy(stream[0]));
+
+  for (int i=1; i < numDevices; ++i) {
+    if (A_d[i]) {
+      HIPCHECK(hipFree(A_d[i]));
+    }
+    if (B_d[i]) {
+      HIPCHECK(hipFree(B_d[i]));
+    }
+    if (C_d[i]) {
+      HIPCHECK(hipFree(C_d[i]));
+    }
+    if (C_h[i]) {
+      free(C_h[i]);
+    }
+    HIPCHECK(hipStreamDestroy(stream[i]));
+  }
+}
+
+void HipMemcpyWithStreamMultiThreadtests::TestkindHtoH(void) {
+  size_t Nbytes = N * sizeof(int);
+  int numDevices = 0;
+  int *A_h, *B_h;
+
+  unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N);
+
+  // Allocate memory to A_h and B_h
+  A_h = static_cast<int*>(malloc(Nbytes));
+  HIPASSERT(A_h != NULL);
+  B_h = static_cast<int*>(malloc(Nbytes));
+  HIPASSERT(B_h != NULL);
+
+  for (size_t i = 0; i < N; ++i) {
+    if (A_h) {
+      (A_h)[i] = 3.146f + i;  // Pi
+    }
+  }
+
+  hipStream_t stream;
+  HIPCHECK(hipStreamCreate(&stream));
+
+  HIPCHECK(hipMemcpyWithStream(B_h, A_h, Nbytes, hipMemcpyHostToHost, stream));
+  HIPCHECK(hipStreamSynchronize(stream));
+
+  for (size_t i = 0; i < N; i++) {
+    HIPASSERT(A_h[i] == B_h[i]);
+  }
+
+  if (A_h) {
+    free(A_h);
+  }
+  if (B_h) {
+    free(B_h);
+  }
+  HIPCHECK(hipStreamDestroy(stream));
+}
+
+
+void HipMemcpyWithStreamMultiThreadtests::TestwithMultiThreaded(ops op) {
+  int n = min(THREADS * std::thread::hardware_concurrency(), MAX_THREADS);
+  std::vector<joinable_thread> threads;
+
+  for (uint32_t i = 0; i < n; i++) {
+    threads.emplace_back(std::thread{[&] {
+      switch ( op ) {
+        case ops::TestwithOnestream:
+          TestwithOnestream();
+          break;
+        case ops::TestwithTwoStream:
+          TestwithTwoStream();
+          break;
+        case ops::TestkindDtoH:
+          TestkindDtoH();
+          break;
+        case ops::TestkindHtoH:
+          TestkindHtoH();
+          break;
+        case ops::TestkindDtoD:
+          TestkindDtoD();
+          break;
+        case ops::TestOnMultiGPUwithOneStream:
+          TestOnMultiGPUwithOneStream();
+          break;
+        case ops::TestkindDefault:
+          TestkindDefault();
+          break;
+        case ops::TestkindDefaultForDtoD:
+          TestkindDefaultForDtoD();
+          break;
+        case ops::TestDtoDonSameDevice:
+          TestDtoDonSameDevice();
+          break;
+        default:{}
+      }
+    }});
+  }
+}
+
+
+int main() {
+  HipMemcpyWithStreamMultiThreadtests tests;
+  for (int op = static_cast<int>(ops::TestwithOnestream);
+           op < static_cast<int>(ops::END_OF_LIST); ++op) {
+    tests.TestwithMultiThreaded(static_cast<ops>(op));
+    switch ( static_cast<ops>(op) ) {
+      case ops::TestwithOnestream:
+        test_passed(HipMemcpyWithStreamMultiThreadtests
+                    ::TestwithOnestream);
+        break;
+      case ops::TestwithTwoStream:
+        test_passed(HipMemcpyWithStreamMultiThreadtests
+                    ::TestwithTwoStream);
+        break;
+      case ops::TestkindDtoH:
+        test_passed(HipMemcpyWithStreamMultiThreadtests
+                    ::TestkindDtoH);
+        break;
+      case ops::TestkindHtoH:
+        test_passed(HipMemcpyWithStreamMultiThreadtests
+                    ::TestkindHtoH);
+        break;
+      case ops::TestkindDtoD:
+        test_passed(HipMemcpyWithStreamMultiThreadtests
+                    ::TestkindDtoD);
+        break;
+      case ops::TestOnMultiGPUwithOneStream:
+        test_passed(HipMemcpyWithStreamMultiThreadtests
+                    ::TestOnMultiGPUwithOneStream);
+        break;
+      case ops::TestkindDefault:
+        test_passed(HipMemcpyWithStreamMultiThreadtests
+                    ::TestkindDefault);
+        break;
+      case ops::TestkindDefaultForDtoD:
+        test_passed(HipMemcpyWithStreamMultiThreadtests
+                    ::TestkindDefaultForDtoD);
+        break;
+      case ops::TestDtoDonSameDevice:
+        test_passed(HipMemcpyWithStreamMultiThreadtests
+                    ::TestDtoDonSameDevice);
+        break;
+      default: { test_failed("No Operation to done with API"); }
+    }
+  }
+}
@@ -1,119 +1,234 @@
 /*
-Copyright (c) 2015-present Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2015-present Advanced Micro Devices, Inc. All rights reserved.
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */

-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
-*/
-// Simple test for memset.
-// Also serves as a template for other tests.
+// Test for hipMemset2D functionality for different width and height values

 /* HIT_START
- * BUILD: %t %s ../../test_common.cpp
- * TEST: %t
+ * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS -std=c++11
+ * TEST_NAMED: %t hipMemset2D-basic
+ * TEST_NAMED: %t hipMemset2D-dim1 --width2D 10 --height2D 10 --memsetWidth 4 --memsetHeight 4
+ * TEST_NAMED: %t hipMemset2D-dim2 --width2D 100 --height2D 100 --memsetWidth 20 --memsetHeight 40
+ * TEST_NAMED: %t hipMemset2D-dim3 --width2D 256 --height2D 256 --memsetWidth 39 --memsetHeight 19
+ * TEST_NAMED: %t hipMemset2D-zeroH --width2D 100 --height2D 100 --memsetWidth 20 --memsetHeight 0
+ * TEST_NAMED: %t hipMemset2D-zeroW --width2D 100 --height2D 100 --memsetWidth 0 --memsetHeight 20
+ * TEST_NAMED: %t hipMemset2D-zeroW*H --width2D 100 --height2D 100 --memsetWidth 0 --memsetHeight 0
 * HIT_END
 */

-#include "hip/hip_runtime.h"
 #include "test_common.h"

-bool testhipMemset2D(int memsetval,int p_gpuDevice)
-{
-    size_t numH = 256;
-    size_t numW = 256;
-    size_t pitch_A;
-    size_t width = numW * sizeof(char);
-    size_t sizeElements = width * numH;
-    size_t elements = numW* numH;
+// Check hipMemset2D functionality
+bool testhipMemset2D(int memsetval, int p_gpuDevice) {
+  bool testResult = true;
+  size_t numH = 256;
+  size_t numW = 256;
+  size_t pitch_A;
+  size_t width = numW * sizeof(char);
+  size_t sizeElements = width * numH;
+  size_t elements = numW* numH;
+  printf("testhipMemset2D memsetval=%2x device=%d\n", memsetval, p_gpuDevice);
+  char *A_d;
+  char *A_h;

+  HIPCHECK(hipMallocPitch(reinterpret_cast<void**>(&A_d), &pitch_A, width ,
+                          numH));
+  A_h = reinterpret_cast<char*>(malloc(sizeElements));
+  HIPASSERT(A_h != NULL);

-    printf ("testhipMemset2D memsetval=%2x device=%d\n", memsetval, p_gpuDevice);
-    char *A_d;
-    char *A_h;
-    bool testResult = true;
-    HIPCHECK (hipMallocPitch((void**)&A_d, &pitch_A, width , numH));
-    A_h = (char*)malloc(sizeElements);
-    HIPASSERT(A_h != NULL);
-    for (size_t i=0; i<elements; i++) {
-        A_h[i] = 1;
+  for (size_t i=0; i < elements; i++) {
+    A_h[i] = 1;
+  }
+
+  HIPCHECK(hipMemset2D(A_d, pitch_A, memsetval, numW, numH));
+  HIPCHECK(hipMemcpy2D(A_h, width, A_d, pitch_A, numW, numH,
+                       hipMemcpyDeviceToHost));
+
+  for (int i=0; i < elements; i++) {
+    if (A_h[i] != memsetval) {
+      testResult = false;
+      printf("testhipMemset2D mismatch at index:%d computed:%02x, memsetval:"
+             "%02x\n", i, static_cast<int>(A_h[i]), static_cast<int>(memsetval));
+      break;
    }
-    HIPCHECK ( hipMemset2D(A_d, pitch_A, memsetval, numW, numH) );
-    HIPCHECK ( hipMemcpy2D(A_h, width, A_d, pitch_A, numW, numH, hipMemcpyDeviceToHost));
+  }

-    for (int i=0; i<elements; i++) {
-        if (A_h[i] != memsetval) {
-            testResult = false;
-            printf("testhipMemset2D mismatch at index:%d computed:%02x, memsetval:%02x\n", i, (int)A_h[i], (int)memsetval);
-            break;
-        }
-    }
-    hipFree(A_d);
-    free(A_h);
-    return testResult;
+  hipFree(A_d);
+  free(A_h);
+  return testResult;
 }

-bool testhipMemset2DAsync(int memsetval,int p_gpuDevice)
-{
-    size_t numH = 256;
-    size_t numW = 256;
-    size_t pitch_A;
-    size_t width = numW * sizeof(char);
-    size_t sizeElements = width * numH;
-    size_t elements = numW* numH;
+// Check hipMemset2DAsync functionality
+bool testhipMemset2DAsync(int memsetval, int p_gpuDevice) {
+  size_t numH = 256;
+  size_t numW = 256;
+  size_t pitch_A;
+  size_t width = numW * sizeof(char);
+  size_t sizeElements = width * numH;
+  size_t elements = numW * numH;
+  printf("testhipMemset2DAsync memsetval=%2x device=%d\n", memsetval,
+          p_gpuDevice);
+  char *A_d;
+  char *A_h;
+  bool testResult = true;

+  HIPCHECK(hipMallocPitch(reinterpret_cast<void**>(&A_d), &pitch_A,
+                          width , numH));
+  A_h = reinterpret_cast<char*>(malloc(sizeElements));
+  HIPASSERT(A_h != NULL);

-    printf ("testhipMemset2DAsync memsetval=%2x device=%d\n", memsetval, p_gpuDevice);
-    char *A_d;
-    char *A_h;
-    bool testResult = true;
+  for (size_t i=0; i < elements; i++) {
+      A_h[i] = 1;
+  }

-    HIPCHECK (hipMallocPitch((void**)&A_d, &pitch_A, width , numH));
-    A_h = (char*)malloc(sizeElements);
-    HIPASSERT(A_h != NULL);
-    for (size_t i=0; i<elements; i++) {
-        A_h[i] = 1;
+  hipStream_t stream;
+  HIPCHECK(hipStreamCreate(&stream));
+  HIPCHECK(hipMemset2DAsync(A_d, pitch_A, memsetval, numW, numH, stream));
+  HIPCHECK(hipStreamSynchronize(stream));
+  HIPCHECK(hipMemcpy2D(A_h, width, A_d, pitch_A, numW, numH,
+                       hipMemcpyDeviceToHost));
+
+  for (int i=0; i < elements; i++) {
+    if (A_h[i] != memsetval) {
+      testResult = false;
+      printf("testhipMemset2DAsync mismatch at index:%d computed:%02x, memsetval:"
+             "%02x\n", i, static_cast<int>(A_h[i]), static_cast<int>(memsetval));
+      break;
    }
-    hipStream_t stream;
-    HIPCHECK(hipStreamCreate(&stream));
-    HIPCHECK(hipMemset2DAsync(A_d, pitch_A, memsetval, numW, numH, stream) );
-    HIPCHECK(hipStreamSynchronize(stream));
-    HIPCHECK(hipMemcpy2D(A_h, width, A_d, pitch_A, numW, numH, hipMemcpyDeviceToHost));
+  }

-    for (int i=0; i<elements; i++) {
-        if (A_h[i] != memsetval) {
-            testResult = false;
-            printf("testhipMemset2DAsync mismatch at index:%d computed:%02x, memsetval:%02x\n", i, (int)A_h[i], (int)memsetval);
-            break;
-        }
-    }
-    hipFree(A_d);
-    HIPCHECK(hipStreamDestroy(stream));
-    free(A_h);
-    return testResult;
+  hipFree(A_d);
+  HIPCHECK(hipStreamDestroy(stream));
+  free(A_h);
+  return testResult;
 }

-int main(int argc, char *argv[])
-{
-    HipTest::parseStandardArguments(argc, argv, true);
-    HIPCHECK(hipSetDevice(p_gpuDevice));
-    bool testResult = true;
+int width2D = 20;
+int height2D = 20;
+int memsetWidth = 20;
+int memsetHeight = 20;
+
+int parseExtraArguments(int argc, char* argv[]) {
+  int i = 0;
+  for (i = 1; i < argc; i++) {
+    const char* arg = argv[i];
+    if (!strcmp(arg, " ")) {
+      // skip NULL args.
+    } else if (!strcmp(arg, "--width2D")) {
+        if (++i >= argc || !HipTest::parseInt(argv[i], &width2D)) {
+          failed("Bad width2D argument");
+        }
+    } else if (!strcmp(arg, "--height2D")) {
+        if (++i >= argc || !HipTest::parseInt(argv[i], &height2D)) {
+          failed("Bad height2D argument");
+        }
+    } else if (!strcmp(arg, "--memsetWidth")) {
+        if (++i >= argc || !HipTest::parseInt(argv[i], &memsetWidth)) {
+          failed("Bad memsetWidth argument");
+        }
+    } else if (!strcmp(arg, "--memsetHeight")) {
+        if (++i >= argc || !HipTest::parseInt(argv[i], &memsetHeight)) {
+          failed("Bad memsetHeight argument");
+        }
+    } else {
+        failed("Bad argument");
+    }
+  }
+  return i;
+}
+
+// Memset random dimensions
+bool testMemset2DPartial(int memsetval, int p_gpuDevice) {
+  bool testResult = true;
+  size_t NUM_H = height2D;
+  size_t NUM_W = width2D;
+  size_t Nbytes = N*sizeof(char);
+  size_t pitch_A;
+  size_t width = NUM_W * sizeof(char);
+  size_t sizeElements = width * NUM_H;
+  size_t elements = NUM_W * NUM_H;
+  char *A_d;
+  char *A_h;
+  printf("testhipMemset2DPartial memsetval=%2x device=%d\n", memsetval,
+          p_gpuDevice);
+
+  HIPCHECK(hipMallocPitch(reinterpret_cast<void**>(&A_d), &pitch_A,
+                          width, NUM_H));
+  hipError_t e;
+  int index;
+
+  A_h = reinterpret_cast<char*>(malloc(sizeElements));
+  HIPASSERT(A_h != NULL);
+
+  for (index = 0; index < sizeElements; index++) {
+    A_h[0] = 'c';
+  }
+
+  printf("2D Dimension: %zuX%zu, MemsetWidth:%d, memsetHeight:%d\n",
+         NUM_W, NUM_H, memsetWidth, memsetHeight);
+  e = hipMemset2D(A_d, pitch_A, memsetval, memsetWidth, memsetHeight);
+  HIPASSERT(e == hipSuccess);
+
+  HIPCHECK(hipMemcpy2D(A_h, width, A_d, pitch_A, NUM_W, NUM_H,
+                       hipMemcpyDeviceToHost));
+
+  for (int row = 0; row < memsetHeight; row++) {
+    for (int column = 0; column < memsetWidth; column++) {
+      if (A_h[(row * width) + column] != memsetval) {
+        printf("A_h[%d][%d] did not match %d", row, column, memsetval);
+        testResult = false;
+      }
+    }
+  }
+  hipFree(A_d);
+  free(A_h);
+  return testResult;
+}
+
+int main(int argc, char *argv[]) {
+  int extraArgs = 0;
+  bool testResult = true;
+
+  HIPCHECK(hipSetDevice(p_gpuDevice));
+  extraArgs = HipTest::parseStandardArguments(argc, argv, false);
+  parseExtraArguments(extraArgs, argv);
+
+  if (extraArgs == 1) {
    testResult &= testhipMemset2D(memsetval, p_gpuDevice);
-    testResult &= testhipMemset2DAsync(memsetval, p_gpuDevice);
-    if(testResult){
-       passed();
+    if (!(testResult)) {
+      printf("hipMemset2D failed\n");
    }
+    testResult &= testhipMemset2DAsync(memsetval, p_gpuDevice);
+    if (!(testResult)) {
+      printf("hipMemset2DAsync failed\n");
+    }
+  } else if (extraArgs == 9) {
+      testResult &= testMemset2DPartial(memsetval, p_gpuDevice);
+      if (!(testResult)) {
+        printf("hipMemset2D at random dimensions failed\n");
+      }
+  } else {
+      failed("Wrong Arguments for test\n");
+  }
+
+  if (testResult) {
+    passed();
+  } else {
+      failed("one or more hipMemset2D tests failed");
+  }
 }
@@ -0,0 +1,173 @@
+/*
+ * Copyright (c) 2020-present Advanced Micro Devices, Inc. All rights reserved.
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+*/
+
+//
+// Test to verify
+// a) Order of execution of device kernel and hipMemset2DAsync api
+// b) hipMemSet2DAsync execution in multiple threads
+//
+
+/* HIT_START
+ * BUILD: %t %s ../../test_common.cpp EXCLUDE_HIP_PLATFORM nvcc
+ * TEST: %t
+ * HIT_END
+ */
+
+#include "test_common.h"
+#define NUM_THREADS 1000
+#define ITER 100
+#define NUM_H 256
+#define NUM_W 256
+
+unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N);
+hipStream_t stream;
+
+bool testResult = true;
+char *A_d, *A_h, *B_d, *B_h, *C_d;
+int validateCount;
+
+size_t pitch_A, pitch_B, pitch_C;
+size_t width = NUM_W * sizeof(char);
+size_t sizeElements = width * NUM_H;
+size_t elements = NUM_W * NUM_H;
+
+/*
+ * Square each element in the array B and write to array C.
+ */
+
+__global__ void
+vector_square(char* B_d, char* C_d, size_t elements) {
+  for (int i=0 ; i < elements ; i++) {
+    C_d[i] = B_d[i] * B_d[i];
+  }
+}
+
+void memAllocate() {
+  HIPCHECK(hipMallocPitch(reinterpret_cast<void**>(&A_d), &pitch_A, width, NUM_H));
+  HIPCHECK(hipMallocPitch(reinterpret_cast<void**>(&B_d), &pitch_B, width, NUM_H));
+  A_h = reinterpret_cast<char*>(malloc(sizeElements));
+  HIPASSERT(A_h != NULL);
+  B_h = reinterpret_cast<char*>(malloc(sizeElements));
+  HIPASSERT(B_h != NULL);
+  HIPCHECK(hipMallocPitch(reinterpret_cast<void**>(&C_d), &pitch_C, width, NUM_H));
+
+  for (int i = 0 ; i < elements ; i++) {
+    B_h[i] = i;
+  }
+  HIPCHECK(hipMemcpy2D(B_d, width, B_h, pitch_B, NUM_W, NUM_H,
+                       hipMemcpyHostToDevice));
+  HIPCHECK(hipStreamCreate(&stream));
+}
+
+void memDeallocate() {
+  HIPCHECK(hipFree(A_d)); HIPCHECK(hipFree(B_d)); HIPCHECK(hipFree(C_d));
+  free(A_h); free(B_h);
+  HIPCHECK(hipStreamDestroy(stream));
+}
+
+void queueJobsForhipMemset2DAsync(char* A_d, char* A_h, size_t pitch,
+                                  size_t width) {
+  HIPCHECK(hipMemset2DAsync(A_d, pitch, memsetval, NUM_W, NUM_H, stream));
+  HIPCHECK(hipMemcpy2DAsync(A_h, width, A_d, pitch, NUM_W, NUM_H,
+                            hipMemcpyDeviceToHost, stream));
+}
+
+bool testhipMemset2DAsyncWithKernel() {
+  validateCount = 0;
+  memAllocate();
+  printf("info: Launching vector_square kernel and hipMemset2DAsync "
+         "simultaneously\n");
+  for (int k = 0 ; k < ITER ; k++) {
+    hipLaunchKernelGGL(vector_square, dim3(blocks), dim3(threadsPerBlock), 0,
+                       stream, B_d, C_d, elements);
+
+    HIPCHECK(hipMemset2DAsync(C_d, pitch_C, memsetval, NUM_W, NUM_H, stream));
+    HIPCHECK(hipStreamSynchronize(stream));
+    HIPCHECK(hipMemcpy2D(A_h, width, C_d, pitch_C, NUM_W, NUM_H,
+                         hipMemcpyDeviceToHost));
+
+    for (int p = 0 ; p < elements ; p++) {
+      if (A_h[p] == memsetval) {
+        validateCount+= 1;
+      }
+    }
+  }
+
+  testResult = (validateCount == (ITER * elements)) ? true : false;
+  memDeallocate();
+  return testResult;
+}
+
+bool testhipMemset2DAsyncMultiThread() {
+  validateCount = 0;
+  std::thread t[NUM_THREADS];
+
+  memAllocate();
+
+  printf("info: Queueing up hipMemset2DAsync jobs over multiple threads\n");
+  for (int i = 0 ; i < ITER ; i++) {
+    for (int k = 0 ; k < NUM_THREADS ; k++) {
+      if (k%2) {
+        t[k] = std::thread(queueJobsForhipMemset2DAsync, A_d, A_h, pitch_A,
+                           width);
+      } else {
+          t[k] = std::thread(queueJobsForhipMemset2DAsync, A_d, B_h, pitch_A,
+                             width);
+      }
+    }
+    for (int j = 0 ; j < NUM_THREADS ; j++) {
+      t[j].join();
+    }
+
+    HIPCHECK(hipStreamSynchronize(stream));
+    for (int k = 0 ; k < elements ; k++) {
+      if ((A_h[k] == memsetval) && (B_h[k] == memsetval)) {
+        validateCount+= 1;
+      }
+    }
+  }
+  memDeallocate();
+  testResult = (validateCount == (ITER * elements)) ? true : false;
+  return testResult;
+}
+
+int main() {
+  bool testResult = true;
+
+  testResult &= testhipMemset2DAsyncWithKernel();
+  if (testResult) {
+    printf("Kernel and hipMemset2DAsync executed in correct order!\n");
+  } else {
+      printf("Kernel and hipMemset2DAsync order of execution failed\n");
+  }
+
+  testResult &= testhipMemset2DAsyncMultiThread();
+  if (testResult) {
+    printf("hipMemset2DAsync jobs on all threads finished successfully!\n");
+    passed();
+  } else {
+      printf("hipMemset2DAsync failed in multi thread scenario\n");
+  }
+
+  if (testResult) {
+    passed();
+  } else {
+      failed("One or more tests failed\n");
+  }
+}
@@ -0,0 +1,191 @@
+/*
+ * Copyright (c) 2020-present Advanced Micro Devices, Inc. All rights reserved.
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+*/
+
+/*
+ * Test for checking order of execution of device kernel and
+ * hipMemsetAsync apis on all gpus
+ */
+
+/* HIT_START
+ * BUILD: %t %s ../../test_common.cpp EXCLUDE_HIP_PLATFORM nvcc
+ * TEST: %t
+ * HIT_END
+ */
+
+#include "test_common.h"
+#define ITER 10
+#define N 1024 * 1024
+
+unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N);
+
+template <typename T>
+__global__ void vector_square(T* B_d, T* C_d, size_t M) {
+  for (int i=0 ; i < M ; i++) {
+    C_d[i] = B_d[i] * B_d[i];
+  }
+}
+
+template <typename T>
+class MemSetTest {
+ public:
+  T *A_h, *B_d, *B_h, *C_d;
+  T memSetVal;
+  size_t Nbytes;
+  bool testResult = true;
+  int validateCount = 0;
+  hipStream_t stream;
+
+  void memAllocate(T memSetValue) {
+    memSetVal = memSetValue;
+    Nbytes = N * sizeof(T);
+
+    A_h = reinterpret_cast<T*>(malloc(Nbytes));
+    HIPASSERT(A_h != NULL);
+    HIPCHECK(hipMalloc(&B_d , Nbytes));
+    B_h = reinterpret_cast<T*>(malloc(Nbytes));
+    HIPASSERT(B_h != NULL);
+    HIPCHECK(hipMalloc(&C_d , Nbytes));
+
+    for (int i = 0 ; i < N ; i++) {
+      B_h[i] = i;
+    }
+    HIPCHECK(hipMemcpy(B_d , B_h , Nbytes , hipMemcpyHostToDevice));
+    HIPCHECK(hipStreamCreate(&stream));
+  }
+
+  void memDeallocate() {
+    HIPCHECK(hipFree(B_d)); HIPCHECK(hipFree(C_d));
+    free(B_h); free(A_h);
+    HIPCHECK(hipStreamDestroy(stream));
+  }
+
+  void validateExecutionOrder() {
+    for (int p = 0 ; p < N ; p++) {
+      if (A_h[p] == memSetVal) {
+        validateCount+= 1;
+      }
+    }
+  }
+
+  bool resultAfterAllIterations() {
+    testResult = (validateCount == (ITER * N)) ? true : false;
+    memDeallocate();
+    return testResult;
+  }
+};
+
+bool testhipMemsetAsyncWithKernel() {
+  MemSetTest <char> obj;
+  obj.memAllocate(memsetval);
+  for (int k = 0 ; k < ITER ; k++) {
+    hipLaunchKernelGGL(vector_square, dim3(blocks), dim3(threadsPerBlock), 0,
+                       obj.stream, obj.B_d, obj.C_d, N);
+    HIPCHECK(hipMemsetAsync(obj.C_d , obj.memSetVal , N , obj.stream));
+    HIPCHECK(hipStreamSynchronize(obj.stream));
+    HIPCHECK(hipMemcpy(obj.A_h , obj.C_d , obj.Nbytes , hipMemcpyDeviceToHost));
+
+    obj.validateExecutionOrder();
+  }
+  return obj.resultAfterAllIterations();
+}
+
+bool testhipMemsetD32AsyncWithKernel() {
+  MemSetTest <int32_t> obj;
+  obj.memAllocate(memsetD32val);
+  for (int k = 0 ; k < ITER ; k++) {
+    hipLaunchKernelGGL(vector_square, dim3(blocks), dim3(threadsPerBlock), 0,
+                       obj.stream, obj.B_d, obj.C_d, N);
+    HIPCHECK(hipMemsetD32Async(obj.C_d , obj.memSetVal , N , obj.stream));
+    HIPCHECK(hipStreamSynchronize(obj.stream));
+    HIPCHECK(hipMemcpy(obj.A_h, obj.C_d, obj.Nbytes, hipMemcpyDeviceToHost));
+
+    obj.validateExecutionOrder();
+  }
+  return obj.resultAfterAllIterations();
+}
+
+bool testhipMemsetD16AsyncWithKernel() {
+  MemSetTest <int16_t> obj;
+  obj.memAllocate(memsetD16val);
+  for (int k = 0 ; k < ITER ; k++) {
+    hipLaunchKernelGGL(vector_square, dim3(blocks), dim3(threadsPerBlock), 0,
+                       obj.stream, obj.B_d, obj.C_d, N);
+    HIPCHECK(hipMemsetD16Async(obj.C_d , obj.memSetVal , N , obj.stream));
+    HIPCHECK(hipStreamSynchronize(obj.stream));
+    HIPCHECK(hipMemcpy(obj.A_h , obj.C_d, obj.Nbytes , hipMemcpyDeviceToHost));
+
+    obj.validateExecutionOrder();
+  }
+  return obj.resultAfterAllIterations();
+}
+
+bool testhipMemsetD8AsyncWithKernel() {
+  MemSetTest <char> obj;
+  obj.memAllocate(memsetD8val);
+  for (int k = 0; k < ITER; k++) {
+    hipLaunchKernelGGL(vector_square, dim3(blocks), dim3(threadsPerBlock), 0,
+                       obj.stream, obj.B_d, obj.C_d, N);
+    HIPCHECK(hipMemsetD8Async(obj.C_d, obj.memSetVal, N, obj.stream));
+    HIPCHECK(hipStreamSynchronize(obj.stream));
+    HIPCHECK(hipMemcpy(obj.A_h, obj.C_d, obj.Nbytes, hipMemcpyDeviceToHost));
+
+    obj.validateExecutionOrder();
+  }
+  return obj.resultAfterAllIterations();
+}
+
+int main() {
+  bool testResult = true;
+  int numDevices = 0;
+  HIPCHECK(hipGetDeviceCount(&numDevices));
+  printf("total number of gpus in the system: %d\n", numDevices);
+
+  for (int i = 0; i < numDevices; i++) {
+    HIPCHECK(hipSetDevice(i));
+    printf("test running on gpu %d\n", i);
+
+    testResult &= testhipMemsetAsyncWithKernel();
+    if (!(testResult)) {
+      printf("Mismatch in order of execution of hipMemsetAsync and kernel\n");
+    }
+
+    testResult &= testhipMemsetD32AsyncWithKernel();
+    if (!(testResult)) {
+      printf("Mismatch in order of execution of hipMemsetD32Async and kernel\n");
+    }
+
+    testResult &= testhipMemsetD16AsyncWithKernel();
+    if (!(testResult)) {
+      printf("Mismatch in order of execution of hipMemsetD16Async and kernel\n");
+    }
+
+    testResult &= testhipMemsetD8AsyncWithKernel();
+    if (!(testResult)) {
+      printf("Mismatch in order of execution of hipMemsetD8Async and kernel\n");
+    }
+  }
+
+  if (testResult) {
+    printf("Execution order of Kernel and hipMemsetAsync apis on "
+           "all gpus is correct!\n");
+    passed();
+  } else {
+      failed("One or more hipMemsetAsync tests failed\n");
+  }
+}
@@ -0,0 +1,247 @@
+/*
+ * Copyright (c) 2020-present Advanced Micro Devices, Inc. All rights reserved.
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+*/
+
+/*
+ * Test that validates functionality of hipmemsetAsync apis over multi threads
+ */
+
+/* HIT_START
+ * BUILD: %t %s ../../test_common.cpp EXCLUDE_HIP_PLATFORM nvcc
+ * TEST: %t
+ * HIT_END
+ */
+
+#include "test_common.h"
+#define NUM_THREADS 50
+#define ITER 50
+
+unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N);
+
+template <typename T>
+class MemSetTest {
+ public:
+  T *A_h, *A_d, *B_h;
+  T memSetVal;
+  size_t Nbytes;
+  bool testResult = true;
+  int validateCount = 0;
+  hipStream_t stream;
+
+  void memAllocate(T memSetValue) {
+    memSetVal = memSetValue;
+    Nbytes = N * sizeof(T);
+
+    A_h = reinterpret_cast<T*>(malloc(Nbytes));
+    HIPASSERT(A_h != NULL);
+
+    HIPCHECK(hipMalloc(&A_d, Nbytes));
+    B_h = reinterpret_cast<T*>(malloc(Nbytes));
+    HIPASSERT(B_h != NULL);
+
+    HIPCHECK(hipStreamCreate(&stream));
+  }
+
+  void threadCompleteStatus() {
+    for (int k = 0 ; k < N ; k++) {
+      if ((A_h[k] == memSetVal) && (B_h[k] == memSetVal)) {
+        validateCount+= 1;
+      }
+    }
+  }
+
+  bool resultAfterAllIterations() {
+    memDeallocate();
+    testResult = (validateCount == (ITER * N)) ? true: false;
+    return testResult;
+  }
+
+  void memDeallocate() {
+    HIPCHECK(hipFree(A_d));
+    free(A_h);
+    free(B_h);
+    HIPCHECK(hipStreamDestroy(stream));
+  }
+};
+
+template <typename T>
+void queueJobsForhipMemsetAsync(T* A_d, T* A_h, T memSetVal, size_t Nbytes,
+                                hipStream_t stream) {
+  HIPCHECK(hipMemsetAsync(A_d, memSetVal, N, stream));
+  HIPCHECK(hipMemcpyAsync(A_h, A_d, Nbytes, hipMemcpyDeviceToHost, stream));
+}
+
+template <typename T>
+void queueJobsForhipMemsetD32Async(T* A_d, T* A_h, T memSetVal, size_t Nbytes,
+                                   hipStream_t stream) {
+  HIPCHECK(hipMemsetD32Async(A_d, memSetVal, N, stream));
+  HIPCHECK(hipMemcpyAsync(A_h, A_d, Nbytes, hipMemcpyDeviceToHost, stream));
+}
+
+template <typename T>
+void queueJobsForhipMemsetD16Async(T* A_d, T* A_h, T memSetVal, size_t Nbytes,
+                                   hipStream_t stream) {
+  HIPCHECK(hipMemsetD16Async(A_d, memSetVal, N, stream));
+  HIPCHECK(hipMemcpyAsync(A_h, A_d, Nbytes, hipMemcpyDeviceToHost, stream));
+}
+
+template <typename T>
+void queueJobsForhipMemsetD8Async(T* A_d, T* A_h, T memSetVal, size_t Nbytes,
+                                  hipStream_t stream) {
+  HIPCHECK(hipMemsetD8Async(A_d, memSetVal, N, stream));
+  HIPCHECK(hipMemcpyAsync(A_h, A_d, Nbytes, hipMemcpyDeviceToHost, stream));
+}
+
+/* Queue hipMemsetAsync jobs on multiple threads and verify they all
+ * finished on all threads successfully
+ */
+
+bool testhipMemsetAsyncWithMultiThread() {
+  MemSetTest <char> obj;
+  obj.memAllocate(memsetval);
+  std::thread t[NUM_THREADS];
+
+  for (int i = 0 ; i < ITER ; i++) {
+    for (int k = 0 ; k < NUM_THREADS ; k++) {
+      if (k%2) {
+        t[k] = std::thread(queueJobsForhipMemsetAsync<char>, obj.A_d, obj.A_h,
+                           obj.memSetVal, obj.Nbytes, obj.stream);
+      } else {
+          t[k] = std::thread(queueJobsForhipMemsetAsync<char>, obj.A_d, obj.B_h,
+                             obj.memSetVal, obj.Nbytes, obj.stream);
+      }
+    }
+
+    for (int j = 0 ; j < NUM_THREADS ; j++) {
+      t[j].join();
+    }
+
+    HIPCHECK(hipStreamSynchronize(obj.stream));
+    obj.threadCompleteStatus();
+  }
+  return obj.resultAfterAllIterations();
+}
+
+bool testhipMemsetD32AsyncWithMultiThread() {
+  MemSetTest <int32_t> obj;
+  obj.memAllocate(memsetD32val);
+  std::thread t[NUM_THREADS];
+
+  for (int i = 0 ; i < ITER ; i++) {
+    for (int k = 0 ; k < NUM_THREADS ; k++) {
+      if (k%2) {
+        t[k] = std::thread(queueJobsForhipMemsetD32Async<int32_t>, obj.A_d,
+                           obj.A_h, obj.memSetVal, obj.Nbytes, obj.stream);
+      } else {
+          t[k] = std::thread(queueJobsForhipMemsetD32Async<int32_t>, obj.A_d,
+                             obj.B_h, obj.memSetVal, obj.Nbytes, obj.stream);
+      }
+    }
+
+    for (int j = 0 ; j < NUM_THREADS ; j++) {
+      t[j].join();
+    }
+
+    HIPCHECK(hipStreamSynchronize(obj.stream));
+    obj.threadCompleteStatus();
+  }
+  return obj.resultAfterAllIterations();
+}
+
+bool testhipMemsetD16AsyncWithMultiThread() {
+  MemSetTest <int16_t> obj;
+  obj.memAllocate(memsetD16val);
+  std::thread t[NUM_THREADS];
+
+  for (int i = 0 ; i < ITER ; i++) {
+    for (int k = 0 ; k < NUM_THREADS ; k++) {
+      if (k%2) {
+        t[k] = std::thread(queueJobsForhipMemsetD16Async<int16_t>, obj.A_d,
+                           obj.A_h, obj.memSetVal, obj.Nbytes, obj.stream);
+      } else {
+          t[k] = std::thread(queueJobsForhipMemsetD16Async<int16_t>, obj.A_d,
+                             obj.B_h, obj.memSetVal, obj.Nbytes, obj.stream);
+      }
+    }
+
+    for (int j = 0 ; j < NUM_THREADS ; j++) {
+      t[j].join();
+    }
+
+    HIPCHECK(hipStreamSynchronize(obj.stream));
+    obj.threadCompleteStatus();
+  }
+  return obj.resultAfterAllIterations();
+}
+
+bool testhipMemsetD8AsyncWithMultiThread() {
+  MemSetTest <char> obj;
+  obj.memAllocate(memsetD8val);
+  std::thread t[NUM_THREADS];
+
+  for (int i = 0 ; i < ITER ; i++) {
+    for (int k = 0 ; k < NUM_THREADS ; k++) {
+      if (k%2) {
+        t[k] = std::thread(queueJobsForhipMemsetD8Async<char>, obj.A_d,
+                           obj.A_h, obj.memSetVal, obj.Nbytes, obj.stream);
+      } else {
+          t[k] = std::thread(queueJobsForhipMemsetD8Async<char>, obj.A_d,
+                             obj.B_h, obj.memSetVal, obj.Nbytes, obj.stream);
+      }
+    }
+    for (int j = 0 ; j < NUM_THREADS ; j++) {
+      t[j].join();
+    }
+
+    HIPCHECK(hipStreamSynchronize(obj.stream));
+    obj.threadCompleteStatus();
+  }
+  return obj.resultAfterAllIterations();
+}
+
+int main() {
+  bool testResult = true;
+  printf("Queueing up hipMemSetAsync jobs on multiple threads"
+         "and checking results\n");
+
+  testResult &= testhipMemsetAsyncWithMultiThread();
+  if (!(testResult)) {
+    printf("Thread execution did not complete for hipMemsetAsync\n");
+  }
+
+  testResult &= testhipMemsetD32AsyncWithMultiThread();
+  if (!(testResult)) {
+    printf("Thread execution did not complete for hipMemsetD32Async\n");
+  }
+
+  testResult &= testhipMemsetD16AsyncWithMultiThread();
+  if (!(testResult)) {
+    printf("Thread execution did not complete for hipMemsetD16Async\n");
+  }
+  testResult &= testhipMemsetD8AsyncWithMultiThread();
+  if (!(testResult)) {
+    printf("Thread execution did not complete for hipMemsetD8Async\n");
+  }
+
+  if (testResult) {
+    printf("All threads ran successfully for all hipMemsetAsync apis\n");
+    passed();
+  } else {
+      failed("One or more tests failed\n");
+  }
+}
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2020-present Advanced Micro Devices, Inc. All rights reserved.
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+*/
+
+//  * To test invalid pointer to hipMemset* apis
+
+/* HIT_START
+ * BUILD: %t %s ../../test_common.cpp EXCLUDE_HIP_PLATFORM nvcc
+ * TEST: %t
+ * HIT_END
+ */
+
+#include "test_common.h"
+#define N 50
+#define MEMSETVAL 0x42
+#define NUM_H 256
+#define NUM_W 256
+
+int main() {
+  size_t Nbytes = N*sizeof(char);
+  size_t pitch_A;
+  size_t width = NUM_W * sizeof(char);
+  size_t sizeElements = width * NUM_H;
+  size_t elements = NUM_W * NUM_H;
+  char *A_d;
+
+  HIPCHECK(hipMallocPitch(reinterpret_cast<void**>(&A_d), &pitch_A, width , NUM_H));
+
+  hipError_t e;
+
+  e = hipMemset(NULL , MEMSETVAL , Nbytes);
+  HIPASSERT(e == hipErrorInvalidValue);
+
+  e = hipMemsetD32(NULL , MEMSETVAL , Nbytes);
+  HIPASSERT(e == hipErrorInvalidValue);
+
+  e = hipMemsetD16(NULL , MEMSETVAL , Nbytes);
+  HIPASSERT(e == hipErrorInvalidValue);
+
+  e = hipMemsetD8(NULL , MEMSETVAL , Nbytes);
+  HIPASSERT(e == hipErrorInvalidValue);
+
+  e = hipMemsetAsync(NULL , MEMSETVAL , Nbytes , 0);
+  HIPASSERT(e == hipErrorInvalidValue);
+
+  e = hipMemsetD32Async(NULL , MEMSETVAL , Nbytes, 0);
+  HIPASSERT(e == hipErrorInvalidValue);
+
+  e = hipMemsetD16Async(NULL , MEMSETVAL , Nbytes, 0);
+  HIPASSERT(e == hipErrorInvalidValue);
+
+  e = hipMemsetD8Async(NULL , MEMSETVAL , Nbytes, 0);
+  HIPASSERT(e == hipErrorInvalidValue);
+
+  e = hipMemset2D(NULL, pitch_A, MEMSETVAL, NUM_W, NUM_H);
+  HIPASSERT(e == hipErrorInvalidValue);
+
+  e = hipMemset2DAsync(NULL, pitch_A, MEMSETVAL, NUM_W, NUM_H, 0);
+  HIPASSERT(e == hipErrorInvalidValue);
+
+  /* Passing host pointer to hipMemset.Ticket SWDEV-243206 is open for this.
+   * Disabling this test until the ticket is closed
+   *
+  char *A_h;
+  A_h = (char*)malloc(Nbytes);
+  e = hipMemset(A_h, MEMSETVAL , Nbytes);
+  HIPASSERT(e == hipErrorInvalidValue);
+  */
+
+  /* Passing invalid pitch to hipMemset2D.Ticket SWDEV-243104 is open for this.
+   * Disabling this test until the ticket is closed
+   *
+  e = hipMemset2D(A_d, 0, MEMSETVAL, NUM_W, NUM_H);
+  HIPASSERT(e == hipErrorInvalidValue);
+
+  e = hipMemset2DAsync(A_d, 0, MEMSETVAL, NUM_W, NUM_H,0);
+  HIPASSERT(e == hipErrorInvalidValue);
+  */
+
+  hipFree(A_d);
+  passed();
+}
@@ -0,0 +1,187 @@
+ /*
+Copyright (c) 2020-present Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR
+IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+ */
+
+/* HIT_START
+ * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS --std=c++11
+ * TEST: %t
+ * HIT_END
+ */
+
+
+
+// Testcase Description: This test case is used to verify if the callback
+// function called through  hipStreamAddCallback() api completes the execution
+// in order as hipStreamAddCallback() api queued in their respective streams
+
+
+
+#include <stdio.h>
+#include <vector>
+#include "hip/hip_runtime.h"
+#include "test_common.h"
+
+
+#ifdef __HIP_PLATFORM_HCC__
+#define HIPRT_CB
+#endif
+
+
+hipStream_t mystream1, mystream2;
+size_t Num = 4096;
+std::vector<int> Stream1_Order, Stream2_Order;
+
+
+__global__ void vector_square(float* C_d, float* A_d, size_t Num) {
+  size_t gputhread = (blockIdx.x * blockDim.x + threadIdx.x);
+  size_t stride = blockDim.x * gridDim.x;
+
+  for (size_t i = gputhread; i < Num; i += stride) {
+      C_d[i] = A_d[i] * A_d[i];
+  }
+
+  // Delay thread 1 only in the GPU
+  if (gputhread == 1) {
+    unsigned long long int wait_t = 3200000000, start = clock64(), cur;
+      do {
+          cur = clock64() - start;
+      } while (cur < wait_t);
+    }
+}
+
+float *A_h, *C_h, *A_h1, *C_h1;
+
+static void HIPRT_CB Callback_Stream1(hipStream_t stream, hipError_t status,
+                                      void* userData) {
+  for (size_t i = 0; i < Num; i++) {
+      if (C_h[i] != A_h[i] * A_h[i]) {
+          std::cout << "Data mismatch in stream1 at: " << i << std::endl;
+      }
+  }
+
+  // Storing the int passed into this callback into Stream1_Order
+  // this will help verify the order in which this Callback function
+  // is called.
+  Stream1_Order.push_back(*(reinterpret_cast<int*>(userData)));
+  delete reinterpret_cast<int*>(userData);
+}
+
+static void HIPRT_CB Callback_Stream2(hipStream_t stream, hipError_t status,
+                                      void* userData) {
+  for (size_t i = 0; i < Num; i++) {
+      if (C_h1[i] != A_h1[i] * A_h1[i]) {
+          std::cout << "Data mismatch in stream2 at: " << i << std::endl;
+      }
+  }
+  // Storing the int passed into this callback into Stream2_Order
+  // this will help verify the order in which this Callback function
+  // is called.
+  Stream2_Order.push_back(*(reinterpret_cast<int*>(userData)));
+  delete reinterpret_cast<int*>(userData);
+}
+
+int main(int argc, char* argv[]) {
+  float *A_d, *C_d;
+  size_t Nbytes = Num * sizeof(float);
+
+  A_h = reinterpret_cast<float*>(malloc(Nbytes));
+  HIPCHECK(A_h == 0 ? hipErrorOutOfMemory : hipSuccess);
+  C_h = reinterpret_cast<float*>(malloc(Nbytes));
+  HIPCHECK(C_h == 0 ? hipErrorOutOfMemory : hipSuccess);
+  A_h1 = reinterpret_cast<float*>(malloc(Nbytes));
+  HIPCHECK(A_h == 0 ? hipErrorOutOfMemory : hipSuccess);
+  C_h1 = reinterpret_cast<float*>(malloc(Nbytes));
+  HIPCHECK(C_h == 0 ? hipErrorOutOfMemory : hipSuccess);
+
+  // Fill with Phi + i
+  for (size_t i = 0; i < Num; i++) {
+      A_h[i] = 1.618f + i;
+  }
+  for (size_t i = 0; i < Num; i++) {
+    A_h1[i] = 1.618f + i;
+  }
+
+  HIPCHECK(hipMalloc(&A_d, Nbytes));
+  HIPCHECK(hipMalloc(&C_d, Nbytes));
+
+  HIPCHECK(hipStreamCreateWithFlags(&mystream1, hipStreamNonBlocking));
+  HIPCHECK(hipStreamCreateWithFlags(&mystream2, hipStreamNonBlocking));
+
+  HIPCHECK(hipMemcpyAsync(A_d, A_h, Nbytes, hipMemcpyHostToDevice, mystream1));
+
+  const unsigned threadsPerBlock = 256;
+  const unsigned blocks = (Num + 255)/threadsPerBlock;
+  int *ptr = NULL;
+  int *ptr1 = NULL;
+  // Queing jobs in both mystream1/2 followed by hipStreamAddCallback
+  for (int i = 1; i < 5; ++i) {
+    hipLaunchKernelGGL((vector_square), dim3(blocks), dim3(threadsPerBlock),
+                       0, mystream1, C_d, A_d, Num);
+    HIPCHECK(hipMemcpyAsync(C_h, C_d, Nbytes, hipMemcpyDeviceToHost,
+                            mystream1));
+    ptr = new int;
+    *ptr = i;
+    HIPCHECK(hipStreamAddCallback(mystream1, Callback_Stream1,
+                                  reinterpret_cast<void*>(ptr), 0));
+
+    hipLaunchKernelGGL((vector_square), dim3(blocks), dim3(threadsPerBlock),
+                       0, mystream2, C_d, A_d, Num);
+    HIPCHECK(hipMemcpyAsync(C_h1, C_d, Nbytes,
+                            hipMemcpyDeviceToHost, mystream2));
+    ptr1 = new int;
+    *ptr1 = i;
+    HIPCHECK(hipStreamAddCallback(mystream2, Callback_Stream2,
+                                  reinterpret_cast<void*>(ptr1), 0));
+  }
+
+  HIPCHECK(hipStreamSynchronize(mystream1));
+  HIPCHECK(hipStreamSynchronize(mystream2));
+
+  HIPCHECK(hipStreamDestroy(mystream1));
+  HIPCHECK(hipStreamDestroy(mystream2));
+
+  HIPCHECK(hipFree(A_d));
+  HIPCHECK(hipFree(C_d));
+  free(A_h);
+  free(C_h);
+  free(A_h1);
+  free(C_h1);
+
+  // Checking if Stream1_Order has ints in sequencial order or not
+  int i = 1;
+  for (auto itr=Stream1_Order.begin(); itr != Stream1_Order.end(); ++itr) {
+    if (*itr != i) {
+      printf("hipStreamAddCallBack() did not execute in sequence");
+      printf(" in first stream\n");
+      failed("Unexpected behavior!");
+    }
+    ++i;
+  }
+
+  // Checking if Stream2_Order has ints in sequencial order or not
+  i = 1;
+  for (auto itr=Stream2_Order.begin(); itr != Stream2_Order.end(); ++itr) {
+    if (*itr != i) {
+      printf("hipStreamAddCallBack() did not execute in sequence");
+      printf(" in second stream\n");
+      failed("Unexpected behavior!");
+    }
+    ++i;
+  }
+  passed();
+}
@@ -0,0 +1,180 @@
+/*
+  Copyright (c) 2019-present Advanced Micro Devices, Inc. All rights reserved.
+  Permission is hereby granted, free of charge, to any person obtaining a copy
+  of this software and associated documentation files (the "Software"), to deal
+  in the Software without restriction, including without limitation the rights
+  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+  copies of the Software, and to permit persons to whom the Software is
+  furnished to do so, subject to the following conditions:
+  The above copyright notice and this permission notice shall be included in
+  all copies or substantial portions of the Software.
+  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+  THE SOFTWARE.
+  */
+
+// Testcase Description: Streams are launched in individual GPUs with different
+// kernel. Verify that all the kernels queued are executed before the callback.
+
+/* HIT_START
+ * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS -std=c++11
+ * TEST: %t
+ * HIT_END
+ */
+
+#include <stdio.h>
+#include <unistd.h>
+#include <thread>
+#include <chrono>
+#include "hip/hip_runtime.h"
+#include "test_common.h"
+
+#ifdef __HIP_PLATFORM_HCC__
+#define HIPRT_CB
+#endif
+
+
+size_t N_ELMTS = 4096;
+
+// Data structure for holding and validating data
+struct gpu_data {
+  int *int_ptr = NULL;
+  int gpu;
+  int acknowledge;
+};
+
+enum {
+  SUCCESS = 0,
+  KERNEL_EXECUTION_MISMATCH,
+  KERNEL_COMPUTATION_MISMATCH
+};
+
+__global__ void Add_Data(int* A_d, size_t N_ELMTS) {
+  size_t offset = (blockIdx.x * blockDim.x + threadIdx.x);
+  size_t stride = blockDim.x * gridDim.x;
+  for (size_t i = offset; i < N_ELMTS; i += stride) {
+    // Increment the value of A_d[i] by 1
+    A_d[i] = A_d[i] + 1;
+  }
+}
+
+// below kernel is just to load the gpu with multiple jobs
+__global__ void Square_plus_one(int* A_d, int* C_d, size_t N_ELMTS) {
+  size_t offset = (blockIdx.x * blockDim.x + threadIdx.x);
+  size_t stride = blockDim.x * gridDim.x;
+  for (size_t i = offset; i < N_ELMTS; i += stride) {
+    C_d[i] = A_d[i]*A_d[i] + 1;
+  }
+}
+
+static void HIPRT_CB Stream_Callback(hipStream_t stream, hipError_t status,
+                                     void* userData) {
+  gpu_data *ptr = reinterpret_cast<gpu_data *>(userData);
+
+  // int_ptr in the passed userData will contain the data copied from device to
+  // host.  Expected data in this field is the gpu ordinal.
+  if (*((*ptr).int_ptr) != (*ptr).gpu + 1) {
+    (*ptr).acknowledge = 100;   // Assign unexpected value to indicate fail
+  } else {
+    (*ptr).acknowledge = (*ptr).gpu;  // Assign the gpu ordinal received
+  }
+}
+
+void launch_gpu(int gpu_ordinal) {
+  HIPCHECK(hipSetDevice(gpu_ordinal));
+  int *A_d, *A_h, *C_h, *C_d;
+  size_t Nbytes = N_ELMTS * sizeof(int), Data_mismatch = 0;
+  bool cb = false;
+  A_h = (int *)malloc(Nbytes);
+  HIPCHECK(A_h == 0 ? hipErrorOutOfMemory : hipSuccess);
+  C_h = (int *)malloc(Nbytes);
+  HIPCHECK(C_h == 0 ? hipErrorOutOfMemory : hipSuccess);
+
+  // Fill with 0
+  for (size_t i = 0; i < N_ELMTS; i++) {
+    A_h[i] = 0;
+  }
+
+  // setting gpu value in the struct object
+  gpu_data *ptr = new gpu_data;
+  ptr->int_ptr = C_h;
+  ptr->gpu = gpu_ordinal;
+  ptr->acknowledge = 100;
+
+  HIPCHECK(hipMalloc(&A_d, Nbytes));
+  HIPCHECK(hipMalloc(&C_d, Nbytes));
+
+  hipStream_t mystream;
+  HIPCHECK(hipStreamCreateWithFlags(&mystream, hipStreamNonBlocking));
+
+  HIPCHECK(hipMemcpyAsync(A_d, A_h, Nbytes, hipMemcpyHostToDevice, mystream));
+
+  const unsigned threadsPerBlock = 256;
+  const unsigned blocks = (N_ELMTS + 255)/threadsPerBlock;
+
+  // A_d is initialized to 0.  Add_Data kernel does A_d = A_d + 1
+  // The Add_data kernel is called 1 time for gpu0, 2 times for gpu1 etc.
+  // At the end of the loop, A_d should have the gpu_ordinal number
+  for (int i = 0; i < gpu_ordinal + 1; i++) {
+    hipLaunchKernelGGL(Add_Data, dim3(blocks), dim3(threadsPerBlock), 0,
+                       mystream, A_d, N_ELMTS);
+    hipLaunchKernelGGL(Square_plus_one, 1, 1, 0, mystream, A_d, C_d, N_ELMTS);
+  }
+  HIPCHECK(hipMemcpyAsync(C_h, A_d, Nbytes, hipMemcpyDeviceToHost, mystream));
+
+  // Pass the ptr as user data which contains the gpu_ordinal, default value
+  // for ack and the data that is copied to host
+  HIPCHECK(hipStreamAddCallback(mystream, Stream_Callback,
+                                reinterpret_cast<void *>(ptr), 0));
+  HIPCHECK(hipStreamSynchronize(mystream));
+
+  HIPCHECK(hipFree(A_d));
+  HIPCHECK(hipFree(C_d));
+  HIPCHECK(hipStreamDestroy(mystream));
+
+  int result = SUCCESS;
+  if (C_h[0] != gpu_ordinal + 1) {
+    result = KERNEL_EXECUTION_MISMATCH;
+  }
+
+  if (ptr->gpu != ptr->acknowledge) {
+    result = KERNEL_COMPUTATION_MISMATCH;
+  }
+
+  free(A_h);
+  free(C_h);
+  free(ptr);
+
+  if (result == KERNEL_EXECUTION_MISMATCH) {
+    failed("Number of kernels expected to be executed does not match");
+  } else if (result == KERNEL_COMPUTATION_MISMATCH) {
+    failed("Mismatch found in the result of the computation!");
+  }
+}
+
+
+int main() {
+  int gpu_cnt = 0;
+
+  HIPCHECK(hipGetDeviceCount(&gpu_cnt));
+  if (gpu_cnt < 2) {
+    printf("Minimum of 2 gpus are needed for this test, skipping the test\n");
+    passed();
+  }
+
+  std::thread T[gpu_cnt];
+
+  // Launching threads for each GPU
+  for (int i = 0; i < gpu_cnt; i++) {
+    T[i] = std::thread(launch_gpu, i);
+  }
+
+  for (int i=0; i < gpu_cnt; i++) {
+    T[i].join();
+  }
+  passed();
+}
@@ -0,0 +1,130 @@
+/*
+ * Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ * */
+
+// Testcase Description:: This test case is used to check if the runtime is ok
+// when hipStreamAddCallback() is called back to back multiple calls
+
+/* HIT_START
+ * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS --std=c++11
+ * TEST: %t
+ * HIT_END
+ */
+
+
+#include <stdio.h>
+#include <unistd.h>
+#include <mutex>
+#include <atomic>
+#include "hip/hip_runtime.h"
+#include "test_common.h"
+
+#ifdef __HIP_PLATFORM_HCC__
+#define HIPRT_CB
+#endif
+
+#define NUM_CALLS 1000
+
+hipStream_t mystream;
+size_t Num = 4096;
+std::atomic<size_t>Cb_count{0}, Data_mismatch{0};
+float *A_h, *C_h;
+
+__global__ void vector_square(float* C_d, float* A_d, size_t Num) {
+  size_t gputhread = (blockIdx.x * blockDim.x + threadIdx.x);
+  size_t stride = blockDim.x * gridDim.x;
+
+  for (size_t i = gputhread; i < Num; i += stride) {
+    C_d[i] = A_d[i] * A_d[i];
+  }
+
+  // Delay thread 1 only in the GPU
+  if (gputhread == 1) {
+    unsigned long long int wait_t = 3200000000, start = clock64(), cur;
+    do {
+      cur = clock64() - start;
+    } while (cur < wait_t);
+  }
+}
+
+static void HIPRT_CB Stream_Callback(hipStream_t stream, hipError_t status,
+                                     void* userData) {
+  for (size_t i = 0; i < Num; i++) {
+    // Validate the data and update Data_mismatch
+    if (C_h[i] != A_h[i] * A_h[i]) {
+      Data_mismatch++;
+    }
+  }
+
+  // Increment the Cb_count to indicate that the callback is processed.
+  ++Cb_count;
+}
+
+int main(int argc, char* argv[]) {
+  float *A_d, *C_d;
+  size_t Nbytes = Num * sizeof(float);
+
+  A_h = (float*)malloc(Nbytes);
+  HIPCHECK(A_h == 0 ? hipErrorOutOfMemory : hipSuccess);
+  C_h = (float*)malloc(Nbytes);
+  HIPCHECK(C_h == 0 ? hipErrorOutOfMemory : hipSuccess);
+
+  // Fill with Phi + i
+  for (size_t i = 0; i < Num; i++) {
+    A_h[i] = 1.618f + i;
+  }
+
+  HIPCHECK(hipMalloc(&A_d, Nbytes));
+  HIPCHECK(hipMalloc(&C_d, Nbytes));
+
+  HIPCHECK(hipStreamCreateWithFlags(&mystream, hipStreamNonBlocking));
+
+  HIPCHECK(hipMemcpyAsync(A_d, A_h, Nbytes, hipMemcpyHostToDevice, mystream));
+
+  const unsigned threadsPerBlock = 256;
+  const unsigned blocks = (Num+255)/threadsPerBlock;
+  hipLaunchKernelGGL((vector_square), dim3(blocks), dim3(threadsPerBlock), 0,
+                      mystream, C_d, A_d, Num);
+
+  HIPCHECK(hipMemcpyAsync(C_h, C_d, Nbytes, hipMemcpyDeviceToHost, mystream));
+
+  // Add multiple callbacks to the stream
+  for (int i = 0; i< NUM_CALLS; i++) {
+    HIPCHECK(hipStreamAddCallback(mystream, Stream_Callback, NULL, 0));
+  }
+
+  HIPCHECK(hipStreamSynchronize(mystream));
+  HIPCHECK(hipStreamDestroy(mystream));
+
+  HIPCHECK(hipFree(A_d));
+  HIPCHECK(hipFree(C_d));
+
+  free(A_h);
+  free(C_h);
+
+  // Each callback would have validated the data and if any mismatch is found,
+  // Data_mismatch will not have proper data.  Validate the same.
+  // Cb_count should match the number of callbacks added.
+  if (Data_mismatch.load() != 0) {
+    failed("Mismatch found in the result of the computation!");
+  } else if (Cb_count.load() != NUM_CALLS) {
+    failed("All callbacks for stream did not get called!");
+  }
+
+  passed();
+}
@@ -0,0 +1,165 @@
+/*
+Copyright (c) 2020-present Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR
+IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+// Testcase Description: This test case is used to check the behaviour of HIP
+// when multiple hipStreaAddCallback() are called over multiple Threads
+// This test case is disabled currently.
+
+/* HIT_START
+ * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS --std=c++11 EXCLUDE_HIP_PLATFORM all
+ * TEST: %t
+ * HIT_END
+ */
+
+
+
+#include <stdio.h>
+#include <thread>
+#include <chrono>
+#include <atomic>
+#include "hip/hip_runtime.h"
+#include "test_common.h"
+
+#ifdef __HIP_PLATFORM_HCC__
+#define HIPRT_CB
+#endif
+
+#define NUM_THREADS 2000
+
+size_t Num = 4096;
+std::atomic<size_t>Cb_count{0}, Data_mismatch{0};
+hipStream_t mystream;
+float *A_h, *C_h;
+
+__global__ void vector_square(float* C_d, float* A_d, size_t Num) {
+  size_t gputhread = (blockIdx.x * blockDim.x + threadIdx.x);
+  size_t stride = blockDim.x * gridDim.x;
+
+  for (size_t i = gputhread; i < Num; i += stride) {
+    C_d[i] = A_d[i] * A_d[i];
+  }
+
+  // Delay thread 1 only in the GPU
+  if (gputhread == 1) {
+    unsigned long long int wait_t = 3200000000, start = clock64(), cur;
+    do {
+      cur = clock64() - start;
+    } while (cur < wait_t);
+  }
+}
+
+
+static void HIPRT_CB Thread1_Callback(hipStream_t stream, hipError_t status,
+                                      void* userData) {
+  for (size_t i = 0; i < Num; i++) {
+    // Validate the data and update Data_mismatch
+    if (C_h[i] != A_h[i] * A_h[i]) {
+      Data_mismatch++;
+    }
+  }
+
+  // Increment the Cb_count to indicate that the callback is processed.
+  ++Cb_count;
+}
+
+static void HIPRT_CB Thread2_Callback(hipStream_t stream, hipError_t status,
+                                      void* userData) {
+  for (size_t i = 0; i < Num; i++) {
+    // Validate the data and update Data_mismatch
+    if (C_h[i] != A_h[i] * A_h[i]) {
+      Data_mismatch++;
+    }
+  }
+
+  // Increment the Cb_count to indicate that the callback is processed.
+  ++Cb_count;
+}
+
+void Thread1_func() {
+  HIPCHECK(hipStreamAddCallback(mystream, Thread1_Callback, NULL, 0));
+}
+
+void Thread2_func() {
+  HIPCHECK(hipStreamAddCallback(mystream, Thread2_Callback, NULL, 0));
+}
+
+
+int main(int argc, char* argv[]) {
+  float *A_d, *C_d;
+  size_t Nbytes = Num * sizeof(float);
+
+  A_h = (float*)malloc(Nbytes);
+  HIPCHECK(A_h == 0 ? hipErrorOutOfMemory : hipSuccess);
+  C_h = (float*)malloc(Nbytes);
+  HIPCHECK(C_h == 0 ? hipErrorOutOfMemory : hipSuccess);
+
+  // Fill with Phi + i
+  for (size_t i = 0; i < Num; i++) {
+    A_h[i] = 1.618f + i;
+  }
+
+  HIPCHECK(hipMalloc(&A_d, Nbytes));
+  HIPCHECK(hipMalloc(&C_d, Nbytes));
+
+  HIPCHECK(hipStreamCreateWithFlags(&mystream, hipStreamNonBlocking));
+
+  HIPCHECK(hipMemcpyAsync(A_d, A_h, Nbytes, hipMemcpyHostToDevice, mystream));
+
+  const unsigned threadsPerBlock = 256;
+  const unsigned blocks = (Num+255)/threadsPerBlock;
+
+  hipLaunchKernelGGL((vector_square), dim3(blocks), dim3(threadsPerBlock), 0,
+                      mystream, C_d, A_d, Num);
+
+  HIPCHECK(hipMemcpyAsync(C_h, C_d, Nbytes, hipMemcpyDeviceToHost, mystream));
+
+  std::thread T[NUM_THREADS];
+  for (int i = 0; i < NUM_THREADS; i++) {
+    // Use different callback for every even thread
+    // The callbacks will be added to same stream from different threads
+    if ((i%2) == 0)
+      T[i] = std::thread(Thread1_func);
+    else
+      T[i] = std::thread(Thread2_func);
+  }
+
+  // Wait until all the threads finish their execution
+  for (int i = 0; i < NUM_THREADS; i++) {
+    T[i].join();
+  }
+
+  HIPCHECK(hipStreamSynchronize(mystream));
+  HIPCHECK(hipStreamDestroy(mystream));
+
+  HIPCHECK(hipFree(A_d));
+  HIPCHECK(hipFree(C_d));
+
+  free(A_h);
+  free(C_h);
+
+  // Cb_count should match total number of callbacks added from both threads
+  // Data_mismatch will be updated if there is problem in data validation
+  if (Cb_count.load() != NUM_THREADS) {
+     failed("All callbacks for stream did not get called!");
+  } else if (Data_mismatch.load() != 0) {
+     failed("Mismatch found in the result of the computation!");
+  }
+
+  passed();
+}
@@ -0,0 +1,147 @@
+/*
+* Copyright (c) 2020-present Advanced Micro Devices, Inc. All rights reserved.
+* Permission is hereby granted, free of charge, to any person obtaining a copy
+* of this software and associated documentation files (the "Software"), to deal
+* in the Software without restriction, including without limitation the rights
+* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+* copies of the Software, and to permit persons to whom the Software is
+* furnished to do so, subject to the following conditions:
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+* OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+* THE SOFTWARE.
+*/
+
+// Testcase Description: This test case checks whether hipStreamSynchronize()
+// is taking less time than the time taken by Callback() function launched
+// by hipStreamAddCallback() api.
+
+/* HIT_START
+ * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS --std=c++11
+ * TEST: %t
+ * HIT_END
+ */
+
+#include <stdio.h>
+#include <unistd.h>
+#include <chrono>
+#include <atomic>
+#include "hip/hip_runtime.h"
+#include "test_common.h"
+
+#ifdef __HIP_PLATFORM_HCC__
+#define HIPRT_CB
+#endif
+
+#define SECONDS_TO_WAIT 5
+#define TO_MICROSECONDS 1000000
+
+hipStream_t mystream;
+size_t N_elmts = 4096;
+bool Init_callback = false;
+std::atomic<int> Data_mismatch{0};
+
+__global__ void vector_square(float* C_d, float* A_d, size_t N_elmts) {
+  size_t offset = (blockIdx.x * blockDim.x + threadIdx.x);
+  size_t stride = blockDim.x * gridDim.x;
+
+  for (size_t i = offset; i < N_elmts; i += stride) {
+    C_d[i] = A_d[i] * A_d[i];
+  }
+
+  // Delay the thread 1
+  if (offset == 1) {
+    unsigned long long int wait_t = 3200000000, start = clock64(), cur;
+    do {
+      cur = clock64() - start;
+    } while (cur < wait_t);
+  }
+}
+
+float *A_h, *C_h;
+
+static void HIPRT_CB Callback1(hipStream_t stream, hipError_t status,
+                               void* userData) {
+  // Mark that the callback is entered.  This is checked in main thread.
+  Init_callback = true;
+
+  // Validate the data
+  for (size_t i = 0; i < N_elmts; i++) {
+    if (C_h[i] != A_h[i] * A_h[i]) {
+      Data_mismatch++;
+    }
+  }
+
+  // Delay the callback completion
+  sleep(SECONDS_TO_WAIT);
+}
+
+
+int main(int argc, char* argv[]) {
+  float *A_d, *C_d;
+  size_t Nbytes = N_elmts * sizeof(float);
+  float tElapsed = 1.0f;
+
+  A_h = (float*)malloc(Nbytes);
+  HIPCHECK(A_h == 0 ? hipErrorOutOfMemory : hipSuccess);
+  C_h = (float*)malloc(Nbytes);
+  HIPCHECK(C_h == 0 ? hipErrorOutOfMemory : hipSuccess);
+
+  // Fill with Phi + i
+  for (size_t i = 0; i < N_elmts; i++) {
+    A_h[i] = 1.618f + i;
+  }
+
+  HIPCHECK(hipMalloc(&A_d, Nbytes));
+  HIPCHECK(hipMalloc(&C_d, Nbytes));
+
+  HIPCHECK(hipStreamCreateWithFlags(&mystream, hipStreamNonBlocking));
+
+  HIPCHECK(hipMemcpyAsync(A_d, A_h, Nbytes, hipMemcpyHostToDevice, mystream));
+
+  const unsigned threadsPerBlock = 256;
+  const unsigned blocks = (N_elmts + 255)/threadsPerBlock;
+
+  hipLaunchKernelGGL((vector_square), dim3(blocks), dim3(threadsPerBlock), 0,
+                      mystream, C_d, A_d, N_elmts);
+  HIPCHECK(hipMemcpyAsync(C_h, C_d, Nbytes, hipMemcpyDeviceToHost, mystream));
+  HIPCHECK(hipStreamAddCallback(mystream, Callback1, NULL, 0));
+
+  // Wait untill Callback() function changes the Init_callback value to true
+  while (!Init_callback) {}
+
+  // Since the callback is supposed to be called only after an implicit stream
+  // synchronization, hipStreamSynchronize call shoud not take much time.
+  auto start = std::chrono::high_resolution_clock::now();
+  HIPCHECK(hipStreamSynchronize(mystream));
+  auto stop = std::chrono::high_resolution_clock::now();
+  auto duration = std::chrono::duration_cast<std::chrono::microseconds>(stop - start);
+
+  HIPCHECK(hipStreamDestroy(mystream));
+  HIPCHECK(hipFree(A_d));
+  HIPCHECK(hipFree(C_d));
+  free(A_h);
+  free(C_h);
+
+  if (Data_mismatch.load() != 0) {
+    failed("Output from kernel execution is not as expected");
+  }
+
+  // There is a delay of 5000000 microseconds in the Callback() function, the
+  // duration.count() value is expected to less than 5000000 microseconds
+  // because it is expected that stream synchronization completed the moment
+  // Callback function starts the execution and not untill Callback function
+  // completes the execution. Therefore the hipStreamSynchronize() in the
+  // main thread should hardly take any time to complete.
+
+  if (duration.count() < SECONDS_TO_WAIT * TO_MICROSECONDS) {
+    passed();
+  } else {
+    failed("hipStreamSynchronize is waiting untill Callback() completes.");
+  }
+}
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2020-present Advanced Micro Devices, Inc. All rights reserved.
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ * */
+
+// Testcase Description: This test case tests if Host thread continues with
+// next command after hipStreamAddCallback() api or wait for callback() call to
+// finish. Ideally Host thread should not wait for callback to finish.
+
+/* HIT_START
+ * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS --std=c++11
+ * TEST: %t
+ * HIT_END
+ */
+
+#include <unistd.h>
+#include <stdio.h>
+#include "hip/hip_runtime.h"
+#include "test_common.h"
+
+#ifdef __HIP_PLATFORM_HCC__
+#define HIPRT_CB
+#endif
+
+bool Callback_Completed = false;
+
+void HIPRT_CB Callback1(hipStream_t stream, hipError_t status, void* userData) {
+  sleep(5);
+  Callback_Completed = true;
+}
+
+int main(int argc, char* argv[]) {
+  hipStream_t mystream;
+  HIPCHECK(hipStreamCreateWithFlags(&mystream, hipStreamNonBlocking));
+  HIPCHECK(hipStreamAddCallback(mystream, Callback1, NULL, 0));
+  sleep(1);
+
+  // Callback_Completed is initialized to false.  The same is set to true at
+  // the end of callback and callback sleeps for 5 seconds.
+  // So, in case Callback_Completed is true here, it means the main thread
+  // has waited till callback is complete and is a fail case.
+  if (Callback_Completed == false) {
+    HIPCHECK(hipStreamDestroy(mystream));
+    passed();
+  } else {
+    HIPCHECK(hipStreamDestroy(mystream));
+    failed("Unexpected: Host thread is waiting for callback to finish");
+  }
+}
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2020-present Advanced Micro Devices, Inc. All rights reserved.
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ * */
+
+/* HIT_START
+ * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS --std=c++11
+ * TEST: %t
+ * HIT_END
+ */
+
+// Checks the callback execution in the same order it was added
+// Also, it checks if the number of callbacks executed are same as the number
+// of callbacks added
+
+#include <stdio.h>
+#include <atomic>
+#include "hip/hip_runtime.h"
+#include "test_common.h"
+#ifdef __HIP_PLATFORM_HCC__
+#define HIPRT_CB
+#endif
+
+#define NUM_CALLS 10
+hipStream_t mystream;
+bool Callback_SequenceMismatch = false;
+std::atomic<int> Cb_ordinal{0};
+
+void HIPRT_CB Stream_Callback(hipStream_t stream, hipError_t status,
+                              void* userData) {
+  // Userdata has the order of the callback.  It should match with
+  // the callback counter Cb_ordinal as the sequence of callback
+  // should match the sequence of callback addition
+  if (*(reinterpret_cast<int*>(userData)) == Cb_ordinal) {
+    // Increment the Cb_ordinal to prepare for next sequence
+    Cb_ordinal++;
+  } else {
+    Callback_SequenceMismatch = true;
+  }
+
+  delete reinterpret_cast<int*>(userData);
+}
+
+int main(int argc, char* argv[]) {
+  int *ptr;
+  HIPCHECK(hipStreamCreateWithFlags(&mystream, hipStreamNonBlocking));
+  for (int i = 0; i< NUM_CALLS; i++) {
+    ptr = new int;
+    *ptr = i;
+    // Pass the userdata with the order of the callback addition
+    HIPCHECK(hipStreamAddCallback(mystream, Stream_Callback,
+                                  reinterpret_cast<void*>(ptr), 0));
+  }
+
+  HIPCHECK(hipStreamSynchronize(mystream));
+  HIPCHECK(hipStreamDestroy(mystream));
+
+  if (!(Cb_ordinal == (NUM_CALLS))) {
+    failed("All callbacks for stream did not get called!");
+  }
+
+  if (Callback_SequenceMismatch == false) {
+    passed();
+  } else {
+    failed("hipStreamAddCallback() calls did not execute in sequence!");
+  }
+}
@@ -0,0 +1,92 @@
+/*
+Copyright (c) 2020-present Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+
+/* HIT_START
+ * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS -std=c++11
+ * TEST: %t
+ * HIT_END
+ */
+
+
+#include "test_common.h"
+
+int main(int argc, char *argv[]) {
+  int numDevices;
+  hipGetDeviceCount(&numDevices);
+  for (int i = 0; i < numDevices; i++) {
+    hipStream_t stream;
+    int priority;
+    int priority_normal;
+    int priority_low;
+    int priority_high;
+    int priority_check;
+
+    // Test is to get the Stream Priority Range
+    HIPCHECK(hipDeviceGetStreamPriorityRange(&priority_low, &priority_high));
+    printf("Priority range is %d for low and %d for high \n", priority_low, priority_high);
+    priority_normal = priority_low + priority_high;
+
+    // Check if priorities are indeed supported
+    if ((priority_low + priority_high) != 0) {
+      failed("Priorities are not supported");
+    }
+
+    // Checking Priority of default stream
+    HIPCHECK(hipStreamCreate(&stream));
+    HIPCHECK(hipStreamGetPriority(stream, &priority));
+    if (priority_normal != priority) {
+      failed("Unable to set Normal Priority for the stream");
+    }
+    HIPCHECK(hipStreamDestroy(stream));
+
+    // Creating Stream with Priorities
+    HIPCHECK(hipStreamCreateWithPriority(&stream, hipStreamDefault, priority_high));
+    HIPCHECK(hipStreamGetPriority(stream, &priority_check));
+    if (priority_check != priority_high) {
+      failed("Unable to set high priority for the stream");
+    }
+    HIPCHECK(hipStreamDestroy(stream));
+
+    HIPCHECK(hipStreamCreateWithPriority(&stream, hipStreamDefault, priority_low));
+    HIPCHECK(hipStreamGetPriority(stream, &priority_check));
+    if (priority_check != priority_low) {
+      failed("Unable to set low priority for the stream");
+    }
+    HIPCHECK(hipStreamDestroy(stream));
+
+    // creating a stream with boundry cases
+    HIPCHECK(hipStreamCreateWithPriority(&stream, hipStreamNonBlocking, priority_low+1));
+    HIPCHECK(hipStreamGetPriority(stream, &priority_check));
+    if (priority_check != priority_low) {
+      failed("setting priority failed ");
+    }
+    HIPCHECK(hipStreamDestroy(stream));
+
+    HIPCHECK(hipStreamCreateWithPriority(&stream, hipStreamNonBlocking, priority_high-1));
+    HIPCHECK(hipStreamGetPriority(stream, &priority_check));
+    if (priority_check != priority_high) {
+      failed("setting priority failed ");
+    }
+    HIPCHECK(hipStreamDestroy(stream));
+  }
+
+  passed();
+  return 0;
+}
@@ -57,7 +57,7 @@ void texture2Dtest()
    
    // Use the texture object
    hipResourceDesc texRes;
-    hipMemset(&texRes, 0, sizeof(texRes));
+    memset(&texRes, 0, sizeof(texRes));
    texRes.resType = hipResourceTypePitch2D;
    texRes.res.pitch2D.devPtr = devPtrA;
    texRes.res.pitch2D.height = SIZE_H;
@@ -66,7 +66,7 @@ void texture2Dtest()
    texRes.res.pitch2D.desc = hipCreateChannelDesc<TYPE_t>();

    hipTextureDesc texDescr;
-    hipMemset(&texDescr, 0, sizeof(texDescr));
+    memset(&texDescr, 0, sizeof(texDescr));
    texDescr.normalizedCoords = false;
    texDescr.filterMode = hipFilterModePoint;
    texDescr.mipmapFilterMode = hipFilterModePoint;