Revert "Revert "Merge branch 'amd-master-next' into amd-npi-next""

This reverts commit 28b17d3dbd. Reason for revert: <INSERT REASONING HERE> Change-Id: I92ceb171e31026ed1864704cef2fc1497b883ef9 [ROCm/hip commit: ad2d55c144]
2020-10-05 13:20:58 -04:00
@@ -8,10 +8,15 @@ set(BUILD_SHARED_LIBS ON  CACHE BOOL "Build shared library (.so) or static lib (

 set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/cmake")

-if(NOT ${BUILD_SHARED_LIBS} AND NOT DEFINED ENABLE_HIP_PCH)
-  set(ENABLE_HIP_PCH ON CACHE BOOL "enable/disable pre-compiled hip headers")
+if(NOT DEFINED __HIP_ENABLE_PCH)
+  set(__HIP_ENABLE_PCH ON CACHE BOOL "enable/disable pre-compiled hip headers")
 endif()

+if(${__HIP_ENABLE_PCH})
+  set(_pchStatus 1)
+else()
+  set(_pchStatus 0)
+endif()
 #############################
 # Options
 #############################
@@ -80,8 +85,8 @@ if(GIT_FOUND)

  set(HIP_VERSION_PATCH ${HIP_VERSION_GITDATE}-${HIP_VERSION_GITHASH})

-  if(DEFINED ENV{ROCM_BUILD_ID})
-    set(HIP_PACKAGING_VERSION_PATCH ${HIP_VERSION_GITDATE}.${HIP_VERSION_GITCOUNT}-$ENV{ROCM_BUILD_ID}-${HIP_VERSION_GITHASH})
+  if(DEFINED ENV{ROCM_LIBPATCH_VERSION})
+    set(HIP_PACKAGING_VERSION_PATCH ${HIP_VERSION_GITDATE}.${HIP_VERSION_GITCOUNT}.$ENV{ROCM_LIBPATCH_VERSION})
  else()
    set(HIP_PACKAGING_VERSION_PATCH ${HIP_VERSION_GITDATE}.${HIP_VERSION_GITCOUNT}-${HIP_VERSION_GITHASH})
  endif()
@@ -90,6 +95,36 @@ else()
  set(HIP_PACKAGING_VERSION_PATCH "0")
 endif()

+## Debian package specific variables
+if ( DEFINED ENV{CPACK_DEBIAN_PACKAGE_RELEASE} )
+  set ( CPACK_DEBIAN_PACKAGE_RELEASE $ENV{CPACK_DEBIAN_PACKAGE_RELEASE} )
+else()
+  set ( CPACK_DEBIAN_PACKAGE_RELEASE "local" )
+endif()
+message ( "Using CPACK_DEBIAN_PACKAGE_RELEASE ${CPACK_DEBIAN_PACKAGE_RELEASE}" )
+
+## RPM package specific variables
+if ( DEFINED ENV{CPACK_RPM_PACKAGE_RELEASE} )
+  set ( CPACK_RPM_PACKAGE_RELEASE $ENV{CPACK_RPM_PACKAGE_RELEASE} )
+else()
+  set ( CPACK_RPM_PACKAGE_RELEASE "local" )
+endif()
+
+## 'dist' breaks manual builds on debian systems due to empty Provides
+execute_process( COMMAND rpm --eval %{?dist}
+                 RESULT_VARIABLE PROC_RESULT
+                 OUTPUT_VARIABLE EVAL_RESULT
+                 OUTPUT_STRIP_TRAILING_WHITESPACE )
+
+if ( PROC_RESULT EQUAL "0" AND NOT EVAL_RESULT STREQUAL "" )
+  string ( APPEND CPACK_RPM_PACKAGE_RELEASE "%{?dist}" )
+endif()
+message("CPACK_RPM_PACKAGE_RELEASE: ${CPACK_RPM_PACKAGE_RELEASE}")
+
+add_to_config(_versionInfo HIP_PACKAGING_VERSION_PATCH)
+add_to_config(_versionInfo CPACK_DEBIAN_PACKAGE_RELEASE)
+add_to_config(_versionInfo CPACK_RPM_PACKAGE_RELEASE)
+
 add_to_config(_versionInfo HIP_VERSION_MAJOR)
 add_to_config(_versionInfo HIP_VERSION_MINOR)
 add_to_config(_versionInfo HIP_VERSION_PATCH)
@@ -102,7 +137,6 @@ else ()
   set (HIP_LIB_VERSION_PATCH ${HIP_VERSION_PATCH})
 endif ()
 set (HIP_LIB_VERSION_STRING "${HIP_LIB_VERSION_MAJOR}.${HIP_LIB_VERSION_MINOR}.${HIP_LIB_VERSION_PATCH}")
-
 if (DEFINED ENV{ROCM_RPATH})
    set (CMAKE_INSTALL_RPATH "$ENV{ROCM_RPATH}")
    set (CMAKE_BUILD_WITH_INSTALL_RPATH TRUE)
@@ -456,6 +490,7 @@ set(_versionInfoHeader
 #define HIP_VERSION_MINOR ${HIP_VERSION_MINOR}
 #define HIP_VERSION_PATCH ${HIP_VERSION_GITDATE}
 #define HIP_VERSION       (HIP_VERSION_MAJOR * 100 + HIP_VERSION_MINOR)\n
+#define __HIP_HAS_GET_PCH ${_pchStatus}\n
 #endif\n
 ")
 file(WRITE "${PROJECT_BINARY_DIR}/include/hip/hip_version.h" ${_versionInfoHeader})
@@ -669,8 +704,11 @@ endif()
 # Testing steps
 #############################
 # Target: test
-set(HIP_ROOT_DIR ${CMAKE_INSTALL_PREFIX})
+set(HIP_ROOT_DIR ${CMAKE_CURRENT_BINARY_DIR})
 set(HIP_SRC_PATH ${CMAKE_CURRENT_SOURCE_DIR})
+if(HIP_PLATFORM STREQUAL "nvcc")
+	execute_process(COMMAND "${CMAKE_COMMAND}" -E copy_directory "${HIP_SRC_PATH}/include" "${CMAKE_CURRENT_BINARY_DIR}/include" RESULT_VARIABLE RUN_HIT ERROR_QUIET)
+endif()
 execute_process(COMMAND "${CMAKE_COMMAND}" -E copy_directory "${HIP_SRC_PATH}/cmake" "${HIP_ROOT_DIR}/cmake" RESULT_VARIABLE RUN_HIT ERROR_QUIET)
 if(${RUN_HIT} EQUAL 0)
    execute_process(COMMAND "${CMAKE_COMMAND}" -E copy_directory "${HIP_SRC_PATH}/bin" "${HIP_ROOT_DIR}/bin" RESULT_VARIABLE RUN_HIT ERROR_QUIET)
@@ -713,7 +751,7 @@ endif()
 #############################
 # Target: clang
 if(HIP_HIPCC_EXECUTABLE)
-    add_custom_target(analyze 
+    add_custom_target(analyze
        COMMAND ${HIP_HIPCC_EXECUTABLE} -fvisibility=hidden -fvisibility-inlines-hidden --analyze --analyzer-outputtext  -isystem /opt/rocm/include ${HIP_HCC_BUILD_FLAGS} -Wno-unused-command-line-argument -I/opt/rocm/include -c  src/*.cpp -Iinclude/ -I./
    WORKING_DIRECTORY ${HIP_SRC_PATH})
    if(CPPCHECK_EXE)
@@ -1,15 +1,15 @@
-# Contributor Guidelines 
+# Contributor Guidelines

 ## Make Tips
-When building HIP, you will likely want to build and install to a local user-accessible directory (rather than /opt/rocm).  
-This can be easily be done by setting the -DCMAKE_INSTALL_PREFIX variable when running cmake.  Typical use case is to 
+When building HIP, you will likely want to build and install to a local user-accessible directory (rather than /opt/rocm).
+This can be easily be done by setting the -DCMAKE_INSTALL_PREFIX variable when running cmake.  Typical use case is to
 set CMAKE_INSTALL_PREFIX to your HIP git root, and then ensure HIP_PATH points to this directory.   For example

 ```
 cmake .. -DCMAKE_INSTALL_PREFIX=..
 make install

-export HIP_PATH= 
+export HIP_PATH=
 ```

 After making HIP, don't forget the "make install" step !
@@ -21,118 +21,110 @@ After making HIP, don't forget the "make install" step !
    - Add a translation to the hipify-clang tool ; many examples abound.
       - For stat tracking purposes, place the API into an appropriate stat category ("dev", "mem", "stream", etc).
    - Add a inlined NVCC implementation for the function in include/hip/nvcc_detail/hip_runtime_api.h.
-       - These are typically headers 
-    - Add an HCC definition and Doxygen comments for the function in include/hcc_detail/hip_runtime_api.h
-       - Source implementation typically go in src/hcc_detail/hip_hcc.cpp. The implementation may involve 
-         calls to HCC runtime or HSA runtime, or interact with other pieces of the HIP runtime (ie for 
-         hipStream_t).
+       - These are typically headers
+    - Add an HIP_ROCclr definition and Doxygen comments for the function in include/hcc_detail/hip_runtime_api.h
+       - Source implementation typically go in hip/rocclr/hip_*.cpp. The implementation involve calls to HIP runtime (ie for hipStream_t).

-#### Testing HCC version
-In some cases new HIP features are tied to specified releases of HCC, and it can be useful to determine at compile-time
-if the current HCC compiler is sufficiently new enough to support the desired feature.  The `__hcc_workweek__` compiler
-define is a monotonically increasing integer value that combines the year + workweek + day-of-week (0-6, Sunday is 0) 
-(ie 15403, 16014, etc).   
-The granularity is one day, so __hcc_workweek__  can only be used to distinguish compiler builds that are at least one day apart.
+## Check HIP-Clang version
+In some cases new HIP-Clang features are tied to specified releases, and it can be useful to check the current version is sufficiently new enough to support the desired feature.
+
+HIP runtime version

 ```
-#ifdef __hcc_workweek_ > 16014
-// use cool new HCC feature here
-#endif
+> cat /opt/rocm/hip/bin/.hipVersion
+# Auto-generated by cmake
+HIP_VERSION_MAJOR=3
+HIP_VERSION_MINOR=9
+HIP_VERSION_PATCH=20345-519ef3f2
 ```

-Additionally, hcc binary can print the work-week to stdout: ("16014" in the version info below.)4
+HIP-Clang compiler version
+
 ```
-> /opt/rocm/hcc/bin/hcc -v
-HCC clang version 3.5.0  (based on HCC 0.8.16014-81f8a3f-f155163-5a1009a LLVM 3.5.0svn)
+$ /opt/rocm/llvm/bin/clang -v
+clang version 11.0.0 (/src/external/llvm-project/clang 075fedd3fd2f4d9d8cca79d0cd51f64c5ef21432)
 Target: x86_64-unknown-linux-gnu
 Thread model: posix
-Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/4.8
-Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/4.8.4
-Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/4.9
-Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/4.9.1
-Selected GCC installation: /usr/lib/gcc/x86_64-linux-gnu/4.8
+InstalledDir: /opt/rocm/llvm/bin
+Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/7
+Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/7.5.0
+Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/8
+Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/9
+Selected GCC installation: /usr/lib/gcc/x86_64-linux-gnu/9
 Candidate multilib: .;@m64
 Candidate multilib: 32;@m32
 Candidate multilib: x32;@mx32
 Selected multilib: .;@m64
 ```

-The unix `date` command can print the HCC-format work-week for a specific date , ie:
-```
-> date --utc +%y%U%w -d 2015-11-09
-15451
-```
-
 ## Unit Testing Environment

-HIP includes unit tests in the tests/src directory.  
+HIP includes unit tests in the tests/src directory.
 When adding a new HIP feature, add a new unit test as well.
 See [tests/README.md](README.md) for more information.

 ## Development Flow
-It is recommended that developers set the flag HIP_BUILD_LOCAL=1 so that the unit testing environment automatically rebuilds libhip_hcc.a and the tests when a change it made to the HIP source. 
-Directed tests provide a great place to develop new features alongside the associated test.  
+
+Directed tests provide a great place to develop new features alongside the associated test.

 For applications and benchmarks outside the directed test environment, developments should use a two-step development flow:
- #1. Compile, link, and install HCC.  See [Installation](README.md#Installation) notes.
- #2. Relink the target application to include changes in the libhip_hcc.a file.
+- #1. Compile, link, and install HIP/ROCclr.  See [Installation](README.md#Installation) notes.
+- #2. Relink the target application to include changes in HIP runtime file.

 ## Environment Variables
- **HIP_PATH** : Location of HIP include, src, bin, lib directories.  
- **HCC_HOME** : Path to HCC compiler.  Default /opt/rocm/hcc.
+- **HIP_PATH** : Location of HIP include, src, bin, lib directories.
+- **HCC_ROCCLR_HOME** : Path to HIP/ROCclr directory, used on AMD platforms.  Default /opt/rocm/rocclr.
 - **HSA_PATH** : Path to HSA include, lib.  Default /opt/rocm/hsa.
 - **CUDA_PATH* : On nvcc system, this points to root of CUDA installation.

-### Contribution guidelines ###
+## Contribution guidelines ##

 Features (ie functions, classes, types) defined in hip*.h should resemble CUDA APIs.
 The HIP interface is designed to be very familiar for CUDA programmers.

-Differences or limitations of HIP APIs as compared to CUDA APIs should be clearly documented and described. 
+Differences or limitations of HIP APIs as compared to CUDA APIs should be clearly documented and described.

-## Coding Guidelines (in brief)
+### Coding Guidelines (in brief)
 - Code Indentation:
    - Tabs should be expanded to spaces.
    - Use 4 spaces indentation.
 - Capitalization and Naming
-    - Prefer camelCase for HIP interfaces and internal symbols.  Note HCC uses _ for separator.  
+    - Prefer camelCase for HIP interfaces and internal symbols.  Note HCC uses _ for separator.
      This guideline is not yet consistently followed in HIP code - eventual compliance is aspirational.
    - Member variables should begin with a leading "_".  This allows them to be easily distinguished from other variables or functions.
-    

 - {} placement
    - For functions, the opening { should be placed on a new line.
    - For if/else blocks, the opening { is placed on same line as the if/else. Use a space to separate {/" from if/else.  Example
 '''
    if (foo) {
-        doFoo() 
-    } else { 
+        doFoo()
+    } else {
        doFooElse();
    }
 '''
    - namespace should be on same line as { and separated by a space.
    - Single-line if statement should still use {/} pair (even though C++ does not require).
 - Miscellaneous
-    - All references in function parameter lists should be const.  
+    - All references in function parameter lists should be const.
    - "ihip" = internal hip structures.  These should not be exposed through the HIP API.
    - Keyword TODO refers to a note that should be addressed in long-term.  Could be style issue, software architecture, or known bugs.
    - FIXME refers to a short-term bug that needs to be addressed.

 - HIP_INIT_API() should be placed at the start of each top-level HIP API.  This function will make sure the HIP runtime is initialized,
  and also constructs an appropriate API string for tracing and CodeXL marker tracing.  The arguments to HIP_INIT_API should match
-  those of the parent function.  
- ihipLogStatus should only be called from top-level HIP APIs,and should be called to log and return the error code.  The error code 
+  those of the parent function.
+- ihipLogStatus should only be called from top-level HIP APIs,and should be called to log and return the error code.  The error code
  is used by the GetLastError and PeekLastError functions - if a HIP API simply returns, then the error will not be logged correctly.

 - All HIP environment variables should begin with the keyword HIP_
    Environment variables should be long enough to describe their purpose but short enough so they can be remembered - perhaps 10-20 characters, with 3-4 parts separated by underscores.
    To see the list of current environment variables, along with their values, set HIP_PRINT_ENV and run any hip applications on ROCm platform .
-    HIPCC or other tools may support additional environment variables which should follow the above convention.  
+    HIPCC or other tools may support additional environment variables which should follow the above convention.


-
-#### Presubmit Testing:
-Before checking in or submitting a pull request, run all directed tests (see tests/README.md) and all Rodinia tests.  
+### Presubmit Testing:
+Before checking in or submitting a pull request, run all directed tests (see tests/README.md) and all Rodinia tests.
 Ensure pass results match starting point:

 ```shell
@@ -141,13 +133,13 @@ Ensure pass results match starting point:
 ```


-#### Checkin messages
+### Checkin messages
 Follow existing best practice for writing a good Git commit message.    Some tips:
    http://chris.beams.io/posts/git-commit/
    https://robots.thoughtbot.com/5-useful-tips-for-a-better-commit-message

-In particular : 
-   - Use imperative voice, ie "Fix this bug", "Refactor the XYZ routine", "Update the doc".  
+In particular :
+   - Use imperative voice, ie "Fix this bug", "Refactor the XYZ routine", "Update the doc".
     Not : "Fixing the bug", "Fixed the bug", "Bug fix", etc.
   - Subject should summarize the commit.  Do not end subject with a period.  Use a blank line
     after the subject.
@@ -1,8 +1,7 @@
 #!/bin/bash

 #set -x
-
-ROCM_PATH=${ROCM_PATH:-/opt/rocm}
+LLVM_DIR="$1/../../../"
 tmp=/tmp/hip_pch.$$
 mkdir -p $tmp

@@ -47,12 +46,12 @@ __hip_pch_size:
  .long __hip_pch_size - __hip_pch
 EOF

-$ROCM_PATH/llvm/bin/clang -O3 -c -std=c++17 -isystem /opt/rocm/llvm/lib/clang/11.0.0/include/.. -isystem /opt/rocm/include -nogpulib --cuda-device-only -x hip $tmp/hip_pch.h -E >$tmp/pch.cui
+$LLVM_DIR/bin/clang -O3 -c -std=c++17 -isystem $LLVM_DIR/lib/clang/11.0.0/include/.. -isystem /opt/rocm/include -nogpulib --cuda-device-only -x hip $tmp/hip_pch.h -E >$tmp/pch.cui

 cat $tmp/hip_macros.h >> $tmp/pch.cui

-$ROCM_PATH/llvm/bin/clang -cc1 -O3 -emit-pch -triple amdgcn-amd-amdhsa -aux-triple x86_64-unknown-linux-gnu -fcuda-is-device -std=c++17 -fgnuc-version=4.2.1 -o $tmp/hip.pch -x hip-cpp-output - <$tmp/pch.cui
+$LLVM_DIR/bin/clang -cc1 -O3 -emit-pch -triple amdgcn-amd-amdhsa -aux-triple x86_64-unknown-linux-gnu -fcuda-is-device -std=c++17 -fgnuc-version=4.2.1 -o $tmp/hip.pch -x hip-cpp-output - <$tmp/pch.cui

-$ROCM_PATH/llvm/bin/llvm-mc -o hip_pch.o $tmp/hip_pch.mcin --filetype=obj
+$LLVM_DIR/bin/llvm-mc -o hip_pch.o $tmp/hip_pch.mcin --filetype=obj

 rm -rf $tmp
@@ -1,36 +0,0 @@
-#!/bin/bash
-
-#set -x
-
-cat >/tmp/hip_macros.h <<EOF
-#define __device__ __attribute__((device))
-#define __host__ __attribute__((host))
-#define __global__ __attribute__((global))
-#define __constant__ __attribute__((constant))
-#define __shared__ __attribute__((shared))
-
-#define launch_bounds_impl0(requiredMaxThreadsPerBlock)                                            \
-    __attribute__((amdgpu_flat_work_group_size(1, requiredMaxThreadsPerBlock)))
-#define launch_bounds_impl1(requiredMaxThreadsPerBlock, minBlocksPerMultiprocessor)                \
-    __attribute__((amdgpu_flat_work_group_size(1, requiredMaxThreadsPerBlock),                     \
-                   amdgpu_waves_per_eu(minBlocksPerMultiprocessor)))
-#define select_impl_(_1, _2, impl_, ...) impl_
-#define __launch_bounds__(...)                                                                     \
-    select_impl_(__VA_ARGS__, launch_bounds_impl1, launch_bounds_impl0)(__VA_ARGS__)
-
-// Macro to replace extern __shared__ declarations
-// to local variable definitions
-#define HIP_DYNAMIC_SHARED(type, var) \
-    type* var = (type*)__amdgcn_get_dynamicgroupbaseptr();
-EOF
-
-cat >/tmp/hip_pch.h <<EOF
-#include "hip/hip_runtime.h"
-#include "hip/hip_fp16.h"
-EOF
-
-/opt/rocm/llvm/bin/clang -O3 -c -std=c++17 -isystem /opt/rocm/llvm/lib/clang/11.0.0/include/.. -isystem /opt/rocm/include -nogpulib --cuda-device-only -x hip /tmp/hip_pch.h -E >/tmp/pch.cui
-
-cat /tmp/hip_macros.h >> /tmp/pch.cui
-
-/opt/rocm/llvm/bin/clang -cc1 -O3 -emit-pch -triple amdgcn-amd-amdhsa -aux-triple x86_64-unknown-linux-gnu -fcuda-is-device -std=c++17 -fgnuc-version=4.2.1 -o /tmp/hip.pch -x hip-cpp-output - </tmp/pch.cui
@@ -803,7 +803,8 @@ if ($needHipHcc) {
    if ($linkType eq 0) {
        substr($HIPLDFLAGS,0,0) = "  $HIP_LIB_PATH/libamdhip64.a " ;
    } else {
-        substr($HIPLDFLAGS,0,0) = "  -Wl,--enable-new-dtags -Wl,--rpath=$HIP_LIB_PATH:$ROCM_PATH/lib $HIP_LIB_PATH/libamdhip64.so ";
+        #Currently in ROCm some of libraries are in lib64 and rest are in lib folder in centos.
+        substr($HIPLDFLAGS,0,0) = "  -Wl,--enable-new-dtags -Wl,--rpath=$HIP_LIB_PATH:$ROCM_PATH/lib:$ROCM_PATH/lib64 $HIP_LIB_PATH/libamdhip64.so ";
    }
 }

@@ -247,4 +247,4 @@ The workaround is to explicitly add the keyword of "static" before any functions
 Product of block.x, block.y, and block.z should be less than 1024.

 ### Are __shfl_*_sync functions supported on HIP platform?
-__shfl_*_sync is not supported on HIP but for nvcc path CUDA 9.0 and above all shuffle calls get redirected to it's sync version.
+__shfl_*_sync is not supported on HIP but for nvcc path CUDA 9.0 and above all shuffle calls get redirected to it's sync version.
@@ -54,7 +54,18 @@ set_and_check( hip_BIN_INSTALL_DIR "@PACKAGE_BIN_INSTALL_DIR@" )
 set_and_check(hip_HIPCC_EXECUTABLE "${hip_BIN_INSTALL_DIR}/hipcc")
 set_and_check(hip_HIPCONFIG_EXECUTABLE "${hip_BIN_INSTALL_DIR}/hipconfig")

+# set a default path for ROCM_PATH
+if(NOT DEFINED ROCM_PATH)
+  set(ROCM_PATH /opt/rocm)
+endif()
+
+#If HIP isnot installed under ROCm, need this to find HSA assuming HSA is under ROCm
+if(DEFINED ENV{ROCM_PATH})
+  set(ROCM_PATH "$ENV{ROCM_PATH}")
+endif()
+
 if(HIP_COMPILER STREQUAL "clang")
+  set(HIP_CLANG_ROOT "${ROCM_PATH}/llvm")
  if(NOT HIP_CXX_COMPILER)
    set(HIP_CXX_COMPILER ${CMAKE_CXX_COMPILER})
  endif()
@@ -62,16 +73,12 @@ if(HIP_COMPILER STREQUAL "clang")
    execute_process(COMMAND ${HIP_CXX_COMPILER} --version
                    OUTPUT_STRIP_TRAILING_WHITESPACE
                    OUTPUT_VARIABLE HIP_CLANG_CXX_COMPILER_VERSION_OUTPUT)
-    if(HIP_CLANG_CXX_COMPILER_VERSION_OUTPUT MATCHES "InstalledDir:[\t\r\n][\t\r\n]*([^\t\r\n])")
-      set(HIP_CLANG_ROOT ${CMAKE_MATCH_1})
-    else()
-      set(HIP_CLANG_ROOT /opt/rocm/llvm)
+    if(HIP_CLANG_CXX_COMPILER_VERSION_OUTPUT MATCHES "InstalledDir:[ \t]*([^\n]*)")
+      get_filename_component(HIP_CLANG_ROOT "${CMAKE_MATCH_1}" DIRECTORY)
    endif()
  elseif (HIP_CXX_COMPILER MATCHES ".*clang\\+\\+")
-    get_filename_component(HIP_CLANG_ROOT "${HIP_CXX_COMPILER}" PATH)
-    get_filename_component(HIP_CLANG_ROOT "${HIP_CLANG_ROOT}" PATH)
-  else()
-    set(HIP_CLANG_ROOT /opt/rocm/llvm)
+    get_filename_component(HIP_CLANG_ROOT "${HIP_CXX_COMPILER}" DIRECTORY)
+    get_filename_component(HIP_CLANG_ROOT "${HIP_CLANG_ROOT}" DIRECTORY)
  endif()
  file(GLOB HIP_CLANG_INCLUDE_SEARCH_PATHS ${HIP_CLANG_ROOT}/lib/clang/*/include)
  find_path(HIP_CLANG_INCLUDE_PATH stddef.h
@@ -89,11 +96,6 @@ find_dependency(amd_comgr)

 include( "${CMAKE_CURRENT_LIST_DIR}/hip-targets.cmake" )

-#If HIP isnot installed under ROCm, need this to find HSA assuming HSA is under ROCm
-if( DEFINED ENV{ROCM_PATH} )
-     set(ROCM_PATH "$ENV{ROCM_PATH}")
-endif()
-
 #Using find_dependecy to locate the dependency for the packagaes
 #This makes the cmake generated file xxxx-targets to supply the linker libraries
 # without worrying other transitive dependencies
@@ -365,6 +365,25 @@ long __shfl(long var, int src_lane, int width = warpSize)
 }
 __device__
 inline
+unsigned long __shfl(unsigned long var, int src_lane, int width = warpSize) {
+    #ifndef _MSC_VER
+    static_assert(sizeof(unsigned long) == 2 * sizeof(unsigned int), "");
+    static_assert(sizeof(unsigned long) == sizeof(uint64_t), "");
+
+    unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
+    tmp[0] = __shfl(tmp[0], src_lane, width);
+    tmp[1] = __shfl(tmp[1], src_lane, width);
+
+    uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
+    unsigned long tmp1;  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
+    return tmp1;
+    #else
+    static_assert(sizeof(unsigned long) == sizeof(unsigned int), "");
+    return static_cast<unsigned long>(__shfl(static_cast<unsigned int>(var), src_lane, width));
+    #endif
+}
+__device__
+inline
 long long __shfl(long long var, int src_lane, int width = warpSize)
 {
    static_assert(sizeof(long long) == 2 * sizeof(int), "");
@@ -378,8 +397,22 @@ long long __shfl(long long var, int src_lane, int width = warpSize)
    long long tmp1;  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
    return tmp1;
 }
+__device__
+inline
+unsigned long long __shfl(unsigned long long var, int src_lane, int width = warpSize) {
+    static_assert(sizeof(unsigned long long) == 2 * sizeof(unsigned int), "");
+    static_assert(sizeof(unsigned long long) == sizeof(uint64_t), "");

- __device__
+    unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
+    tmp[0] = __shfl(tmp[0], src_lane, width);
+    tmp[1] = __shfl(tmp[1], src_lane, width);
+
+    uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
+    unsigned long long tmp1;  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
+    return tmp1;
+}
+
+__device__
 inline
 int __shfl_up(int var, unsigned int lane_delta, int width = warpSize) {
    int self = __lane_id();
@@ -435,6 +468,28 @@ long __shfl_up(long var, unsigned int lane_delta, int width = warpSize)
    return static_cast<long>(__shfl_up(static_cast<int>(var), lane_delta, width));
    #endif
 }
+
+__device__
+inline
+unsigned long __shfl_up(unsigned long var, unsigned int lane_delta, int width = warpSize)
+{
+    #ifndef _MSC_VER
+    static_assert(sizeof(unsigned long) == 2 * sizeof(unsigned int), "");
+    static_assert(sizeof(unsigned long) == sizeof(uint64_t), "");
+
+    unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
+    tmp[0] = __shfl_up(tmp[0], lane_delta, width);
+    tmp[1] = __shfl_up(tmp[1], lane_delta, width);
+
+    uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
+    unsigned long tmp1;  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
+    return tmp1;
+    #else
+    static_assert(sizeof(unsigned long) == sizeof(unsigned int), "");
+    return static_cast<unsigned long>(__shfl_up(static_cast<unsigned int>(var), lane_delta, width));
+    #endif
+}
+
 __device__
 inline
 long long __shfl_up(long long var, unsigned int lane_delta, int width = warpSize)
@@ -449,6 +504,20 @@ long long __shfl_up(long long var, unsigned int lane_delta, int width = warpSize
    return tmp1;
 }

+__device__
+inline
+unsigned long long __shfl_up(unsigned long long var, unsigned int lane_delta, int width = warpSize)
+{
+    static_assert(sizeof(unsigned long long) == 2 * sizeof(unsigned int), "");
+    static_assert(sizeof(unsigned long long) == sizeof(uint64_t), "");
+    unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
+    tmp[0] = __shfl_up(tmp[0], lane_delta, width);
+    tmp[1] = __shfl_up(tmp[1], lane_delta, width);
+    uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
+    unsigned long long tmp1;  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
+    return tmp1;
+}
+
 __device__
 inline
 int __shfl_down(int var, unsigned int lane_delta, int width = warpSize) {
@@ -507,6 +576,26 @@ long __shfl_down(long var, unsigned int lane_delta, int width = warpSize)
 }
 __device__
 inline
+unsigned long __shfl_down(unsigned long var, unsigned int lane_delta, int width = warpSize)
+{
+    #ifndef _MSC_VER
+    static_assert(sizeof(unsigned long) == 2 * sizeof(unsigned int), "");
+    static_assert(sizeof(unsigned long) == sizeof(uint64_t), "");
+
+    unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
+    tmp[0] = __shfl_down(tmp[0], lane_delta, width);
+    tmp[1] = __shfl_down(tmp[1], lane_delta, width);
+
+    uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
+    unsigned long tmp1;  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
+    return tmp1;
+    #else
+    static_assert(sizeof(unsigned long) == sizeof(unsigned int), "");
+    return static_cast<unsigned long>(__shfl_down(static_cast<unsigned int>(var), lane_delta, width));
+    #endif
+}
+__device__
+inline
 long long __shfl_down(long long var, unsigned int lane_delta, int width = warpSize)
 {
    static_assert(sizeof(long long) == 2 * sizeof(int), "");
@@ -518,6 +607,19 @@ long long __shfl_down(long long var, unsigned int lane_delta, int width = warpSi
    long long tmp1;  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
    return tmp1;
 }
+__device__
+inline
+unsigned long long __shfl_down(unsigned long long var, unsigned int lane_delta, int width = warpSize)
+{
+    static_assert(sizeof(unsigned long long) == 2 * sizeof(unsigned int), "");
+    static_assert(sizeof(unsigned long long) == sizeof(uint64_t), "");
+    unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
+    tmp[0] = __shfl_down(tmp[0], lane_delta, width);
+    tmp[1] = __shfl_down(tmp[1], lane_delta, width);
+    uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
+    unsigned long long tmp1;  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
+    return tmp1;
+}

 __device__
 inline
@@ -577,6 +679,26 @@ long __shfl_xor(long var, int lane_mask, int width = warpSize)
 }
 __device__
 inline
+unsigned long __shfl_xor(unsigned long var, int lane_mask, int width = warpSize)
+{
+    #ifndef _MSC_VER
+    static_assert(sizeof(unsigned long) == 2 * sizeof(unsigned int), "");
+    static_assert(sizeof(unsigned long) == sizeof(uint64_t), "");
+
+    unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
+    tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
+    tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
+
+    uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
+    unsigned long tmp1;  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
+    return tmp1;
+    #else
+    static_assert(sizeof(unsigned long) == sizeof(unsigned int), "");
+    return static_cast<unsigned long>(__shfl_xor(static_cast<unsigned int>(var), lane_mask, width));
+    #endif
+}
+__device__
+inline
 long long __shfl_xor(long long var, int lane_mask, int width = warpSize)
 {
    static_assert(sizeof(long long) == 2 * sizeof(int), "");
@@ -588,7 +710,19 @@ long long __shfl_xor(long long var, int lane_mask, int width = warpSize)
    long long tmp1;  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
    return tmp1;
 }
-
+__device__
+inline
+unsigned long long __shfl_xor(unsigned long long var, int lane_mask, int width = warpSize)
+{
+    static_assert(sizeof(unsigned long long) == 2 * sizeof(unsigned int), "");
+    static_assert(sizeof(unsigned long long) == sizeof(uint64_t), "");
+    unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
+    tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
+    tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
+    uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
+    unsigned long long tmp1;  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
+    return tmp1;
+}
 #define MASK1 0x00ff00ff
 #define MASK2 0xff00ff00

@@ -487,6 +487,22 @@ struct __HIP_Coordinates {
 #endif

 };
+template <typename F>
+#if !defined(_MSC_VER)
+__attribute__((weak))
+#endif
+constexpr typename __HIP_Coordinates<F>::X __HIP_Coordinates<F>::x;
+template <typename F>
+#if !defined(_MSC_VER)
+__attribute__((weak))
+#endif
+constexpr typename __HIP_Coordinates<F>::Y __HIP_Coordinates<F>::y;
+template <typename F>
+#if !defined(_MSC_VER)
+__attribute__((weak))
+#endif
+constexpr typename __HIP_Coordinates<F>::Z __HIP_Coordinates<F>::z;
+
 extern "C" __device__ __attribute__((const)) size_t __ockl_get_global_size(uint);
 inline
 __device__
@@ -345,13 +345,16 @@ typedef struct hipLaunchParams_t {
    hipStream_t stream;     ///< Stream identifier
 } hipLaunchParams;

-// Pre-Compiled header for online compilation
-#ifdef ENABLE_HIP_PCH
-extern const char* __hip_pch;
-extern unsigned __hip_pch_size;
-void __hipGetPCH(const char** pch, unsigned int*size);
+#if __HIP_HAS_GET_PCH
+/**
+ * Internal use only. This API may change in the future
+ * Pre-Compiled header for online compilation
+ *
+ */
+    void __hipGetPCH(const char** pch, unsigned int*size);
 #endif

+
 // Doxygen end group GlobalDefs
 /**  @} */

@@ -28,14 +28,17 @@ THE SOFTWARE.
 */

 #ifndef  HIP_INCLUDE_HIP_HIP_COOPERATIVE_GROUP_H
-#define HIP_INCLUDE_HIP_HIP_VECTOR_TYPES_H
+#define  HIP_INCLUDE_HIP_HIP_COOPERATIVE_GROUP_H
+
+#include <hip/hip_version.h>
+#include <hip/hip_common.h>

 #if defined(__HIP_PLATFORM_HCC__) && !defined(__HIP_PLATFORM_NVCC__)
-#if __cplusplus
+#if __cplusplus && defined(__clang__) && defined(__HIP__)
 #include <hip/hcc_detail/hip_cooperative_groups.h>
 #endif
 #elif defined(__HIP_PLATFORM_NVCC__) && !defined(__HIP_PLATFORM_HCC__)
-#include <cooperative_groups.h>
+#include <hip/nvcc_detail/hip_cooperative_groups.h>
 #else
 #error("Must define exactly one of __HIP_PLATFORM_HCC__ or __HIP_PLATFORM_NVCC__");
 #endif
@@ -32,6 +32,7 @@ THE SOFTWARE.


 #include <string.h>  // for getDeviceProp
+#include <hip/hip_version.h>
 #include <hip/hip_common.h>

 enum {
@@ -0,0 +1,12 @@
+#ifndef HIP_INCLUDE_HIP_NVCC_DETAIL_HIP_COOPERATIVE_GROUPS_H
+#define HIP_INCLUDE_HIP_NVCC_DETAIL_HIP_COOPERATIVE_GROUPS_H
+
+// Include CUDA headers
+#include <cuda_runtime.h>
+#include <cooperative_groups.h>
+
+// Include HIP wrapper headers around CUDA
+#include <hip/hip_runtime.h>
+#include <hip/hip_runtime_api.h>
+
+#endif // HIP_INCLUDE_HIP_NVCC_DETAIL_HIP_COOPERATIVE_GROUPS_H
@@ -104,13 +104,13 @@ typedef int hipLaunchParm;
 #define HIP_DYNAMIC_SHARED_ATTRIBUTE

 #ifdef __HIP_DEVICE_COMPILE__
-#define abort()                                                                                    \
+#define abort_()                                                                                    \
    { asm("trap;"); }
 #undef assert
 #define assert(COND)                                                                               \
    {                                                                                              \
        if (!COND) {                                                                               \
-            abort();                                                                               \
+            abort_();                                                                               \
        }                                                                                          \
    }
 #endif
@@ -26,6 +26,7 @@ THE SOFTWARE.
 #include <cuda_runtime_api.h>
 #include <cuda.h>
 #include <cuda_profiler_api.h>
+#include <cuda_fp16.h>

 #ifdef __cplusplus
 extern "C" {
@@ -20,6 +20,7 @@ target_include_directories(lpl

 target_compile_options(lpl PUBLIC -Wall)
 target_link_libraries(lpl PUBLIC pthread)
+add_custom_command(TARGET lpl POST_BUILD COMMAND ${CMAKE_COMMAND} -E copy ${PROJECT_BINARY_DIR}/lpl ${PROJECT_BINARY_DIR}/bin/lpl)

 install(TARGETS lpl RUNTIME DESTINATION bin)
 #-------------------------------------LPL--------------------------------------#
@@ -43,6 +44,7 @@ find_package(hsa-runtime64 REQUIRED CONFIG

 target_link_libraries(ca PUBLIC hsa-runtime64::hsa-runtime64 )
 target_compile_options(ca PUBLIC -DDISABLE_REDUCED_GPU_BLOB_COPY -Wall)
+add_custom_command(TARGET ca POST_BUILD COMMAND ${CMAKE_COMMAND} -E copy ${PROJECT_BINARY_DIR}/ca ${PROJECT_BINARY_DIR}/bin/ca)

 install(TARGETS ca RUNTIME DESTINATION bin)
 #-------------------------------------CA---------------------------------------#
@@ -21,22 +21,23 @@ set(CPACK_PACKAGE_NAME "hip-base")
 set(CPACK_PACKAGE_DESCRIPTION_SUMMARY "HIP: Heterogenous-computing Interface for Portability [BASE]")
 set(CPACK_PACKAGE_VENDOR "Advanced Micro Devices, Inc.")
 set(CPACK_PACKAGE_CONTACT "Maneesh Gupta <maneesh.gupta@amd.com>")
-set(CPACK_PACKAGE_VERSION @HIP_VERSION_MAJOR@.@HIP_VERSION_MINOR@.@HIP_VERSION_PATCH@)
 set(CPACK_PACKAGE_VERSION_MAJOR @HIP_VERSION_MAJOR@)
 set(CPACK_PACKAGE_VERSION_MINOR @HIP_VERSION_MINOR@)
-set(CPACK_PACKAGE_VERSION_PATCH @HIP_VERSION_PATCH@)
-set(CPACK_PACKAGE_FILE_NAME ${CPACK_PACKAGE_NAME}-${CPACK_PACKAGE_VERSION_MAJOR}.${CPACK_PACKAGE_VERSION_MINOR}.${CPACK_PACKAGE_VERSION_PATCH})
+set(CPACK_PACKAGE_VERSION_PATCH @HIP_PACKAGING_VERSION_PATCH@)
+set(CPACK_PACKAGE_VERSION @HIP_VERSION_MAJOR@.@HIP_VERSION_MINOR@.@HIP_PACKAGING_VERSION_PATCH@)
 set(CPACK_GENERATOR "TGZ;DEB;RPM")

 set(CPACK_BINARY_DEB "ON")
-set(CPACK_DEBIAN_FILE_NAME ${CPACK_PACKAGE_FILE_NAME}_amd64.deb)
+set(CPACK_DEBIAN_PACKAGE_RELEASE @CPACK_DEBIAN_PACKAGE_RELEASE@)
+set(CPACK_DEBIAN_FILE_NAME "DEB-DEFAULT")
 set(CPACK_DEBIAN_PACKAGE_CONTROL_EXTRA "${PROJECT_BINARY_DIR}/postinst;${PROJECT_BINARY_DIR}/prerm")
 set(CPACK_DEBIAN_PACKAGE_DEPENDS "perl (>= 5.0),libfile-which-perl")
 set(CPACK_DEBIAN_PACKAGE_PROVIDES "hip-base")
 set(CPACK_DEBIAN_PACKAGE_REPLACES "hip_base")

 set(CPACK_BINARY_RPM "ON")
-set(CPACK_RPM_FILE_NAME ${CPACK_PACKAGE_FILE_NAME}.x86_64.rpm)
+set(CPACK_RPM_PACKAGE_RELEASE @CPACK_RPM_PACKAGE_RELEASE@)
+set(CPACK_RPM_FILE_NAME "RPM-DEFAULT")
 set(CPACK_RPM_PACKAGE_ARCHITECTURE "${CMAKE_SYSTEM_PROCESSOR}")
 set(CPACK_RPM_POST_INSTALL_SCRIPT_FILE "${PROJECT_BINARY_DIR}/postinst")
 set(CPACK_RPM_PRE_UNINSTALL_SCRIPT_FILE "${PROJECT_BINARY_DIR}/prerm")
@@ -24,25 +24,26 @@ set(CPACK_PACKAGE_NAME "hip-doc")
 set(CPACK_PACKAGE_DESCRIPTION_SUMMARY "HIP: Heterogenous-computing Interface for Portability [DOCUMENTATION]")
 set(CPACK_PACKAGE_VENDOR "Advanced Micro Devices, Inc.")
 set(CPACK_PACKAGE_CONTACT "Maneesh Gupta <maneesh.gupta@amd.com>")
-set(CPACK_PACKAGE_VERSION @HIP_VERSION_MAJOR@.@HIP_VERSION_MINOR@.@HIP_VERSION_PATCH@)
 set(CPACK_PACKAGE_VERSION_MAJOR @HIP_VERSION_MAJOR@)
 set(CPACK_PACKAGE_VERSION_MINOR @HIP_VERSION_MINOR@)
-set(CPACK_PACKAGE_VERSION_PATCH @HIP_VERSION_PATCH@)
-set(CPACK_PACKAGE_FILE_NAME ${CPACK_PACKAGE_NAME}-${CPACK_PACKAGE_VERSION_MAJOR}.${CPACK_PACKAGE_VERSION_MINOR}.${CPACK_PACKAGE_VERSION_PATCH})
+set(CPACK_PACKAGE_VERSION_PATCH @HIP_PACKAGING_VERSION_PATCH@)
+set(CPACK_PACKAGE_VERSION @HIP_VERSION_MAJOR@.@HIP_VERSION_MINOR@.@HIP_PACKAGING_VERSION_PATCH@)
 set(CPACK_GENERATOR "TGZ;DEB;RPM")

 set(CPACK_BINARY_DEB "ON")
-set(CPACK_DEBIAN_FILE_NAME ${CPACK_PACKAGE_FILE_NAME}_amd64.deb)
-set(CPACK_DEBIAN_PACKAGE_DEPENDS "hip-base (= ${CPACK_PACKAGE_VERSION})")
+set(CPACK_DEBIAN_PACKAGE_RELEASE @CPACK_DEBIAN_PACKAGE_RELEASE@)
+set(CPACK_DEBIAN_FILE_NAME "DEB-DEFAULT")
+set(CPACK_DEBIAN_PACKAGE_DEPENDS "hip-base (= ${CPACK_PACKAGE_VERSION}-${CPACK_DEBIAN_PACKAGE_RELEASE})")
 set(CPACK_DEBIAN_PACKAGE_PROVIDES "hip-doc")
 set(CPACK_DEBIAN_PACKAGE_REPLACES "hip_doc")

 set(CPACK_BINARY_RPM "ON")
-set(CPACK_RPM_FILE_NAME ${CPACK_PACKAGE_FILE_NAME}.x86_64.rpm)
+set(CPACK_RPM_PACKAGE_RELEASE @CPACK_RPM_PACKAGE_RELEASE@)
+set(CPACK_RPM_FILE_NAME "RPM-DEFAULT")
 set(CPACK_RPM_PACKAGE_ARCHITECTURE "${CMAKE_SYSTEM_PROCESSOR}")
 set(CPACK_RPM_PACKAGE_AUTOREQPROV " no")
 string(REPLACE "-" "_" HIP_BASE_VERSION ${CPACK_PACKAGE_VERSION})
-set(CPACK_RPM_PACKAGE_REQUIRES "hip-base = ${HIP_BASE_VERSION}")
+set(CPACK_RPM_PACKAGE_REQUIRES "hip-base = ${HIP_BASE_VERSION}-${CPACK_RPM_PACKAGE_RELEASE}")
 set(CPACK_RPM_PACKAGE_OBSOLETES "hip_doc")
 set(CPACK_RPM_PACKAGE_CONFLICTS "hip_doc")
 set(CPACK_RPM_EXCLUDE_FROM_AUTO_FILELIST_ADDITION "/opt")
@@ -28,24 +28,29 @@ endif()
 set(CPACK_PACKAGE_DESCRIPTION_SUMMARY "HIP: Heterogenous-computing Interface for Portability [HCC]")
 set(CPACK_PACKAGE_VENDOR "Advanced Micro Devices, Inc.")
 set(CPACK_PACKAGE_CONTACT "Maneesh Gupta <maneesh.gupta@amd.com>")
-set(CPACK_PACKAGE_VERSION @HIP_VERSION_MAJOR@.@HIP_VERSION_MINOR@.@HIP_VERSION_PATCH@)
 set(CPACK_PACKAGE_VERSION_MAJOR @HIP_VERSION_MAJOR@)
 set(CPACK_PACKAGE_VERSION_MINOR @HIP_VERSION_MINOR@)
 set(CPACK_PACKAGE_VERSION_PATCH @HIP_VERSION_PATCH@)
-set(CPACK_PACKAGE_FILE_NAME ${CPACK_PACKAGE_NAME}-${CPACK_PACKAGE_VERSION_MAJOR}.${CPACK_PACKAGE_VERSION_MINOR}.${CPACK_PACKAGE_VERSION_PATCH})
+set(CPACK_PACKAGE_VERSION @HIP_VERSION_MAJOR@.@HIP_VERSION_MINOR@.@HIP_PACKAGING_VERSION_PATCH@)
 set(CPACK_GENERATOR "TGZ;DEB;RPM")
+
 set(CPACK_BINARY_DEB "ON")
+set(CPACK_DEBIAN_PACKAGE_RELEASE @CPACK_DEBIAN_PACKAGE_RELEASE@)
+set(CPACK_DEBIAN_FILE_NAME "DEB-DEFAULT")
 set(CPACK_DEBIAN_PACKAGE_CONTROL_EXTRA "${PROJECT_BINARY_DIR}/postinst;${PROJECT_BINARY_DIR}/prerm")
-set(CPACK_DEBIAN_PACKAGE_DEPENDS "hip-base (= ${CPACK_PACKAGE_VERSION}), ${HCC_PACKAGE_NAME} (= @HCC_PACKAGE_VERSION@), comgr (>= 1.1)")
+set(CPACK_DEBIAN_PACKAGE_DEPENDS "hip-base (= ${CPACK_PACKAGE_VERSION}-${CPACK_DEBIAN_PACKAGE_RELEASE}), ${HCC_PACKAGE_NAME} (= @HCC_PACKAGE_VERSION@), comgr (>= 1.1)")
 set(CPACK_DEBIAN_PACKAGE_PROVIDES "hip-hcc")
 set(CPACK_DEBIAN_PACKAGE_REPLACES "hip_hcc")
+
 set(CPACK_BINARY_RPM "ON")
+set(CPACK_RPM_PACKAGE_RELEASE @CPACK_RPM_PACKAGE_RELEASE@)
+set(CPACK_RPM_FILE_NAME "RPM-DEFAULT")
 set(CPACK_RPM_PACKAGE_ARCHITECTURE "${CMAKE_SYSTEM_PROCESSOR}")
 set(CPACK_RPM_POST_INSTALL_SCRIPT_FILE "${PROJECT_BINARY_DIR}/postinst")
 set(CPACK_RPM_PRE_UNINSTALL_SCRIPT_FILE "${PROJECT_BINARY_DIR}/prerm")
 set(CPACK_RPM_PACKAGE_AUTOREQPROV " no")
 string(REPLACE "-" "_" HIP_BASE_VERSION ${CPACK_PACKAGE_VERSION})
-set(CPACK_RPM_PACKAGE_REQUIRES "hip-base = ${HIP_BASE_VERSION}, ${HCC_PACKAGE_NAME} = @HCC_PACKAGE_VERSION@, comgr >= 1.1")
+set(CPACK_RPM_PACKAGE_REQUIRES "hip-base = ${HIP_BASE_VERSION}-${CPACK_RPM_PACKAGE_RELEASE}, ${HCC_PACKAGE_NAME} = @HCC_PACKAGE_VERSION@, comgr >= 1.1")
 set(CPACK_RPM_PACKAGE_OBSOLETES "hip_hcc")
 set(CPACK_RPM_PACKAGE_CONFLICTS "hip_hcc")
 set(CPACK_RPM_EXCLUDE_FROM_AUTO_FILELIST_ADDITION "/opt")
@@ -10,28 +10,29 @@ set(CPACK_PACKAGE_NAME "hip-nvcc")
 set(CPACK_PACKAGE_DESCRIPTION_SUMMARY "HIP: Heterogenous-computing Interface for Portability [NVCC]")
 set(CPACK_PACKAGE_VENDOR "Advanced Micro Devices, Inc.")
 set(CPACK_PACKAGE_CONTACT "Maneesh Gupta <maneesh.gupta@amd.com>")
-set(CPACK_PACKAGE_VERSION @HIP_VERSION_MAJOR@.@HIP_VERSION_MINOR@.@HIP_VERSION_PATCH@)
 set(CPACK_PACKAGE_VERSION_MAJOR @HIP_VERSION_MAJOR@)
 set(CPACK_PACKAGE_VERSION_MINOR @HIP_VERSION_MINOR@)
 set(CPACK_PACKAGE_VERSION_PATCH @HIP_VERSION_PATCH@)
-set(CPACK_PACKAGE_FILE_NAME ${CPACK_PACKAGE_NAME}-${CPACK_PACKAGE_VERSION_MAJOR}.${CPACK_PACKAGE_VERSION_MINOR}.${CPACK_PACKAGE_VERSION_PATCH})
+set(CPACK_PACKAGE_VERSION @HIP_VERSION_MAJOR@.@HIP_VERSION_MINOR@.@HIP_PACKAGING_VERSION_PATCH@)
 set(CPACK_GENERATOR "TGZ;DEB;RPM")

 set(CPACK_BINARY_DEB "ON")
-set(CPACK_DEBIAN_FILE_NAME ${CPACK_PACKAGE_FILE_NAME}_amd64.deb)
+set(CPACK_DEBIAN_PACKAGE_RELEASE @CPACK_DEBIAN_PACKAGE_RELEASE@)
+set(CPACK_DEBIAN_FILE_NAME "DEB-DEFAULT")
 set(CPACK_DEBIAN_PACKAGE_CONTROL_EXTRA "${PROJECT_BINARY_DIR}/postinst;${PROJECT_BINARY_DIR}/prerm")
-set(CPACK_DEBIAN_PACKAGE_DEPENDS "hip-base (= ${CPACK_PACKAGE_VERSION}), cuda (>= 7.5)")
+set(CPACK_DEBIAN_PACKAGE_DEPENDS "hip-base (= ${CPACK_PACKAGE_VERSION}-${CPACK_DEBIAN_PACKAGE_RELEASE}), cuda (>= 7.5)")
 set(CPACK_DEBIAN_PACKAGE_PROVIDES "hip-nvcc")
 set(CPACK_DEBIAN_PACKAGE_REPLACES "hip_nvcc")

 set(CPACK_BINARY_RPM "ON")
-set(CPACK_RPM_FILE_NAME ${CPACK_PACKAGE_FILE_NAME}.x86_64.rpm)
+set(CPACK_RPM_PACKAGE_RELEASE @CPACK_RPM_PACKAGE_RELEASE@)
+set(CPACK_RPM_FILE_NAME "RPM-DEFAULT")
 set(CPACK_RPM_PACKAGE_ARCHITECTURE "${CMAKE_SYSTEM_PROCESSOR}")
 set(CPACK_RPM_POST_INSTALL_SCRIPT_FILE "${PROJECT_BINARY_DIR}/postinst")
 set(CPACK_RPM_PRE_UNINSTALL_SCRIPT_FILE "${PROJECT_BINARY_DIR}/prerm")
 set(CPACK_RPM_PACKAGE_AUTOREQPROV " no")
 string(REPLACE "-" "_" HIP_BASE_VERSION ${CPACK_PACKAGE_VERSION})
-set(CPACK_RPM_PACKAGE_REQUIRES "hip-base = ${HIP_BASE_VERSION}, cuda >= 7.5")
+set(CPACK_RPM_PACKAGE_REQUIRES "hip-base = ${HIP_BASE_VERSION}-${CPACK_RPM_PACKAGE_RELEASE}, cuda >= 7.5")
 set(CPACK_RPM_PACKAGE_OBSOLETES "hip_nvcc")
 set(CPACK_RPM_PACKAGE_CONFLICTS "hip_nvcc")
 set(CPACK_RPM_EXCLUDE_FROM_AUTO_FILELIST_ADDITION "/opt")
@@ -33,27 +33,28 @@ set(HCC_PACKAGE_NAME "rocclr")
 set(CPACK_PACKAGE_DESCRIPTION_SUMMARY "HIP: Heterogenous-computing Interface for Portability [ROCClr]")
 set(CPACK_PACKAGE_VENDOR "Advanced Micro Devices, Inc.")
 set(CPACK_PACKAGE_CONTACT "Maneesh Gupta <maneesh.gupta@amd.com>")
-set(CPACK_PACKAGE_VERSION @HIP_VERSION_MAJOR@.@HIP_VERSION_MINOR@.@HIP_VERSION_PATCH@)
 set(CPACK_PACKAGE_VERSION_MAJOR @HIP_VERSION_MAJOR@)
 set(CPACK_PACKAGE_VERSION_MINOR @HIP_VERSION_MINOR@)
 set(CPACK_PACKAGE_VERSION_PATCH @HIP_VERSION_PATCH@)
-set(CPACK_PACKAGE_FILE_NAME ${CPACK_PACKAGE_NAME}-${CPACK_PACKAGE_VERSION_MAJOR}.${CPACK_PACKAGE_VERSION_MINOR}.${CPACK_PACKAGE_VERSION_PATCH})
+set(CPACK_PACKAGE_VERSION @HIP_VERSION_MAJOR@.@HIP_VERSION_MINOR@.@HIP_PACKAGING_VERSION_PATCH@)
 set(CPACK_GENERATOR "TGZ;DEB;RPM")

 set(CPACK_BINARY_DEB "ON")
-set(CPACK_DEBIAN_FILE_NAME ${CPACK_PACKAGE_FILE_NAME}_amd64.deb)
+set(CPACK_DEBIAN_PACKAGE_RELEASE @CPACK_DEBIAN_PACKAGE_RELEASE@)
+set(CPACK_DEBIAN_FILE_NAME "DEB-DEFAULT")
 set(CPACK_DEBIAN_PACKAGE_CONTROL_EXTRA "${PROJECT_BINARY_DIR}/postinst;${PROJECT_BINARY_DIR}/prerm")
-set(CPACK_DEBIAN_PACKAGE_DEPENDS "hsa-rocr-dev, rocminfo, hip-base (= ${CPACK_PACKAGE_VERSION}),  comgr (>= 1.1), llvm-amdgpu")
+set(CPACK_DEBIAN_PACKAGE_DEPENDS "hsa-rocr-dev, rocminfo, hip-base (= ${CPACK_PACKAGE_VERSION}-${CPACK_DEBIAN_PACKAGE_RELEASE}),  comgr (>= 1.1), llvm-amdgpu")
 set(CPACK_DEBIAN_PACKAGE_PROVIDES "hip-hcc (= ${CPACK_PACKAGE_VERSION})")

 set(CPACK_BINARY_RPM "ON")
-set(CPACK_RPM_FILE_NAME ${CPACK_PACKAGE_FILE_NAME}.x86_64.rpm)
+set(CPACK_RPM_PACKAGE_RELEASE @CPACK_RPM_PACKAGE_RELEASE@)
+set(CPACK_RPM_FILE_NAME "RPM-DEFAULT")
 set(CPACK_RPM_PACKAGE_ARCHITECTURE "${CMAKE_SYSTEM_PROCESSOR}")
 set(CPACK_RPM_POST_INSTALL_SCRIPT_FILE "${PROJECT_BINARY_DIR}/postinst")
 set(CPACK_RPM_PRE_UNINSTALL_SCRIPT_FILE "${PROJECT_BINARY_DIR}/prerm")
 set(CPACK_RPM_PACKAGE_AUTOREQPROV " no")
 string(REPLACE "-" "_" HIP_BASE_VERSION ${CPACK_PACKAGE_VERSION})
-set(CPACK_RPM_PACKAGE_REQUIRES "hsa-rocr-dev, rocminfo, hip-base = ${HIP_BASE_VERSION},  comgr >= 1.1, llvm-amdgpu")
+set(CPACK_RPM_PACKAGE_REQUIRES "hsa-rocr-dev, rocminfo, hip-base = ${HIP_BASE_VERSION}-${CPACK_RPM_PACKAGE_RELEASE},  comgr >= 1.1, llvm-amdgpu")
 set(CPACK_RPM_PACKAGE_PROVIDES "hip-hcc = ${HIP_BASE_VERSION}")
 set(CPACK_RPM_EXCLUDE_FROM_AUTO_FILELIST_ADDITION "/opt")
 set(CPACK_SOURCE_GENERATOR "TGZ")
@@ -12,25 +12,26 @@ set(CPACK_PACKAGE_NAME "hip-samples")
 set(CPACK_PACKAGE_DESCRIPTION_SUMMARY "HIP: Heterogenous-computing Interface for Portability [SAMPLES]")
 set(CPACK_PACKAGE_VENDOR "Advanced Micro Devices, Inc.")
 set(CPACK_PACKAGE_CONTACT "Maneesh Gupta <maneesh.gupta@amd.com>")
-set(CPACK_PACKAGE_VERSION @HIP_VERSION_MAJOR@.@HIP_VERSION_MINOR@.@HIP_VERSION_PATCH@)
 set(CPACK_PACKAGE_VERSION_MAJOR @HIP_VERSION_MAJOR@)
 set(CPACK_PACKAGE_VERSION_MINOR @HIP_VERSION_MINOR@)
 set(CPACK_PACKAGE_VERSION_PATCH @HIP_VERSION_PATCH@)
-set(CPACK_PACKAGE_FILE_NAME ${CPACK_PACKAGE_NAME}-${CPACK_PACKAGE_VERSION_MAJOR}.${CPACK_PACKAGE_VERSION_MINOR}.${CPACK_PACKAGE_VERSION_PATCH})
+set(CPACK_PACKAGE_VERSION @HIP_VERSION_MAJOR@.@HIP_VERSION_MINOR@.@HIP_PACKAGING_VERSION_PATCH@)
 set(CPACK_GENERATOR "TGZ;DEB;RPM")

 set(CPACK_BINARY_DEB "ON")
-set(CPACK_DEBIAN_FILE_NAME ${CPACK_PACKAGE_FILE_NAME}_amd64.deb)
-set(CPACK_DEBIAN_PACKAGE_DEPENDS "hip-base (= ${CPACK_PACKAGE_VERSION})")
+set(CPACK_DEBIAN_PACKAGE_RELEASE @CPACK_DEBIAN_PACKAGE_RELEASE@)
+set(CPACK_DEBIAN_FILE_NAME "DEB-DEFAULT")
+set(CPACK_DEBIAN_PACKAGE_DEPENDS "hip-base (= ${CPACK_PACKAGE_VERSION}-${CPACK_DEBIAN_PACKAGE_RELEASE})")
 set(CPACK_DEBIAN_PACKAGE_PROVIDES "hip-samples")
 set(CPACK_DEBIAN_PACKAGE_REPLACES "hip_samples")

 set(CPACK_BINARY_RPM "ON")
-set(CPACK_RPM_FILE_NAME ${CPACK_PACKAGE_FILE_NAME}.x86_64.rpm)
+set(CPACK_RPM_PACKAGE_RELEASE @CPACK_RPM_PACKAGE_RELEASE@)
+set(CPACK_RPM_FILE_NAME "RPM-DEFAULT")
 set(CPACK_RPM_PACKAGE_ARCHITECTURE "${CMAKE_SYSTEM_PROCESSOR}")
 set(CPACK_RPM_PACKAGE_AUTOREQPROV " no")
 string(REPLACE "-" "_" HIP_BASE_VERSION ${CPACK_PACKAGE_VERSION})
-set(CPACK_RPM_PACKAGE_REQUIRES "hip-rocclr = ${HIP_BASE_VERSION}")
+set(CPACK_RPM_PACKAGE_REQUIRES "hip-rocclr = ${HIP_BASE_VERSION}-${CPACK_RPM_PACKAGE_RELEASE}")
 set(CPACK_RPM_PACKAGE_OBSOLETES "hip_samples")
 set(CPACK_RPM_PACKAGE_CONFLICTS "hip_samples")
 set(CPACK_RPM_EXCLUDE_FROM_AUTO_FILELIST_ADDITION "/opt")
@@ -96,6 +96,14 @@ find_package(amd_comgr REQUIRED CONFIG

 message(STATUS "Code Object Manager found at ${amd_comgr_DIR}.")

+find_package(LLVM REQUIRED CONFIG
+   PATHS
+     /opt/rocm/llvm
+   PATH_SUFFIXES
+     lib/cmake/llvm)
+
+message(STATUS "llvm found at ${LLVM_DIR}.")
+
 add_library(hip64 OBJECT
 hip_context.cpp
 hip_code_object.cpp
@@ -148,10 +156,9 @@ endif()

 # Short-Term solution for pre-compiled headers for online compilation
 # Enable pre compiled header
-if(${ENABLE_HIP_PCH})
-    execute_process(COMMAND sh -c "${CMAKE_CURRENT_SOURCE_DIR}/../bin/hip_gen_pch.sh")
-    execute_process(COMMAND sh -c "${CMAKE_CURRENT_SOURCE_DIR}/../bin/hip_embed_pch.sh")
-    add_definitions(-DENABLE_HIP_PCH)
+if(${__HIP_ENABLE_PCH})
+    execute_process(COMMAND sh -c "${CMAKE_CURRENT_SOURCE_DIR}/../bin/hip_embed_pch.sh ${LLVM_DIR}")
+    add_definitions(-D__HIP_ENABLE_PCH)
 endif()

 # Enable profiling API
@@ -216,7 +223,7 @@ add_library(device INTERFACE)
 target_link_libraries(device INTERFACE host)

 # Short-Term solution for pre-compiled headers for online compilation
-if(${ENABLE_HIP_PCH})
+if(${__HIP_ENABLE_PCH})
  target_link_libraries(amdhip64 PRIVATE ${CMAKE_BINARY_DIR}/hip_pch.o)
 endif()

@@ -227,6 +234,18 @@ endif()
 # filename.
 if(${BUILD_SHARED_LIBS})
    target_link_libraries(amdhip64 PRIVATE amdrocclr_static Threads::Threads dl hsa-runtime64::hsa-runtime64)
+
+    add_custom_command(TARGET amdhip64 POST_BUILD COMMAND
+        ${CMAKE_COMMAND} -E create_symlink ${PROJECT_BINARY_DIR}/lib/libamdhip64.so.${HIP_LIB_VERSION_STRING}
+        ${PROJECT_BINARY_DIR}/lib/libhip_hcc.so.${HIP_LIB_VERSION_MAJOR})
+    add_custom_command(TARGET amdhip64 POST_BUILD COMMAND
+        ${CMAKE_COMMAND} -E create_symlink ${PROJECT_BINARY_DIR}/lib/libhip_hcc.so.${HIP_LIB_VERSION_MAJOR}
+        ${PROJECT_BINARY_DIR}/lib/libhip_hcc.so)
+    add_custom_command(TARGET amdhip64 POST_BUILD COMMAND
+        ${CMAKE_COMMAND} -E copy ${PROJECT_BINARY_DIR}/.hipInfo ${PROJECT_BINARY_DIR}/lib/.hipInfo)
+    add_custom_command(TARGET amdhip64 POST_BUILD COMMAND ${CMAKE_COMMAND} -E copy_directory
+        ${PROJECT_SOURCE_DIR}/include ${PROJECT_BINARY_DIR}/include)
+
    INSTALL(PROGRAMS $<TARGET_FILE:amdhip64> DESTINATION lib COMPONENT MAIN)
 else()
    target_link_libraries(amdhip64 PRIVATE Threads::Threads dl hsa-runtime64::hsa-runtime64 amd_comgr)
@@ -244,6 +263,7 @@ else()
    INSTALL(PROGRAMS $<TARGET_FILE:amdhip64> DESTINATION lib COMPONENT MAIN)
 endif()

+
 INSTALL(TARGETS amdhip64 host device EXPORT hip-targets DESTINATION ${LIB_INSTALL_DIR})
 INSTALL(EXPORT hip-targets DESTINATION ${CONFIG_PACKAGE_INSTALL_DIR} NAMESPACE hip::)

@@ -202,19 +202,10 @@ hipError_t DynCO::populateDynGlobalVars() {
    return hipErrorSharedObjectSymbolNotFound;
  }

-  if (!dev_program->getUndefinedVarFromCodeObj(&undef_var_names)) {
-    DevLogPrintfError("Could not get undefined Variables for Module: 0x%x \n", module());
-    return hipErrorSharedObjectSymbolNotFound;
-  }
-
  for (auto& elem : var_names) {
    vars_.insert(std::make_pair(elem, new Var(elem, Var::DeviceVarKind::DVK_Variable, 0, 0, 0, nullptr)));
  }

-  for (auto& elem : undef_var_names) {
-    vars_.insert(std::make_pair(elem, new Var(elem, Var::DeviceVarKind::DVK_Texture, 0, 0, 0, nullptr)));
-  }
-
  return hipSuccess;
 }

@@ -377,20 +368,4 @@ hipError_t StatCO::getStatGlobalVar(const void* hostVar, int deviceId, hipDevice
  *size_ptr = dvar->size();
  return hipSuccess;
 }
-
-hipError_t StatCO::getStatGlobalVarByName(std::string hostVar, int deviceId, hipModule_t hmod,
-                                          hipDeviceptr_t* dev_ptr, size_t* size_ptr) {
-  amd::ScopedLock lock(sclock_);
-
-  for (auto& elem : vars_) {
-    if ((elem.second->name() == hostVar)
-        && (elem.second->module(deviceId) == hmod)) {
-      *dev_ptr = elem.second->device_ptr(deviceId);
-      *size_ptr = elem.second->device_size(deviceId);
-      return hipSuccess;
-    }
-  }
-
-  return hipErrorNotFound;
-}
 }; //namespace: hip
@@ -118,8 +118,6 @@ public:
  hipError_t getStatFuncAttr(hipFuncAttributes* func_attr, const void* hostFunction, int deviceId);
  hipError_t getStatGlobalVar(const void* hostVar, int deviceId, hipDeviceptr_t* dev_ptr,
                              size_t* size_ptr);
-  hipError_t getStatGlobalVarByName(std::string hostVar, int deviceId, hipModule_t hmod,
-                                    hipDeviceptr_t* dev_ptr, size_t* size_ptr);

 private:
  friend class ::PlatformState;
@@ -155,7 +155,7 @@ hipError_t hipGetDeviceProperties ( hipDeviceProp_t* props, hipDevice_t device )
  ::strncpy(deviceProps.name, info.boardName_, 128);
  deviceProps.totalGlobalMem = info.globalMemSize_;
  deviceProps.sharedMemPerBlock = info.localMemSizePerCU_;
-  deviceProps.regsPerBlock = info.availableSGPRs_;
+  deviceProps.regsPerBlock = info.availableRegistersPerCU_;
  deviceProps.warpSize = info.wavefrontWidth_;
  deviceProps.maxThreadsPerBlock = info.maxWorkGroupSize_;
  deviceProps.maxThreadsDim[0] = info.maxWorkItemSizes_[0];
@@ -12,7 +12,7 @@ FatBinaryDeviceInfo::~FatBinaryDeviceInfo() {
 }

 FatBinaryInfo::FatBinaryInfo(const char* fname, const void* image)
-               : fdesc_(-1), fsize_(0), image_(image), uri_(std::string()) {
+               : fdesc_(amd::Os::FDescInit()), fsize_(0), image_(image), uri_(std::string()) {
  guarantee(fname || image);

  if (fname != nullptr) {
@@ -41,7 +41,7 @@ FatBinaryInfo::~FatBinaryInfo() {
  }

  fname_ = std::string();
-  fdesc_ = -1;
+  fdesc_ = amd::Os::FDescInit();
  fsize_ = 0;
  image_ = nullptr;
  uri_ = std::string();
@@ -64,6 +64,9 @@ hipError_t FatBinaryInfo::ExtractFatBinary(const std::vector<hip::Device*>& devi
    if (!amd::Os::GetFileHandle(fname_.c_str(), &fdesc_, &fsize_)) {
      return hipErrorFileNotFound;
    }
+    if (fsize_ == 0) {
+      return hipErrorInvalidKernelFile;
+    }

    // Extract the code object from file
    hip_error = CodeObject::ExtractCodeObjectFromFile(fdesc_, fsize_,
@@ -5,7 +5,9 @@
 #include "hip_code_object.hpp"
 #include "platform/program.hpp"

-#ifdef ENABLE_HIP_PCH
+#ifdef __HIP_ENABLE_PCH
+extern const char __hip_pch[];
+extern unsigned __hip_pch_size;
 void __hipGetPCH(const char** pch, unsigned int *size) {
  *pch = __hip_pch;
  *size = __hip_pch_size;
@@ -95,11 +95,6 @@ public:
  hipError_t getStatDeviceVar(DeviceVar** dvar, int deviceId);
  void resize_dVar(size_t size) { dVar_.resize(size); }

-  //Accessor for device_ptrs.
-  std::string name() const { return name_; }
-  hipModule_t module(int deviceId) const { return nullptr; }
-  hipDeviceptr_t device_ptr(int deviceId) const { return dVar_[deviceId]->device_ptr(); }
-  size_t device_size(int deviceId) const { return dVar_[deviceId]->size(); }
  FatBinaryInfo** moduleInfo() { return modules_; };

 private:
@@ -252,8 +252,6 @@ extern int ihipGetDevice();
 extern hipError_t ihipMalloc(void** ptr, size_t sizeBytes, unsigned int flags);
 extern amd::Memory* getMemoryObject(const void* ptr, size_t& offset);
 extern amd::Memory* getMemoryObjectWithOffset(const void* ptr, const size_t size);
-extern bool CL_CALLBACK getSvarInfo(cl_program program, std::string var_name, void** var_addr,
-                                    size_t* var_size);

 constexpr bool kOptionChangeable = true;
 constexpr bool kNewDevProg = false;
@@ -124,7 +124,7 @@ hipError_t ihipMalloc(void** ptr, size_t sizeBytes, unsigned int flags)
  if (*ptr == nullptr) {
    size_t free = 0, total =0;
    hipMemGetInfo(&free, &total);
-    LogPrintfError("Allocation failed : Device memory : required :%u | free :%u | total :%u \n", sizeBytes, free, total);
+    LogPrintfError("Allocation failed : Device memory : required :%zu | free :%zu | total :%zu \n", sizeBytes, free, total);
    return hipErrorOutOfMemory;
  }

@@ -202,14 +202,14 @@ hipError_t ihipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKin
      }
    } else {
      amd::HostQueue* pQueue = &queue;
-      if (queueDevice != srcMemory->getContext().devices()[0]) {
+      if ((srcMemory->getContext().devices()[0] == dstMemory->getContext().devices()[0]) &&
+          (queueDevice != srcMemory->getContext().devices()[0])) {
        pQueue = hip::getNullStream(srcMemory->getContext());
        amd::Command* cmd = queue.getLastQueuedCommand(true);
        if (cmd != nullptr) {
          waitList.push_back(cmd);
        }
      }
-
      command = new amd::CopyMemoryCommand(*pQueue, CL_COMMAND_COPY_BUFFER, waitList,
          *srcMemory->asBuffer(), *dstMemory->asBuffer(), sOffset, dOffset, sizeBytes);
    }
@@ -1850,18 +1850,27 @@ hipError_t ihipMemset3D(hipPitchedPtr pitchedDevPtr,
                        hipExtent extent,
                        hipStream_t stream,
                        bool isAsync = false) {
-  if (pitchedDevPtr.pitch == extent.width) {
-    return ihipMemset(pitchedDevPtr.ptr, value, sizeof(int8_t), extent.width * extent.height * extent.depth, stream, isAsync);
-  }
-
-  // Workaround for cases when pitch > row untill fill kernel will be updated to support pitch.
-  // Fallback to filling one row at a time.
-
-  amd::HostQueue* queue = hip::getQueue(stream);
-
  size_t offset = 0;
  amd::Memory* memory = getMemoryObject(pitchedDevPtr.ptr, offset);

+  auto sizeBytes = extent.width * extent.height * extent.depth;
+
+  if (memory == nullptr) {
+    return hipErrorInvalidValue;
+  }
+  if (sizeBytes > memory->getSize()) {
+    return hipErrorInvalidValue;
+  }
+
+  if (pitchedDevPtr.pitch == extent.width) {
+    return ihipMemset(pitchedDevPtr.ptr, value, sizeof(int8_t), static_cast<size_t>(sizeBytes), stream, isAsync);
+  }
+
+  // Workaround for cases when pitch > row until fill kernel will be updated to support pitch.
+  // Fall back to filling one row at a time.
+
+  amd::HostQueue* queue = hip::getQueue(stream);
+
  amd::Coord3D origin(offset);
  amd::Coord3D region(pitchedDevPtr.xsize, pitchedDevPtr.ysize, extent.depth);
  amd::BufferRect rect;
@@ -1870,34 +1879,26 @@ hipError_t ihipMemset3D(hipPitchedPtr pitchedDevPtr,
    return hipErrorInvalidValue;
  }

-  if (memory != nullptr) {
-    std::vector<amd::FillMemoryCommand*> commands;
+  std::vector<amd::FillMemoryCommand*> commands;

-    for (size_t slice = 0; slice < extent.depth; slice++) {
-      for (size_t row = 0; row < extent.height; row++) {
-        const size_t rowOffset = rect.offset(0, row, slice);
-        amd::FillMemoryCommand* command = new amd::FillMemoryCommand(*queue,
-                                                                     CL_COMMAND_FILL_BUFFER,
-                                                                     amd::Command::EventWaitList{},
-                                                                     *memory->asBuffer(),
-                                                                     &value,
-                                                                     sizeof(int8_t),
-                                                                     amd::Coord3D{rowOffset, 0, 0},
-                                                                     amd::Coord3D{extent.width, 1, 1});
+  for (size_t slice = 0; slice < extent.depth; slice++) {
+    for (size_t row = 0; row < extent.height; row++) {
+      const size_t rowOffset = rect.offset(0, row, slice);
+      amd::FillMemoryCommand *command = new amd::FillMemoryCommand(*queue,
+          CL_COMMAND_FILL_BUFFER, amd::Command::EventWaitList { },
+          *memory->asBuffer(), &value, sizeof(int8_t), amd::Coord3D { rowOffset,
+              0, 0 }, amd::Coord3D { extent.width, 1, 1 });

-        command->enqueue();
-        commands.push_back(command);
-      }
+      command->enqueue();
+      commands.push_back(command);
    }
+  }

-    for (auto &command: commands) {
-      if (!isAsync) {
-        command->awaitCompletion();
-      }
-      command->release();
+  for (auto &command : commands) {
+    if (!isAsync) {
+      command->awaitCompletion();
    }
-  } else {
-	return hipErrorInvalidValue;
+    command->release();
  }

  return hipSuccess;
@@ -2038,7 +2039,7 @@ hipError_t hipPointerGetAttributes(hipPointerAttribute_t* attributes, const void
  memset(attributes, 0, sizeof(hipPointerAttribute_t));

  if (memObj != nullptr) {
-    attributes->memoryType = (CL_MEM_SVM_FINE_GRAIN_BUFFER & memObj->getMemFlags())? hipMemoryTypeHost : hipMemoryTypeDevice;
+    attributes->memoryType = ((CL_MEM_SVM_FINE_GRAIN_BUFFER | CL_MEM_USE_HOST_PTR) & memObj->getMemFlags())? hipMemoryTypeHost : hipMemoryTypeDevice;
    if (attributes->memoryType == hipMemoryTypeHost) {
      attributes->hostPointer = static_cast<char*>(memObj->getSvmPtr()) + offset;
    }
@@ -537,7 +537,7 @@ hipError_t ihipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsL
    if (result != hipSuccess) {
      break;
    }
-    prevGridSize += launch.gridDim.x * launch.gridDim.y * launch.gridDim.z;
+    prevGridSize += globalWorkSizeX * globalWorkSizeY * globalWorkSizeZ;
  }

  // Sync the execution streams on all devices
@@ -97,6 +97,10 @@ hipError_t hipExtGetLinkTypeAndHopCount(int device1, int device2,
                                        uint32_t* linktype, uint32_t* hopcount) {
  HIP_INIT_API(hipExtGetLinkTypeAndHopCount, device1, device2, linktype, hopcount);

+  if (linktype == nullptr || hopcount == nullptr ||
+      device1 == device2  || device1 < 0 || device2 < 0) {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
  // Fill out the list of LinkAttributes
  std::vector<amd::Device::LinkAttrType> link_attrs;
  link_attrs.push_back(std::make_pair(amd::Device::LinkAttribute::kLinkLinkType, 0));
@@ -80,27 +80,6 @@ extern "C" hip::FatBinaryInfo** __hipRegisterFatBinary(const void* data)
  return PlatformState::instance().addFatBinary(fbwrapper->binary);
 }

-bool PlatformState::getShadowVarInfo(std::string var_name, hipModule_t hmod,
-                                     void** var_addr, size_t* var_size) {
-
-  amd::ScopedLock lock(lock_);
-  if (hipSuccess == getDynGlobalVar(var_name.c_str(), ihipGetDevice(), hmod, var_addr, var_size)) {
-    return true;
-  }
-
-  if (hipSuccess == getStatGlobalVarByName(var_name, ihipGetDevice(), hmod, var_addr, var_size)) {
-    return true;
-  }
-
-  return false;
-}
-
-bool CL_CALLBACK getSvarInfo(cl_program program, std::string var_name, void** var_addr,
-                             size_t* var_size) {
-  return PlatformState::instance().getShadowVarInfo(var_name, reinterpret_cast<hipModule_t>(program),
-                                                    var_addr, var_size);
-}
-
 extern "C" void __hipRegisterFunction(
  hip::FatBinaryInfo** modules,
  const void*  hostFunction,
@@ -686,11 +665,19 @@ static inline std::uint32_t __convert_float_to_half(float a) noexcept {
  return s | v;
 }

-extern "C" __attribute__((weak)) float  __gnu_h2f_ieee(unsigned short h){
+extern "C"
+#if !defined(_MSC_VER)
+__attribute__((weak))
+#endif
+float  __gnu_h2f_ieee(unsigned short h){
  return __convert_half_to_float((std::uint32_t) h);
 }

-extern "C" __attribute__((weak)) unsigned short  __gnu_f2h_ieee(float f){
+extern "C"
+#if !defined(_MSC_VER)
+__attribute__((weak))
+#endif
+unsigned short  __gnu_f2h_ieee(float f){
  return (unsigned short)__convert_float_to_half(f);
 }

@@ -765,6 +752,9 @@ hipError_t PlatformState::getDynFunc(hipFunction_t* hfunc, hipModule_t hmod,
    DevLogPrintfError("Cannot find the module: 0x%x", hmod);
    return hipErrorNotFound;
  }
+  if (0 == strlen(func_name)) {
+    return hipErrorNotFound;
+  }

  return it->second->getDynFunc(hfunc, func_name);
 }
@@ -868,11 +858,6 @@ hipError_t PlatformState::getStatGlobalVar(const void* hostVar, int deviceId, hi
  return statCO_.getStatGlobalVar(hostVar, deviceId, dev_ptr, size_ptr);
 }

-hipError_t PlatformState::getStatGlobalVarByName(std::string hostVar, int deviceId, hipModule_t hmod,
-                                                 hipDeviceptr_t* dev_ptr, size_t* size_ptr) {
-  return statCO_.getStatGlobalVarByName(hostVar, deviceId, hmod, dev_ptr, size_ptr);
-}
-
 void PlatformState::setupArgument(const void *arg, size_t size, size_t offset) {
  auto& arguments = execStack_.top().arguments_;

@@ -77,11 +77,6 @@ public:
  hipError_t getStatFuncAttr(hipFuncAttributes* func_attr, const void* hostFunction, int deviceId);
  hipError_t getStatGlobalVar(const void* hostVar, int deviceId, hipDeviceptr_t* dev_ptr,
                              size_t* size_ptr);
-  hipError_t getStatGlobalVarByName(std::string hostVar, int deviceId, hipModule_t hmod,
-                                    hipDeviceptr_t* dev_ptr, size_t* size_ptr);
-
-  bool getShadowVarInfo(std::string var_name, hipModule_t hmod,
-                            void** var_addr, size_t* var_size);

  //Exec Functions
  void setupArgument(const void *arg, size_t size, size_t offset);
@@ -0,0 +1,20 @@
+project(bit_extract)
+
+cmake_minimum_required(VERSION 3.10)
+
+# Search for rocm in common locations
+list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
+
+# Find hip
+find_package(hip)
+
+# Set compiler and linker
+set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
+
+set(CMAKE_CXX_LINKER   ${HIP_HIPCC_EXECUTABLE})
+
+# Create the excutable
+add_executable(bit_extract bit_extract.cpp)
+
+# Link with HIP
+target_link_libraries(bit_extract hip::host)
@@ -9,19 +9,15 @@ HIPCC=$(HIP_PATH)/bin/hipcc

 # Show how to use PLATFORM to specify different options for each compiler:
 ifeq (${HIP_PLATFORM}, nvcc)
-	HIPCC_FLAGS = -gencode=arch=compute_20,code=sm_20 
+	HIPCC_FLAGS = -gencode=arch=compute_20,code=sm_20
 endif

 EXE=bit_extract
-EXE_STATIC=bit_extract_static

 $(EXE): bit_extract.cpp
 	$(HIPCC) $(HIPCC_FLAGS) $< -o $@

-$(EXE_STATIC): bit_extract.cpp
-	$(HIPCC) -use-staticlib  $(HIPCC_FLAGS) $< -o $@
-
-all: $(EXE) $(EXE_STATIC)
+all: $(EXE)

 clean:
-	rm -f *.o $(EXE) $(EXE_STATIC)
+	rm -f *.o $(EXE)
@@ -0,0 +1,36 @@
+project(module_api)
+
+cmake_minimum_required(VERSION 3.10)
+
+# Search for rocm in common locations
+list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
+
+# Find hip
+find_package(hip)
+
+# Set compiler and linker
+set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
+
+set(CMAKE_CXX_LINKER   ${HIP_HIPCC_EXECUTABLE})
+
+# Create the excutable
+add_executable(runKernel.hip.out runKernel.cpp)
+add_executable(launchKernelHcc.hip.out launchKernelHcc.cpp)
+add_executable(defaultDriver.hip.out defaultDriver.cpp)
+
+# Generate code object
+add_custom_target(
+  codeobj
+  ALL
+  COMMAND  ${HIP_HIPCC_EXECUTABLE} --genco  ../vcpy_kernel.cpp -o vcpy_kernel.code
+  COMMENT "codeobj generated"
+)
+
+add_dependencies(runKernel.hip.out codeobj)
+add_dependencies(launchKernelHcc.hip.out codeobj)
+add_dependencies(defaultDriver.hip.out codeobj)
+
+# Link with HIP
+target_link_libraries(runKernel.hip.out hip::host)
+target_link_libraries(launchKernelHcc.hip.out hip::host)
+target_link_libraries(defaultDriver.hip.out hip::host)
@@ -0,0 +1,30 @@
+project(modile_api_global)
+
+cmake_minimum_required(VERSION 3.10)
+
+# Search for rocm in common locations
+list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
+
+# Find hip
+find_package(hip)
+
+# Set compiler and linker
+set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
+
+set(CMAKE_CXX_LINKER   ${HIP_HIPCC_EXECUTABLE})
+
+# Create the excutable
+add_executable(runKernel.hip.out runKernel.cpp)
+
+# Generate code object
+add_custom_target(
+  codeobj
+  ALL
+  COMMAND  ${HIP_HIPCC_EXECUTABLE} --genco  ../vcpy_kernel.cpp -o vcpy_kernel.code
+  COMMENT "codeobj generated"
+)
+
+add_dependencies(runKernel.hip.out codeobj)
+
+# Link with HIP
+target_link_libraries(runKernel.hip.out hip::host)
@@ -0,0 +1,21 @@
+#Follow "README.md" to generate square.cpp if it's missing
+
+project(square)
+
+cmake_minimum_required(VERSION 3.10)
+
+# Search for rocm in common locations
+list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
+
+# Find hip
+find_package(hip)
+
+# Set compiler and linker
+set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
+set(CMAKE_CXX_LINKER   ${HIP_HIPCC_EXECUTABLE})
+
+# Create the excutable
+add_executable(square square.cpp)
+
+# Link with HIP
+target_link_libraries(square hip::host)
@@ -11,7 +11,7 @@ else
 	SOURCES=square.cpp
 endif

-all: square.out square.out.static
+all: square.out

 # Step
 square.cpp: square.cu
@@ -20,8 +20,5 @@ square.cpp: square.cu
 square.out: $(SOURCES)
 	$(HIPCC) $(CXXFLAGS) $(SOURCES) -o $@

-square.out.static: $(SOURCES)
-	$(HIPCC) -use-staticlib $(CXXFLAGS) $(SOURCES) -o $@
-
 clean:
-	rm -f *.o *.out *.out.static square.cpp
+	rm -f *.o *.out square.cpp
@@ -1,13 +1,39 @@
 # Square.md

-Simple test which shows how to use hipify-perl to port CUDA code to HIP.  
-See related [blog](http://gpuopen.com/hip-to-be-squared-an-introductory-hip-tutorial) that explains the example. 
+Simple test which shows how to use hipify-perl to port CUDA code to HIP.
+See related [blog](http://gpuopen.com/hip-to-be-squared-an-introductory-hip-tutorial) that explains the example.
 Now it is even simpler and requires no manual modification to the hipified source code - just hipify and compile:

-1. Add hip/bin path to the PATH  :
-    <code>export PATH=$PATH:[MYHIP]/bin</code>
+- Add hip/bin path to the PATH

-2. <code>$ make </code>
-   Make runs these steps.  This can be performed on either CUDA or AMD platform:
-   <code>hipify-perl square.cu > square.cpp </code>    # convert cuda code to hip code
-   <code>hipcc square.cpp</code>                       # compile into executable
+```
+$ export PATH=$PATH:[MYHIP]/bin
+```
+
+- Define environment variable
+
+```
+$ export HIP_PATH=[MYHIP]
+```
+
+- Build executible file
+
+```
+$ cd ~/hip/samples/0_Intro/square
+$ make
+/home/user/hip/bin/hipify-perl square.cu > square.cpp
+/home/user/hip/bin/hipcc  square.cpp -o square.out
+/home/user/hip/bin/hipcc -use-staticlib  square.cpp -o square.out.static
+```
+- Execute file
+```
+$ ./square.out
+info: running on device Navi 14 [Radeon Pro W5500]
+info: allocate host mem (  7.63 MB)
+info: allocate device mem (  7.63 MB)
+info: copy Host2Device
+info: launch 'vector_square' kernel
+info: copy Device2Host
+info: check result
+PASSED!
+```
@@ -0,0 +1,20 @@
+project(hipBusBandwidth)
+
+cmake_minimum_required(VERSION 3.10)
+
+# Search for rocm in common locations
+list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
+
+# Find hip
+find_package(hip)
+
+# Set compiler and linker
+set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
+set(CMAKE_CXX_LINKER   ${HIP_HIPCC_EXECUTABLE})
+set(CMAKE_BUILD_TYPE Release)
+
+# Create the excutable
+add_executable(hipBusBandwidth hipBusBandwidth.cpp ResultDatabase.cpp)
+
+# Link with HIP
+target_link_libraries(hipBusBandwidth hip::host)
@@ -12,7 +12,7 @@ enum MallocMode { MallocPinned, MallocUnpinned, MallocRegistered };
 bool p_verbose = false;
 MallocMode p_malloc_mode = MallocPinned;
 int p_numa_ctl = -1;
-int p_iterations = 10;
+int p_iterations = 0;
 int p_beatsperiteration = 1;
 int p_device = 0;
 int p_detailed = 0;
@@ -89,7 +89,9 @@ hipError_t memcopy(void* dst, const void* src, size_t sizeBytes, enum hipMemcpyK
 int sizes[] = {-64, -256, -512, 1,    2,    4,     8,     16,    32,     64,     128,   256,
               512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072, 262144, 524288};
 int nSizes = sizeof(sizes) / sizeof(int);
-
+// iterations to be run for the corresponding sizes, less number as the size increases
+int iterations[] = {1000, 1000, 1000, 1000, 500, 500, 500, 500, 500, 200, 200, 200,
+               200, 200, 100, 100, 100, 100, 50, 50, 50, 20, 20};

 // ****************************************************************************
 // Function: RunBenchmark_H2D
@@ -174,53 +176,48 @@ void RunBenchmark_H2D(ResultDatabase& resultDB) {
    hipEventCreate(&stop);
    CHECK_HIP_ERROR();

-    // Three passes, forward and backward both
-    for (int pass = 0; pass < p_iterations; pass++) {
-        // store the times temporarily to estimate latency
-        // float times[nSizes];
-        // Step through sizes forward on even passes and backward on odd
-        for (int i = 0; i < nSizes; i++) {
-            int sizeIndex;
-            if ((pass % 2) == 0)
-                sizeIndex = i;
-            else
-                sizeIndex = (nSizes - 1) - i;
+    // store the times temporarily to estimate latency
+    // float times[nSizes];
+    for (int i = 0; i < nSizes; i++) {
+      int sizeIndex, iterIndex;
+      sizeIndex = i;
+      iterIndex = i;

-            const int thisSize = p_onesize ? p_onesize : sizes[sizeIndex];
-            const int nbytes = sizeToBytes(thisSize);
+      const int thisSize = p_onesize ? p_onesize : sizes[sizeIndex];
+      const int nbytes = sizeToBytes(thisSize);
+      const int niter = p_iterations ? p_iterations : iterations[iterIndex];
+      for (int pass = 0; pass < niter; pass++) {

-            hipEventRecord(start, 0);
-            for (int j = 0; j < p_beatsperiteration; j++) {
-                memcopy(device, hostMem, nbytes, hipMemcpyHostToDevice);
-            }
-            hipEventRecord(stop, 0);
-            hipEventSynchronize(stop);
-            float t = 0;
-            hipEventElapsedTime(&t, start, stop);
-            // times[sizeIndex] = t;
-
-            // Convert to GB/sec
-            if (p_verbose) {
-                std::cerr << "size " << sizeToString(thisSize) << " took " << t << " ms\n";
-            }
-
-            double speed =
-                (double(double(sizeToBytes(thisSize)/1000) * p_beatsperiteration) /  1000) / t;
-            char sizeStr[256];
-            if (p_beatsperiteration > 1) {
-                sprintf(sizeStr, "%9sx%d", sizeToString(thisSize).c_str(), p_beatsperiteration);
-            } else {
-                sprintf(sizeStr, "%9s", sizeToString(thisSize).c_str());
-            }
-            resultDB.AddResult(std::string("H2D_Bandwidth") + "_" + mallocModeString(p_malloc_mode),
-                               sizeStr, "GB/sec", speed);
-            resultDB.AddResult(std::string("H2D_Time") + mallocModeString(p_malloc_mode), sizeStr,
-                               "ms", t);
-
-            if (p_onesize) {
-                break;
-            }
+        hipEventRecord(start, 0);
+        for (int j = 0; j < p_beatsperiteration; j++) {
+          memcopy(device, hostMem, nbytes, hipMemcpyHostToDevice);
        }
+        hipEventRecord(stop, 0);
+        hipEventSynchronize(stop);
+        float t = 0;
+        hipEventElapsedTime(&t, start, stop);
+        // times[sizeIndex] = t;
+        // Convert to GB/sec
+        if (p_verbose) {
+          std::cerr << "size " << sizeToString(thisSize) << " took " << t << " ms\n";
+        }
+
+        double speed =
+            (double(double(sizeToBytes(thisSize)/1000) * p_beatsperiteration) /  1000) / t;
+        char sizeStr[256];
+        if (p_beatsperiteration > 1) {
+          sprintf(sizeStr, "%9sx%d", sizeToString(thisSize).c_str(), p_beatsperiteration);
+        } else {
+          sprintf(sizeStr, "%9s", sizeToString(thisSize).c_str());
+        }
+        resultDB.AddResult(std::string("H2D_Bandwidth") + "_" + mallocModeString(p_malloc_mode),
+                           sizeStr, "GB/sec", speed);
+        resultDB.AddResult(std::string("H2D_Time") + mallocModeString(p_malloc_mode), sizeStr,                                            "ms", t);
+
+      }
+      if (p_onesize) {
+        break;
+      }
    }

    if (p_onesize) {
@@ -347,53 +344,50 @@ void RunBenchmark_D2H(ResultDatabase& resultDB) {
    hipEventCreate(&stop);
    CHECK_HIP_ERROR();

-    // Three passes, forward and backward both
-    for (int pass = 0; pass < p_iterations; pass++) {
-        // store the times temporarily to estimate latency
-        // float times[nSizes];
-        // Step through sizes forward on even passes and backward on odd
-        for (int i = 0; i < nSizes; i++) {
-            int sizeIndex;
-            if ((pass % 2) == 0)
-                sizeIndex = i;
-            else
-                sizeIndex = (nSizes - 1) - i;
+    // store the times temporarily to estimate latency
+    // float times[nSizes];
+    for (int i = 0; i < nSizes; i++) {
+      int sizeIndex, iterIndex;
+      sizeIndex = i;
+      iterIndex = i;

-            const int thisSize = p_onesize ? p_onesize : sizes[sizeIndex];
-            const int nbytes = sizeToBytes(thisSize);
+      const int thisSize = p_onesize ? p_onesize : sizes[sizeIndex];
+      const int nbytes = sizeToBytes(thisSize);
+      const int niter = p_iterations ? p_iterations : iterations[iterIndex];
+      for (int pass = 0; pass < niter; pass++) {

-            hipEventRecord(start, 0);
-            for (int j = 0; j < p_beatsperiteration; j++) {
-                memcopy(hostMem2, device, nbytes, hipMemcpyDeviceToHost);
-            }
-            hipEventRecord(stop, 0);
-            hipEventSynchronize(stop);
-            float t = 0;
-            hipEventElapsedTime(&t, start, stop);
-            // times[sizeIndex] = t;
-
-            // Convert to GB/sec
-            if (p_verbose) {
-                std::cerr << "size " << sizeToString(thisSize) << " took " << t << " ms\n";
-            }
-
-            double speed =
-                (double(double(sizeToBytes(thisSize)/1000) * p_beatsperiteration) / 1000) / t;
-            char sizeStr[256];
-            sprintf(sizeStr, "%9s", sizeToString(thisSize).c_str());
-            if (p_beatsperiteration > 1) {
-                sprintf(sizeStr, "%9sx%d", sizeToString(thisSize).c_str(), p_beatsperiteration);
-            } else {
-                sprintf(sizeStr, "%9s", sizeToString(thisSize).c_str());
-            }
-            resultDB.AddResult(std::string("D2H_Bandwidth") + "_" + mallocModeString(p_malloc_mode),
-                               sizeStr, "GB/sec", speed);
-            resultDB.AddResult(std::string("D2H_Time") + "_" + mallocModeString(p_malloc_mode),
-                               sizeStr, "ms", t);
-            if (p_onesize) {
-                break;
-            }
+        hipEventRecord(start, 0);
+        for (int j = 0; j < p_beatsperiteration; j++) {
+          memcopy(hostMem2, device, nbytes, hipMemcpyDeviceToHost);
        }
+        hipEventRecord(stop, 0);
+        hipEventSynchronize(stop);
+        float t = 0;
+        hipEventElapsedTime(&t, start, stop);
+        // times[sizeIndex] = t;
+        // Convert to GB/sec
+        if (p_verbose) {
+          std::cerr << "size " << sizeToString(thisSize) << " took " << t << " ms\n";
+        }
+
+        double speed =
+            (double(double(sizeToBytes(thisSize)/1000) * p_beatsperiteration) / 1000) / t;
+        char sizeStr[256];
+        sprintf(sizeStr, "%9s", sizeToString(thisSize).c_str());
+        if (p_beatsperiteration > 1) {
+          sprintf(sizeStr, "%9sx%d", sizeToString(thisSize).c_str(), p_beatsperiteration);
+        } else {
+          sprintf(sizeStr, "%9s", sizeToString(thisSize).c_str());
+        }
+        resultDB.AddResult(std::string("D2H_Bandwidth") + "_" + mallocModeString(p_malloc_mode),
+                           sizeStr, "GB/sec", speed);
+        resultDB.AddResult(std::string("D2H_Time") + "_" + mallocModeString(p_malloc_mode),
+                           sizeStr, "ms", t);
+
+      }
+      if (p_onesize) {
+        break;
+      }
    }

    if (p_onesize) {
@@ -522,43 +516,43 @@ void RunBenchmark_Bidir(ResultDatabase& resultDB) {
    hipStreamCreate(&stream[0]);
    hipStreamCreate(&stream[1]);

-    // Three passes, forward and backward both
-    for (int pass = 0; pass < p_iterations; pass++) {
-        // store the times temporarily to estimate latency
-        // float times[nSizes];
-        // Step through sizes forward on even passes and backward on odd
-        for (int i = 0; i < nSizes; i++) {
-            int sizeIndex;
-            if ((pass % 2) == 0)
-                sizeIndex = i;
-            else
-                sizeIndex = (nSizes - 1) - i;
+    // store the times temporarily to estimate latency
+    // float times[nSizes];
+    for (int i = 0; i < nSizes; i++) {
+      int sizeIndex, iterIndex;
+      sizeIndex = i;
+      iterIndex = i;

-            const int thisSize = p_onesize ? p_onesize : sizes[sizeIndex];
-            const int nbytes = sizeToBytes(thisSize);
+      const int thisSize = p_onesize ? p_onesize : sizes[sizeIndex];
+      const int nbytes = sizeToBytes(thisSize);
+      const int niter = p_iterations ? p_iterations : iterations[iterIndex];
+      for (int pass = 0; pass < niter; pass++) {

-            hipEventRecord(start, 0);
-            hipMemcpyAsync(deviceMem[0], hostMem[0], nbytes, hipMemcpyHostToDevice, stream[0]);
-            hipMemcpyAsync(hostMem[1], deviceMem[1], nbytes, hipMemcpyDeviceToHost, stream[1]);
-            hipEventRecord(stop, 0);
-            hipEventSynchronize(stop);
-            float t = 0;
-            hipEventElapsedTime(&t, start, stop);
+        hipEventRecord(start, 0);
+        hipMemcpyAsync(deviceMem[0], hostMem[0], nbytes, hipMemcpyHostToDevice, stream[0]);
+        hipMemcpyAsync(hostMem[1], deviceMem[1], nbytes, hipMemcpyDeviceToHost, stream[1]);
+        hipEventRecord(stop, 0);
+        hipEventSynchronize(stop);
+        float t = 0;
+        hipEventElapsedTime(&t, start, stop);

-            // Convert to GB/sec
-            if (p_verbose) {
-                std::cerr << "size " << sizeToString(thisSize) << " took " << t << " ms\n";
-            }
-
-            double speed = (double(sizeToBytes(2 * thisSize)) / (1000 * 1000)) / t;
-            char sizeStr[256];
-            sprintf(sizeStr, "%9s", sizeToString(thisSize).c_str());
-            resultDB.AddResult(
-                std::string("Bidir_Bandwidth") + "_" + mallocModeString(p_malloc_mode), sizeStr,
-                "GB/sec", speed);
-            resultDB.AddResult(std::string("Bidir_Time") + "_" + mallocModeString(p_malloc_mode),
-                               sizeStr, "ms", t);
+        // Convert to GB/sec
+        if (p_verbose) {
+          std::cerr << "size " << sizeToString(thisSize) << " took " << t << " ms\n";
        }
+
+        double speed = (double(sizeToBytes(2 * thisSize)) / (1000 * 1000)) / t;
+        char sizeStr[256];
+        sprintf(sizeStr, "%9s", sizeToString(thisSize).c_str());
+        resultDB.AddResult(
+            std::string("Bidir_Bandwidth") + "_" + mallocModeString(p_malloc_mode), sizeStr,
+            "GB/sec", speed);
+        resultDB.AddResult(std::string("Bidir_Time") + "_" + mallocModeString(p_malloc_mode),
+                           sizeStr, "ms", t);
+      }
+      if (p_onesize) {
+        break;
+      }
    }

    // Cleanup
@@ -708,66 +702,63 @@ void RunBenchmark_P2P_Unidir(ResultDatabase& resultDB) {
            hipEventCreate(&stop);
            CHECK_HIP_ERROR();

-            // Three passes, forward and backward both
-            for (int pass = 0; pass < p_iterations; pass++) {
-                // store the times temporarily to estimate latency
-                // float times[nSizes];
-                // Step through sizes forward on even passes and backward on odd
-                for (int i = 0; i < nSizes; i++) {
-                    int sizeIndex;
-                    if ((pass % 2) == 0)
-                        sizeIndex = i;
-                    else
-                        sizeIndex = (nSizes - 1) - i;
+            // store the times temporarily to estimate latency
+            // float times[nSizes];
+            for (int i = 0; i < nSizes; i++) {
+              int sizeIndex, iterIndex;
+              sizeIndex = i;
+              iterIndex = i;

-                    const int thisSize = p_onesize ? p_onesize : sizes[sizeIndex];
-                    const int nbytes = sizeToBytes(thisSize);
+              const int thisSize = p_onesize ? p_onesize : sizes[sizeIndex];
+              const int nbytes = sizeToBytes(thisSize);
+              const int niter = p_iterations ? p_iterations : iterations[iterIndex];
+              for (int pass = 0; pass < niter; pass++) {

-                    hipDeviceSynchronize();
+                hipDeviceSynchronize();

-                    hipEventRecord(start, 0);
+                hipEventRecord(start, 0);

-                    for (int j = 0; j < p_beatsperiteration; j++) {
-                        hipMemcpy(peerGpuMem, currentGpuMem, nbytes, hipMemcpyDeviceToDevice);
-                    }
+                for (int j = 0; j < p_beatsperiteration; j++) {
+                  hipMemcpy(peerGpuMem, currentGpuMem, nbytes, hipMemcpyDeviceToDevice);
+                }

-                    hipEventRecord(stop, 0);
+                hipEventRecord(stop, 0);

-                    hipEventSynchronize(stop);
+                hipEventSynchronize(stop);

-                    float t = 0;
-                    hipEventElapsedTime(&t, start, stop);
-                    // times[sizeIndex] = t;
+                float t = 0;
+                hipEventElapsedTime(&t, start, stop);
+                // times[sizeIndex] = t;

-                    // Convert to GB/sec
-                    if (p_verbose) {
-                        std::cerr << "size " << sizeToString(thisSize) << " took " << t << " ms\n";
-                    }
+                // Convert to GB/sec
+                if (p_verbose) {
+                  std::cerr << "size " << sizeToString(thisSize) << " took " << t << " ms\n";
+                }

-                    double speed =
-                        (double(double(sizeToBytes(thisSize)/1000) * p_beatsperiteration) / 1000) / t;
-                    char sizeStr[256];
-                    if (p_beatsperiteration > 1) {
-                        sprintf(sizeStr, "%9sx%d", sizeToString(thisSize).c_str(),
-                                p_beatsperiteration);
-                    } else {
-                        sprintf(sizeStr, "%9s", sizeToString(thisSize).c_str());
-                    }
+                double speed =
+                    (double(double(sizeToBytes(thisSize)/1000) * p_beatsperiteration) / 1000) / t;
+                char sizeStr[256];
+                if (p_beatsperiteration > 1) {
+                  sprintf(sizeStr, "%9sx%d", sizeToString(thisSize).c_str(),
+                          p_beatsperiteration);
+                } else {
+                  sprintf(sizeStr, "%9s", sizeToString(thisSize).c_str());
+                }

-                    string cGpu, pGpu;
-                    cGpu = gpuIDToString(currentGpu);
-                    pGpu = gpuIDToString(peerGpu);
+                string cGpu, pGpu;
+                cGpu = gpuIDToString(currentGpu);
+                pGpu = gpuIDToString(peerGpu);

-                    resultDB.AddResult(std::string("p2p_uni") + "_gpu" + std::string(cGpu) +
-                                           "_gpu" + std::string(pGpu),
+                resultDB.AddResult(std::string("p2p_uni") + "_gpu" + std::string(cGpu) +
+                                       "_gpu" + std::string(pGpu),
                                       sizeStr, "GB/sec", speed);
-                    resultDB.AddResult(std::string("P2P_uni") + "_gpu" + std::string(cGpu) +
-                                           "_gpu" + std::string(pGpu),
+                resultDB.AddResult(std::string("P2P_uni") + "_gpu" + std::string(cGpu) +
+                                       "_gpu" + std::string(pGpu),
                                       sizeStr, "ms", t);

-                    if (p_onesize) {
-                        break;
-                    }
+              }
+                if (p_onesize) {
+                  break;
                }
            }

@@ -829,71 +820,68 @@ void RunBenchmark_P2P_Bidir(ResultDatabase& resultDB) {
            hipStreamCreate(&stream[0]);
            hipStreamCreate(&stream[1]);

-            // Three passes, forward and backward both
-            for (int pass = 0; pass < p_iterations; pass++) {
-                // store the times temporarily to estimate latency
-                // float times[nSizes];
-                // Step through sizes forward on even passes and backward on odd
-                for (int i = 0; i < nSizes; i++) {
-                    int sizeIndex;
-                    if ((pass % 2) == 0)
-                        sizeIndex = i;
-                    else
-                        sizeIndex = (nSizes - 1) - i;
+            // store the times temporarily to estimate latency
+            // float times[nSizes];
+            for (int i = 0; i < nSizes; i++) {
+              int sizeIndex, iterIndex;
+              sizeIndex = i;
+              iterIndex = i;

-                    const int thisSize = p_onesize ? p_onesize : sizes[sizeIndex];
-                    const int nbytes = sizeToBytes(thisSize);
+              const int thisSize = p_onesize ? p_onesize : sizes[sizeIndex];
+              const int nbytes = sizeToBytes(thisSize);
+              const int niter = p_iterations ? p_iterations : iterations[iterIndex];
+              for (int pass = 0; pass < niter; pass++) {

-                    hipDeviceSynchronize();
+                hipDeviceSynchronize();

-                    hipEventRecord(start, 0);
+                hipEventRecord(start, 0);

-                    for (int j = 0; j < p_beatsperiteration; j++) {
-                        hipMemcpyAsync(peerGpuMem[0], currentGpuMem[0], nbytes,
-                                       hipMemcpyDeviceToDevice, stream[0]);
-                        hipMemcpyAsync(currentGpuMem[1], peerGpuMem[1], nbytes,
-                                       hipMemcpyDeviceToDevice, stream[1]);
-                    }
-
-                    hipEventRecord(stop, 0);
-
-                    hipEventSynchronize(stop);
-
-                    float t = 0;
-                    hipEventElapsedTime(&t, start, stop);
-                    // times[sizeIndex] = t;
-
-                    // Convert to GB/sec
-                    if (p_verbose) {
-                        std::cerr << "size " << sizeToString(thisSize) << " took " << t << " ms\n";
-                    }
-
-                    double speed =
-                        (double(double(sizeToBytes(2 * thisSize)/1000) * p_beatsperiteration) / 1000) /
-                        t;
-                    char sizeStr[256];
-                    if (p_beatsperiteration > 1) {
-                        sprintf(sizeStr, "%9sx%d", sizeToString(thisSize).c_str(),
-                                p_beatsperiteration);
-                    } else {
-                        sprintf(sizeStr, "%9s", sizeToString(thisSize).c_str());
-                    }
-
-                    string cGpu, pGpu;
-                    cGpu = gpuIDToString(currentGpu);
-                    pGpu = gpuIDToString(peerGpu);
-
-                    resultDB.AddResult(std::string("p2p_bi") + "_gpu" + std::string(cGpu) + "_gpu" +
-                                           std::string(pGpu),
-                                       sizeStr, "GB/sec", speed);
-                    resultDB.AddResult(std::string("P2P_bi") + "_gpu" + std::string(cGpu) + "_gpu" +
-                                           std::string(pGpu),
-                                       sizeStr, "ms", t);
-
-                    if (p_onesize) {
-                        break;
-                    }
+                for (int j = 0; j < p_beatsperiteration; j++) {
+                  hipMemcpyAsync(peerGpuMem[0], currentGpuMem[0], nbytes,
+                                 hipMemcpyDeviceToDevice, stream[0]);
+                  hipMemcpyAsync(currentGpuMem[1], peerGpuMem[1], nbytes,
+                                 hipMemcpyDeviceToDevice, stream[1]);
                }
+
+                hipEventRecord(stop, 0);
+
+                hipEventSynchronize(stop);
+
+                float t = 0;
+                hipEventElapsedTime(&t, start, stop);
+                // times[sizeIndex] = t;
+
+                // Convert to GB/sec
+                if (p_verbose) {
+                  std::cerr << "size " << sizeToString(thisSize) << " took " << t << " ms\n";
+                }
+
+                double speed =
+                    (double(double(sizeToBytes(2 * thisSize)/1000) * p_beatsperiteration) / 1000) /
+                    t;
+                char sizeStr[256];
+                if (p_beatsperiteration > 1) {
+                  sprintf(sizeStr, "%9sx%d", sizeToString(thisSize).c_str(),
+                          p_beatsperiteration);
+                } else {
+                  sprintf(sizeStr, "%9s", sizeToString(thisSize).c_str());
+                }
+
+                string cGpu, pGpu;
+                cGpu = gpuIDToString(currentGpu);
+                pGpu = gpuIDToString(peerGpu);
+
+                resultDB.AddResult(std::string("p2p_bi") + "_gpu" + std::string(cGpu) + "_gpu" +
+                                       std::string(pGpu),
+                                   sizeStr, "GB/sec", speed);
+                resultDB.AddResult(std::string("P2P_bi") + "_gpu" + std::string(cGpu) + "_gpu" +
+                                       std::string(pGpu),
+                                   sizeStr, "ms", t);
+
+              }
+              if (p_onesize) {
+                break;
+              }
            }

            if (p_onesize) {
@@ -0,0 +1,31 @@
+project(hipCommander)
+
+cmake_minimum_required(VERSION 3.10)
+
+# Search for rocm in common locations
+list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
+
+# Find hip
+find_package(hip)
+
+# Set compiler and linker
+set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
+set(CMAKE_CXX_LINKER   ${HIP_HIPCC_EXECUTABLE})
+set(CMAKE_BUILD_TYPE Release)
+
+# Create the excutable
+add_executable(hipCommander hipCommander.cpp)
+
+# Generate code object
+add_custom_target(
+  codeobj
+  ALL
+  COMMAND  ${HIP_HIPCC_EXECUTABLE} --genco  ../nullkernel.hip.cpp -o nullkernel.hsaco
+  COMMENT "codeobj generated"
+)
+
+add_dependencies(hipCommander codeobj)
+
+# Link with HIP
+target_link_libraries(hipCommander hip::host)
+set_property(TARGET hipCommander PROPERTY CXX_STANDARD 11)
@@ -0,0 +1,35 @@
+project(hipDispatchLatency)
+
+cmake_minimum_required(VERSION 3.10)
+
+# Search for rocm in common locations
+list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
+
+# Find hip
+find_package(hip)
+
+# Set compiler and linker
+set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
+set(CMAKE_CXX_LINKER   ${HIP_HIPCC_EXECUTABLE})
+set(CMAKE_BUILD_TYPE Release)
+
+# Create the excutable
+add_executable(hipDispatchLatency hipDispatchLatency.cpp)
+add_executable(hipDispatchEnqueueRateMT hipDispatchEnqueueRateMT.cpp)
+
+# Generate code object
+add_custom_target(
+  codeobj
+  ALL
+  COMMAND  ${HIP_HIPCC_EXECUTABLE} --genco  ../test_kernel.cpp -o test_kernel.code
+  COMMENT "codeobj generated"
+)
+
+add_dependencies(hipDispatchLatency codeobj)
+add_dependencies(hipDispatchEnqueueRateMT codeobj)
+
+# Link with HIP
+target_link_libraries(hipDispatchLatency hip::host)
+target_link_libraries(hipDispatchEnqueueRateMT hip::host)
+set_property(TARGET hipDispatchLatency PROPERTY CXX_STANDARD 11)
+set_property(TARGET hipDispatchEnqueueRateMT PROPERTY CXX_STANDARD 11)
@@ -0,0 +1,20 @@
+project(hipInfo)
+
+cmake_minimum_required(VERSION 3.10)
+
+# Search for rocm in common locations
+list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
+
+# Find hip
+find_package(hip)
+
+# Set compiler and linker
+set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
+set(CMAKE_CXX_LINKER   ${HIP_HIPCC_EXECUTABLE})
+set(CMAKE_BUILD_TYPE Release)
+
+# Create the excutable
+add_executable(hipInfo hipInfo.cpp)
+
+# Link with HIP
+target_link_libraries(hipInfo hip::host)
@@ -0,0 +1,20 @@
+project(MatrixTranspose)
+
+cmake_minimum_required(VERSION 3.10)
+
+# Search for rocm in common locations
+list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
+
+# Find hip
+find_package(hip)
+
+# Set compiler and linker
+set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
+set(CMAKE_CXX_LINKER   ${HIP_HIPCC_EXECUTABLE})
+set(CMAKE_BUILD_TYPE Release)
+
+# Create the excutable
+add_executable(MatrixTranspose MatrixTranspose.cpp)
+
+# Link with HIP
+target_link_libraries(MatrixTranspose hip::host)
@@ -0,0 +1,20 @@
+project(inline_asm)
+
+cmake_minimum_required(VERSION 3.10)
+
+# Search for rocm in common locations
+list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
+
+# Find hip
+find_package(hip)
+
+# Set compiler and linker
+set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
+set(CMAKE_CXX_LINKER   ${HIP_HIPCC_EXECUTABLE})
+set(CMAKE_BUILD_TYPE Release)
+
+# Create the excutable
+add_executable(inline_asm inline_asm.cpp)
+
+# Link with HIP
+target_link_libraries(inline_asm hip::host)
@@ -0,0 +1,30 @@
+project(texture2dDrv)
+
+cmake_minimum_required(VERSION 3.10)
+
+# Search for rocm in common locations
+list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
+
+# Find hip
+find_package(hip)
+
+# Set compiler and linker
+set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
+set(CMAKE_CXX_LINKER   ${HIP_HIPCC_EXECUTABLE})
+set(CMAKE_BUILD_TYPE Release)
+
+# Create the excutable
+add_executable(texture2dDrv texture2dDrv.cpp)
+
+# Generate code object
+add_custom_target(
+  codeobj
+  ALL
+  COMMAND  ${HIP_HIPCC_EXECUTABLE} --genco  ../tex2dKernel.cpp -o tex2dKernel.code
+  COMMENT "codeobj generated"
+)
+
+add_dependencies(texture2dDrv codeobj)
+
+# Link with HIP
+target_link_libraries(texture2dDrv hip::host)
@@ -0,0 +1,20 @@
+project(occupancy)
+
+cmake_minimum_required(VERSION 3.10)
+
+# Search for rocm in common locations
+list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
+
+# Find hip
+find_package(hip)
+
+# Set compiler and linker
+set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
+set(CMAKE_CXX_LINKER   ${HIP_HIPCC_EXECUTABLE})
+set(CMAKE_BUILD_TYPE Release)
+
+# Create the excutable
+add_executable(occupancy occupancy.cpp)
+
+# Link with HIP
+target_link_libraries(occupancy hip::host)
@@ -0,0 +1,20 @@
+project(hipEvent)
+
+cmake_minimum_required(VERSION 3.10)
+
+# Search for rocm in common locations
+list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
+
+# Find hip
+find_package(hip)
+
+# Set compiler and linker
+set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
+set(CMAKE_CXX_LINKER   ${HIP_HIPCC_EXECUTABLE})
+set(CMAKE_BUILD_TYPE Release)
+
+# Create the excutable
+add_executable(hipEvent hipEvent.cpp)
+
+# Link with HIP
+target_link_libraries(hipEvent hip::host)
@@ -0,0 +1,20 @@
+project(sharedMemory)
+
+cmake_minimum_required(VERSION 3.10)
+
+# Search for rocm in common locations
+list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
+
+# Find hip
+find_package(hip)
+
+# Set compiler and linker
+set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
+set(CMAKE_CXX_LINKER   ${HIP_HIPCC_EXECUTABLE})
+set(CMAKE_BUILD_TYPE Release)
+
+# Create the excutable
+add_executable(sharedMemory sharedMemory.cpp)
+
+# Link with HIP
+target_link_libraries(sharedMemory hip::host)
@@ -0,0 +1,20 @@
+project(shfl)
+
+cmake_minimum_required(VERSION 3.10)
+
+# Search for rocm in common locations
+list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
+
+# Find hip
+find_package(hip)
+
+# Set compiler and linker
+set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
+set(CMAKE_CXX_LINKER   ${HIP_HIPCC_EXECUTABLE})
+set(CMAKE_BUILD_TYPE Release)
+
+# Create the excutable
+add_executable(shfl shfl.cpp)
+
+# Link with HIP
+target_link_libraries(shfl hip::host)
@@ -0,0 +1,19 @@
+project(2dshfl)
+
+cmake_minimum_required(VERSION 3.10)
+
+# Search for rocm in common locations
+list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
+
+# Find hip
+find_package(hip)
+
+# Set compiler and linker
+set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
+set(CMAKE_CXX_LINKER   ${HIP_HIPCC_EXECUTABLE})
+
+# Create the excutable
+add_executable(2dshfl 2dshfl.cpp)
+
+# Link with HIP
+target_link_libraries(2dshfl hip::host)
@@ -0,0 +1,19 @@
+project(dynamic_shared)
+
+cmake_minimum_required(VERSION 3.10)
+
+# Search for rocm in common locations
+list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
+
+# Find hip
+find_package(hip)
+
+# Set compiler and linker
+set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
+set(CMAKE_CXX_LINKER   ${HIP_HIPCC_EXECUTABLE})
+
+# Create the excutable
+add_executable(dynamic_shared dynamic_shared.cpp)
+
+# Link with HIP
+target_link_libraries(dynamic_shared hip::host)
@@ -0,0 +1,19 @@
+project(stream)
+
+cmake_minimum_required(VERSION 3.10)
+
+# Search for rocm in common locations
+list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
+
+# Find hip
+find_package(hip)
+
+# Set compiler and linker
+set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
+set(CMAKE_CXX_LINKER   ${HIP_HIPCC_EXECUTABLE})
+
+# Create the excutable
+add_executable(stream stream.cpp)
+
+# Link with HIP
+target_link_libraries(stream hip::host)
@@ -0,0 +1,19 @@
+project(peer2peer)
+
+cmake_minimum_required(VERSION 3.10)
+
+# Search for rocm in common locations
+list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
+
+# Find hip
+find_package(hip)
+
+# Set compiler and linker
+set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
+set(CMAKE_CXX_LINKER   ${HIP_HIPCC_EXECUTABLE})
+
+# Create the excutable
+add_executable(peer2peer peer2peer.cpp)
+
+# Link with HIP
+target_link_libraries(peer2peer hip::host)
@@ -0,0 +1,19 @@
+project(unroll)
+
+cmake_minimum_required(VERSION 3.10)
+
+# Search for rocm in common locations
+list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
+
+# Find hip
+find_package(hip)
+
+# Set compiler and linker
+set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
+set(CMAKE_CXX_LINKER   ${HIP_HIPCC_EXECUTABLE})
+
+# Create the excutable
+add_executable(unroll unroll.cpp)
+
+# Link with HIP
+target_link_libraries(unroll hip::host)
@@ -0,0 +1,27 @@
+Build procedure
+
+We provide Makefile and CMakeLists.txt to build the samples seperately.
+
+1.Makefile supports shared lib of hip-rocclr runtime and nvcc.
+
+To build a sample, just type in sample folder,
+
+make
+
+
+
+2.CMakeLists.txt can support shared and static libs of hip-rocclr runtime.
+
+To build a sample, type in sample folder,
+
+mkdir build (if build folder is missing)
+
+cd build
+
+cmake ..
+
+make
+
+If you want debug version, follow,
+
+cmake -DCMAKE_BUILD_TYPE=Debug ..
@@ -303,6 +303,7 @@ macro(MAKE_TEST _config exe)
        add_test(NAME ${testname} CONFIGURATIONS ${_config} COMMAND ${PROJECT_BINARY_DIR}/${exe} ${ARGN})
    endif()
    set_tests_properties(${testname} PROPERTIES PASS_REGULAR_EXPRESSION "PASSED" ENVIRONMENT HIP_PATH=${HIP_ROOT_DIR})
+    set_tests_properties(${testname} PROPERTIES SKIP_RETURN_CODE 127 ENVIRONMENT HIP_PATH=${HIP_ROOT_DIR})
 endmacro()

 macro(MAKE_NAMED_TEST _config exe testname)
@@ -0,0 +1,747 @@
+/*
+ Copyright (c) 2015-2020 Advanced Micro Devices, Inc. All rights reserved.
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE.
+ */
+
+/* HIT_START
+ * BUILD: %t %s ../../src/test_common.cpp EXCLUDE_HIP_PLATFORM nvcc
+ * TEST: %t
+ * HIT_END
+ */
+
+#include <iostream>
+#include <chrono>
+#include "test_common.h"
+#include <hip/hip_vector_types.h>
+#include <hip/math_functions.h>
+#include <vector>
+#include <string>
+#include <map>
+
+typedef struct {
+  double x;
+  double y;
+  double width;
+} coordRec;
+
+coordRec coords[] = {
+    {0.0, 0.0, 4.0},                                     // Whole set
+    {0.0, 0.0, 0.00001},                                 // All black
+    {-0.0180789661868, 0.6424294066162, 0.00003824140},  // Hit detail
+};
+
+static unsigned int numCoords = sizeof(coords) / sizeof(coordRec);
+
+template <typename T>
+__global__ void float_mad_kernel(uint *out, uint width, T xPos,  T yPos, T xStep, T yStep,
+                                  uint maxIter) {
+
+#pragma FP_CONTRACT ON
+  int tid = (blockIdx.x * blockDim.x + threadIdx.x);
+  int i = tid % width;
+  int j = tid / width;
+  float x0 = (float)(xPos + xStep*i);
+  float y0 = (float)(yPos + yStep*j);
+
+  float x = x0;
+  float y = y0;
+
+  uint iter = 0;
+  float tmp;
+  for (iter = 0; (x*x + y*y <= 4.0f) && (iter < maxIter); iter++) {
+    tmp = x;
+    x = fma(-y,y,fma(x,x,x0));
+    y = fma(2.0f*tmp,y,y0);
+  }
+
+  out[tid] = iter;
+};
+
+template <typename T>
+__global__ void float_mandel_unroll_kernel(uint *out, uint width, T xPos,
+    T yPos, T xStep, T yStep, uint maxIter) {
+
+#pragma FP_CONTRACT ON
+  int tid = (blockIdx.x * blockDim.x + threadIdx.x);
+  int i = tid % width;
+  int j = tid / width;
+  float x0 = (float)(xPos + xStep*(float)i);
+  float y0 = (float)(yPos + yStep*(float)j);
+
+  float x = x0;
+  float y = y0;
+
+#define FAST
+  uint iter = 0;
+  float tmp;
+  int stay;
+  int ccount = 0;
+  stay = (x*x+y*y) <= 4.0;
+  float savx = x;
+  float savy = y;
+#ifdef FAST
+  for (iter = 0; (iter < maxIter); iter+=16) {
+#else
+  for (iter = 0; stay && (iter < maxIter); iter+=16) {
+#endif
+    x = savx;
+    y = savy;
+
+    // Two iterations
+    tmp =  fma(-y,y, fma(x,x,x0));
+    y =  fma(2.0f*x,y,y0);
+    x =  fma(-y,y, fma(tmp,tmp,x0));
+    y =  fma(2.0f*tmp,y,y0);
+
+    // Two iterations
+    tmp =  fma(-y,y, fma(x,x,x0));
+    y =  fma(2.0f*x,y,y0);
+    x =  fma(-y,y, fma(tmp,tmp,x0));
+    y =  fma(2.0f*tmp,y,y0);
+
+    // Two iterations
+    tmp =  fma(-y,y, fma(x,x,x0));
+    y =  fma(2.0f*x,y,y0);
+    x =  fma(-y,y, fma(tmp,tmp,x0));
+    y =  fma(2.0f*tmp,y,y0);
+
+    // Two iterations
+    tmp =  fma(-y,y, fma(x,x,x0));
+    y =  fma(2.0f*x,y,y0);
+    x =  fma(-y,y, fma(tmp,tmp,x0));
+    y =  fma(2.0f*tmp,y,y0);
+
+    // Two iterations
+    tmp =  fma(-y,y, fma(x,x,x0));
+    y =  fma(2.0f*x,y,y0);
+    x =  fma(-y,y, fma(tmp,tmp,x0));
+    y =  fma(2.0f*tmp,y,y0);
+
+    // Two iterations
+    tmp =  fma(-y,y, fma(x,x,x0));
+    y =  fma(2.0f*x,y,y0);
+    x =  fma(-y,y, fma(tmp,tmp,x0));
+    y =  fma(2.0f*tmp,y,y0);
+
+    // Two iterations
+    tmp =  fma(-y,y, fma(x,x,x0));
+    y =  fma(2.0f*x,y,y0);
+    x =  fma(-y,y, fma(tmp,tmp,x0));
+    y =  fma(2.0f*tmp,y,y0);
+
+    // Two iterations
+    tmp =  fma(-y,y, fma(x,x,x0));
+    y =  fma(2.0f*x,y,y0);
+    x =  fma(-y,y, fma(tmp,tmp,x0));
+    y =  fma(2.0f*tmp,y,y0);
+
+    stay = (x*x+y*y) <= 4.0;
+    savx = (stay ? x : savx);
+    savy = (stay ? y : savy);
+    ccount += stay*16;
+#ifdef FAST
+    if (!stay)
+      break;
+#endif
+  }
+  // Handle remainder
+  if (!stay) {
+    iter = 16;
+    do {
+      x = savx;
+      y = savy;
+      stay = ((x*x+y*y) <= 4.0) && (ccount < maxIter);
+      tmp = x;
+      x =  fma(-y,y, fma(x,x,x0));
+      y =  fma(2.0f*tmp,y,y0);
+      ccount += stay;
+      iter--;
+      savx = (stay ? x : savx);
+      savy = (stay ? y : savy);
+    } while (stay && iter);
+  }
+
+
+  out[tid] = (uint)ccount;
+
+};
+
+
+template <typename T>
+__global__ void double_mad_kernel(uint *out, uint width, T xPos,  T yPos, T xStep, T yStep,
+                                   uint maxIter) {
+
+#pragma FP_CONTRACT ON
+  int tid = (blockIdx.x * blockDim.x + threadIdx.x);
+  int i = tid % width;
+  int j = tid / width;
+  double x0 = (double)(xPos + xStep*i);
+  double y0 = (double)(yPos + yStep*j);
+
+  double x = x0;
+  double y = y0;
+
+  uint iter = 0;
+  double tmp;
+  for (iter = 0; (x*x + y*y <= 4.0f) && (iter < maxIter); iter++) {
+    tmp = x;
+    x = fma(-y,y,fma(x,x,x0));
+    y = fma(2.0f*tmp,y,y0);
+  }
+  out[tid] = iter;
+};
+
+
+template <typename T>
+__global__ void double_mandel_unroll_kernel(uint *out, uint width, T xPos,
+                  T yPos, T xStep, T yStep, uint maxIter) {
+
+#pragma FP_CONTRACT ON
+  int tid = (blockIdx.x * blockDim.x + threadIdx.x);
+
+  int i = tid % width;
+  int j = tid / width;
+  double x0 = (double)(xPos + xStep*(double)i);
+  double y0 = (double)(yPos + yStep*(double)j);
+
+  double x = x0;
+  double y = y0;
+
+#define FAST
+  uint iter = 0;
+  double tmp;
+  int stay;
+  int ccount = 0;
+  stay = (x*x+y*y) <= 4.0;
+  double savx = x;
+  double savy = y;
+#ifdef FAST
+  for (iter = 0; (iter < maxIter); iter+=16)
+#else
+  for (iter = 0; stay && (iter < maxIter); iter+=16)
+#endif
+  {
+    x = savx;
+    y = savy;
+
+    // Two iterations
+    tmp = fma(-y,y, fma(x,x,x0));
+    y =   fma(2.0f*x,y,y0);
+    x =   fma(-y,y, fma(tmp,tmp,x0));
+    y =   fma(2.0f*tmp,y,y0);
+
+    // Two iterations
+    tmp = fma(-y,y, fma(x,x,x0));
+    y =   fma(2.0f*x,y,y0);
+    x =   fma(-y,y, fma(tmp,tmp,x0));
+    y =   fma(2.0f*tmp,y,y0);
+
+    // Two iterations
+    tmp = fma(-y,y, fma(x,x,x0));
+    y =   fma(2.0f*x,y,y0);
+    x =   fma(-y,y, fma(tmp,tmp,x0));
+    y =   fma(2.0f*tmp,y,y0);
+
+    // Two iterations
+    tmp =  fma(-y,y, fma(x,x,x0));
+    y =    fma(2.0f*x,y,y0);
+    x =    fma(-y,y, fma(tmp,tmp,x0));
+    y =    fma(2.0f*tmp,y,y0);
+
+    // Two iterations
+    tmp =  fma(-y,y, fma(x,x,x0));
+    y =    fma(2.0f*x,y,y0);
+    x =    fma(-y,y, fma(tmp,tmp,x0));
+    y =    fma(2.0f*tmp,y,y0);
+
+    // Two iterations
+    tmp =  fma(-y,y, fma(x,x,x0));
+    y =    fma(2.0f*x,y,y0);
+    x =    fma(-y,y, fma(tmp,tmp,x0));
+    y =    fma(2.0f*tmp,y,y0);
+
+    // Two iterations
+    tmp =  fma(-y,y, fma(x,x,x0));
+    y =    fma(2.0f*x,y,y0);
+    x =    fma(-y,y, fma(tmp,tmp,x0));
+    y =    fma(2.0f*tmp,y,y0);
+
+    // Two iterations
+    tmp =  fma(-y,y, fma(x,x,x0));
+    y =    fma(2.0f*x,y,y0);
+    x =    fma(-y,y, fma(tmp,tmp,x0));
+    y =    fma(2.0f*tmp,y,y0);
+
+    stay = (x*x+y*y) <= 4.0;
+    savx = (stay ? x : savx);
+    savy = (stay ? y : savy);
+    ccount += stay*16;
+#ifdef FAST
+    if (!stay)
+      break;
+#endif
+    }
+  // Handle remainder
+    if (!stay) {
+      iter = 16;
+      do {
+        x = savx;
+        y = savy;
+        stay = ((x*x+y*y) <= 4.0) && (ccount < maxIter);
+        tmp = x;
+        x =  fma(-y,y, fma(x,x,x0));
+        y =  fma(2.0f*tmp,y,y0);
+        ccount += stay;
+        iter--;
+        savx = (stay ? x : savx);
+        savy = (stay ? y : savy);
+      }
+      while (stay && iter);
+
+    }
+    out[tid] = (uint)ccount;
+};
+
+static const unsigned int FMA_EXPECTEDVALUES_INDEX = 15;
+
+// Expected results for each kernel run at each coord
+unsigned long long expectedIters[] = {
+    203277748ull,  2147483648ull, 120254651ull,  203277748ull,  2147483648ull,
+    120254651ull,  203277748ull,  2147483648ull, 120254651ull,  203315114ull,
+    2147483648ull, 120042599ull,  203315114ull,  2147483648ull, 120042599ull,
+    203280620ull,  2147483648ull, 120485704ull,  203280620ull,  2147483648ull,
+    120485704ull,  203280620ull,  2147483648ull, 120485704ull,  203315114ull,
+    2147483648ull, 120042599ull,  203315114ull,  2147483648ull, 120042599ull};
+
+class hipPerfMandelBrot {
+  public:
+  hipPerfMandelBrot();
+  ~hipPerfMandelBrot();
+
+  void setNumKernels(unsigned int num) {
+    numKernels = num;
+  }
+
+  unsigned int getNumKernels() {
+    return numKernels;
+  }
+
+  void setNumStreams(unsigned int num) {
+    numStreams = num;
+  }
+  unsigned int getNumStreams() {
+    return numStreams;
+  }
+
+  void open(int deviceID);
+  void run(unsigned int testCase, unsigned int deviceId);
+  void printResults(void);
+
+  // array of funtion pointers
+  typedef void (hipPerfMandelBrot::*funPtr)(uint *out, uint width, float xPos,  float yPos,
+                 float xStep, float yStep, uint maxIter,  hipStream_t* streams, int blocks,
+                 int threads_per_block, int kernelCnt);
+
+  // Wrappers
+  void float_mad(uint *out, uint width, float xPos,  float yPos,
+                  float xStep, float yStep, uint maxIter, hipStream_t* streams,
+                  int blocks, int threads_per_block, int kernelCnt);
+
+  void float_mandel_unroll(uint *out, uint width, float xPos,  float yPos,
+                            float xStep, float yStep, uint maxIter, hipStream_t* streams,
+                            int blocks, int threads_per_block, int kernelCnt);
+
+  void double_mad(uint *out, uint width, float xPos,  float yPos, float xStep,
+                   float yStep, uint maxIter, hipStream_t* streams, int blocks,
+                   int threads_per_block, int kernelCnt);
+
+  void double_mandel_unroll(uint *out, uint width, float xPos,  float yPos, float xStep,
+                             float yStep, uint maxIter, hipStream_t* streams, int blocks,
+                             int threads_per_block, int kernelCnt);
+
+  hipStream_t streams[2];
+
+  private:
+  void setData(void *ptr, unsigned int value);
+  void checkData(uint *ptr);
+
+  unsigned int numKernels;
+  unsigned int numStreams;
+
+  std::map<std::string, std::vector<double>> results;
+  unsigned int width_;
+  unsigned int bufSize;
+  unsigned int maxIter;
+  unsigned int coordIdx;
+  volatile unsigned long long totalIters = 0;
+  int numCUs;
+  static const unsigned int numLoops = 10;
+};
+
+
+hipPerfMandelBrot::hipPerfMandelBrot() {}
+
+hipPerfMandelBrot::~hipPerfMandelBrot() {}
+
+void hipPerfMandelBrot::open(int deviceId) {
+
+
+  int nGpu = 0;
+  HIPCHECK(hipGetDeviceCount(&nGpu));
+  if (nGpu < 1) {
+  std::cout << "info: didn't find any GPU! skipping the test!\n";
+  passed();
+  return;
+  }
+
+
+  HIPCHECK(hipSetDevice(deviceId));
+  hipDeviceProp_t props = {0};
+  HIPCHECK(hipGetDeviceProperties(&props, deviceId));
+  std::cout << "info: running on bus " << "0x" << props.pciBusID << " " << props.name
+    << " with " << props.multiProcessorCount << " CUs" << " and device id: " << deviceId
+    << std::endl;
+
+  numCUs = props.multiProcessorCount;
+}
+
+
+void hipPerfMandelBrot::printResults() {
+
+  int numkernels = getNumKernels();
+  int numStreams = getNumStreams();
+
+  std::cout << "\n" <<"Measured perf for kernels in GFLOPS on "
+            << numStreams << " streams (s)" <<  std::endl;
+
+  std::map<std::string, std::vector<double>>:: iterator itr;
+  for (itr = results.begin(); itr != results.end(); itr++) {
+          std::cout << "\n" << std::setw(20) << itr->first << " ";
+          for(auto i : results[itr->first]) {
+            std::cout << std::setw(10) << i << " ";
+            }
+     }
+  results.clear();
+
+  std::cout << std::endl;
+}
+
+
+// Wrappers for the kernel launches
+void hipPerfMandelBrot::float_mad(uint *out, uint width, float xPos,  float yPos, float xStep,
+                                   float yStep, uint maxIter, hipStream_t* streams,
+                                   int blocks, int threads_per_block, int kernelCnt) {
+
+  int streamCnt = getNumStreams();
+  hipLaunchKernelGGL(float_mad_kernel<float>, dim3(blocks), dim3(threads_per_block), 0,
+                      streams[kernelCnt % streamCnt], out, width_, xPos, yPos, xStep, yStep,
+                      maxIter);
+
+
+}
+
+
+void hipPerfMandelBrot::float_mandel_unroll(uint *out, uint width, float xPos,  float yPos,
+                             float xStep, float yStep, uint maxIter, hipStream_t * streams,
+                             int blocks, int threads_per_block, int kernelCnt) {
+
+  int streamCnt = getNumStreams();
+  hipLaunchKernelGGL(float_mandel_unroll_kernel<float>, dim3(blocks), dim3(threads_per_block), 0,
+                  streams[kernelCnt % streamCnt], out, width_, xPos, yPos, xStep, yStep, maxIter);
+
+}
+
+
+void hipPerfMandelBrot::double_mad(uint *out, uint width, float xPos,  float yPos,
+                               float xStep, float yStep, uint maxIter, hipStream_t * streams,
+                               int blocks, int threads_per_block, int kernelCnt) {
+
+  int streamCnt = getNumStreams();
+  hipLaunchKernelGGL(double_mad_kernel<double>, dim3(blocks), dim3(threads_per_block), 0,
+                  streams[kernelCnt % streamCnt], out, width_, xPos, yPos, xStep, yStep, maxIter);
+
+}
+
+
+void hipPerfMandelBrot::double_mandel_unroll(uint *out, uint width, float xPos,  float yPos,
+                              float xStep, float yStep, uint maxIter, hipStream_t * streams,
+                              int blocks, int threads_per_block, int kernelCnt) {
+
+  int streamCnt = getNumStreams();
+  hipLaunchKernelGGL(float_mandel_unroll_kernel<double>, dim3(blocks), dim3(threads_per_block), 0,
+                  streams[kernelCnt % streamCnt], out, width_, xPos, yPos, xStep, yStep, maxIter);
+
+}
+
+
+void hipPerfMandelBrot::run(unsigned int testCase,unsigned int deviceId) {
+
+  unsigned int numStreams = getNumStreams();
+
+  funPtr p[] = {&hipPerfMandelBrot::float_mad, &hipPerfMandelBrot::float_mandel_unroll,
+               &hipPerfMandelBrot::double_mad, &hipPerfMandelBrot::double_mandel_unroll};
+
+  // Maximum iteration count
+  maxIter = 32768;
+
+  uint * hPtr[numKernels];
+  uint * dPtr[numKernels];
+
+  // Width is divisible by 4 because the mandelbrot kernel processes 4 pixels at once.
+  width_ = 256;
+
+  bufSize = width_  * width_ * sizeof(uint);
+
+  // Create streams for concurrency
+  for (uint i = 0; i < numStreams; i++) {
+    HIPCHECK(hipStreamCreate(&streams[i]));
+  }
+
+
+  // Allocate memory on the host and device
+  for (uint i = 0; i < numKernels; i++) {
+    HIPCHECK(hipHostMalloc((void **)&hPtr[i], bufSize, hipHostMallocDefault));
+    setData(hPtr[i], 0xdeadbeef);
+    HIPCHECK(hipMalloc((uint **)&dPtr[i], bufSize))
+  }
+
+
+  // Prepare kernel launch parameters
+  int threads = (bufSize/sizeof(uint));
+  int threads_per_block  = 64;
+  int blocks = (threads/threads_per_block) + (threads % threads_per_block);
+
+  float xStep = (float)(coords[coordIdx].width / (double)width_);
+  float yStep = (float)(-coords[coordIdx].width / (double)width_);
+  float xPos = (float)(coords[coordIdx].x - 0.5 * coords[coordIdx].width);
+  float yPos = (float)(coords[coordIdx].y + 0.5 * coords[coordIdx].width);
+
+  // Copy memory asynchronously and concurrently from host to device
+  for (uint i = 0; i < numKernels; i++) {
+    HIPCHECK(hipMemcpy(dPtr[i], hPtr[i], bufSize, hipMemcpyHostToDevice));
+  }
+
+  // Synchronize to make sure all the copies are completed
+  HIPCHECK(hipStreamSynchronize(0));
+
+  int kernelIdx;
+  if(testCase == 0 || testCase == 5 || testCase == 10) {
+    kernelIdx = 0;
+  }
+
+  else if(testCase == 1 || testCase == 6 || testCase == 11) {
+    kernelIdx = 1;
+  }
+  else if(testCase == 2 || testCase == 7 || testCase == 12) {
+    kernelIdx = 2;
+  }
+  else if(testCase == 3 || testCase == 8 || testCase == 13){
+    kernelIdx = 3;
+  }
+
+
+  double totalTime = 0.0;
+
+  for (unsigned int k = 0; k < numLoops; k++) {
+
+  coordIdx = testCase % numCoords;
+
+  if ((testCase == 0 || testCase == 1 || testCase == 2 ||
+                  testCase == 5 || testCase == 6 || testCase == 7 ||
+                  testCase == 10 || testCase == 11 || testCase == 12)) {
+  float xStep = (float)(coords[coordIdx].width / (double)width_);
+  float yStep = (float)(-coords[coordIdx].width / (double)width_);
+  float xPos = (float)(coords[coordIdx].x - 0.5 * coords[coordIdx].width);
+  float yPos = (float)(coords[coordIdx].y + 0.5 * coords[coordIdx].width);
+
+  // Time the kernel execution
+  auto all_start = std::chrono::steady_clock::now();
+
+  for (uint i = 0; i < numKernels; i++) {
+    (this->*p[kernelIdx])(dPtr[i], width_, xPos, yPos, xStep, yStep, maxIter, streams, blocks,
+                           threads_per_block, i);
+  }
+
+
+  // Synchronize all the concurrent streams to have completed execution
+  HIPCHECK(hipStreamSynchronize(0));
+
+  auto all_end = std::chrono::steady_clock::now();
+  std::chrono::duration<double> all_kernel_time = all_end - all_start;
+  totalTime += all_kernel_time.count();
+
+  }
+
+
+  else {
+  double xStep = coords[coordIdx].width / (double)width_;
+  double yStep = -coords[coordIdx].width / (double)width_;
+  double xPos = coords[coordIdx].x - 0.5 * coords[coordIdx].width;
+  double yPos = coords[coordIdx].y + 0.5 * coords[coordIdx].width;
+
+  // Time the kernel execution
+  auto all_start = std::chrono::steady_clock::now();
+
+  for (uint i = 0; i < numKernels; i++) {
+  (this->*p[kernelIdx])(dPtr[i], width_, xPos, yPos, xStep, yStep, maxIter, streams, blocks,
+                         threads_per_block, i);
+  }
+
+
+  // Synchronize all the concurrent streams to have completed execution
+  HIPCHECK(hipStreamSynchronize(0));
+
+  auto all_end = std::chrono::steady_clock::now();
+  std::chrono::duration<double> all_kernel_time = all_end - all_start;
+  totalTime += all_kernel_time.count();
+  }
+
+
+  }
+
+  // Copy data back from device to the host
+  for(uint i = 0; i < numKernels; i++) {
+    HIPCHECK(hipMemcpy(hPtr[i] ,dPtr[i], bufSize, hipMemcpyDeviceToHost));
+  }
+
+
+  for(uint i = 0; i < numKernels; i++) {
+  checkData(hPtr[i]);
+
+  int j =0;
+  while((totalIters != expectedIters[j] && totalIters > expectedIters[j]) && j < 30) {
+          j++;
+  }
+
+  if(j==30) {
+    std::cout << "Incorrect iteration count detected. ";
+  }
+
+  }
+
+
+  // Compute GFLOPS.  There are 7 FLOPs per iteration
+  double perf = ((double)(totalIters*numKernels) * 7 * (double)(1e-09)) /
+                (totalTime / (double)numLoops);
+
+
+  std::vector<std::string> kernelName = {"float", "float_unroll",
+                      "double", "double_unroll"};
+
+  // Print results except for Warm-up kernel
+  if(testCase!=100) {
+  results[kernelName[testCase % 4]].push_back(perf);
+ }
+
+
+  for(uint i = 0 ; i < numStreams; i++) {
+    HIPCHECK(hipStreamDestroy(streams[i]));
+  }
+
+
+  // Free host and device memory
+  for (uint i = 0; i < numKernels; i++) {
+    HIPCHECK(hipFree(hPtr[i]));
+    HIPCHECK(hipFree(dPtr[i]));
+  }
+
+
+}
+
+
+void hipPerfMandelBrot::setData(void *ptr, unsigned int value) {
+  unsigned int *ptr2 = (unsigned int *)ptr;
+  for (unsigned int i = 0; i < width_ * width_; i++) {
+      ptr2[i] = value;
+  }
+}
+
+
+void hipPerfMandelBrot::checkData(uint *ptr) {
+  totalIters = 0;
+  for (unsigned int i = 0; i < width_ * width_; i++) {
+    totalIters += ptr[i];
+  }
+}
+
+
+int main(int argc, char* argv[]) {
+  hipPerfMandelBrot mandelbrotCompute;
+  int deviceId = 0;
+
+  mandelbrotCompute.open(deviceId);
+
+  for (unsigned int testCase = 0; testCase < 3; testCase++) {
+
+
+  switch (testCase) {
+
+
+  case 0: {
+    // Warmup-kernel - default stream executes serially
+    mandelbrotCompute.setNumStreams(1);
+    mandelbrotCompute.setNumKernels(1);
+    mandelbrotCompute.run(100/*Random number*/, deviceId);
+    break;
+    }
+
+
+  case 1: {
+    // run all - sync
+    int i = 0;
+    do {
+    mandelbrotCompute.setNumStreams(1);
+    mandelbrotCompute.setNumKernels(1);
+    mandelbrotCompute.run(i, deviceId);
+    i++;
+    }while(i < 12);
+    mandelbrotCompute.printResults();
+
+    break;
+  }
+
+
+  case 2: {
+    // run all - async
+    int i = 0;
+    do {
+    mandelbrotCompute.setNumStreams(2);
+    mandelbrotCompute.setNumKernels(2);
+    mandelbrotCompute.run(i, deviceId);
+    i++;
+    }while(i < 12);
+    mandelbrotCompute.printResults();
+
+    break;
+
+  }
+
+
+  default: {
+    break;
+  }
+
+
+  }
+
+
+
+  }
+
+
+  passed();
+}
@@ -0,0 +1,289 @@
+/*
+ Copyright (c) 2015-2020 Advanced Micro Devices, Inc. All rights reserved.
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE.
+ */
+
+/* HIT_START
+ * BUILD: %t %s ../../src/test_common.cpp EXCLUDE_HIP_PLATFORM nvcc
+ * TEST: %t
+ * HIT_END
+ */
+
+#include <iostream>
+#include <chrono>
+#include "test_common.h"
+
+typedef struct {
+  double x;
+  double y;
+  double width;
+} coordRec;
+
+static coordRec coords[] = {
+    {0.0, 0.0, 0.00001},         // All black
+};
+
+static unsigned int numCoords = sizeof(coords) / sizeof(coordRec);
+
+__global__ void mandelbrot(uint *out, uint width, float xPos,  float yPos, float xStep,
+                            float yStep, uint maxIter) {
+
+  int tid = (blockIdx.x * blockDim.x + threadIdx.x);
+  int i = tid % width;
+  int j = tid / width;
+  float x0 = (float)(xPos + xStep*i);
+  float y0 = (float)(yPos + yStep*j);
+
+  float x = x0;
+  float y = y0;
+
+  uint iter = 0;
+  float tmp;
+  for (iter = 0; (x*x + y*y <= 4.0f) && (iter < maxIter); iter++) {
+    tmp = x;
+    x = fma(-y,y,fma(x,x,x0));
+    y = fma(2.0f*tmp,y,y0);
+  }
+
+  out[tid] = iter;
+};
+
+class hipPerfDeviceConcurrency {
+  public:
+  hipPerfDeviceConcurrency();
+  ~hipPerfDeviceConcurrency();
+
+  void setNumGpus(unsigned int num) {
+    numDevices = num;
+  }
+  unsigned int getNumGpus() {
+    return numDevices;
+  }
+
+  void open(void);
+  void close(void);
+  void run(unsigned int testCase, int numGpus);
+
+  private:
+  void setData(void *ptr, unsigned int value);
+  void checkData(uint *ptr);
+
+  unsigned int numDevices;
+  unsigned int width_;
+  unsigned int bufSize;
+  unsigned int coordIdx;
+  unsigned long long totalIters = 0;
+};
+
+
+hipPerfDeviceConcurrency::hipPerfDeviceConcurrency() {}
+
+hipPerfDeviceConcurrency::~hipPerfDeviceConcurrency() {}
+
+void hipPerfDeviceConcurrency::open(void) {
+
+
+  int nGpu = 0;
+  HIPCHECK(hipGetDeviceCount(&nGpu));
+  setNumGpus(nGpu);
+  if (nGpu < 1) {
+  std::cout << "info: didn't find any GPU! skipping the test!\n";
+  passed();
+  }
+
+
+}
+
+
+void hipPerfDeviceConcurrency::close() {
+}
+
+void hipPerfDeviceConcurrency::run(unsigned int testCase, int numGpus) {
+
+
+  static int deviceId;
+  uint * hPtr[numGpus];
+  uint * dPtr[numGpus];
+  hipStream_t streams[numGpus];
+  int numCUs[numGpus];
+  unsigned int maxIter[numGpus];
+  unsigned long long expectedIters[numGpus];
+
+  int threads, threads_per_block, blocks;
+  float xStep, yStep, xPos, yPos;
+
+  for(int i = 0; i < numGpus; i++) {
+
+  if(testCase != 0) {
+    deviceId = i;
+  }
+
+  HIPCHECK(hipSetDevice(deviceId));
+
+  hipDeviceProp_t props = {0};
+  HIPCHECK(hipGetDeviceProperties(&props, i));
+
+  if (testCase != 0) {
+  std::cout << "info: running on bus " << "0x" << props.pciBusID << " " << props.name
+            << " with " << props.multiProcessorCount << " CUs" << " and device ID: "
+            << i << std::endl;
+  }
+
+  numCUs[i] = props.multiProcessorCount;
+  int clkFrequency = 0;
+  HIPCHECK(hipDeviceGetAttribute(&clkFrequency, hipDeviceAttributeClockRate, i));
+
+  clkFrequency =(unsigned int)clkFrequency/1000;
+
+  // Maximum iteration count
+  // maxIter = 8388608 * (engine_clock / 1000).serial execution
+  maxIter[i] = (unsigned int)(((8388608 * ((float)clkFrequency / 1000)) * numCUs[i]) / 128);
+  maxIter[i] = (maxIter[i] + 15) & ~15;
+
+  // Width is divisible by 4 because the mandelbrot kernel processes 4 pixels at once.
+  width_ = 256;
+
+  bufSize = width_ * width_ * sizeof(uint);
+
+  // Create streams for concurrency
+  HIPCHECK(hipStreamCreate(&streams[i]));
+
+  // Allocate memory on the host and device
+  HIPCHECK(hipHostMalloc((void **)&hPtr[i], bufSize, hipHostMallocDefault));
+  setData(hPtr[i], 0xdeadbeef);
+  HIPCHECK(hipMalloc((uint **)&dPtr[i], bufSize))
+
+  // Prepare kernel launch parameters
+  threads = (bufSize/sizeof(uint));
+  threads_per_block  = 64;
+  blocks = (threads/threads_per_block) + (threads % threads_per_block);
+
+  coordIdx = testCase % numCoords;
+  xStep = (float)(coords[coordIdx].width / (double)width_);
+  yStep = (float)(-coords[coordIdx].width / (double)width_);
+  xPos = (float)(coords[coordIdx].x - 0.5 * coords[coordIdx].width);
+  yPos = (float)(coords[coordIdx].y + 0.5 * coords[coordIdx].width);
+
+  // Copy memory from host to device
+  HIPCHECK(hipMemcpy(dPtr[i], hPtr[i], bufSize, hipMemcpyHostToDevice));
+
+  }
+
+  // Time the kernel execution
+  auto all_start = std::chrono::steady_clock::now();
+
+  for(int i = 0; i < numGpus; i++) {
+
+  if(testCase != 0) {
+    deviceId = i;
+  }
+
+  HIPCHECK(hipSetDevice(deviceId));
+
+  hipLaunchKernelGGL(mandelbrot, dim3(blocks), dim3(threads_per_block), 0, streams[i],
+                      dPtr[i], width_, xPos, yPos, xStep, yStep, maxIter[i]);
+
+  }
+
+  for(int i = 0; i < numGpus; i++) {
+    HIPCHECK(hipStreamSynchronize(0));
+  }
+
+
+  auto all_end = std::chrono::steady_clock::now();
+  std::chrono::duration<double> all_kernel_time = all_end - all_start;
+
+  for(int i = 0; i < numGpus; i++) {
+
+  if(testCase != 0) {
+    deviceId = i;
+  }
+  HIPCHECK(hipSetDevice(deviceId));
+
+  // Copy data back from device to the host
+  HIPCHECK(hipMemcpy(hPtr[i], dPtr[i], bufSize, hipMemcpyDeviceToHost));
+
+  checkData(hPtr[i]);
+  expectedIters[i] = width_ * width_ * (unsigned long long) maxIter[i];
+
+  if (testCase != 0) {
+    checkData(hPtr[i]);
+    if(totalIters != expectedIters[i]) {
+      std::cout << "Incorrect iteration count detected" << std::endl;
+    }
+  }
+
+
+  HIPCHECK(hipStreamDestroy(streams[i]));
+
+  // Free host and device memory
+  HIPCHECK(hipFree(hPtr[i]));
+  HIPCHECK(hipFree(dPtr[i]));
+  }
+
+  if (testCase != 0) {
+  std::cout << '\n' << "Measured time for kernel computation on " << numGpus << " device (s): "
+            << all_kernel_time.count() << " (s) " << '\n' << std::endl;
+  }
+
+  if(testCase == 0) {
+    deviceId++;
+  }
+
+
+}
+
+
+void hipPerfDeviceConcurrency::setData(void *ptr, unsigned int value) {
+  unsigned int *ptr2 = (unsigned int *)ptr;
+  for (unsigned int i = 0; i < width_ * width_ ; i++) {
+      ptr2[i] = value;
+  }
+}
+
+
+void hipPerfDeviceConcurrency::checkData(uint *ptr) {
+  totalIters = 0;
+  for (unsigned int i = 0; i < width_ * width_; i++) {
+    totalIters += ptr[i];
+  }
+}
+
+
+int main(int argc, char* argv[]) {
+  hipPerfDeviceConcurrency deviceConcurrency;
+
+  deviceConcurrency.open();
+
+  int nGpu = deviceConcurrency.getNumGpus();
+
+  // testCase = 0 refers to warmup kernel run
+  int testCase = 0;
+
+  for (int i = 0; i < nGpu; i++) {
+    // Warm-up kernel on all devices
+    deviceConcurrency.run(testCase, 1);
+  }
+
+  // Time for kernel on 1 device
+  deviceConcurrency.run(++testCase, 1);
+
+  // Time for kernel on all available devices
+  deviceConcurrency.run(++testCase, nGpu);
+
+  passed();
+}
@@ -57,6 +57,15 @@ void matrixTransposeCPUReference(T* output, T* input, const unsigned int width)
    }
 }

+void getFactor(int& fact) { fact = 101; }
+void getFactor(unsigned int& fact) { fact = static_cast<unsigned int>(INT32_MAX)+1; }
+void getFactor(float& fact) { fact = 2.5; }
+void getFactor(double& fact) { fact = 2.5; }
+void getFactor(long& fact) { fact = 202; }
+void getFactor(unsigned long& fact) { fact = static_cast<unsigned long>(__LONG_MAX__)+1; }
+void getFactor(long long& fact) { fact = 303; }
+void getFactor(unsigned long long& fact) { fact = static_cast<unsigned long long>(__LONG_LONG_MAX__)+1; }
+
 template<typename T>
 void runTest() {
    T* Matrix;
@@ -77,8 +86,10 @@ void runTest() {
    cpuTransposeMatrix = (T*)malloc(NUM * sizeof(T));

    // initialize the input data
+    T factor;
+    getFactor(factor);
    for (i = 0; i < NUM; i++) {
-        Matrix[i] = (T)i * 10l;
+        Matrix[i] = (T)i + factor;
    }

    // allocate the memory on the device side
@@ -124,7 +135,11 @@ void runTest() {
 int main() {
    runTest<int>();
    runTest<float>();
+    runTest<double>();
    runTest<long>();
    runTest<long long>();
+    runTest<unsigned int>();
+    runTest<unsigned long>();
+    runTest<unsigned long long>();
    passed();
 }
@@ -47,13 +47,31 @@ __global__ void shflUpSum(T* a, int size) {
    a[threadIdx.x] = val;
 }

+template <typename T>
+__global__ void shflXorSum(T* a, int size) {
+  T val = a[threadIdx.x];
+  for (int i = size/2; i > 0; i /= 2)
+    val += __shfl_xor(val, i, size);
+  a[threadIdx.x] = val;
+}
+
+void getFactor(int& fact) { fact = 101; }
+void getFactor(unsigned int& fact) { fact = static_cast<unsigned int>(INT32_MAX)+1; }
+void getFactor(float& fact) { fact = 2.5; }
+void getFactor(double& fact) { fact = 2.5; }
+void getFactor(long& fact) { fact = 202; }
+void getFactor(unsigned long& fact) { fact = static_cast<unsigned long>(__LONG_MAX__)+1; }
+void getFactor(long long& fact) { fact = 303; }
+void getFactor(unsigned long long& fact) { fact = static_cast<unsigned long long>(__LONG_LONG_MAX__)+1; }
+
 template <typename T>
 void runTestShflUp() {
    const int size = 32;
    T a[size];
    T cpuSum = 0;
+    T factor; getFactor(factor);
    for (int i = 0; i < size; i++) {
-        a[i] = i;
+        a[i] = i + factor;
        cpuSum += a[i];
    }
    T* d_a;
@@ -73,8 +91,9 @@ void runTestShflDown() {
    const int size = 32;
    T a[size];
    T cpuSum = 0;
+    T factor; getFactor(factor);
    for (int i = 0; i < size; i++) {
-        a[i] = i;
+        a[i] = i + factor;
        cpuSum += a[i];
    }
    T* d_a;
@@ -84,19 +103,58 @@ void runTestShflDown() {
    hipMemcpy(&a, d_a, sizeof(T) * size, hipMemcpyDefault);
    if (a[0] != cpuSum) {
        hipFree(d_a);
-        failed("Shfl Up Sum did not match.");
+        failed("Shfl Down Sum did not match.");
+    }
+    hipFree(d_a);
+}
+
+template <typename T>
+void runTestShflXor() {
+    const int size = 32;
+    T a[size];
+    T cpuSum = 0;
+    T factor; getFactor(factor);
+    for (int i = 0; i < size; i++) {
+        a[i] = i + factor;
+        cpuSum += a[i];
+    }
+    T* d_a;
+    hipMalloc(&d_a, sizeof(T) * size);
+    hipMemcpy(d_a, &a, sizeof(T) * size, hipMemcpyDefault);
+    hipLaunchKernelGGL(shflXorSum<T>, 1, size, 0, 0, d_a, size);
+    hipMemcpy(&a, d_a, sizeof(T) * size, hipMemcpyDefault);
+    if (a[0] != cpuSum) {
+        hipFree(d_a);
+        failed("Shfl Xor Sum did not match.");
    }
    hipFree(d_a);
 }
 int main() {
    runTestShflUp<int>();
    runTestShflUp<float>();
+    runTestShflUp<double>();
    runTestShflUp<long>();
    runTestShflUp<long long>();
+    runTestShflUp<unsigned int>();
+    runTestShflUp<unsigned long>();
+    runTestShflUp<unsigned long long>();

    runTestShflDown<int>();
    runTestShflDown<float>();
+    runTestShflDown<double>();
    runTestShflDown<long>();
    runTestShflDown<long long>();
+    runTestShflDown<unsigned int>();
+    runTestShflDown<unsigned long>();
+    runTestShflDown<unsigned long long>();
+
+    runTestShflXor<int>();
+    runTestShflXor<float>();
+    runTestShflXor<double>();
+    runTestShflXor<long>();
+    runTestShflXor<long long>();
+    runTestShflXor<unsigned int>();
+    runTestShflXor<unsigned long>();
+    runTestShflXor<unsigned long long>();
    passed();
 }
@@ -395,6 +395,9 @@ int main(int argc, char* argv[]) {

    if (gpuCount < 2) {
        printf("P2P application requires atleast 2 gpu devices\n");
+        if (hip_skip_tests_enabled()) {
+          return hip_skip_retcode();
+        }
    } else {
        if (p_tests & 0x100) {
            testPeerHostToDevice(false /*useAsyncCopy*/);
@@ -0,0 +1,280 @@
+/*
+  Copyright (c) 2020-present Advanced Micro Devices, Inc. All rights reserved.
+  Permission is hereby granted, free of charge, to any person obtaining a copy
+  of this software and associated documentation files (the "Software"), to deal
+  in the Software without restriction, including without limitation the rights
+  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+  copies of the Software, and to permit persons to whom the Software is
+  furnished to do so, subject to the following conditions:
+  The above copyright notice and this permission notice shall be included in
+  all copies or substantial portions of the Software.
+  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+  THE SOFTWARE.
+*/
+// Test Description:
+/*The general idea of the application is to test how Cooperative Groups kernel
+launches work when launching too many warps to the target device. This test
+first queries the nominal warp size of the target device. It then walks through
+block sizes from 1 thread, 1 warp, 2 warps, ... `maximum_warps_in_a_block`. For
+each of these, it queries the maximum number of blocks that can fit in each SM.
+It then queries the number of SMs on the target device. This will yield a
+calculation for the maximum number of blocks that can be co-scheduled on this
+device.
+
+The Cooperative Groups API says that users should not launch more than this
+many warps (or blocks, etc.) to the target device. This test first tires to
+launch 2x as many blcoks, to confirm that the runtime prevents such a launch
+by returning a proper error value (`hipErrorCooperativeLaunchTooLarge`).
+
+It then ensures that trying to launch too large of a kernel invocation does
+not break the GPU by launching a kernel with exactly the maximum number of
+blocks.
+
+Finally, we run the same test for a block size that is larger than the maximum
+allowed by the device, to ensure that this case is properly detected by the
+runtime and that nothing breaks.*/
+
+
+
+/* HIT_START
+ * BUILD: %t %s ../../test_common.cpp
+ * TEST: %t
+ * HIT_END
+ */
+
+
+#include <hip/hip_runtime.h>
+#include <hip/hip_cooperative_groups.h>
+#include "test_common.h"
+
+
+static inline void hipCheckAndFail(hipError_t errval,
+        const char *file, int line) {
+  hipError_t last_err = hipGetLastError();
+  if (errval != hipSuccess) {
+    std::cerr << "hip error: " << hipGetErrorString(errval);
+    std::cerr << std::endl;
+    std::cerr << "    Location: " << file << ":" << line << std::endl;
+    failed("");
+  }
+  if (last_err != errval) {
+    std::cerr << "Error: the return value of a function was not the same ";
+    std::cerr << "as the value returned by hipGetLastError()" << std::endl;
+    std::cerr << "    Location: " << file << ":" << line << std::endl;
+    std::cerr << "    Function returned: " << hipGetErrorString(errval);
+    std::cerr << " (" << errval << ")" << std::endl;
+    std::cerr << "hipGetLastError() returned: " << hipGetErrorString(last_err);
+    std::cerr << " (" << last_err << ")" << std::endl;
+    failed("");
+  }
+}
+#define hipCheckErr(errval) \
+        do { hipCheckAndFail((errval), __FILE__, __LINE__); } while (0)
+
+static inline bool hipCheckExpected(hipError_t errval,
+        hipError_t expected_err, const char *file, int line) {
+  hipError_t last_err = hipGetLastError();
+  if (errval != expected_err) {
+    std::cerr << "hip error: " << hipGetErrorString(errval);
+    std::cerr << std::endl;
+    std::cerr << "    Location: " << file << ":" << line << std::endl;
+    return false;
+  }
+  if (last_err != errval) {
+    std::cerr << "Error: the return value of a function was not the same ";
+    std::cerr << "as the value returned by hipGetLastError()" << std::endl;
+    std::cerr << "    Location: " << file << ":" << line << std::endl;
+    std::cerr << "    Function returned: " << hipGetErrorString(errval);
+    std::cerr << " (" << errval << ")" << std::endl;
+    std::cerr << "hipGetLastError() returned: " << hipGetErrorString(last_err);
+    std::cerr << " (" << last_err << ")" << std::endl;
+    return false;
+  }
+  return true;
+}
+
+static bool cooperative_groups_support(int device_id) {
+  hipError_t err;
+  int cooperative_attribute;
+  HIPCHECK(hipDeviceGetAttribute(&cooperative_attribute,
+           hipDeviceAttributeCooperativeLaunch, device_id));
+  if (!cooperative_attribute) {
+    std::cerr << "Cooperative launch support not available in ";
+    std::cerr << "the device attribute for device " << device_id;
+    std::cerr << std::endl;
+    return false;
+  }
+  hipDeviceProp_t device_properties;
+  HIPCHECK(hipGetDeviceProperties(&device_properties, device_id));
+  if (device_properties.cooperativeLaunch == 0) {
+    std::cerr << "Cooperative group support not available in ";
+    std::cerr << "device properties." << std::endl;
+    return false;
+  }
+  return true;
+}
+
+__global__ void test_kernel(long long *array) {
+  unsigned int rank = blockIdx.x * blockDim.x + threadIdx.x;
+  array[rank] += clock64();
+}
+
+int main(int argc, char** argv) {
+  hipError_t err;
+  int device_num, FailFlag = 0;
+  // Alocate the host input buffer, and two device-focused buffers that we
+  // will use for our test.
+  unsigned int *dev_array[2];
+  HIPCHECK(hipGetDeviceCount(&device_num));
+  for (int dev = 0; dev < device_num; ++dev) {
+    /*************************************************************************/
+    /* Test whether target device supports cooperative groups ****************/
+    HIPCHECK(hipSetDevice(dev));
+    if (!cooperative_groups_support(dev)) {
+      std::cout << "Skipping the test with Pass result.\n";
+      passed();
+    }
+
+    /*************************************************************************/
+    /* Create the streams we will use in this test. **************************/
+    hipStream_t streams[2];
+    for (int i = 0; i < 2; i++) {
+      HIPCHECK(hipStreamCreate(&streams[i]));
+    }
+
+    /*************************************************************************/
+    /* We will try to launch more waves than the GPU can fit. ***************/
+    hipDeviceProp_t device_properties;
+    HIPCHECK(hipGetDeviceProperties(&device_properties, dev));
+    int warp_size = device_properties.warpSize;
+    int num_sms = device_properties.multiProcessorCount;
+    int max_num_threads = device_properties.maxThreadsPerBlock;
+
+    // Check single-thread block, all numbers of warps, then too-large block
+    for (int block_size = 0; block_size <= (max_num_threads + warp_size);
+         block_size += warp_size) {
+      if (block_size == 0) {
+        block_size = 1;
+      }
+      int max_blocks_per_sm;
+      // Calculate the device occupancy to know how many blocks can be run.
+      HIPCHECK(hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
+              &max_blocks_per_sm, test_kernel, block_size, 0,
+              hipOccupancyDefault));
+
+      if ((block_size > max_num_threads) && (max_blocks_per_sm != 0)) {
+        std::cerr << "ERROR! Occupancy API indicated that we can have >0 ";
+        std::cerr << "blocks in a kernel when the block size is too large ";
+        std::cerr << "to work on the device." << std::endl;
+        std::cerr << "This is incorrect, and could possibly lead users ";
+        std::cerr << "to try to launch kernels that will fail." << std::endl;
+        //failed("");
+        FailFlag = 1;
+        break;
+      }
+
+      int desired_blocks = max_blocks_per_sm * num_sms;
+      bool expect_fail = false;
+      if (desired_blocks == 0) {
+        desired_blocks = 1;
+        expect_fail = true;
+      }
+
+      /**********************************************************************/
+      /* Set up data to pass into the kernel ********************************/
+
+      for (int i = 0; i < 2; i++) {
+        int test_size;
+        // Case where we expect to fail at launch.
+        if (i == 0) {
+          test_size = 2 * desired_blocks;
+        } else {
+          test_size = desired_blocks;
+        }
+        HIPCHECK(hipMalloc(reinterpret_cast<void**>(&dev_array[i]),
+                           test_size * block_size * sizeof(long long)));
+        HIPCHECK(hipMemsetAsync(dev_array[i], 0,
+                                test_size * block_size * sizeof(long long),
+                                streams[i]));
+      }
+
+      HIPCHECK(hipDeviceSynchronize());
+
+      /***********************************************************************/
+      /* Launch the kernels **************************************************/
+      void *coop_params[2][1];
+      for (int i = 0; i < 2; i++) {
+        coop_params[i][0] = reinterpret_cast<void*>(&dev_array[i]);
+      }
+
+      err = hipLaunchCooperativeKernel(reinterpret_cast<void*>(test_kernel),
+                                       2 * desired_blocks, block_size,
+                                       coop_params[0], 0, streams[0]);
+
+      hipError_t expect_to_see;
+      if (expect_fail) {
+        expect_to_see = hipErrorInvalidConfiguration;
+      } else {
+        expect_to_see = hipErrorCooperativeLaunchTooLarge;
+      }
+      if (!hipCheckExpected(err, expect_to_see, __FILE__, __LINE__)) {
+        std::cerr << "ERROR! Tried to launch a cooperative kernel with ";
+        std::cerr << "too many warps." << std::endl;
+        std::cerr << "This SHOULD have failed with the error ";
+        std::cerr << hipGetErrorString(expect_to_see);
+        std::cerr << " (" << expect_to_see << ")." << std::endl;
+        std::cerr << "Instead, the launch returned " << hipGetErrorName(err);
+        std::cerr << " (" << err << ")" << std::endl;
+        FailFlag = 1;
+        break;
+      }
+
+      HIPCHECK(hipDeviceSynchronize());
+      err = hipLaunchCooperativeKernel(reinterpret_cast<void*>(test_kernel),
+                                       desired_blocks, block_size,
+                                       coop_params[1], 0, streams[1]);
+
+      if (expect_fail) {
+        expect_to_see = hipErrorInvalidConfiguration;
+      } else {
+        expect_to_see = hipSuccess;
+      }
+      if (!hipCheckExpected(err, expect_to_see, __FILE__, __LINE__)) {
+        std::cerr << "ERROR! Tried to launch a cooperative kernel ";
+        std::cerr << "with a normal size, but a block size of ";
+        std::cerr << desired_blocks << std::endl;
+        std::cerr << "This SHOULD have returned ";
+        std::cerr << hipGetErrorString(expect_to_see);
+        std::cerr << " (" << expect_to_see << ")." << std::endl;
+        std::cerr << "Instead, the launch returned " << hipGetErrorName(err);
+        std::cerr << " (" << err << ")" << std::endl;
+        FailFlag = 1;
+        break;
+      }
+
+      HIPCHECK(hipDeviceSynchronize());
+
+      if (block_size == 1) {
+        block_size = 0;
+      }
+      for (int m = 0; m < 2; ++m) {
+        HIPCHECK(hipFree(dev_array[m]));
+      }
+    }
+    for (int m = 0; m < 2; ++m) {
+      HIPCHECK(hipStreamDestroy(streams[m]));
+    }
+    if (FailFlag == 1) {
+      for (int m = 0; m < 2; ++m) {
+        HIPCHECK(hipFree(dev_array[m]));
+      }
+      failed("");
+    }
+  }
+  passed();
+}
@@ -0,0 +1,283 @@
+/*
+Copyright (c) 2020-present Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR
+IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+// Test Description:
+/*
+The general idea of the application is to test how Cooperative Groups kernel
+launches to a stream interact with other kernels being launched to different
+streams.
+
+For example: the HIP runtime will force cooperative kernel launches to run
+serially, even if they are launched to different streams. However,
+cooperative kernel launches can run in parallel with regular kernels that
+are launched to other streams. This limitation is so that the cooperative
+kernels do not conflict with one another for resources and potentially
+deadlock the system.
+
+As such, this benchmark tests three situations:
+
+  1. Launching a cooperative kernel by itself to stream[0]
+  2. Launching two cooperative kernels in parallel to stream[0] and stream[1]
+  3. Launching two cooperative kernels in parallel to stream[0] and stream[1]
+     and launching a third non-cooperative kernel to stream[2]
+
+We time how long it takes to run each of these benchmarks and print it as
+the output of the benchmark. The kernels themselves are just useless time-
+wasting code so that the kernel takes a meaningful amount of time on the
+GPU before it exits. We only launch a single wavefront for each kernel, so
+any serialization should not be because of GPU occupancy concerns.
+
+If test #2 takes roughly twice as long as #1, that implies that cooperative
+kernels are properly serialized with each other by the runtime.
+
+If test #3 takes the same amount of time as test #2, that implies that
+regular kernels can properly run in parallel with cooperative kernels.
+*/
+
+/* HIT_START
+ * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS --std=c++11
+ * TEST: %t
+ * HIT_END
+ */
+
+#include <chrono>
+#include <hip/hip_runtime.h>
+#include <hip/hip_cooperative_groups.h>
+#include "test_common.h"
+
+static inline void hipCheckAndFail(hipError_t errval,
+        const char *file, int line) {
+  hipError_t last_err = hipGetLastError();
+  if (errval != hipSuccess) {
+    std::cerr << "hip error: " << hipGetErrorString(errval);
+    std::cerr << std::endl;
+    std::cerr << "Location: " << file << ":" << line << std::endl;
+    failed("");
+  }
+  if (last_err != errval) {
+    std::cerr << "Error: the return value of a function was not the same ";
+    std::cerr << "as the value returned by hipGetLastError()" << std::endl;
+    std::cerr << "Location: " << file << ":" << line << std::endl;
+    std::cerr << "Function returned: " << hipGetErrorString(errval);
+    std::cerr << " (" << errval << ")" << std::endl;
+    std::cerr << "hipGetLastError() returned: " << hipGetErrorString(last_err);
+    std::cerr << " (" << last_err << ")" << std::endl;
+    failed("");
+  }
+}
+#define hipCheckErr(errval) \
+  do { hipCheckAndFail((errval), __FILE__, __LINE__); } while (0)
+
+static int cooperative_groups_support(int device_id) {
+  hipError_t err;
+  int cooperative_attribute;
+  HIPCHECK(hipDeviceGetAttribute(&cooperative_attribute,
+           hipDeviceAttributeCooperativeLaunch, device_id));
+  if (!cooperative_attribute) {
+    std::cerr << "Cooperative launch support not available in ";
+    std::cerr << "the device attribute for device " << device_id;
+    std::cerr << std::endl;
+    return 0;
+  }
+
+  hipDeviceProp_t device_properties;
+  HIPCHECK(hipGetDeviceProperties(&device_properties, device_id));
+  if (device_properties.cooperativeLaunch == 0) {
+    std::cerr << "Cooperative group support not available in ";
+    std::cerr << "device properties." << std::endl;
+    return 0;
+  }
+  return 1;
+}
+
+__global__ void test_kernel(uint32_t loops, unsigned long long *array) {
+  unsigned int rank = blockIdx.x * blockDim.x + threadIdx.x;
+
+  for (int i = 0; i < loops; i++) {
+    long long start_clock = clock64();
+    while (clock64() < (start_clock+1000000)) {}
+    array[rank] += clock64();
+  }
+}
+
+int main(int argc, char** argv) {
+  hipError_t err;
+  /*************************************************************************/
+  int device_num = 0, loops = 1000, FailFlag = 0;
+  /* Create the streams we will use in this test. **************************/
+  hipStream_t streams[3];
+  // Alocate the host input buffer, and two device-focused buffers that we
+  // will use for our test.
+  unsigned long long *dev_array[3];
+  HIPCHECK(hipGetDeviceCount(&device_num));
+  for (int dev = 0; dev < device_num; ++dev) {
+    /*************************************************************************/
+    /* Test whether target device supports cooperative groups ****************/
+    HIPCHECK(hipSetDevice(dev));
+    if (!cooperative_groups_support(dev)) {
+      std::cout << "Skipping the test with Pass result.\n";
+      passed();
+    }
+
+    /*************************************************************************/
+    /* We will launch enough waves to fill up all of the GPU *****************/
+    hipDeviceProp_t device_properties;
+    HIPCHECK(hipGetDeviceProperties(&device_properties, dev));
+    int warp_size = device_properties.warpSize;
+    int num_sms = device_properties.multiProcessorCount;
+    int desired_blocks = 1;
+    std::cout << "Device: " << dev << std::endl;
+    std::cout << "Device name: " << device_properties.name << std::endl;
+
+    int max_blocks_per_sm;
+    // Calculate the device occupancy to know how many blocks can be run.
+    HIPCHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor(&max_blocks_per_sm,
+                                                          test_kernel,
+                                                          warp_size, 0));
+
+    if (desired_blocks > max_blocks_per_sm * num_sms) {
+      std::cerr << "The requested number of blocks will not fit on the GPU";
+      std::cerr << std::endl;
+      std::cerr << "You requested " << desired_blocks << " but we can only ";
+      std::cerr << "fit " << (max_blocks_per_sm * num_sms) << std::endl;
+      failed("");
+    }
+
+    /*************************************************************************/
+    for (int i = 0; i < 3; i++) {
+      HIPCHECK(hipStreamCreate(&streams[i]));
+    }
+
+    /*************************************************************************/
+    /* Set up data to pass into the kernel ***********************************/
+
+    for (int i = 0; i < 3; i++) {
+      HIPCHECK(hipMalloc(reinterpret_cast<void**>(&dev_array[i]),
+                         warp_size * sizeof(long long)));
+      HIPCHECK(hipMemsetAsync(dev_array[i], 0, warp_size * sizeof(long long),
+                              streams[i]));
+    }
+
+    HIPCHECK(hipDeviceSynchronize());
+
+    /*************************************************************************/
+    /* Launch the kernels ****************************************************/
+    void *coop_params[3][2];
+    for (int i = 0; i < 3; i++) {
+      coop_params[i][0] = reinterpret_cast<void*>(&loops);
+      coop_params[i][1] = reinterpret_cast<void*>(&dev_array[i]);
+    }
+
+    std::cout << "Launching a single cooperative kernel..." << std::endl;
+    auto single_start = std::chrono::system_clock::now();
+    HIPCHECK(hipLaunchCooperativeKernel(reinterpret_cast<void*>(test_kernel),
+                                        desired_blocks, warp_size,
+                                        coop_params[0], 0, streams[0]));
+
+    HIPCHECK(hipDeviceSynchronize());
+    auto single_end = std::chrono::system_clock::now();
+    std::cout << "Launching 2 cooperative kernels to different streams...";
+    std::cout << std::endl;
+
+    auto double_start = std::chrono::system_clock::now();
+    HIPCHECK(hipLaunchCooperativeKernel(reinterpret_cast<void*>(test_kernel),
+                                        desired_blocks, warp_size,
+                                        coop_params[0], 0, streams[0]));
+    HIPCHECK(hipLaunchCooperativeKernel(reinterpret_cast<void*>(test_kernel),
+                                        desired_blocks, warp_size,
+                                        coop_params[1], 0, streams[1]));
+
+    HIPCHECK(hipDeviceSynchronize());
+    auto double_end = std::chrono::system_clock::now();
+    std::cout << "Launching 2 cooperative kernels and 1 normal kernel...";
+    std::cout << std::endl;
+
+    auto triple_start = std::chrono::system_clock::now();
+    HIPCHECK(hipLaunchCooperativeKernel(reinterpret_cast<void*>(test_kernel),
+                                        desired_blocks, warp_size,
+                                        coop_params[0], 0, streams[0]));
+    HIPCHECK(hipLaunchCooperativeKernel(reinterpret_cast<void*>(test_kernel),
+                                        desired_blocks, warp_size,
+                                        coop_params[1], 0, streams[1]));
+    hipLaunchKernelGGL(test_kernel, dim3(desired_blocks), dim3(warp_size),
+                       0, streams[2], loops, dev_array[2]);
+    err = hipGetLastError();
+    hipCheckErr(err);
+
+    HIPCHECK(hipDeviceSynchronize());
+    auto triple_end = std::chrono::system_clock::now();
+    std::chrono::duration<double> single_kernel_time =
+                                  (single_end - single_start);
+    std::chrono::duration<double> double_kernel_time =
+                                  (double_end - double_start);
+    std::chrono::duration<double> triple_kernel_time =
+                                  (triple_end - triple_start);
+
+    std::cout << "A single kernel took:" << std::endl;
+    std::cout << "    " << single_kernel_time.count();
+    std::cout << " seconds" << std::endl;
+    std::cout << std::endl;
+    std::cout << "Two cooperative kernels that could run together took:";
+    std::cout << std::endl;
+    std::cout << "    " << double_kernel_time.count();
+    std::cout << " seconds" << std::endl;
+    std::cout << std::endl;
+    std::cout << "Two coop kernels and a third regular kernel took:";
+    std::cout << std::endl << "    ";
+    std::cout << triple_kernel_time.count();
+    std::cout << " seconds" << std::endl;
+
+    std::cout << "Testing whether these times make sense.." << std::endl;
+    // Test that two cooperative kernels is roughly twice as long as one
+    if (double_kernel_time < 1.8 * single_kernel_time) {
+      std::cerr << "ERROR!" << std::endl;
+      std::cerr << "Two cooperative kernels launched at the same ";
+      std::cerr << "time did not take roughly twice as long as a single ";
+      std::cerr << "cooperative kernel." << std::endl;
+      std::cerr << "Were they truly serialized?" << std::endl;
+      FailFlag = 1;
+      break;
+    }
+
+    // Test that the three kernels together took roughly as long as two
+    // cooperative kernels.
+    if (triple_kernel_time > 1.1 * double_kernel_time) {
+      std::cerr << "ERROR!" << std::endl;
+      std::cerr << "Launching a normal kernel in parallel with two ";
+      std::cerr << "back-to-back cooperative kernels still ended up taking ";
+      std::cerr << "more than 10% longer than the two cooperative kernels ";
+      std::cerr << "alone." << std::endl;
+      std::cerr << "Is the normal kernel being serialized with the ";
+      std::cerr << "cooperative kernels on different streams?" << std::endl;
+      FailFlag = 1;
+      break;
+    }
+    for (int k = 0; k < 3; ++k) {
+      HIPCHECK(hipFree(dev_array[k]));
+      HIPCHECK(hipStreamDestroy(streams[k]));
+    }
+  }
+  if (FailFlag == 1) {
+    for (int k = 0; k < 3; ++k) {
+      HIPCHECK(hipFree(dev_array[k]));
+      HIPCHECK(hipStreamDestroy(streams[k]));
+    }
+    failed("");
+  }
+  passed();
+}
@@ -0,0 +1,303 @@
+/*
+Copyright (c) 2020-present Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR
+IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+// Test Description:
+/*The general idea of the application is to create a buffer of width N. N is a
+command line parameter, and the user will need to make sure that we can fit
+two buffers of N unsigned integers onto the target GPU at the same time.
+
+We then launch a fixed number of warps to the GPU. This number is calculated
+to fill the GPU with as many warps as can simultaneously run on the GPU.
+The threads in these warps then walk over two arrays. First, values from
+A[offset] are added into B[offset]. After all of A is added into all of B
+in this element-wise manner, all of the waves barrier with one another.
+
+After the barrier, the waves start adding values from B[mirror_offset] into
+A[offset]. Mirror offset means that the wave that is writing into A[7] is
+reading from B[7 before the last value]. This was probably written by a
+different thread before the barrier.
+
+After going through this loop a certain number of times, the kernel ends and
+we read the arrays back out and recalculate this algorithm serially on the
+CPU. We compare the serial version to the version that has inter-thread data
+sharing and barriers and ensure they result in the same answer.
+
+If they do have the same answer, then we can pretty confidently say that
+writing from thread X and then hitting a barrier allows thread Y to see the
+values.*/
+
+/* HIT_START
+ * BUILD: %t %s ../../test_common.cpp
+ * TEST: %t
+ * HIT_END
+ */
+#include <hip/hip_runtime.h>
+#include <hip/hip_cooperative_groups.h>
+#include "test_common.h"
+
+static inline void hipCheckAndFail(hipError_t errval,
+                                   const char *file, int line) {
+  hipError_t last_err = hipGetLastError();
+  if (errval != hipSuccess) {
+    std::cerr << "hip error: " << hipGetErrorString(errval);
+    std::cerr << std::endl;
+    std::cerr << "    Location: " << file << ":" << line << std::endl;
+    exit(errval);
+  }
+  if (last_err != errval) {
+    std::cerr << "Error: the return value of a function was not the same ";
+    std::cerr << "as the value returned by hipGetLastError()" << std::endl;
+    std::cerr << "    Location: " << file << ":" << line << std::endl;
+    std::cerr << "    Function returned: " << hipGetErrorString(errval);
+    std::cerr << " (" << errval << ")" << std::endl;
+    std::cerr << "hipGetLastError() returned: " << hipGetErrorString(last_err);
+    std::cerr << " (" << last_err << ")" << std::endl;
+    failed("");
+  }
+}
+#define hipCheckErr(errval)\
+        do { hipCheckAndFail((errval), __FILE__, __LINE__); } while (0)
+
+static int cooperative_groups_support(int device_id) {
+  hipError_t err;
+
+  int cooperative_attribute;
+  HIPCHECK(hipDeviceGetAttribute(&cooperative_attribute,
+          hipDeviceAttributeCooperativeLaunch, device_id));
+  if (!cooperative_attribute) {
+    std::cerr << "Cooperative launch support not available in ";
+    std::cerr << "the device attribute for device " << device_id;
+    std::cerr << std::endl;
+    return 0;
+  }
+
+  hipDeviceProp_t device_properties;
+  HIPCHECK(hipGetDeviceProperties(&device_properties, device_id));
+  if (device_properties.cooperativeLaunch == 0) {
+    std::cerr << "Cooperative group support not available in ";
+    std::cerr << "device properties." << std::endl;
+    return 0;
+  }
+  return 1;
+}
+
+static int verify_coop_arrays(unsigned int loops, unsigned int *host_input,
+                              unsigned int *first_array,
+                              unsigned int *second_array,
+                              unsigned int array_len) {
+  unsigned int *host_first_array = host_input;
+  unsigned int *host_second_array = (unsigned int*)calloc(array_len,
+                                                          sizeof(int));
+
+  for (int i = 0; i < loops; i++) {
+    for (int offset = 0; offset < array_len; offset++) {
+      host_second_array[offset] += host_first_array[offset];
+    }
+
+    for (int offset = 0; offset < array_len; offset++) {
+      unsigned int swizzle_offset = array_len - offset - 1;
+      host_first_array[offset] += host_second_array[swizzle_offset];
+    }
+  }
+
+  for (int i = 0; i < array_len; i++) {
+    if (host_first_array[i] != first_array[i]) {
+      std::cerr << "Test failure!" << std::endl;
+      std::cerr << "    host_first_array[" << i << "] contains the ";
+      std::cerr << "value " << host_first_array[i] << std::endl;
+      std::cerr << "    GPU first_array[" << i << "] contains the ";
+      std::cerr << "value " << first_array[i] << std::endl;
+      return -1;
+    }
+    if (host_second_array[i] != second_array[i]) {
+      std::cerr << "Test failure!" << std::endl;
+      std::cerr << "    host_second_array[" << i << "] contains the ";
+      std::cerr << "value " << host_second_array[i] << std::endl;
+      std::cerr << "    GPU second_array[" << i << "] contains the ";
+      std::cerr << "value " << second_array[i] << std::endl;
+      return -1;
+    }
+  }
+
+  std::cout << "Coop test appears to work properly!" << std::endl;
+  free(host_second_array);
+  return 0;
+}
+
+__global__ void
+coop_kernel(unsigned int *first_array, unsigned int *second_array,
+            unsigned int loops, unsigned int array_len) {
+  cooperative_groups::grid_group grid = cooperative_groups::this_grid();
+  unsigned int rank = grid.thread_rank();
+  unsigned int grid_size = grid.size();
+
+  for (int i = 0; i < loops; i++) {
+    // The goal of this loop is to directly add in values from
+    // array one into array two, on a per-wave basis.
+    for (int offset = rank; offset < array_len; offset += grid_size) {
+      second_array[offset] += first_array[offset];
+    }
+
+    grid.sync();
+
+    // The goal of this loop is to pull data the "mirror" lane in
+    // array two and add it back into array one. This causes inter-
+    // thread swizzling.
+    for (int offset = rank; offset < array_len; offset += grid_size) {
+      unsigned int swizzle_offset = array_len - offset - 1;
+      first_array[offset] += second_array[swizzle_offset];
+    }
+
+    grid.sync();
+  }
+}
+
+int main(int argc, char** argv) {
+  hipError_t err;
+  /*************************************************************************/
+  /* Parse the command line parameters *************************************/
+  // Arguments to pull out of the command line.
+  int device_num = 0, loops = 2, width = 4096, flag = 0;
+  HIPCHECK(hipGetDeviceCount(&device_num));
+  for (int dev = 0; dev < device_num; ++dev) {
+    std::cout << "Device number: " << dev << std::endl;
+    std::cout << "Loops: " << loops << std::endl;
+    std::cout << "Width: " << width << std::endl;
+
+    /*************************************************************************/
+    /* Test whether target device supports cooperative groups ****************/
+    HIPCHECK(hipSetDevice(dev));
+
+    if (!cooperative_groups_support(dev)) {
+      std::cout << "Skipping the test with Pass result.\n";
+      passed();
+    }
+
+    /*************************************************************************/
+    /* We will launch enough waves to fill up all of the GPU *****************/
+    hipDeviceProp_t device_properties;
+    HIPCHECK(hipGetDeviceProperties(&device_properties, dev));
+
+    int warp_size = device_properties.warpSize;
+    int num_sms = device_properties.multiProcessorCount;
+
+    std::cout << "Device name: " << device_properties.name << std::endl;
+    std::cout << std::endl;
+
+    // Calculate the device occupancy to know how many blocks can be run.
+    int max_blocks_per_sm;
+    HIPCHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor(&max_blocks_per_sm,
+                                                          coop_kernel,
+                                                          warp_size, 0));
+
+    int total_blocks = max_blocks_per_sm * num_sms;
+
+    /*************************************************************************/
+    /* Create the streams we will use in this test. **************************/
+    hipStream_t streams[2];
+    for (int i = 0; i < 2; i++) {
+      HIPCHECK(hipStreamCreate(&streams[i]));
+    }
+
+    /*************************************************************************/
+    /* Set up data to pass into the kernel ***********************************/
+
+    // Alocate the host input buffer, and two device-focused buffers that we
+    // will use for our test.
+    unsigned int *input_buffer = (unsigned int*)calloc(width,
+                                                       sizeof(unsigned int));
+    for (int i = 0; i < width; i++) {
+      input_buffer[i] = i;
+    }
+
+    unsigned int *first_dev_array;
+    HIPCHECK(hipMalloc(reinterpret_cast<void**>(&first_dev_array),
+                       width * sizeof(unsigned int)));
+
+    HIPCHECK(hipMemcpyAsync(first_dev_array, input_buffer,
+                            width * sizeof(unsigned int),
+                            hipMemcpyHostToDevice, streams[0]));
+
+    unsigned int *second_dev_array;
+    HIPCHECK(hipMalloc(reinterpret_cast<void**>(&second_dev_array),
+                       width * sizeof(unsigned int)));
+    HIPCHECK(hipMemsetAsync(second_dev_array, 0, width * sizeof(unsigned int),
+                            streams[0]));
+
+    /*************************************************************************/
+    /* Launch the kernels ****************************************************/
+    std::cout << "Launching a cooperative kernel with " << total_blocks;
+    std::cout << " thread blocks, each with " << warp_size << " threads";
+    std::cout << std::endl;
+
+    void *coop_params[4];
+    coop_params[0] = reinterpret_cast<void*>(&first_dev_array);
+    coop_params[1] = reinterpret_cast<void*>(&second_dev_array);
+    coop_params[2] = reinterpret_cast<void*>(&loops);
+    coop_params[3] = reinterpret_cast<void*>(&width);
+    HIPCHECK(hipLaunchCooperativeKernel(reinterpret_cast<void*>(coop_kernel),
+                                        total_blocks, warp_size, coop_params,
+                                        0, streams[0]));
+
+    /*************************************************************************/
+    /* Read back the buffers and print out their data ************************/
+    unsigned int *first_array = (unsigned int*)calloc(width,
+                                                      sizeof(unsigned int));
+    unsigned int *second_array = (unsigned int*)calloc(width,
+                                                       sizeof(unsigned int));
+    HIPCHECK(hipMemcpyAsync(first_array, first_dev_array,
+                            width * sizeof(unsigned int),
+                            hipMemcpyDeviceToHost, streams[0]));
+
+    HIPCHECK(hipMemcpyAsync(second_array, second_dev_array,
+                            width * sizeof(unsigned int),
+                            hipMemcpyDeviceToHost, streams[0]));
+
+    std::cout << "Waiting for cooperative work to finish..." << std::endl;
+    std::cout << std::flush;
+
+    HIPCHECK(hipStreamSynchronize(streams[0]));
+
+
+    int ret_val = 0;
+
+    std::cout << "Attemping to verify buffers." << std::endl;
+    std::cout << std::flush;
+    ret_val = verify_coop_arrays(loops, input_buffer, first_array,
+                                 second_array, width);
+    if (!ret_val) {
+      std::cout << "It appears that inter-thread data sharing at ";
+      std::cout << "grid_group sync points works properly!" << std::endl;
+    } else {
+      flag = 1;
+    }
+    for (int k = 0; k < 2; ++k) {
+      HIPCHECK(hipStreamDestroy(streams[k]));
+    }
+    HIPCHECK(hipFree(first_dev_array));
+    HIPCHECK(hipFree(second_dev_array));
+    free(input_buffer);
+    free(first_array);
+    free(second_array);
+  }
+  if (!flag) {
+    passed();
+  } else {
+    failed("");
+  }
+}
@@ -22,7 +22,7 @@ THE SOFTWARE.


 /* HIT_START
- * BUILD: %t %s ../test_common.cpp
+ * BUILD: %t %s ../../test_common.cpp
 * TEST: %t
 * HIT_END
 */
@@ -139,7 +139,11 @@ int main()

  if (!deviceProperties.cooperativeLaunch) {
    std::cout << "info: Device doesn't support cooperative launch! skipping the test!\n";
-    passed();
+    if (hip_skip_tests_enabled()) {
+      return hip_skip_retcode();
+    } else {
+      passed();
+    }
    return 0;
  }

@@ -22,7 +22,7 @@ THE SOFTWARE.


 /* HIT_START
- * BUILD: %t %s ../test_common.cpp
+ * BUILD: %t %s ../../test_common.cpp
 * TEST: %t
 * HIT_END
 */
@@ -139,7 +139,11 @@ int main()

  if (!deviceProperties.cooperativeLaunch) {
    std::cout << "info: Device doesn't support cooperative launch! skipping the test!\n";
-    passed();
+    if (hip_skip_tests_enabled()) {
+      return hip_skip_retcode();
+    } else {
+      passed();
+    }
    return 0;
  }

@@ -22,7 +22,7 @@ THE SOFTWARE.


 /* HIT_START
- * BUILD: %t %s ../test_common.cpp
+ * BUILD: %t %s ../../test_common.cpp
 * TEST: %t
 * HIT_END
 */
@@ -139,7 +139,11 @@ int main()

  if (!deviceProperties.cooperativeLaunch) {
    std::cout << "info: Device doesn't support cooperative launch! skipping the test!\n";
-    passed();
+    if (hip_skip_tests_enabled()) {
+      return hip_skip_retcode();
+    } else {
+      passed();
+    }
    return 0;
  }

@@ -22,7 +22,7 @@ THE SOFTWARE.


 /* HIT_START
- * BUILD: %t %s ../test_common.cpp
+ * BUILD: %t %s ../../test_common.cpp
 * TEST: %t
 * HIT_END
 */
@@ -34,6 +34,8 @@ THE SOFTWARE.
 #include <climits>

 #define ASSERT_EQUAL(lhs, rhs) assert(lhs == rhs)
+#define ASSERT_LE(lhs, rhs) assert(lhs <= rhs)
+#define ASSERT_GE(lhs, rhs) assert(lhs >= rhs)

 using namespace cooperative_groups;

@@ -193,15 +195,27 @@ static void test_cg_multi_grid_group_type(int blockSize)
  }

  // Validate results
+  int gridsSeen[MaxGPUs];
  for (int i = 0; i < nGpu; ++i) {
    for (int j = 0; j < 2 * blockSize; ++j) {
-      //ASSERT_EQUAL(numGridsTestH[i][j], nGpu);
-      //ASSERT_EQUAL(gridRankTestH[i][j], i);
+      ASSERT_EQUAL(numGridsTestH[i][j], nGpu);
+      ASSERT_GE(gridRankTestH[i][j], 0);
+      ASSERT_LE(gridRankTestH[i][j], nGpu-1);
+      ASSERT_EQUAL(gridRankTestH[i][j], gridRankTestH[i][0]);
      ASSERT_EQUAL(sizeTestH[i][j], nGpu * 2 * blockSize);
-      ASSERT_EQUAL(thdRankTestH[i][j], (i * 2 * blockSize) + j);
+      int gridRank = gridRankTestH[i][j];
+      ASSERT_EQUAL(thdRankTestH[i][j], (gridRank * 2 * blockSize) + j);
      ASSERT_EQUAL(isValidTestH[i][j], 1);
    }
    ASSERT_EQUAL(syncResultD[i+1],  2 * blockSize);
+
+    // Validate uniqueness property of grid rank
+    gridsSeen[i] = gridRankTestH[i][0];
+    for (int k = 0; k < i; ++k) {
+      if (gridsSeen[k] == gridsSeen[i]) {
+        assert (false && "Grid rank in multi-gpu setup should be unique");
+      }
+    }
  }
  ASSERT_EQUAL(syncResultD[0], nGpu * 2 * blockSize);

@@ -22,7 +22,7 @@ THE SOFTWARE.


 /* HIT_START
- * BUILD: %t %s ../test_common.cpp
+ * BUILD: %t %s ../../test_common.cpp
 * TEST: %t
 * HIT_END
 */
@@ -34,11 +34,14 @@ THE SOFTWARE.
 #include <climits>

 #define ASSERT_EQUAL(lhs, rhs) assert(lhs == rhs)
+#define ASSERT_LE(lhs, rhs) assert(lhs <= rhs)
+#define ASSERT_GE(lhs, rhs) assert(lhs >= rhs)

 using namespace cooperative_groups;

 static __global__
 void kernel_cg_multi_grid_group_type_via_base_type(int *sizeTestD,
+                                                   int* gridRankTestD,
                                                   int *thdRankTestD,
                                                   int *isValidTestD,
                                                   int *syncTestD,
@@ -51,6 +54,7 @@ void kernel_cg_multi_grid_group_type_via_base_type(int *sizeTestD,
  sizeTestD[gIdx] = tg.size();

  // Test thread_rank
+  gridRankTestD[gIdx] = this_multi_grid().grid_rank();
  thdRankTestD[gIdx] = tg.thread_rank();

  // Test is_valid
@@ -110,6 +114,7 @@ static void test_cg_multi_grid_group_type_via_base_type(int blockSize)
  // Allocate host and device memory
  int nBytes = sizeof(int) * 2 * blockSize;
  int *sizeTestD[MaxGPUs], *sizeTestH[MaxGPUs];
+  int *gridRankTestD[MaxGPUs], *gridRankTestH[MaxGPUs];
  int *thdRankTestD[MaxGPUs], *thdRankTestH[MaxGPUs];
  int *isValidTestD[MaxGPUs], *isValidTestH[MaxGPUs];
  int *syncTestD[MaxGPUs], *syncResultD;
@@ -117,11 +122,13 @@ static void test_cg_multi_grid_group_type_via_base_type(int blockSize)
    ASSERT_EQUAL(hipSetDevice(i), hipSuccess);

    ASSERT_EQUAL(hipMalloc(&sizeTestD[i], nBytes), hipSuccess);
+    ASSERT_EQUAL(hipMalloc(&gridRankTestD[i], nBytes), hipSuccess);
    ASSERT_EQUAL(hipMalloc(&thdRankTestD[i], nBytes), hipSuccess);
    ASSERT_EQUAL(hipMalloc(&isValidTestD[i], nBytes), hipSuccess);
    ASSERT_EQUAL(hipMalloc(&syncTestD[i], nBytes), hipSuccess);

    ASSERT_EQUAL(hipHostMalloc(&sizeTestH[i], nBytes), hipSuccess);
+    ASSERT_EQUAL(hipHostMalloc(&gridRankTestH[i], nBytes), hipSuccess);
    ASSERT_EQUAL(hipHostMalloc(&thdRankTestH[i], nBytes), hipSuccess);
    ASSERT_EQUAL(hipHostMalloc(&isValidTestH[i], nBytes), hipSuccess);

@@ -135,17 +142,18 @@ static void test_cg_multi_grid_group_type_via_base_type(int blockSize)
  }

  // Launch Kernel
-  constexpr int NumKernelArgs = 5;
+  constexpr int NumKernelArgs = 6;
  hipLaunchParams* launchParamsList = new hipLaunchParams[nGpu];
  void* args[MaxGPUs * NumKernelArgs];
  for (int i = 0; i < nGpu; i++) {
    ASSERT_EQUAL(hipSetDevice(i), hipSuccess);

    args[i * NumKernelArgs    ] = &sizeTestD[i];
-    args[i * NumKernelArgs + 1] = &thdRankTestD[i];
-    args[i * NumKernelArgs + 2] = &isValidTestD[i];
-    args[i * NumKernelArgs + 3] = &syncTestD[i];
-    args[i * NumKernelArgs + 4] = &syncResultD;
+    args[i * NumKernelArgs + 1] = &gridRankTestD[i];
+    args[i * NumKernelArgs + 2] = &thdRankTestD[i];
+    args[i * NumKernelArgs + 3] = &isValidTestD[i];
+    args[i * NumKernelArgs + 4] = &syncTestD[i];
+    args[i * NumKernelArgs + 5] = &syncResultD;

    launchParamsList[i].func = reinterpret_cast<void*>(kernel_cg_multi_grid_group_type_via_base_type);
    launchParamsList[i].gridDim = 2;
@@ -164,6 +172,8 @@ static void test_cg_multi_grid_group_type_via_base_type(int blockSize)

    ASSERT_EQUAL(hipMemcpy(sizeTestH[i], sizeTestD[i], nBytes, hipMemcpyDeviceToHost),
                 hipSuccess);
+    ASSERT_EQUAL(hipMemcpy(gridRankTestH[i], gridRankTestD[i], nBytes, hipMemcpyDeviceToHost),
+                 hipSuccess);
    ASSERT_EQUAL(hipMemcpy(thdRankTestH[i], thdRankTestD[i], nBytes, hipMemcpyDeviceToHost),
                 hipSuccess);
    ASSERT_EQUAL(hipMemcpy(isValidTestH[i], isValidTestD[i], nBytes, hipMemcpyDeviceToHost),
@@ -173,13 +183,26 @@ static void test_cg_multi_grid_group_type_via_base_type(int blockSize)
  }

  // Validate results
+  int gridsSeen[MaxGPUs];
  for (int i = 0; i < nGpu; ++i) {
    for (int j = 0; j < 2 * blockSize; ++j) {
      ASSERT_EQUAL(sizeTestH[i][j], nGpu * 2 * blockSize);
-      ASSERT_EQUAL(thdRankTestH[i][j], (i * 2 * blockSize) + j);
+      ASSERT_GE(gridRankTestH[i][j], 0);
+      ASSERT_LE(gridRankTestH[i][j], nGpu-1);
+      ASSERT_EQUAL(gridRankTestH[i][j], gridRankTestH[i][0]);
+      int gridRank = gridRankTestH[i][j];
+      ASSERT_EQUAL(thdRankTestH[i][j], (gridRank * 2 * blockSize) + j);
      ASSERT_EQUAL(isValidTestH[i][j], 1);
    }
    ASSERT_EQUAL(syncResultD[i+1],  2 * blockSize);
+
+    // Validate uniqueness property of grid rank
+    gridsSeen[i] = gridRankTestH[i][0];
+    for (int k = 0; k < i; ++k) {
+      if (gridsSeen[k] == gridsSeen[i]) {
+        assert (false && "Grid rank in multi-gpu setup should be unique");
+      }
+    }
  }
  ASSERT_EQUAL(syncResultD[0], nGpu * 2 * blockSize);

@@ -189,6 +212,7 @@ static void test_cg_multi_grid_group_type_via_base_type(int blockSize)
    ASSERT_EQUAL(hipSetDevice(i), hipSuccess);

    ASSERT_EQUAL(hipFree(sizeTestD[i]), hipSuccess);
+    ASSERT_EQUAL(hipFree(gridRankTestD[i]), hipSuccess);
    ASSERT_EQUAL(hipFree(thdRankTestD[i]), hipSuccess);
    ASSERT_EQUAL(hipFree(isValidTestD[i]), hipSuccess);
    ASSERT_EQUAL(hipFree(syncTestD[i]), hipSuccess);
@@ -197,6 +221,7 @@ static void test_cg_multi_grid_group_type_via_base_type(int blockSize)
      ASSERT_EQUAL(hipFree(syncResultD), hipSuccess);

    ASSERT_EQUAL(hipHostFree(sizeTestH[i]), hipSuccess);
+    ASSERT_EQUAL(hipHostFree(gridRankTestH[i]), hipSuccess);
    ASSERT_EQUAL(hipHostFree(thdRankTestH[i]), hipSuccess);
    ASSERT_EQUAL(hipHostFree(isValidTestH[i]), hipSuccess);

@@ -22,7 +22,7 @@ THE SOFTWARE.


 /* HIT_START
- * BUILD: %t %s ../test_common.cpp
+ * BUILD: %t %s ../../test_common.cpp
 * TEST: %t
 * HIT_END
 */
@@ -34,11 +34,14 @@ THE SOFTWARE.
 #include <climits>

 #define ASSERT_EQUAL(lhs, rhs) assert(lhs == rhs)
+#define ASSERT_LE(lhs, rhs) assert(lhs <= rhs)
+#define ASSERT_GE(lhs, rhs) assert(lhs >= rhs)

 using namespace cooperative_groups;

 static __global__
 void kernel_cg_multi_grid_group_type_via_public_api(int *sizeTestD,
+                                                    int* gridRankTestD,
                                                    int *thdRankTestD,
                                                    int *isValidTestD,
                                                    int *syncTestD,
@@ -51,6 +54,7 @@ void kernel_cg_multi_grid_group_type_via_public_api(int *sizeTestD,
  sizeTestD[gIdx] = group_size(mg);

  // Test thread_rank api
+  gridRankTestD[gIdx] = this_multi_grid().grid_rank();
  thdRankTestD[gIdx] = thread_rank(mg);

  // Test is_valid api
@@ -110,6 +114,7 @@ static void test_cg_multi_grid_group_type_via_public_api(int blockSize)
  // Allocate host and device memory
  int nBytes = sizeof(int) * 2 * blockSize;
  int *sizeTestD[MaxGPUs], *sizeTestH[MaxGPUs];
+  int *gridRankTestD[MaxGPUs], *gridRankTestH[MaxGPUs];
  int *thdRankTestD[MaxGPUs], *thdRankTestH[MaxGPUs];
  int *isValidTestD[MaxGPUs], *isValidTestH[MaxGPUs];
  int *syncTestD[MaxGPUs], *syncResultD;
@@ -117,11 +122,13 @@ static void test_cg_multi_grid_group_type_via_public_api(int blockSize)
    ASSERT_EQUAL(hipSetDevice(i), hipSuccess);

    ASSERT_EQUAL(hipMalloc(&sizeTestD[i], nBytes), hipSuccess);
+    ASSERT_EQUAL(hipMalloc(&gridRankTestD[i], nBytes), hipSuccess);
    ASSERT_EQUAL(hipMalloc(&thdRankTestD[i], nBytes), hipSuccess);
    ASSERT_EQUAL(hipMalloc(&isValidTestD[i], nBytes), hipSuccess);
    ASSERT_EQUAL(hipMalloc(&syncTestD[i], nBytes), hipSuccess);

    ASSERT_EQUAL(hipHostMalloc(&sizeTestH[i], nBytes), hipSuccess);
+    ASSERT_EQUAL(hipHostMalloc(&gridRankTestH[i], nBytes), hipSuccess);
    ASSERT_EQUAL(hipHostMalloc(&thdRankTestH[i], nBytes), hipSuccess);
    ASSERT_EQUAL(hipHostMalloc(&isValidTestH[i], nBytes), hipSuccess);

@@ -135,17 +142,18 @@ static void test_cg_multi_grid_group_type_via_public_api(int blockSize)
  }

  // Launch Kernel
-  constexpr int NumKernelArgs = 5;
+  constexpr int NumKernelArgs = 6;
  hipLaunchParams* launchParamsList = new hipLaunchParams[nGpu];
  void* args[MaxGPUs * NumKernelArgs];
  for (int i = 0; i < nGpu; i++) {
    ASSERT_EQUAL(hipSetDevice(i), hipSuccess);

    args[i * NumKernelArgs    ] = &sizeTestD[i];
-    args[i * NumKernelArgs + 1] = &thdRankTestD[i];
-    args[i * NumKernelArgs + 2] = &isValidTestD[i];
-    args[i * NumKernelArgs + 3] = &syncTestD[i];
-    args[i * NumKernelArgs + 4] = &syncResultD;
+    args[i * NumKernelArgs + 1] = &gridRankTestD[i];
+    args[i * NumKernelArgs + 2] = &thdRankTestD[i];
+    args[i * NumKernelArgs + 3] = &isValidTestD[i];
+    args[i * NumKernelArgs + 4] = &syncTestD[i];
+    args[i * NumKernelArgs + 5] = &syncResultD;

    launchParamsList[i].func = reinterpret_cast<void*>(kernel_cg_multi_grid_group_type_via_public_api);
    launchParamsList[i].gridDim = 2;
@@ -164,6 +172,8 @@ static void test_cg_multi_grid_group_type_via_public_api(int blockSize)

    ASSERT_EQUAL(hipMemcpy(sizeTestH[i], sizeTestD[i], nBytes, hipMemcpyDeviceToHost),
                 hipSuccess);
+    ASSERT_EQUAL(hipMemcpy(gridRankTestH[i], gridRankTestD[i], nBytes, hipMemcpyDeviceToHost),
+                 hipSuccess);
    ASSERT_EQUAL(hipMemcpy(thdRankTestH[i], thdRankTestD[i], nBytes, hipMemcpyDeviceToHost),
                 hipSuccess);
    ASSERT_EQUAL(hipMemcpy(isValidTestH[i], isValidTestD[i], nBytes, hipMemcpyDeviceToHost),
@@ -173,13 +183,26 @@ static void test_cg_multi_grid_group_type_via_public_api(int blockSize)
  }

  // Validate results
+  int gridsSeen[MaxGPUs];
  for (int i = 0; i < nGpu; ++i) {
    for (int j = 0; j < 2 * blockSize; ++j) {
      ASSERT_EQUAL(sizeTestH[i][j], nGpu * 2 * blockSize);
-      ASSERT_EQUAL(thdRankTestH[i][j], (i * 2 * blockSize) + j);
+      ASSERT_GE(gridRankTestH[i][j], 0);
+      ASSERT_LE(gridRankTestH[i][j], nGpu-1);
+      ASSERT_EQUAL(gridRankTestH[i][j], gridRankTestH[i][0]);
+      int gridRank = gridRankTestH[i][j];
+      ASSERT_EQUAL(thdRankTestH[i][j], (gridRank * 2 * blockSize) + j);
      ASSERT_EQUAL(isValidTestH[i][j], 1);
    }
    ASSERT_EQUAL(syncResultD[i+1],  2 * blockSize);
+
+    // Validate uniqueness property of grid rank
+    gridsSeen[i] = gridRankTestH[i][0];
+    for (int k = 0; k < i; ++k) {
+      if (gridsSeen[k] == gridsSeen[i]) {
+        assert (false && "Grid rank in multi-gpu setup should be unique");
+      }
+    }
  }
  ASSERT_EQUAL(syncResultD[0], nGpu * 2 * blockSize);

@@ -189,6 +212,7 @@ static void test_cg_multi_grid_group_type_via_public_api(int blockSize)
    ASSERT_EQUAL(hipSetDevice(i), hipSuccess);

    ASSERT_EQUAL(hipFree(sizeTestD[i]), hipSuccess);
+    ASSERT_EQUAL(hipFree(gridRankTestD[i]), hipSuccess);
    ASSERT_EQUAL(hipFree(thdRankTestD[i]), hipSuccess);
    ASSERT_EQUAL(hipFree(isValidTestD[i]), hipSuccess);
    ASSERT_EQUAL(hipFree(syncTestD[i]), hipSuccess);
@@ -197,6 +221,7 @@ static void test_cg_multi_grid_group_type_via_public_api(int blockSize)
      ASSERT_EQUAL(hipFree(syncResultD), hipSuccess);

    ASSERT_EQUAL(hipHostFree(sizeTestH[i]), hipSuccess);
+    ASSERT_EQUAL(hipHostFree(gridRankTestH[i]), hipSuccess);
    ASSERT_EQUAL(hipHostFree(thdRankTestH[i]), hipSuccess);
    ASSERT_EQUAL(hipHostFree(isValidTestH[i]), hipSuccess);

@@ -22,7 +22,7 @@ THE SOFTWARE.


 /* HIT_START
- * BUILD: %t %s ../test_common.cpp
+ * BUILD: %t %s ../../test_common.cpp
 * TEST: %t
 * HIT_END
 */
@@ -166,6 +166,16 @@ int main()
  ASSERT_EQUAL(hipGetDeviceProperties(&deviceProperties, deviceId), hipSuccess);
  int maxThreadsPerBlock = deviceProperties.maxThreadsPerBlock;

+  if (!deviceProperties.cooperativeLaunch) {
+    std::cout << "info: Device doesn't support cooperative launch! skipping the test!\n";
+    if (hip_skip_tests_enabled()) {
+      return hip_skip_retcode();
+    } else {
+      passed();
+    }
+    return 0;
+  }
+
  // Test block sizes which are powers of 2
  int i = 0;
  while (true) {
@@ -22,7 +22,7 @@ THE SOFTWARE.


 /* HIT_START
- * BUILD: %t %s ../test_common.cpp
+ * BUILD: %t %s ../../test_common.cpp
 * TEST: %t
 * HIT_END
 */
@@ -135,6 +135,16 @@ int main()
  ASSERT_EQUAL(hipGetDeviceProperties(&deviceProperties, deviceId), hipSuccess);
  int maxThreadsPerBlock = deviceProperties.maxThreadsPerBlock;

+  if (!deviceProperties.cooperativeLaunch) {
+    std::cout << "info: Device doesn't support cooperative launch! skipping the test!\n";
+    if (hip_skip_tests_enabled()) {
+      return hip_skip_retcode();
+    } else {
+      passed();
+    }
+    return 0;
+  }
+
  // Test block sizes which are powers of 2
  int i = 0;
  while (true) {
@@ -22,7 +22,7 @@ THE SOFTWARE.


 /* HIT_START
- * BUILD: %t %s ../test_common.cpp
+ * BUILD: %t %s ../../test_common.cpp
 * TEST: %t
 * HIT_END
 */
@@ -135,6 +135,16 @@ int main()
  ASSERT_EQUAL(hipGetDeviceProperties(&deviceProperties, deviceId), hipSuccess);
  int maxThreadsPerBlock = deviceProperties.maxThreadsPerBlock;

+  if (!deviceProperties.cooperativeLaunch) {
+    std::cout << "info: Device doesn't support cooperative launch! skipping the test!\n";
+    if (hip_skip_tests_enabled()) {
+      return hip_skip_retcode();
+    } else {
+      passed();
+    }
+    return 0;
+  }
+
  // Test block sizes which are powers of 2
  int i = 0;
  while (true) {
@@ -20,7 +20,7 @@ THE SOFTWARE.
 // Simple test for hipLaunchCooperativeKernelMultiDevice API.

 /* HIT_START
- * BUILD: %t %s ../../test_common.cpp EXCLUDE_HIP_PLATFORM nvcc
+ * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS --std=c++11 -rdc=true -gencode arch=compute_60,code=sm_60
 * TEST: %t
 * HIT_END
 */
@@ -22,15 +22,14 @@ THE SOFTWARE.
 // Simple test for hipLaunchCooperativeKernel API.

 /* HIT_START
- * BUILD: %t %s ../../test_common.cpp EXCLUDE_HIP_PLATFORM nvcc
+ * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS --std=c++11
 * TEST: %t
 * HIT_END
 */

 #include "hip/hip_runtime.h"
 #include "hip/hip_runtime_api.h"
-#include "hip/hcc_detail/device_library_decls.h"
-#include "hip/hcc_detail/hip_cooperative_groups.h"
+#include "hip/hip_cooperative_groups.h"
 #include <iostream>
 #include <chrono>
 #include "test_common.h"
@@ -129,7 +128,7 @@ int main() {
    params[3] = (void*)&dC;

    std::cout << "Testing with grid size = " << dimGrid.x << " and block size = " << dimBlock.x << "\n";
-    HIPCHECK(hipLaunchCooperativeKernel(test_gws, dimGrid, dimBlock, params, dimBlock.x * sizeof(long), stream));
+    HIPCHECK(hipLaunchCooperativeKernel(reinterpret_cast<void*>(test_gws), dimGrid, dimBlock, params, dimBlock.x * sizeof(long), stream));

    HIPCHECK(hipMemcpy(init, dC, sizeof(long), hipMemcpyDeviceToHost));

@@ -0,0 +1,568 @@
+/*
+Copyright (c) 2020-present Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR
+IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+// Test Description:
+/*The general idea of the application is to test how Cooperative Groups kernel
+launches work when launching too many warps to multiple target devices. This
+tests the following failure modes for hipLaunchCooperativeKernelMultiDevice:
+  1) Do not launch more warps to any device than can fit on that device
+  2) All device targets for the multi-device launch function must be different
+  3) All streams must be explicit (non-NULL)
+  4) The kernels sent in must be identical between devices
+  5) The grid and block sizes must be identical between devices
+  6) The block dimensions must be non-zero
+  7) The dynamic shared memory size must be identical between devices.
+
+This test ensures that the proper error conditions are returned, even if the
+target kernel does not actually use any fo the cooperative groups features.
+
+Note that tests 4, 5, and 7 only hold on Nvidia GPUs. AMD GPUs running ROCm
+do not have these constraints. As such, the test checks to see whether they
+should fail or succeed and compares this to what actually happens.
+*/
+
+/* HIT_START
+ * BUILD: %t %s ../../test_common.cpp
+ * TEST: %t
+ * HIT_END
+ */
+
+
+#include <hip/hip_runtime.h>
+#include <hip/hip_cooperative_groups.h>
+#include "test_common.h"
+
+static inline void hipCheckAndFail(hipError_t errval,
+                                   const char *file, int line) {
+  hipError_t last_err = hipGetLastError();
+  if (errval != hipSuccess) {
+    std::cerr << "hip error: " << hipGetErrorString(errval);
+    std::cerr << std::endl;
+    std::cerr << "    Location: " << file << ":" << line << std::endl;
+    failed("");
+  }
+  if (last_err != errval) {
+    std::cerr << "Error: the return value of a function was not the same ";
+    std::cerr << "as the value returned by hipGetLastError()" << std::endl;
+    std::cerr << "    Location: " << file << ":" << line << std::endl;
+    std::cerr << "    Function returned: " << hipGetErrorString(errval);
+    std::cerr << " (" << errval << ")" << std::endl;
+    std::cerr << "hipGetLastError() returned: " << hipGetErrorString(last_err);
+    std::cerr << " (" << last_err << ")" << std::endl;
+    failed("");
+  }
+}
+#define hipCheckErr(errval) \
+  do { hipCheckAndFail((errval), __FILE__, __LINE__); } while (0)
+
+static int cooperative_groups_support(int device_id) {
+  hipError_t err;
+
+  int cooperative_attribute;
+  HIPCHECK(hipDeviceGetAttribute(&cooperative_attribute,
+           hipDeviceAttributeCooperativeLaunch, device_id));
+  if (!cooperative_attribute) {
+    std::cerr << "Cooperative launch support not available in ";
+    std::cerr << "the device attribute for device " << device_id;
+    std::cerr << std::endl;
+    return 0;
+  }
+
+  int multi_gpu_cooperative_attribute;
+  HIPCHECK(hipDeviceGetAttribute(&multi_gpu_cooperative_attribute,
+           hipDeviceAttributeCooperativeMultiDeviceLaunch, device_id));
+
+  if (!multi_gpu_cooperative_attribute) {
+    std::cerr << "Multi-GPU cooperative launch support not available in ";
+    std::cerr << "the device attribute for device " << device_id;
+    std::cerr << std::endl;
+    return 0;
+  }
+
+  hipDeviceProp_t device_properties;
+  HIPCHECK(hipGetDeviceProperties(&device_properties, device_id));
+  if (device_properties.cooperativeLaunch == 0) {
+    std::cerr << "Cooperative group support not available in ";
+    std::cerr << "device properties." << std::endl;
+    return 0;
+  }
+  if (device_properties.cooperativeMultiDeviceLaunch == 0) {
+    std::cerr << "Multi-GPU cooperative group support not available in ";
+    std::cerr << "device properties." << std::endl;
+    return 0;
+  }
+  return 1;
+}
+
+static int support_for_separate_kernels(int device_id) {
+  hipError_t err;
+
+  int separate_kernel_supported;
+  HIPCHECK(hipDeviceGetAttribute(&separate_kernel_supported,
+           hipDeviceAttributeCooperativeMultiDeviceUnmatchedFunc,
+           device_id));
+  if (!separate_kernel_supported) {
+    return 0;
+  }
+
+  hipDeviceProp_t device_properties;
+  HIPCHECK(hipGetDeviceProperties(&device_properties, device_id));
+  if (device_properties.cooperativeMultiDeviceUnmatchedFunc == 0) {
+    return 0;
+  }
+  return 1;
+}
+
+static int support_for_separate_grid_sizes(int device_id) {
+  hipError_t err;
+  int separate_sizes_supported;
+  HIPCHECK(hipDeviceGetAttribute(&separate_sizes_supported,
+           hipDeviceAttributeCooperativeMultiDeviceUnmatchedGridDim,
+           device_id));
+  if (!separate_sizes_supported) {
+    return 0;
+  }
+
+  hipDeviceProp_t device_properties;
+  HIPCHECK(hipGetDeviceProperties(&device_properties, device_id));
+  if (device_properties.cooperativeMultiDeviceUnmatchedGridDim == 0) {
+    return 0;
+  }
+  return 1;
+}
+
+static int support_for_separate_block_dims(int device_id) {
+  hipError_t err;
+  int separate_sizes_supported;
+  HIPCHECK(hipDeviceGetAttribute(&separate_sizes_supported,
+           hipDeviceAttributeCooperativeMultiDeviceUnmatchedBlockDim,
+           device_id));
+  if (!separate_sizes_supported) {
+    return 0;
+  }
+
+  hipDeviceProp_t device_properties;
+  HIPCHECK(hipGetDeviceProperties(&device_properties, device_id));
+  if (device_properties.cooperativeMultiDeviceUnmatchedBlockDim == 0) {
+    return 0;
+  }
+  return 1;
+}
+
+static int support_for_separate_shared_sizes(int device_id) {
+  hipError_t err;
+  int separate_sizes_supported;
+  HIPCHECK(hipDeviceGetAttribute(&separate_sizes_supported,
+           hipDeviceAttributeCooperativeMultiDeviceUnmatchedSharedMem,
+           device_id));
+  if (!separate_sizes_supported) {
+    return 0;
+  }
+
+  hipDeviceProp_t device_properties;
+  HIPCHECK(hipGetDeviceProperties(&device_properties, device_id));
+  if (device_properties.cooperativeMultiDeviceUnmatchedSharedMem == 0) {
+    return 0;
+  }
+  return 1;
+}
+
+__global__ void test_kernel(long long *array) {
+    unsigned int rank = blockIdx.x * blockDim.x + threadIdx.x;
+    array[rank] += clock64();
+}
+
+__global__ void second_test_kernel(long long *array) {
+    unsigned int rank = blockIdx.x * blockDim.x + threadIdx.x;
+    array[rank] += clock64();
+}
+
+int main(int argc, char** argv) {
+  hipError_t err;
+  /*************************************************************************/
+  /* Parse the command line parameters *************************************/
+  // Arguments to pull out of the command line.
+  int device_num, FailFlag = 0;
+  HIPCHECK(hipGetDeviceCount(&device_num));
+  if (device_num < 2) {
+    std::cout << "This test requires atleast two gpus but the system has ";
+    std::cout << " only "<< device_num <<std::endl;
+    std::cout << "The test is skipping with Pass result" << std::endl;
+    passed();
+  }
+  for (int dev = 0; dev < (device_num-1); ++dev) {
+    std::cout << "First device number: " << dev << std::endl;
+    std::cout << "Second device number: " << (dev + 1) << std::endl;
+
+    /*************************************************************************/
+    /* Test whether target devices support cooperative groups ****************/
+    for (int i = 0; i < 2; i++) {
+       if (!cooperative_groups_support((dev + i))) {
+         std::cout << "Skipping the test with Pass result.\n";
+         passed();
+        }
+    }
+
+    /*************************************************************************/
+    /* We will try to launch more waves than the GPUs can fit. ***************/
+    int warp_sizes[2];
+    int num_sms[2];
+    hipDeviceProp_t device_properties[2];
+    int warp_size = INT_MAX;
+    int num_sm = INT_MAX;
+    for (int i = 0; i < 2; i++) {
+      HIPCHECK(hipGetDeviceProperties(&device_properties[i], (dev + i)));
+      warp_sizes[i] = device_properties[i].warpSize;
+      if (warp_sizes[i] < warp_size) {
+        warp_size = warp_sizes[i];
+      }
+      num_sms[i] = device_properties[i].multiProcessorCount;
+      if (num_sms[i] < num_sm) {
+        num_sm = num_sms[i];
+      }
+      std::cout << "Device " << (dev + i);
+      std::cout << " name: " << device_properties[i].name << std::endl;
+    }
+    std::cout << std::endl;
+
+    // Calculate the device occupancy to know how many blocks can be run.
+    int max_blocks_per_sm_arr[2];
+    int max_blocks_per_sm = INT_MAX;
+    for (int i = 0; i < 2; i++) {
+      HIPCHECK(hipSetDevice((dev + i)));
+      HIPCHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor(
+               &max_blocks_per_sm_arr[i], test_kernel, warp_size, 0));
+      if (max_blocks_per_sm_arr[i] < max_blocks_per_sm) {
+          max_blocks_per_sm = max_blocks_per_sm_arr[i];
+      }
+    }
+
+    int desired_blocks = max_blocks_per_sm * num_sm;
+
+    /*************************************************************************/
+    /* Create the streams we will use in this test. **************************/
+    hipStream_t streams[2];
+    for (int i = 0; i < 2; i++) {
+      HIPCHECK(hipSetDevice((dev + i)));
+      HIPCHECK(hipStreamCreate(&streams[i]));
+    }
+
+    /*************************************************************************/
+    /* Set up data to pass into the kernel ***********************************/
+
+    // Alocate the host input buffer, and two device-focused buffers per GPU
+    // that we will use for our test.
+    unsigned int *good_dev_array[2];
+    unsigned int *bad_dev_array[2];
+    for (int i = 0; i < 2; i++) {
+      int good_size = desired_blocks * warp_size * sizeof(long long);
+      int bad_size = 2 * desired_blocks * warp_size * sizeof(long long);
+
+      HIPCHECK(hipSetDevice((dev + i)));
+      HIPCHECK(hipMalloc(reinterpret_cast<void**>(&good_dev_array[i]),
+                         good_size));
+      HIPCHECK(hipMemsetAsync(good_dev_array[i], 0, good_size, streams[i]));
+      HIPCHECK(hipMalloc(reinterpret_cast<void**>(&bad_dev_array[i]),
+                         bad_size));
+      HIPCHECK(hipMemsetAsync(bad_dev_array[i], 0, bad_size, streams[i]));
+    }
+    HIPCHECK(hipDeviceSynchronize());
+
+    /*************************************************************************/
+    /* Launch the kernels ****************************************************/
+    std::cout << "Launching a multi-GPU cooperative kernel with too many ";
+    std::cout << "warps..." << std::endl;
+
+    void *dev_params[2][1];
+    hipLaunchParams md_params[2];
+    for (int i = 0; i < 2; i++) {
+      dev_params[i][0] = reinterpret_cast<void*>(&bad_dev_array[i]);
+
+      md_params[i].func = reinterpret_cast<void*>(test_kernel);
+      md_params[i].gridDim = 2 * desired_blocks;
+      md_params[i].blockDim = warp_size;
+      md_params[i].sharedMem = 0;
+      md_params[i].stream = streams[i];
+      md_params[i].args = dev_params[i];
+    }
+
+    err = hipLaunchCooperativeKernelMultiDevice(md_params, 2, 0);
+    if (err != hipErrorCooperativeLaunchTooLarge) {
+      std::cerr << "ERROR! Tried to launch a multi-GPU cooperative kernel ";
+      std::cerr << "with too many warps." << std::endl;
+      std::cerr << "This SHOULD have failed with the error ";
+      std::cerr << "hipErrorCooperativeLaunchTooLarge (";
+      std::cerr << hipErrorCooperativeLaunchTooLarge << ")." << std::endl;
+      std::cerr << "Instead, the launch returned " << hipGetErrorName(err);
+      std::cerr << " (" << err << ")" << std::endl;
+      FailFlag = 1;
+    } else {
+      std::cout << "\tProperly saw this return ";
+      std::cout << "hipErrorCooperativeLaunchTooLarge" << std::endl;
+    }
+    HIPCHECK(hipDeviceSynchronize());
+
+    std::cout << "Launching a multi-GPU cooperative kernel to the same ";
+    std::cout << "device twice..." << std::endl;
+    for (int i = 0; i < 2; i++) {
+      dev_params[i][0] = reinterpret_cast<void*>(&good_dev_array[i]);
+      md_params[i].gridDim = desired_blocks;
+      md_params[i].stream = streams[0];
+    }
+    err = hipLaunchCooperativeKernelMultiDevice(md_params, 2, 0);
+    if (err != hipErrorInvalidDevice) {
+      std::cerr << "ERROR! Tried to launch a multi-GPU cooperative kernel ";
+      std::cerr << "to the same device twice." << std::endl;
+      std::cerr << "This SHOULD have failed with the error ";
+      std::cerr << "hipErrorInvalidDevice (";
+      std::cerr << hipErrorInvalidDevice << ")." << std::endl;
+      std::cerr << "Instead, the launch returned " << hipGetErrorName(err);
+      std::cerr << " (" << err << ")" << std::endl;
+      FailFlag = 1;
+    } else {
+      std::cout << "\tProperly saw this return ";
+      std::cout << "hipErrorInvalidDevice" << std::endl;
+    }
+    HIPCHECK(hipDeviceSynchronize());
+
+    std::cout << "Launching a multi-GPU cooperative kernel to the NULL ";
+    std::cout << "stream" << std::endl;
+    for (int i = 0; i < 2; i++) {
+      md_params[i].stream = NULL;
+    }
+    err = hipLaunchCooperativeKernelMultiDevice(md_params, 2, 0);
+    if (err != hipErrorInvalidResourceHandle) {
+      std::cerr << "ERROR! Tried to launch a multi-GPU cooperative kernel ";
+      std::cerr << "to the NULL stream." << std::endl;
+      std::cerr << "This SHOULD have failed with the error ";
+      std::cerr << "hipErrorInvalidResourceHandle (";
+      std::cerr << hipErrorInvalidResourceHandle << ")." << std::endl;
+      std::cerr << "Instead, the launch returned " << hipGetErrorName(err);
+      std::cerr << " (" << err << ")" << std::endl;
+      FailFlag = 1;
+    } else {
+      std::cout << "\tProperly saw this return ";
+      std::cout << "hipErrorInvalidResourceHandle" << std::endl;
+    }
+    HIPCHECK(hipDeviceSynchronize());
+
+    std::cout << "Launching a multi-GPU cooperative kernel with two ";
+    std::cout << "different kernels." << std::endl;
+    bool supports_sep_kernels = true;
+    for (int i = 0; i < 2; i++) {
+      md_params[i].stream = streams[i];
+      if (!support_for_separate_kernels((dev + i))) {
+        supports_sep_kernels = false;
+      }
+    }
+    md_params[1].func = reinterpret_cast<void*>(second_test_kernel);
+    err = hipLaunchCooperativeKernelMultiDevice(md_params, 2, 0);
+    if ((supports_sep_kernels && err != hipSuccess) ||
+        (!supports_sep_kernels && err != hipErrorInvalidValue)) {
+      if (supports_sep_kernels) {
+        std::cerr << "ERROR! Tried to launch a multi-GPU cooperative kernel ";
+        std::cerr << "with two different kernels." << std::endl;
+        std::cerr << "This SHOULD have succeeded with hipSuccess (";
+        std::cerr << hipSuccess << ")." << std::endl;
+        std::cerr << "Instead, the launch returned " << hipGetErrorName(err);
+        std::cerr << " (" << err << ")" << std::endl;
+      } else {
+        std::cerr << "ERROR! Tried to launch a multi-GPU cooperative kernel ";
+        std::cerr << "with two different kernels." << std::endl;
+        std::cerr << "This SHOULD have failed with the error ";
+        std::cerr << "hipErrorInvalidValue (";
+        std::cerr << hipErrorInvalidValue << ")." << std::endl;
+        std::cerr << "Instead, the launch returned " << hipGetErrorName(err);
+        std::cerr << " (" << err << ")" << std::endl;
+      }
+      FailFlag = 1;
+    } else {
+      std::cout << "\tProperly saw this return ";
+      if (supports_sep_kernels) {
+        std::cout << "hipSuccess" << std::endl;
+      } else {
+        std::cout << "hipErrorInvalidValue" << std::endl;
+      }
+    }
+    HIPCHECK(hipDeviceSynchronize());
+
+    std::cout << "Launching a multi-GPU cooperative kernel with two ";
+    std::cout << "different grid sizes." << std::endl;
+    bool supports_sep_sizes = true;
+    for (int i = 0; i < 2; i++) {
+      md_params[i].func = reinterpret_cast<void*>(test_kernel);
+      md_params[i].gridDim = i+1;
+      if (!support_for_separate_grid_sizes((dev + i))) {
+        supports_sep_sizes = false;
+      }
+    }
+    err = hipLaunchCooperativeKernelMultiDevice(md_params, 2, 0);
+    if ((supports_sep_sizes && err != hipSuccess) ||
+      (!supports_sep_sizes && err == hipErrorInvalidValue)) {
+      if (supports_sep_sizes) {
+        std::cerr << "ERROR! Tried to launch a multi-GPU cooperative kernel ";
+        std::cerr << "with two different grid sizes." << std::endl;
+        std::cerr << "This SHOULD have succeeded with hipSuccess (";
+        std::cerr << hipSuccess << ")." << std::endl;
+        std::cerr << "Instead, the launch returned " << hipGetErrorName(err);
+        std::cerr << " (" << err << ")" << std::endl;
+      } else {
+        std::cerr << "ERROR! Tried to launch a multi-GPU cooperative kernel ";
+        std::cerr << "with two different grid sizes." << std::endl;
+        std::cerr << "This SHOULD have failed with the error ";
+        std::cerr << "hipErrorInvalidValue (";
+        std::cerr << hipErrorInvalidValue << ")." << std::endl;
+        std::cerr << "Instead, the launch returned " << hipGetErrorName(err);
+        std::cerr << " (" << err << ")" << std::endl;
+        FailFlag = 1;
+      }
+    } else {
+      std::cout << "\tProperly saw this return ";
+      if (supports_sep_kernels) {
+        std::cout << "hipSuccess" << std::endl;
+      } else {
+        std::cout << "hipErrorInvalidValue" << std::endl;
+      }
+    }
+    HIPCHECK(hipDeviceSynchronize());
+
+    std::cout << "Launching a multi-GPU cooperative kernel with two ";
+    std::cout << "different block dimensions." << std::endl;
+    supports_sep_sizes = true;
+    for (int i = 0; i < 2; i++) {
+      md_params[i].gridDim = desired_blocks;
+      md_params[i].blockDim = i+1;
+      if (!support_for_separate_block_dims((dev + i))) {
+        supports_sep_sizes = false;
+      }
+    }
+    err = hipLaunchCooperativeKernelMultiDevice(md_params, 2, 0);
+    if ((supports_sep_sizes && err != hipSuccess) ||
+          (!supports_sep_sizes && err == hipErrorInvalidValue)) {
+      if (supports_sep_sizes) {
+        std::cerr << "ERROR! Tried to launch a multi-GPU cooperative kernel ";
+        std::cerr << "with two different block dimensions." << std::endl;
+        std::cerr << "This SHOULD have succeeded with hipSuccess (";
+        std::cerr << hipSuccess << ")." << std::endl;
+        std::cerr << "Instead, the launch returned " << hipGetErrorName(err);
+        std::cerr << " (" << err << ")" << std::endl;
+      } else {
+        std::cerr << "ERROR! Tried to launch a multi-GPU cooperative kernel ";
+        std::cerr << "with two different block dimensions." << std::endl;
+        std::cerr << "This SHOULD have failed with the error ";
+        std::cerr << "hipErrorInvalidValue (";
+        std::cerr << hipErrorInvalidValue << ")." << std::endl;
+        std::cerr << "Instead, the launch returned " << hipGetErrorName(err);
+        std::cerr << " (" << err << ")" << std::endl;
+        FailFlag = 1;
+      }
+    } else {
+      std::cout << "\tProperly saw this return ";
+      if (supports_sep_kernels) {
+        std::cout << "hipSuccess" << std::endl;
+      } else {
+        std::cout << "hipErrorInvalidValue" << std::endl;
+      }
+    }
+    HIPCHECK(hipDeviceSynchronize());
+
+    std::cout << "Launching a multi-GPU cooperative kernel with block ";
+    std::cout << "dimensions of zero." << std::endl;
+    for (int i = 0; i < 2; i++) {
+      md_params[i].blockDim = 0;
+    }
+    err = hipLaunchCooperativeKernelMultiDevice(md_params, 2, 0);
+    if (err != hipErrorInvalidConfiguration) {
+      std::cerr << "ERROR! Tried to launch a multi-GPU cooperative kernel ";
+      std::cerr << "with block dimensions of zero." << std::endl;
+      std::cerr << "This SHOULD have failed with the error ";
+      std::cerr << "hipErrorInvalidConfiguration (";
+      std::cerr << hipErrorInvalidConfiguration << ")." << std::endl;
+      std::cerr << "Instead, the launch returned " << hipGetErrorName(err);
+      std::cerr << " (" << err << ")" << std::endl;
+      FailFlag = 1;
+    } else {
+      std::cout << "\tProperly saw this return ";
+      std::cout << "hipErrorInvalidConfiguration" << std::endl;
+    }
+    HIPCHECK(hipDeviceSynchronize());
+
+    std::cout << "Launching a multi-GPU cooperative kernel with two ";
+    std::cout << "different shared memory sizes." << std::endl;
+    supports_sep_sizes = true;
+    for (int i = 0; i < 2; i++) {
+      md_params[i].blockDim = warp_size;
+      md_params[i].sharedMem = i;
+      if (!support_for_separate_shared_sizes((dev + i))) {
+        supports_sep_sizes = false;
+      }
+    }
+    err = hipLaunchCooperativeKernelMultiDevice(md_params, 2, 0);
+    if ((supports_sep_sizes && err != hipSuccess) ||
+          (!supports_sep_sizes && err == hipErrorInvalidValue)) {
+      if (supports_sep_sizes) {
+        std::cerr << "ERROR! Tried to launch a multi-GPU cooperative kernel ";
+        std::cerr << "with two different shared memory sizes." << std::endl;
+        std::cerr << "This SHOULD have succeeded with hipSuccess (";
+        std::cerr << hipSuccess << ")." << std::endl;
+        std::cerr << "Instead, the launch returned " << hipGetErrorName(err);
+        std::cerr << " (" << err << ")" << std::endl;
+      } else {
+        std::cerr << "ERROR! Tried to launch a multi-GPU cooperative kernel ";
+        std::cerr << "with two different shared memory sizes." << std::endl;
+        std::cerr << "This SHOULD have failed with the error ";
+        std::cerr << "hipErrorInvalidValue (";
+        std::cerr << hipErrorInvalidValue << ")." << std::endl;
+        std::cerr << "Instead, the launch returned " << hipGetErrorName(err);
+        std::cerr << " (" << err << ")" << std::endl;
+        FailFlag = 1;
+      }
+    } else {
+      std::cout << "\tProperly saw this return ";
+      if (supports_sep_kernels) {
+        std::cout << "hipSuccess" << std::endl;
+      } else {
+        std::cout << "hipErrorInvalidValue" << std::endl;
+      }
+    }
+    HIPCHECK(hipDeviceSynchronize());
+
+    std::cout << "Launching a multi-GPU cooperative kernel with maximum ";
+    std::cout << "number of warps..." << std::endl;
+    for (int i = 0; i < 2; i++) {
+      md_params[i].sharedMem = 0;
+    }
+    HIPCHECK(hipLaunchCooperativeKernelMultiDevice(md_params, 2, 0));
+    std::cout << "\tProperly launched." << std::endl;
+
+    HIPCHECK(hipDeviceSynchronize());
+    for (int m = 0; m < 2; ++m) {
+      HIPCHECK(hipFree(good_dev_array[m]));
+      HIPCHECK(hipFree(bad_dev_array[m]));
+      HIPCHECK(hipStreamDestroy(streams[m]));
+    }
+    if (FailFlag == 1) {
+      break;
+    }
+  }
+  if (FailFlag == 1) {
+    failed("");
+  } else {
+  passed();
+  }
+}
@@ -0,0 +1,581 @@
+/*
+Copyright (c) 2020-present Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR
+IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+// Test Description:
+/*The general idea of the application is to test how multi-GPU Cooperative
+Groups kernel launches to a stream interact with other things that may be
+simultaneously running in the same streams.
+
+The HIP specification says that a multi-GPU cooperative launch will wait
+until all of the streams it's using finish their work. Only then will the
+cooperative kernel be launched to all of the devices. Then no other work
+can take part in the any of the streams until all of the multi-GPU
+cooperative work is done.
+
+However, there are flags that allow you to disable each of these
+serialization points: hipCooperativeLaunchMultiDeviceNoPreSync and
+hipCooperativeLaunchMultiDeviceNoPostSync.
+
+As such, this benchmark tests the following five situations launching
+to two GPUs (and thus two streams):
+
+    1. Normal multi-GPU cooperative kernel:
+        This should result in the following pattern:
+        Stream 0: Cooperative
+        Stream 1: Cooperative
+    2. Regular kernel launches and multi-GPU cooperative kernel launches
+       with the default flags, resulting in the following pattern:
+        Stream 0: Regular --> Cooperative
+        Stream 1:         --> Cooperative --> Regular
+
+    3. Regular kernel launches and multi-GPU cooperative kernel launches
+       that turn off "pre-sync". This should allow a cooperative kernel
+       to launch even if work is already in a stream pointing to
+       another GPU.
+        This should result in the following pattern:
+        Stream 0: Regular --> Cooperative
+        Stream 1: Cooperative            --> Regular
+
+    4. Regular kernel launches and multi-GPU cooperative kernel launches
+       that turn off "post-sync". This should allow a new kernel to enter
+       a GPU even if another GPU still has a cooperative kernel on it.
+        This should result in the following pattern:
+        Stream 0: Regular --> Cooperative
+        Stream 1:         --> Cooperative--> Regular
+
+    5. Regular kernel launches and multi-GPU cooperative kernel launches
+       that turn off both pre- and post-sync. This should allow any of
+       the kernels to launch to their GPU regardless of the status of
+       other kernels in other multi-GPU stream groups.
+        This should result in the following pattern:
+        Stream 0: Regular --> Cooperative
+        Stream 1: Cooperative --> Regular
+
+We time how long it takes to run each of these benchmarks and print it as
+the output of the benchmark. The kernels themselves are just useless time-
+wasting code so that the kernel takes a meaningful amount of time on the
+GPU before it exits. We only launch a single wavefront for each kernel, so
+any serialization should not be because of GPU occupancy concerns.
+
+If tests 2, 3, and 4 take roughly 3x as long as #1, that implies that
+cooperative kernels are serialized as expected.
+
+If test #5 takes roughly twice as long as #1, that implies that the
+overlap-allowing flags work as expected.
+*/
+
+/* HIT_START
+ * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS --std=c++11 -rdc=true -gencode arch=compute_60,code=sm_60
+ * TEST: %t
+ * HIT_END
+ */
+
+#include <chrono>
+#include <hip/hip_runtime.h>
+#include <hip/hip_cooperative_groups.h>
+#include "test_common.h"
+
+static inline void hipCheckAndFail(hipError_t errval,
+                                   const char *file, int line) {
+  hipError_t last_err = hipGetLastError();
+  if (errval != hipSuccess) {
+    std::cerr << "hip error: " << hipGetErrorString(errval);
+    std::cerr << std::endl;
+    std::cerr << "    Location: " << file << ":" << line << std::endl;
+    failed("");
+  }
+  if (last_err != errval) {
+    std::cerr << "Error: the return value of a function was not the same ";
+    std::cerr << "as the value returned by hipGetLastError()" << std::endl;
+    std::cerr << "    Location: " << file << ":" << line << std::endl;
+    std::cerr << "    Function returned: " << hipGetErrorString(errval);
+    std::cerr << " (" << errval << ")" << std::endl;
+    std::cerr << "hipGetLastError() returned: " << hipGetErrorString(last_err);
+    std::cerr << " (" << last_err << ")" << std::endl;
+    failed("");
+  }
+}
+#define hipCheckErr(errval) \
+  do { hipCheckAndFail((errval), __FILE__, __LINE__); } while (0)
+
+static int cooperative_groups_support(int device_id) {
+  hipError_t err;
+  int cooperative_attribute;
+  HIPCHECK(hipDeviceGetAttribute(&cooperative_attribute,
+          hipDeviceAttributeCooperativeLaunch, device_id));
+  if (!cooperative_attribute) {
+    std::cerr << "Cooperative launch support not available in ";
+    std::cerr << "the device attribute for device " << device_id;
+    std::cerr << std::endl;
+    return 0;
+  }
+
+  int multi_gpu_cooperative_attribute;
+  HIPCHECK(hipDeviceGetAttribute(&multi_gpu_cooperative_attribute,
+           hipDeviceAttributeCooperativeMultiDeviceLaunch, device_id));
+  if (!multi_gpu_cooperative_attribute) {
+    std::cerr << "Multi-GPU cooperative launch support not available in ";
+    std::cerr << "the device attribute for device " << device_id;
+    std::cerr << std::endl;
+    return 0;
+  }
+
+  hipDeviceProp_t device_properties;
+  HIPCHECK(hipGetDeviceProperties(&device_properties, device_id));
+  if (device_properties.cooperativeLaunch == 0) {
+    std::cerr << "Cooperative group support not available in ";
+    std::cerr << "device properties." << std::endl;
+    return 0;
+  }
+  if (device_properties.cooperativeMultiDeviceLaunch == 0) {
+    std::cerr << "Multi-GPU cooperative group support not available in ";
+    std::cerr << "device properties." << std::endl;
+    return 0;
+  }
+  return 1;
+}
+
+__global__ void test_coop_kernel(unsigned int loops, long long *array,
+                                 int fast_gpu) {
+  cooperative_groups::multi_grid_group mgrid =
+  cooperative_groups::this_multi_grid();
+  unsigned int rank = blockIdx.x * blockDim.x + threadIdx.x;
+
+  if (mgrid.grid_rank() == fast_gpu) {
+    return;
+  }
+
+  for (int i = 0; i < loops; i++) {
+    long long start_clock = clock64();
+    while (clock64() < (start_clock+1000000)) {}
+    array[rank] += clock64();
+  }
+}
+
+__global__ void test_kernel(uint32_t loops, unsigned long long *array) {
+  unsigned int rank = blockIdx.x * blockDim.x + threadIdx.x;
+
+  for (int i = 0; i < loops; i++) {
+    long long start_clock = clock64();
+    while (clock64() < (start_clock+1000000)) {}
+    array[rank] += clock64();
+  }
+}
+
+int main(int argc, char** argv) {
+  hipError_t err;
+  int device_num, FailFlag = 0;
+  uint32_t loops = 2000;
+  uint32_t fast_loops = 1;
+  int32_t fast_gpu = -1;
+  HIPCHECK(hipGetDeviceCount(&device_num));
+  if (device_num < 2) {
+    std::cout << "This test requires atleast two gpus but the system has ";
+    std::cout << " only "<< device_num <<std::endl;
+    std::cout << "The test is skipping with Pass result" << std::endl;
+    passed();
+  }
+  for (int dev = 0; dev < (device_num-1); ++dev) {
+    std::cout << "First device number: " << dev << std::endl;
+    std::cout << "Second device number: " << (dev + 1) << std::endl;
+    std::cout << "Loops: " << loops << std::endl;
+
+    /*************************************************************************/
+    /* Test whether target devices support cooperative groups ****************/
+    for (int i = 0; i < 2; i++) {
+      if (!cooperative_groups_support(dev + i)) {
+        std::cout << "Skipping the test with Pass result.\n";
+        passed();
+      }
+    }
+
+    /*************************************************************************/
+    /* We will launch enough waves to fill up all of the GPU *****************/
+    int warp_sizes[2];
+    int num_sms[2];
+    hipDeviceProp_t device_properties[2];
+    int warp_size = INT_MAX;
+    int num_sm = INT_MAX;
+    for (int i = 0; i < 2; i++) {
+      HIPCHECK(hipGetDeviceProperties(&device_properties[i], (dev + i)));
+      warp_sizes[i] = device_properties[i].warpSize;
+      if (warp_sizes[i] < warp_size) {
+        warp_size = warp_sizes[i];
+      }
+      num_sms[i] = device_properties[i].multiProcessorCount;
+      if (num_sms[i] < num_sm) {
+        num_sm = num_sms[i];
+      }
+      std::cout << "Device " << (i + 1);
+      std::cout << " name: " << device_properties[i].name << std::endl;
+    }
+    std::cout << std::endl;
+
+    // Calculate the device occupancy to know how many blocks can be run.
+    int max_blocks_per_sm_arr[2];
+    int max_blocks_per_sm = INT_MAX;
+    for (int i = 0; i < 2; i++) {
+      HIPCHECK(hipSetDevice(dev + i));
+      HIPCHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor(
+               &max_blocks_per_sm_arr[i], test_kernel, warp_size, 0));
+      if (max_blocks_per_sm_arr[i] < max_blocks_per_sm) {
+        max_blocks_per_sm = max_blocks_per_sm_arr[i];
+      }
+    }
+    int desired_blocks = 1;
+
+    if (desired_blocks > max_blocks_per_sm * num_sm) {
+      std::cerr << "The requested number of blocks will not fit on the GPU";
+      std::cerr << std::endl;
+      std::cerr << "You requested " << desired_blocks << " but we can only ";
+      std::cerr << "fit " << (max_blocks_per_sm * num_sm) << std::endl;
+      failed("");
+    }
+
+    /*************************************************************************/
+    /* Create the streams we will use in this test. **************************/
+    hipStream_t streams[2];
+    for (int i = 0; i < 2; i++) {
+      HIPCHECK(hipSetDevice(dev + i));
+      HIPCHECK(hipStreamCreate(&streams[i]));
+    }
+
+    /*************************************************************************/
+    /* Set up data to pass into the kernelx **********************************/
+
+    // Alocate the host input buffer, and two device-focused buffers that we
+    // will use for our test.
+    unsigned long long *dev_array[2];
+    for (int i = 0; i < 2; i++) {
+      int good_size = desired_blocks * warp_size * sizeof(long long);
+      HIPCHECK(hipSetDevice(dev + i));
+      HIPCHECK(hipMalloc(reinterpret_cast<void**>(&dev_array[i]), good_size));
+      HIPCHECK(hipMemsetAsync(dev_array[i], 0, good_size, streams[i]));
+    }
+    for (int i = 0; i < 2; i++) {
+      HIPCHECK(hipSetDevice(dev + i));
+      HIPCHECK(hipDeviceSynchronize());
+    }
+
+    /*************************************************************************/
+    /* Launch the kernels ****************************************************/
+    void *dev_params[2][3];
+    hipLaunchParams md_params[2];
+    std::chrono::time_point<std::chrono::system_clock> start_time[6];
+    std::chrono::time_point<std::chrono::system_clock> end_time[6];
+
+    std::cout << "Test 0: Launching a multi-GPU cooperative kernel...\n";
+    std::cout << "This should result in the following pattern:" << std::endl;
+    std::cout << "GPU " << dev << ": Long Coop Kernel" << std::endl;
+    std::cout << "GPU " << (dev + 1) << ": Long Coop Kernel" << std::endl;
+
+    for (int i = 0; i < 2; i++) {
+      dev_params[i][0] = reinterpret_cast<void*>(&loops);
+      dev_params[i][1] = reinterpret_cast<void*>(&dev_array[i]);
+      dev_params[i][2] = reinterpret_cast<void*>(&fast_gpu);
+      md_params[i].func = reinterpret_cast<void*>(test_coop_kernel);
+      md_params[i].gridDim = desired_blocks;
+      md_params[i].blockDim = warp_size;
+      md_params[i].sharedMem = 0;
+      md_params[i].stream = streams[i];
+      md_params[i].args = dev_params[i];
+    }
+
+    start_time[0] = std::chrono::system_clock::now();
+    HIPCHECK(hipLaunchCooperativeKernelMultiDevice(md_params, 2, 0));
+    for (int i = 0; i < 2; i++) {
+      HIPCHECK(hipSetDevice(dev + i));
+      HIPCHECK(hipDeviceSynchronize());
+    }
+    end_time[0] = std::chrono::system_clock::now();
+
+    std::cout << std::endl;
+    std::cout << "Test 1: Launching a multi-GPU cooperative kernel with the ";
+    std::cout << "following pattern:" << std::endl;
+    std::cout << "GPU " << dev << ": Standard  Kernel --> Long Coop Kernel\n";
+    std::cout << "GPU " << (dev + 1) << ":                  --> Coop        ";
+    std::cout << "--> Standard  Kernel\n";
+    fast_gpu = 1;
+    start_time[1] = std::chrono::system_clock::now();
+    HIPCHECK(hipSetDevice(dev));
+    hipLaunchKernelGGL(test_kernel, dim3(desired_blocks), dim3(warp_size), 0,
+                       streams[0], loops, dev_array[0]);
+    HIPCHECK(hipGetLastError());
+    HIPCHECK(hipLaunchCooperativeKernelMultiDevice(md_params, 2, 0));
+    HIPCHECK(hipSetDevice(dev + 1));
+    hipLaunchKernelGGL(test_kernel, dim3(desired_blocks), dim3(warp_size), 0,
+                       streams[1], loops, dev_array[1]);
+    HIPCHECK(hipGetLastError());
+    for (int i = 0; i < 2; i++) {
+      HIPCHECK(hipSetDevice(dev + i));
+      HIPCHECK(hipDeviceSynchronize());
+    }
+    end_time[1] = std::chrono::system_clock::now();
+    fast_gpu = -1;
+
+    std::cout << std::endl;
+    std::cout << "Test 2: Launching a multi-GPU cooperative kernel with the ";
+    std::cout << "following pattern:" << std::endl;
+    std::cout << "GPU " << dev << ": Standard  Kernel --> Coop" << std::endl;
+    std::cout << "GPU " << (dev + 1) << ":                  --> Long Coop";
+    std::cout << " Kernel --> ";
+    std::cout << "Standard  Kernel\n";
+    fast_gpu = 0;
+    start_time[2] = std::chrono::system_clock::now();
+    HIPCHECK(hipSetDevice(dev));
+    hipLaunchKernelGGL(test_kernel, dim3(desired_blocks), dim3(warp_size), 0,
+                       streams[0], loops, dev_array[0]);
+    HIPCHECK(hipGetLastError());
+    HIPCHECK(hipLaunchCooperativeKernelMultiDevice(md_params, 2, 0));
+    HIPCHECK(hipSetDevice(dev + 1));
+    hipLaunchKernelGGL(test_kernel, dim3(desired_blocks), dim3(warp_size), 0,
+                       streams[1], loops, dev_array[1]);
+    HIPCHECK(hipGetLastError());
+    for (int i = 0; i < 2; i++) {
+      HIPCHECK(hipSetDevice(dev + i));
+      HIPCHECK(hipDeviceSynchronize());
+    }
+    end_time[2] = std::chrono::system_clock::now();
+    fast_gpu = -1;
+
+    std::cout << std::endl;
+    std::cout << "Test 3: Launching a multi-GPU cooperative kernel with the ";
+    std::cout << "ability to overlap regular and cooperative kernels ";
+    std::cout << "only at the beginning." << std::endl;
+    std::cout << "This should result in the following pattern:" << std::endl;
+    std::cout << "GPU " << dev << ": Standard  Kernel --> Coop" << std::endl;
+    std::cout << "GPU " << (dev + 1) << ": Long Coop Kernel -->      Standard";
+    std::cout<< "  Kernel\n";
+    fast_gpu = 0;
+    start_time[3] = std::chrono::system_clock::now();
+    HIPCHECK(hipSetDevice(dev));
+    hipLaunchKernelGGL(test_kernel, dim3(desired_blocks), dim3(warp_size), 0,
+                       streams[0], loops, dev_array[0]);
+    HIPCHECK(hipGetLastError());
+    HIPCHECK(hipLaunchCooperativeKernelMultiDevice(md_params, 2,
+             hipCooperativeLaunchMultiDeviceNoPreSync));
+    HIPCHECK(hipSetDevice(dev + 1));
+    hipLaunchKernelGGL(test_kernel, dim3(desired_blocks), dim3(warp_size), 0,
+                       streams[1], loops, dev_array[1]);
+    HIPCHECK(hipGetLastError());
+    for (int i = 0; i < 2; i++) {
+      HIPCHECK(hipSetDevice(dev + i));
+      HIPCHECK(hipDeviceSynchronize());
+    }
+    end_time[3] = std::chrono::system_clock::now();
+    fast_gpu = -1;
+
+    std::cout << std::endl;
+    std::cout << "Test 4: Launching a multi-GPU cooperative kernel with the ";
+    std::cout << "ability to overlap regular and cooperative kernels ";
+    std::cout << "only at the end." << std::endl;
+    std::cout << "This should result in the following pattern:" << std::endl;
+    std::cout << "GPU " << dev << ": Standard  Kernel --> Long Coop Kernel\n";
+    std::cout << "GPU " << (dev + 1) << ":                  --> Coop --> ";
+    std::cout << "Standard  Kernel\n";
+    fast_gpu = 1;
+    start_time[4] = std::chrono::system_clock::now();
+    HIPCHECK(hipSetDevice(dev));
+    hipLaunchKernelGGL(test_kernel, dim3(desired_blocks), dim3(warp_size), 0,
+                       streams[0], loops, dev_array[0]);
+    HIPCHECK(hipGetLastError());
+    HIPCHECK(hipLaunchCooperativeKernelMultiDevice(md_params, 2,
+             hipCooperativeLaunchMultiDeviceNoPostSync));
+    HIPCHECK(hipSetDevice(dev + 1));
+    hipLaunchKernelGGL(test_kernel, dim3(desired_blocks), dim3(warp_size), 0,
+                       streams[1], loops, dev_array[1]);
+    for (int i = 0; i < 2; i++) {
+      HIPCHECK(hipSetDevice(dev + i));
+      HIPCHECK(hipDeviceSynchronize());
+    }
+    end_time[4] = std::chrono::system_clock::now();
+    fast_gpu = -1;
+
+    std::cout << std::endl;
+    std::cout << "Test 5: Launching a multi-GPU cooperative kernel with the ";
+    std::cout << "ability to overlap regular and cooperative kernels";
+    std::cout << std::endl;
+    std::cout << "This should result in the following pattern:" << std::endl;
+    std::cout << "GPU " << dev << ": Standard  Kernel --> Long Coop Kernel\n";
+    std::cout << "GPU " << (dev + 1) << ": Long Coop Kernel --> Standard";
+    std::cout << "  Kernel\n";
+    start_time[5] = std::chrono::system_clock::now();
+    HIPCHECK(hipSetDevice(dev));
+    hipLaunchKernelGGL(test_kernel, dim3(desired_blocks), dim3(warp_size), 0,
+                       streams[0], loops, dev_array[0]);
+    HIPCHECK(hipGetLastError());
+    HIPCHECK(hipLaunchCooperativeKernelMultiDevice(md_params, 2,
+             hipCooperativeLaunchMultiDeviceNoPreSync |
+             hipCooperativeLaunchMultiDeviceNoPostSync));
+    HIPCHECK(hipSetDevice(dev + 1));
+    hipLaunchKernelGGL(test_kernel, dim3(desired_blocks), dim3(warp_size), 0,
+                       streams[1], loops, dev_array[1]);
+    HIPCHECK(hipGetLastError());
+    for (int i = 0; i < 2; i++) {
+      HIPCHECK(hipSetDevice(dev + i));
+      HIPCHECK(hipDeviceSynchronize());
+    }
+    end_time[5] = std::chrono::system_clock::now();
+
+    std::chrono::duration<double> single_kernel_time =
+                                  (end_time[0] - start_time[0]);
+    std::chrono::duration<double> serialized_gpu0_time =
+                                  (end_time[1] - start_time[1]);
+    std::chrono::duration<double> serialized_gpu1_time =
+                                  (end_time[2] - start_time[2]);
+    std::chrono::duration<double> pre_overlapped_time =
+                                  (end_time[3] - start_time[3]);
+    std::chrono::duration<double> post_overlapped_time =
+                                  (end_time[4] - start_time[4]);
+    std::chrono::duration<double> overlapped_time =
+                                  (end_time[5] - start_time[5]);
+
+    std::cout << "Test 0: A single kernel on both GPUs took:" << std::endl;
+    std::cout << "    " << single_kernel_time.count();
+    std::cout << " seconds" << std::endl;
+    std::cout << std::endl;
+    std::cout << "Test 1: Serialized set of three kernels with GPU0";
+    std::cout << " being long took:";
+    std::cout << "    " << serialized_gpu0_time.count();
+    std::cout << " seconds" << std::endl;
+    std::cerr << "Expect between " << (2.7 * single_kernel_time.count());
+    std::cerr << " and ";
+    std::cerr << (3.3 * single_kernel_time.count()) << " seconds.\n";
+    std::cout << std::endl;
+    std::cout << "Test 2: Serialized set of three kernels with GPU1";
+    std::cout << " being long took:" << std::endl;
+    std::cout << "    " << serialized_gpu1_time.count();
+    std::cout << " seconds" << std::endl;
+    std::cerr << "Expect between " << (2.7 * single_kernel_time.count());
+    std::cerr << " and ";
+    std::cerr << (3.3 * single_kernel_time.count()) << " seconds.\n";
+    std::cout << std::endl;
+    std::cout << "Test 3: Multiple kernels with pre-overlap allowed took:\n";
+    std::cout << "    " << pre_overlapped_time.count();
+    std::cout << " seconds" << std::endl;
+    std::cerr << "Expect between " << (1.7 * single_kernel_time.count());
+    std::cerr << " and ";
+    std::cerr << (2.3 * single_kernel_time.count()) << " seconds.\n";
+    std::cout << std::endl;
+    std::cout << "Test 4: Multiple kernels with post-overlap allowed took:\n";
+    std::cout << "    " << post_overlapped_time.count();
+    std::cout << " seconds" << std::endl;
+    std::cerr << "Expect between " << (1.7 * single_kernel_time.count());
+    std::cerr << " and ";
+    std::cerr << (2.3 * single_kernel_time.count()) << " seconds.";
+    std::cout << std::endl;
+    std::cout << "Test 5: Multiple kernels with overlap allowed took:\n";
+    std::cout << "    " << overlapped_time.count();
+    std::cout << " seconds" << std::endl;
+    std::cerr << "Expect between " << (1.8 * single_kernel_time.count());
+    std::cerr << " and ";
+    std::cerr << (2.2 * single_kernel_time.count()) << " seconds.\n";
+
+    // Test that fully not-overlapped kernels take roughly 3x as long as one
+    // cooperative kernel.
+    if (serialized_gpu0_time > 3.3 * single_kernel_time ||
+        serialized_gpu0_time < 2.7 * single_kernel_time) {
+      std::cerr << "ERROR!" << std::endl;
+      std::cerr << "Test 1, the first case where all kernels should be ";
+      std::cerr << "serialized, had a runtime that was very different ";
+      std::cerr << "than what was expected." << std::endl;
+      std::cerr << "Was " << serialized_gpu0_time.count() << " seconds.\n";
+      std::cerr << "Expected between ";
+      std::cerr << (2.7 * single_kernel_time.count()) << " and ";
+      std::cerr << (3.3 * single_kernel_time.count()) << " seconds.\n";
+      std::cerr << "Were they truly serialized?" << std::endl;
+      FailFlag = 1;
+    }
+
+    // Test that fully not-overlapped kernels take roughly 3x as long as one
+    // cooperative kernel.
+    if (serialized_gpu1_time > 3.3 * single_kernel_time ||
+        serialized_gpu1_time < 2.7 * single_kernel_time) {
+      std::cerr << "ERROR!" << std::endl;
+      std::cerr << "Test 2, the second case where all kernels should be ";
+      std::cerr << "serialized, had a runtime that was very different ";
+      std::cerr << "than what was expected." << std::endl;
+      std::cerr << "Was " << serialized_gpu1_time.count();
+      std::cerr << " seconds." << std::endl;
+      std::cerr << "Expected between ";
+      std::cerr << (2.7 * single_kernel_time.count()) << " and ";
+      std::cerr << (3.3 * single_kernel_time.count()) << " seconds.\n";
+      std::cerr << "Were they truly serialized?" << std::endl;
+      FailFlag = 1;
+    }
+
+    // Test that kernels that can overlap only before the cooperative kernel
+    // launches kernels take roughly the same time (in this case)
+    if (pre_overlapped_time > 2.3 * single_kernel_time ||
+        pre_overlapped_time < 1.7 * single_kernel_time) {
+      std::cerr << "ERROR!" << std::endl;
+      std::cerr << "Test 3, the case where the last kernel is serialized, had ";
+      std::cerr << "a runtime that was very different than what was ";
+      std::cerr << "expected." << std::endl;
+      std::cerr << "Was " << pre_overlapped_time.count() << " seconds.\n";
+      std::cerr << "Expected between ";
+      std::cerr << (1.7 * single_kernel_time.count()) << " and ";
+      std::cerr << (2.3 * single_kernel_time.count()) << " seconds.\n";
+      FailFlag = 1;
+    }
+
+    // Test that kernels that can overlap only after the cooperative kernel
+    // launches kernels take roughly the same time (in this case)
+    if (post_overlapped_time > 2.3 * single_kernel_time ||
+        post_overlapped_time < 1.7 * single_kernel_time) {
+      std::cerr << "ERROR!" << std::endl;
+      std::cerr << "Teste 4, the case where the first kernel is ";
+      std::cerr << "serialized, had a runtime that was very different ";
+      std::cerr << "than what was expected." << std::endl;
+      std::cerr << "Was " << post_overlapped_time.count() << " seconds.\n";
+      std::cerr << "Expected between ";
+      std::cerr << (1.7 * single_kernel_time.count()) << " and ";
+      std::cerr << (2.3 * single_kernel_time.count()) << " seconds.\n";
+      FailFlag = 1;
+    }
+
+    // Test that, with the right flags on the kernel launch, that we prevent
+    // incomplete launches from serializing the cooperative launch streams.
+    if (overlapped_time > 2.2 * single_kernel_time ||
+        overlapped_time < 1.8 * single_kernel_time) {
+      std::cerr << "ERROR!" << std::endl;
+      std::cerr << "Test 5, the case where normal and cooperative kernel ";
+      std::cerr << "launches should overlap, does not appear to have done so.";
+      std::cerr << std::endl;
+      std::cerr << "Was " << overlapped_time.count() << " seconds.\n";
+      std::cerr << "Expected between ";
+      std::cerr << (1.8 * single_kernel_time.count()) << " and ";
+      std::cerr << (2.2 * single_kernel_time.count()) << " seconds.\n";
+      std::cerr << "Is the normal kernel being serialized with the ";
+      std::cerr << "cooperative kernels on different streams?" << std::endl;
+      FailFlag = 1;
+    }
+    for (int k = 0; k < 2; ++k) {
+      HIPCHECK(hipFree(dev_array[k]));
+      HIPCHECK(hipStreamDestroy(streams[k]));
+    }
+    if (FailFlag == 1) {
+      break;
+    }
+  }
+  if (FailFlag == 1) {
+    failed("");
+  } else {
+    passed();
+  }
+}
@@ -0,0 +1,374 @@
+/*
+Copyright (c) 2020-present Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR
+IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+// Test Description:
+/*The general idea of the application is to launch N warps to all GPUs detected
+in the HIP system. N is a command-line parameter, but the user should set N
+small enough that all warps can be on each of the GPUs at the same time.
+
+All of the warps do a "work loop". Within the work loop, every warp
+atomically increments a global variable that is shared between both fo the
+target GPUs. The value returned from this atomic increment entriely depends
+on the order the warps from the GPUs arrive at the atomic instruction. Each
+warp then stores the result into a global array based on its warp ID.
+
+We also add a sleep/wait loop into the code so that the last warp runs much
+slower than everyone else. As such, it should store much larger values than
+all the other warps.
+
+If there are no barrier within the loop, then warp 0 will likely ge to the
+global variable the first time while all the other warps have each
+incremented it many times. If the barrier properly works, then each warp
+will increment the variable once per time through the loop, and all threads
+will sleep on the barrier waiting for the last warp to finally catch up.
+*/
+
+/* HIT_START
+ * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS -rdc=true -gencode arch=compute_60,code=sm_60
+ * TEST: %t
+ * HIT_END
+ */
+
+#include <hip/hip_runtime.h>
+#include <hip/hip_cooperative_groups.h>
+#include "test_common.h"
+
+static int cooperative_groups_support(int device_id) {
+  hipError_t err;
+  int cooperative_attribute;
+  HIPCHECK(hipDeviceGetAttribute(&cooperative_attribute,
+           hipDeviceAttributeCooperativeLaunch, device_id));
+  if (!cooperative_attribute) {
+    std::cerr << "Cooperative launch support not available in ";
+    std::cerr << "the device attribute for device " << device_id;
+    std::cerr << std::endl;
+    return 0;
+  }
+
+  int multi_gpu_cooperative_attribute;
+  HIPCHECK(hipDeviceGetAttribute(&multi_gpu_cooperative_attribute,
+           hipDeviceAttributeCooperativeMultiDeviceLaunch, device_id));
+  if (!multi_gpu_cooperative_attribute) {
+    std::cerr << "Multi-GPU cooperative launch support not available in ";
+    std::cerr << "the device attribute for device " << device_id;
+    std::cerr << std::endl;
+    return 0;
+  }
+
+  hipDeviceProp_t device_properties;
+  HIPCHECK(hipGetDeviceProperties(&device_properties, device_id));
+  if (device_properties.cooperativeLaunch == 0) {
+    std::cerr << "Cooperative group support not available in ";
+    std::cerr << "device properties." << std::endl;
+    return 0;
+  }
+  if (device_properties.cooperativeMultiDeviceLaunch == 0) {
+    std::cerr << "Multi-GPU cooperative group support not available in ";
+    std::cerr << "device properties." << std::endl;
+    return 0;
+  }
+  return 1;
+}
+
+static int verify_barrier_buffer(unsigned int loops, unsigned int warps,
+                                 unsigned int *host_buffer,
+                                 unsigned int num_devs) {
+  unsigned int max_in_this_loop = 0;
+  for (unsigned int i = 0; i < loops; i++) {
+    max_in_this_loop += (warps * num_devs);
+    for (unsigned int j = 0; j < warps; j++) {
+      if (host_buffer[i*warps+j] > max_in_this_loop) {
+        std::cerr << "Barrier failure!" << std::endl;
+        std::cerr << "    Buffer entry " << i*warps+j;
+        std::cerr << " contains the value " << host_buffer[i*warps+j];
+        std::cerr << " but it should not be more than ";
+        std::cerr << max_in_this_loop << std::endl;
+        return -1;
+      }
+    }
+  }
+  std::cout << "\tBarriers work properly!" << std::endl;
+  return 0;
+}
+
+static int verify_multi_gpu_buffer(unsigned int loops, unsigned int array_val) {
+  unsigned int desired_val = 0;
+  for (int i = 0; i < loops; i++) {
+    if (i % 2 == 0) {
+      desired_val += 2;
+    } else {
+      desired_val *= 2;
+    }
+  }
+  std::cout << "Desired value is " << desired_val << std::endl;
+  if (array_val != desired_val) {
+    std::cerr << "ERROR! Multi-grid barrier does not appear to work.";
+    std::cerr << std::endl;
+    std::cerr << "Expected the multi-GPUs to work together to produce ";
+    std::cerr << "the value " << desired_val << std::endl;
+    std::cerr << "However, the entry returned from the multi-GPU ";
+    std::cerr << "kernel was " << array_val << std::endl;
+    return -1;
+  }
+    std::cout << "\tMulti-GPU barriers appear to work here." << std::endl;
+    return 0;
+}
+
+__global__ void
+test_kernel(unsigned int *atomic_val, unsigned int *global_array,
+            unsigned int *array, uint32_t loops) {
+  cooperative_groups::grid_group grid = cooperative_groups::this_grid();
+  cooperative_groups::multi_grid_group mgrid =
+                      cooperative_groups::this_multi_grid();
+  unsigned rank = grid.thread_rank();
+  unsigned global_rank = mgrid.thread_rank();
+
+  int offset = blockIdx.x;
+  for (int i = 0; i < loops; i++) {
+    // Make the last thread run way behind everyone else.
+    // If the grid barrier below fails, then the other threads may hit the
+    // atomicInc instruction many times before the last thread ever gets
+    // to it.
+    // As such, without the barrier, the last array entry will eventually
+    // contain a very large value, defined by however many times the other
+    // wavefronts make it through this loop.
+    // If the barrier works, then it will likely contain some number
+    // near "total number of blocks". It will be the last wavefront to
+    // reach the atomicInc, but everyone will have only hit the atomic once.
+    if (rank == (grid.size() - 1)) {
+      long long start_clock = clock64();
+      while (clock64() < (start_clock+1000000)) {}
+    }
+    if (threadIdx.x == 0) {
+      array[offset] = atomicInc(atomic_val, UINT_MAX);
+    }
+    grid.sync();
+
+    // Make the last thread in the entire multi-grid run way behind
+    // everyone else.
+    // If the mgrid barrier below fails, then the two global_array entries
+    // will end up being out of sync, because the intermingling of adds
+    // and multiplies will not be aligned between to the two GPUs.
+    if (global_rank == (mgrid.size() - 1)) {
+      long long start_clock = clock64();
+      while (clock64() < (start_clock+100000000)) {}
+    }
+    // During even iterations, add into your own array entry
+    // During odd iterations, add into your partner's array entry
+    unsigned grid_rank = mgrid.grid_rank();
+    unsigned inter_gpu_offset = (grid_rank + i) % mgrid.num_grids();
+    if (rank == (grid.size() - 1)) {
+      if (i % mgrid.num_grids() == 0) {
+        global_array[grid_rank] += 2;
+      } else {
+        global_array[inter_gpu_offset] *= 2;
+      }
+    }
+    mgrid.sync();
+    offset += gridDim.x;
+  }
+}
+
+int main(int argc, char** argv) {
+  hipError_t err;
+  int num_devices = 0;
+  uint32_t loops = 2;
+  uint32_t warps = 10;
+  uint32_t block_size = 1;
+
+  std::cout << "Loops: " << loops << std::endl;
+  std::cout << "Warps: " << warps << std::endl;
+  std::cout << "Block size: " << block_size << std::endl;
+
+  HIPCHECK(hipGetDeviceCount(&num_devices));
+  if (num_devices < 2) {
+    std::cout << "Not enough GPUs to run test." << std::endl;
+    std::cout << "We require at least 2 GPUs, but only found ";
+    std::cout << num_devices << std::endl;
+    std::cout << "Skipping the test with PASSED result\n";
+    passed();
+  }
+
+  uint32_t device_num[num_devices];
+
+  /*************************************************************************/
+  /* Test whether target device supports cooperative groups ****************/
+  for (int i = 0; i < num_devices; i++) {
+    device_num[i] = i;
+    if (!cooperative_groups_support(device_num[i])) {
+      std::cout << "Skipping the test with Pass result.\n";
+      passed();
+    }
+  }
+
+  /*************************************************************************/
+  /* Test whether the requested size will fit on the GPU *******************/
+  int warp_sizes[num_devices];
+  int num_sms[num_devices];
+  hipDeviceProp_t device_properties[num_devices];
+  int warp_size = INT_MAX;
+  int num_sm = INT_MAX;
+  for (int i = 0; i < num_devices; i++) {
+    HIPCHECK(hipGetDeviceProperties(&device_properties[i], device_num[i]));
+    warp_sizes[i] = device_properties[i].warpSize;
+    if (warp_sizes[i] < warp_size) {
+      warp_size = warp_sizes[i];
+    }
+    num_sms[i] = device_properties[i].multiProcessorCount;
+    if (num_sms[i] < num_sm) {
+      num_sm = num_sms[i];
+    }
+    std::cout << "Device " << (i + 1);
+    std::cout << " name: " << device_properties[i].name << std::endl;
+  }
+  std::cout << std::endl;
+
+  int num_threads_in_block = block_size * warp_size;
+
+  // Calculate the device occupancy to know how many blocks can be run.
+  int max_blocks_per_sm_arr[num_devices];
+  int max_blocks_per_sm = INT_MAX;
+  for (int i = 0; i < num_devices; i++) {
+    HIPCHECK(hipSetDevice(device_num[i]));
+    HIPCHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor(
+            &max_blocks_per_sm_arr[i], test_kernel, num_threads_in_block, 0));
+    if (max_blocks_per_sm_arr[i] < max_blocks_per_sm) {
+      max_blocks_per_sm = max_blocks_per_sm_arr[i];
+    }
+  }
+
+  int requested_blocks = warps / block_size;
+  if (requested_blocks > max_blocks_per_sm * num_sm) {
+    std::cerr << "Requesting to run " << requested_blocks << " blocks, ";
+    std::cerr << "but we can only guarantee to simultaneously run ";
+    std::cerr << (max_blocks_per_sm * num_sm) << std::endl;
+    failed("");
+  }
+
+  /*************************************************************************/
+  /* Set up data to pass into the kernel ***********************************/
+  // Each block will output a single value per loop.
+  uint32_t total_buffer_len = requested_blocks*loops;
+
+  // Alocate the buffer that will hold the kernel's output, and which will
+  // also be used to globally synchronize during GWS initialization
+  unsigned int *host_buffer[num_devices];
+  unsigned int *kernel_buffer[num_devices];
+  unsigned int *kernel_atomic[num_devices];
+  hipStream_t streams[num_devices];
+  for (int i = 0; i < num_devices; i++) {
+    host_buffer[i] = (unsigned int*)calloc(total_buffer_len,
+                                           sizeof(unsigned int));
+    HIPCHECK(hipSetDevice(device_num[i]));
+    HIPCHECK(hipMalloc(reinterpret_cast<void**>(&kernel_buffer[i]),
+                       total_buffer_len * sizeof(unsigned int)));
+    HIPCHECK(hipMemcpy(kernel_buffer[i], host_buffer[i],
+                       total_buffer_len * sizeof(unsigned int),
+                       hipMemcpyHostToDevice));
+    HIPCHECK(hipMalloc(reinterpret_cast<void**>(&kernel_atomic[i]),
+                       sizeof(unsigned int)));
+    HIPCHECK(hipMemset(kernel_atomic[i], 0, sizeof(unsigned int)));
+    HIPCHECK(hipStreamCreate(&streams[i]));
+  }
+
+  // Single kernel atomic shared between both devices; put it on the host
+  unsigned int* global_array;
+  HIPCHECK(hipHostMalloc(reinterpret_cast<void**>(&global_array),
+                         num_devices * sizeof(unsigned int), 0));
+  HIPCHECK(hipMemset(global_array, 0, num_devices * sizeof(unsigned int)));
+
+  /*************************************************************************/
+  /* Launch the kernels ****************************************************/
+  std::cout << "Launching a kernel with " << warps << " warps ";
+  std::cout << "in " << requested_blocks << " thread blocks.";
+  std::cout << std::endl;
+
+  void *dev_params[num_devices][4];
+  hipLaunchParams md_params[num_devices];
+  for (int i = 0; i < num_devices; i++) {
+    dev_params[i][0] = reinterpret_cast<void*>(&kernel_atomic[i]);
+    dev_params[i][1] = reinterpret_cast<void*>(&global_array);
+    dev_params[i][2] = reinterpret_cast<void*>(&kernel_buffer[i]);
+    dev_params[i][3] = reinterpret_cast<void*>(&loops);
+    md_params[i].func = reinterpret_cast<void*>(test_kernel);
+    md_params[i].gridDim = requested_blocks;
+    md_params[i].blockDim = num_threads_in_block;
+    md_params[i].sharedMem = 0;
+    md_params[i].stream = streams[i];
+    md_params[i].args = dev_params[i];
+  }
+
+  HIPCHECK(hipLaunchCooperativeKernelMultiDevice(md_params, num_devices, 0));
+  HIPCHECK(hipDeviceSynchronize());
+
+  /*************************************************************************/
+  /* Read back the buffers and print out its data **************************/
+  for (int dev = 0; dev < num_devices; dev++) {
+    HIPCHECK(hipMemcpy(host_buffer[dev], kernel_buffer[dev],
+                       total_buffer_len * sizeof(unsigned int),
+                       hipMemcpyDeviceToHost));
+  }
+
+  for (unsigned int i = 0; i < loops; i++) {
+    for (int dev = 0; dev < num_devices; dev++) {
+      std::cout << "+++++++++++++++++ Device " << dev;
+      std::cout << "+++++++++++++++++" << std::endl;
+      for (unsigned int j = 0; j < requested_blocks; j++) {
+        std::cout << "Buffer entry " << (i*warps+j);
+        std::cout << " (written by warp " << j << ")";
+        std::cout << " is " << host_buffer[dev][i*requested_blocks+j];
+        std::cout << std::endl;
+      }
+    }
+    std::cout << "==========================\n";
+  }
+  for (unsigned int dev = 0; dev < num_devices; dev++) {
+    std::cout << "Testing output from device " << dev << std::endl;
+    int local_ret_val = verify_barrier_buffer(loops, requested_blocks,
+                                              host_buffer[dev], num_devices);
+    if (local_ret_val) {
+      failed("");
+    }
+  }
+
+  std::cout << std::endl << "The multi-GPU shared updates contain:\n";
+  for (int i = 0; i < num_devices; i++) {
+    std::cout << "Entry " << i << ": ";
+    std::cout << global_array[i] << std::endl;
+  }
+  int flag = 0;
+  for (int dev = 0; dev < num_devices; dev++) {
+    std::cout << "Testing multi-GPU output for entry " << dev << std::endl;
+    int local_ret_val = verify_multi_gpu_buffer(loops, global_array[dev]);
+    if (local_ret_val) {
+      flag = 1;
+    }
+  }
+  for (int k = 0; k < num_devices; ++k) {
+    HIPCHECK(hipFree(kernel_buffer[k]));
+    HIPCHECK(hipFree(kernel_atomic[k]));
+    HIPCHECK(hipStreamDestroy(streams[k]));
+    free(host_buffer[k]);
+  }
+  if (flag == 1) {
+    failed("");
+  } else {
+    passed();
+  }
+}
@@ -0,0 +1,233 @@
+/*
+Copyright (c) 2020-present Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR
+IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+// Test Description:
+/*The general idea of the application is to launch N warps. N is a command-line
+parameter, but the user should set N small enough that all warps can be on
+the GPU at the same time.
+
+All of the warps do a "work loop". Within the work loop, every warp
+atomically increments a global variable. The value returned from this atomic
+increment entriely depends on the order the threads arrive at the atomic
+instruction. Each warp then stores the result into a global array based on its
+warp ID.
+
+We also add a sleep/wait loop into the code so that the last warp runs much
+slower than everyone else. As such, it should store much larger values than
+all the other warps.
+
+If there are no barrier within the loop, then the last warp will likely get to
+the global variable the first time after all the other warps have each
+incremented it many times. If the barrier properly works, then each warp
+will increment the variable once per time through the loop, and all threads
+will sleep on the barrier waiting for the last warp to finally catch up.
+*/
+
+/* HIT_START
+ * BUILD: %t %s ../../test_common.cpp
+ * TEST: %t
+ * HIT_END
+ */
+
+#include <hip/hip_runtime.h>
+#include <hip/hip_cooperative_groups.h>
+#include "test_common.h"
+
+static int cooperative_groups_support(int device_id) {
+  hipError_t err;
+  int cooperative_attribute;
+  HIPCHECK(hipDeviceGetAttribute(&cooperative_attribute,
+           hipDeviceAttributeCooperativeLaunch, device_id));
+  if (!cooperative_attribute) {
+    std::cerr << "Cooperative launch support not available in ";
+    std::cerr << "the device attribute for device " << device_id;
+    std::cerr << std::endl;
+    return 0;
+  }
+
+  hipDeviceProp_t device_properties;
+  HIPCHECK(hipGetDeviceProperties(&device_properties, device_id));
+  if (device_properties.cooperativeLaunch == 0) {
+    std::cerr << "Cooperative group support not available in ";
+    std::cerr << "device properties." << std::endl;
+    return 0;
+  }
+  return 1;
+}
+
+static int verify_barrier_buffer(unsigned int loops, unsigned int warps,
+                                 unsigned int *host_buffer) {
+  unsigned int max_in_this_loop = 0;
+  for (unsigned int i = 0; i < loops; i++) {
+    max_in_this_loop += warps;
+    for (unsigned int j = 0; j < warps; j++) {
+      if (host_buffer[i*warps+j] > max_in_this_loop) {
+        std::cerr << "Barrier failure!" << std::endl;
+        std::cerr << "    Buffer entry " << i*warps+j;
+        std::cerr << " contains the value " << host_buffer[i*warps+j];
+        std::cerr << " but it should not be more than ";
+        std::cerr << max_in_this_loop << std::endl;
+        return -1;
+      }
+    }
+  }
+  std::cout << "Barriers work properly!" << std::endl;
+  return 0;
+}
+
+__global__ void
+test_kernel(unsigned int *atomic_val, unsigned int *array,
+            unsigned int loops) {
+  cooperative_groups::grid_group grid = cooperative_groups::this_grid();
+  unsigned rank = grid.thread_rank();
+
+  int offset = blockIdx.x;
+  for (int i = 0; i < loops; i++) {
+    // Make the last thread run way behind everyone else.
+    // If the barrier below fails, then the other threads may hit the
+    // atomicInc instruction many times before the last thread ever gets
+    // to it.
+    // As such, without the barrier, the last array entry will eventually
+    // contain a very large value, defined by however many times the other
+    // wavefronts make it through this loop.
+    // If the barrier works, then it will likely contain some number
+    // near "total number of blocks". It will be the last wavefront to
+    // reach the atomicInc, but everyone will have only hit the atomic once.
+    if (rank == (grid.size() - 1)) {
+      long long start_clock = clock64();
+      while (clock64() < (start_clock+1000000)) {}
+    }
+
+    if (threadIdx.x == 0) {
+      array[offset] = atomicInc(&atomic_val[0], UINT_MAX);
+    }
+    grid.sync();
+    offset += gridDim.x;
+  }
+}
+
+int main(int argc, char** argv) {
+  hipError_t err;
+  int device_num;
+  uint32_t loops = 2;
+  uint32_t warps = 10;
+  uint32_t block_size = 1;
+  HIPCHECK(hipGetDeviceCount(&device_num));
+  for (int dev = 0; dev < device_num; ++dev) {
+    std::cout << "Device number: " << dev << std::endl;
+    std::cout << "Loops: " << loops << std::endl;
+    std::cout << "Warps: " << warps << std::endl;
+    std::cout << "Block size: " << block_size << std::endl;
+
+    /*************************************************************************/
+    /* Test whether target device supports cooperative groups ****************/
+    HIPCHECK(hipSetDevice(dev));
+    if (!cooperative_groups_support(dev)) {
+      std::cout << "Skipping the test with Pass result.\n";
+      passed();
+    }
+
+    /*************************************************************************/
+    /* Test whether the requested size will fit on the GPU *******************/
+    int warp_size;
+    int num_sms;
+    int max_blocks_per_sm;
+    hipDeviceProp_t device_properties;
+    HIPCHECK(hipGetDeviceProperties(&device_properties, dev));
+    warp_size = device_properties.warpSize;
+    num_sms = device_properties.multiProcessorCount;
+
+    std::cout << "Device name: " << device_properties.name << std::endl;
+    std::cout << std::endl;
+
+    int num_threads_in_block = block_size * warp_size;
+
+    // Calculate the device occupancy to know how many blocks can be run.
+    HIPCHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor(&max_blocks_per_sm,
+             test_kernel, num_threads_in_block, 0));
+
+    int requested_blocks = warps / block_size;
+    if (requested_blocks > max_blocks_per_sm * num_sms) {
+      std::cerr << "Requesting to run " << requested_blocks << " blocks, ";
+      std::cerr << "but we can only guarantee to simultaneously run ";
+      std::cerr << (max_blocks_per_sm * num_sms) << std::endl;
+      failed("");
+    }
+
+    /*************************************************************************/
+    /* Set up data to pass into the kernel ***********************************/
+    // Each block will output a single value per loop.
+    uint32_t total_buffer_len = requested_blocks*loops;
+
+    // Alocate the buffer that will hold the kernel's output, and which will
+    // also be used to globally synchronize during GWS initialization
+    unsigned int *host_buffer = (unsigned int*)calloc(total_buffer_len,
+            sizeof(unsigned int));
+
+    unsigned int *kernel_buffer;
+    HIPCHECK(hipMalloc(reinterpret_cast<void**>(&kernel_buffer),
+                       total_buffer_len * sizeof(unsigned int)));
+    HIPCHECK(hipMemcpy(kernel_buffer, host_buffer,
+                       total_buffer_len * sizeof(unsigned int),
+                       hipMemcpyHostToDevice));
+
+    unsigned int *kernel_atomic;
+    HIPCHECK(hipMalloc(reinterpret_cast<void**>(&kernel_atomic),
+                       sizeof(unsigned int)));
+    HIPCHECK(hipMemset(kernel_atomic, 0, sizeof(unsigned int)));
+
+    /*************************************************************************/
+    /* Launch the kernel *****************************************************/
+    std::cout << "Launching a kernel with " << warps << " warps ";
+    std::cout << "in " << requested_blocks << " thread blocks.";
+    std::cout << std::endl;
+
+    void *params[3];
+    params[0] = reinterpret_cast<void*>(&kernel_atomic);
+    params[1] = reinterpret_cast<void*>(&kernel_buffer);
+    params[2] = reinterpret_cast<void*>(&loops);
+    HIPCHECK(hipLaunchCooperativeKernel(reinterpret_cast<void*>(test_kernel),
+                                        requested_blocks,
+                                        num_threads_in_block, params, 0, NULL));
+
+    /*************************************************************************/
+    /* Read back the buffer and print out its data****************************/
+    HIPCHECK(hipMemcpy(host_buffer, kernel_buffer,
+                       total_buffer_len * sizeof(unsigned int),
+                       hipMemcpyDeviceToHost));
+
+    for (unsigned int i = 0; i < loops; i++) {
+      for (unsigned int j = 0; j < requested_blocks; j++) {
+        std::cout << "Buffer entry " << (i*warps+j);
+        std::cout << " (written by warp " << j << ")";
+        std::cout << " is " << host_buffer[i * requested_blocks + j];
+        std::cout << std::endl;
+      }
+      std::cout << "==========================\n";
+    }
+    int ret_val = verify_barrier_buffer(loops, requested_blocks, host_buffer);
+    HIPCHECK(hipFree(kernel_buffer));
+    HIPCHECK(hipFree(kernel_atomic));
+    if (ret_val == -1) {
+      failed("");
+    } else {
+      passed();
+    }
+  }
+}
@@ -0,0 +1,374 @@
+/*
+Copyright (c) 2020-present Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR
+IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+// Test Description:
+/*The general idea of the application is to launch N warps to each of two GPUs.
+N is a command-line parameter, but the user should set N small enough that all
+warps can be on each of the GPUs at the same time.
+
+All of the warps do a "work loop". Within the work loop, every warp
+atomically increments a global variable that is shared between both fo the
+target GPUs. The value returned from this atomic increment entriely depends
+on the order the warps from the GPUs arrive at the atomic instruction. Each
+warp then stores the result into a global array based on its warp ID.
+
+We also add a sleep/wait loop into the code so that the last warp runs much
+slower than everyone else. As such, it should store much larger values than
+all the other warps.
+
+If there are no barrier within the loop, then warp 0 will likely ge to the
+global variable the first time while all the other warps have each
+incremented it many times. If the barrier properly works, then each warp
+will increment the variable once per time through the loop, and all threads
+will sleep on the barrier waiting for the last warp to finally catch up.
+*/
+
+/* HIT_START
+ * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS -rdc=true -gencode arch=compute_60,code=sm_60
+ * TEST: %t
+ * HIT_END
+ */
+
+#include <hip/hip_runtime.h>
+#include <hip/hip_cooperative_groups.h>
+#include "test_common.h"
+
+static int cooperative_groups_support(int device_id) {
+  hipError_t err;
+  int cooperative_attribute;
+  HIPCHECK(hipDeviceGetAttribute(&cooperative_attribute,
+           hipDeviceAttributeCooperativeLaunch, device_id));
+  if (!cooperative_attribute) {
+    std::cerr << "Cooperative launch support not available in ";
+    std::cerr << "the device attribute for device " << device_id;
+    std::cerr << std::endl;
+    return 0;
+  }
+
+  int multi_gpu_cooperative_attribute;
+  HIPCHECK(hipDeviceGetAttribute(&multi_gpu_cooperative_attribute,
+           hipDeviceAttributeCooperativeMultiDeviceLaunch, device_id));
+  if (!multi_gpu_cooperative_attribute) {
+    std::cerr << "Multi-GPU cooperative launch support not available in ";
+    std::cerr << "the device attribute for device " << device_id;
+    std::cerr << std::endl;
+    return 0;
+  }
+
+  hipDeviceProp_t device_properties;
+  HIPCHECK(hipGetDeviceProperties(&device_properties, device_id));
+  if (device_properties.cooperativeLaunch == 0) {
+    std::cerr << "Cooperative group support not available in ";
+    std::cerr << "device properties." << std::endl;
+    return 0;
+  }
+  if (device_properties.cooperativeMultiDeviceLaunch == 0) {
+    std::cerr << "Multi-GPU cooperative group support not available in ";
+    std::cerr << "device properties." << std::endl;
+    return 0;
+  }
+  return 1;
+}
+
+static int verify_barrier_buffer(unsigned int loops, unsigned int warps,
+                                 unsigned int *host_buffer,
+                                 unsigned int num_devs) {
+  unsigned int max_in_this_loop = 0;
+  for (unsigned int i = 0; i < loops; i++) {
+    max_in_this_loop += (warps * num_devs);
+    for (unsigned int j = 0; j < warps; j++) {
+      if (host_buffer[i*warps+j] > max_in_this_loop) {
+        std::cerr << "Barrier failure!" << std::endl;
+        std::cerr << "    Buffer entry " << i*warps+j;
+        std::cerr << " contains the value " << host_buffer[i*warps+j];
+        std::cerr << " but it should not be more than ";
+        std::cerr << max_in_this_loop << std::endl;
+        return -1;
+      }
+    }
+  }
+  std::cout << "\tBarriers work properly!" << std::endl;
+  return 0;
+}
+
+static int verify_multi_gpu_buffer(unsigned int loops, unsigned int array_val) {
+  unsigned int desired_val = 0;
+  for (int i = 0; i < loops; i++) {
+    if (i % 2 == 0) {
+      desired_val += 2;
+    } else {
+      desired_val *= 2;
+    }
+  }
+  std::cout << "Desired value is " << desired_val << std::endl;
+  if (array_val != desired_val) {
+    std::cerr << "ERROR! Multi-grid barrier does not appear to work.";
+    std::cerr << std::endl;
+    std::cerr << "Expected the multi-GPUs to work together to produce ";
+    std::cerr << "the value " << desired_val << std::endl;
+    std::cerr << "However, the entry returned from the multi-GPU ";
+    std::cerr << "kernel was " << array_val << std::endl;
+    return -1;
+  }
+    std::cout << "\tMulti-GPU barriers appear to work here." << std::endl;
+    return 0;
+}
+
+__global__ void
+test_kernel(unsigned int *atomic_val, unsigned int *global_array,
+            unsigned int *array, uint32_t loops) {
+  cooperative_groups::grid_group grid = cooperative_groups::this_grid();
+  cooperative_groups::multi_grid_group mgrid =
+                      cooperative_groups::this_multi_grid();
+  unsigned rank = grid.thread_rank();
+  unsigned global_rank = mgrid.thread_rank();
+
+  int offset = blockIdx.x;
+  for (int i = 0; i < loops; i++) {
+    // Make the last thread run way behind everyone else.
+    // If the grid barrier below fails, then the other threads may hit the
+    // atomicInc instruction many times before the last thread ever gets
+    // to it.
+    // As such, without the barrier, the last array entry will eventually
+    // contain a very large value, defined by however many times the other
+    // wavefronts make it through this loop.
+    // If the barrier works, then it will likely contain some number
+    // near "total number of blocks". It will be the last wavefront to
+    // reach the atomicInc, but everyone will have only hit the atomic once.
+    if (rank == (grid.size() - 1)) {
+      long long start_clock = clock64();
+      while (clock64() < (start_clock + 1000000)) {}
+    }
+    if (threadIdx.x == 0) {
+      array[offset] = atomicInc(atomic_val, UINT_MAX);
+    }
+    grid.sync();
+
+    // Make the last thread in the entire multi-grid run way behind
+    // everyone else.
+    // If the mgrid barrier below fails, then the two global_array entries
+    // will end up being out of sync, because the intermingling of adds
+    // and multiplies will not be aligned between to the two GPUs.
+    if (global_rank == (mgrid.size() - 1)) {
+      long long start_clock = clock64();
+      while (clock64() < (start_clock + 100000000)) {}
+    }
+    // During even iterations, add into your own array entry
+    // During odd iterations, add into your partner's array entry
+    unsigned grid_rank = mgrid.grid_rank();
+    unsigned inter_gpu_offset = (grid_rank + i) % mgrid.num_grids();
+    if (rank == (grid.size() - 1)) {
+      if (i % mgrid.num_grids() == 0) {
+        global_array[grid_rank] += 2;
+      } else {
+        global_array[inter_gpu_offset] *= 2;
+      }
+    }
+    mgrid.sync();
+    offset += gridDim.x;
+  }
+}
+
+int main(int argc, char** argv) {
+    hipError_t err;
+    int device_num = 0, flag = 0;
+    uint32_t loops = 2;
+    uint32_t warps = 10;
+    uint32_t block_size = 1;
+    HIPCHECK(hipGetDeviceCount(&device_num));
+    if (device_num < 2) {
+      std::cout << "This test needs atleast two gpus but found only";
+      std::cout << device_num << std::endl;
+      std::cout << "Hence skipping the test with pass result\n";
+      passed();
+    }
+
+    for (int d = 0; d < (device_num - 1); ++d) {
+    std::cout << "First device number: " << d << std::endl;
+    std::cout << "Second device number: " << (d + 1) << std::endl;
+    std::cout << "Loops: " << loops << std::endl;
+    std::cout << "Warps: " << warps << std::endl;
+    std::cout << "Block size: " << block_size << std::endl;
+
+    /*************************************************************************/
+    /* Test whether target device supports cooperative groups ****************/
+    for (int i = 0; i < 2; i++) {
+      if (!cooperative_groups_support((d + i))) {
+        std::cout << "Skipping the test with Pass result.\n";
+        passed();
+      }
+    }
+
+    /*************************************************************************/
+    /* Test whether the requested size will fit on the GPU *******************/
+    int warp_sizes[2];
+    int num_sms[2];
+    hipDeviceProp_t device_properties[2];
+    int warp_size = INT_MAX;
+    int num_sm = INT_MAX;
+    for (int i = 0; i < 2; i++) {
+      HIPCHECK(hipGetDeviceProperties(&device_properties[i], (d + i)));
+      warp_sizes[i] = device_properties[i].warpSize;
+      if (warp_sizes[i] < warp_size) {
+        warp_size = warp_sizes[i];
+      }
+      num_sms[i] = device_properties[i].multiProcessorCount;
+      if (num_sms[i] < num_sm) {
+        num_sm = num_sms[i];
+      }
+      std::cout << "Device " << (d + i);
+      std::cout << " name: " << device_properties[i].name << std::endl;
+    }
+    std::cout << std::endl;
+
+    int num_threads_in_block = block_size * warp_size;
+
+    // Calculate the device occupancy to know how many blocks can be run.
+    int max_blocks_per_sm_arr[2];
+    int max_blocks_per_sm = INT_MAX;
+    for (int i = 0; i < 2; i++) {
+      HIPCHECK(hipSetDevice((d + i)));
+      HIPCHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor(
+              &max_blocks_per_sm_arr[i], test_kernel, num_threads_in_block,
+              0));
+      if (max_blocks_per_sm_arr[i] < max_blocks_per_sm) {
+        max_blocks_per_sm = max_blocks_per_sm_arr[i];
+      }
+    }
+
+    int requested_blocks = warps / block_size;
+    if (requested_blocks > max_blocks_per_sm * num_sm) {
+      std::cerr << "Requesting to run " << requested_blocks << " blocks, ";
+      std::cerr << "but we can only guarantee to simultaneously run ";
+      std::cerr << (max_blocks_per_sm * num_sm) << std::endl;
+      failed("");
+    }
+
+    /*************************************************************************/
+    /* Set up data to pass into the kernel ***********************************/
+    // Each block will output a single value per loop.
+    uint32_t total_buffer_len = requested_blocks*loops;
+
+    // Alocate the buffer that will hold the kernel's output, and which will
+    // also be used to globally synchronize during GWS initialization
+    unsigned int *host_buffer[2];
+    unsigned int *kernel_buffer[2];
+    unsigned int *kernel_atomic[2];
+    hipStream_t streams[2];
+    for (int i = 0; i < 2; i++) {
+      host_buffer[i] = (unsigned int*)calloc(total_buffer_len,
+                                             sizeof(unsigned int));
+      HIPCHECK(hipSetDevice((d + i)));
+      HIPCHECK(hipMalloc(reinterpret_cast<void**>(&kernel_buffer[i]),
+               total_buffer_len * sizeof(unsigned int)));
+      HIPCHECK(hipMemcpy(kernel_buffer[i], host_buffer[i],
+               total_buffer_len * sizeof(unsigned int), hipMemcpyHostToDevice));
+      HIPCHECK(hipMalloc(reinterpret_cast<void**>(&kernel_atomic[i]),
+                         sizeof(unsigned int)));
+      HIPCHECK(hipMemset(kernel_atomic[i], 0, sizeof(unsigned int)));
+      HIPCHECK(hipStreamCreate(&streams[i]));
+    }
+
+    // Single kernel atomic shared between both devices; put it on the host
+    unsigned int* global_array;
+    HIPCHECK(hipHostMalloc(reinterpret_cast<void**>(&global_array),
+                           2 * sizeof(unsigned int), 0));
+    HIPCHECK(hipMemset(global_array, 0, 2 * sizeof(unsigned int)));
+
+    /*************************************************************************/
+    /* Launch the kernels ****************************************************/
+    std::cout << "Launching a kernel with " << warps << " warps ";
+    std::cout << "in " << requested_blocks << " thread blocks.";
+    std::cout << std::endl;
+
+    void *dev_params[2][4];
+    hipLaunchParams md_params[2];
+    for (int i = 0; i < 2; i++) {
+      dev_params[i][0] = reinterpret_cast<void*>(&kernel_atomic[i]);
+      dev_params[i][1] = reinterpret_cast<void*>(&global_array);
+      dev_params[i][2] = reinterpret_cast<void*>(&kernel_buffer[i]);
+      dev_params[i][3] = reinterpret_cast<void*>(&loops);
+      md_params[i].func = reinterpret_cast<void*>(test_kernel);
+      md_params[i].gridDim = requested_blocks;
+      md_params[i].blockDim = num_threads_in_block;
+      md_params[i].sharedMem = 0;
+      md_params[i].stream = streams[i];
+      md_params[i].args = dev_params[i];
+    }
+
+    HIPCHECK(hipLaunchCooperativeKernelMultiDevice(md_params, 2, 0));
+    HIPCHECK(hipDeviceSynchronize());
+
+    /*************************************************************************/
+    /* Read back the buffers and print out its data **************************/
+    for (int dev = 0; dev < 2; dev++) {
+      HIPCHECK(hipMemcpy(host_buffer[d + dev], kernel_buffer[d + dev],
+                         total_buffer_len * sizeof(unsigned int),
+                         hipMemcpyDeviceToHost));
+    }
+
+    for (unsigned int i = 0; i < loops; i++) {
+      for (int dev = 0; dev < 2; dev++) {
+        std::cout << "+++++++++++++++++ Device " << (d + dev);
+        std::cout << "+++++++++++++++++" << std::endl;
+        for (unsigned int j = 0; j < requested_blocks; j++) {
+          std::cout << "Buffer entry " << (i * warps + j);
+          std::cout << " (written by warp " << j << ")";
+          std::cout << " is " << host_buffer[dev][i * requested_blocks + j];
+          std::cout << std::endl;
+        }
+      }
+      std::cout << "==========================\n";
+    }
+    for (unsigned int dev = 0; dev < 2; dev++) {
+      std::cout << "Testing output from device " << (d + dev) << std::endl;
+      int local_ret_val = verify_barrier_buffer(loops, requested_blocks,
+                                                host_buffer[dev], 2);
+      if (local_ret_val == -1) {
+        flag = 1;
+      }
+    }
+
+    std::cout << std::endl << "The multi-GPU shared updates contain:";
+    std::cout << std::endl;
+    for (int i = 0; i < 2; i++) {
+      std::cout << "Entry " << i << ": ";
+      std::cout << global_array[i] << std::endl;
+    }
+    for (int dev = 0; dev < 2; dev++) {
+      std::cout << "Testing multi-GPU output for entry " << (d + dev);
+      std::cout << std::endl;
+      int local_ret_val = verify_multi_gpu_buffer(loops, global_array[dev]);
+      if (local_ret_val) {
+        flag = 1;
+      }
+    }
+    for (int k = 0; k < 2; ++k) {
+      HIPCHECK(hipFree(kernel_buffer[k]));
+      HIPCHECK(hipFree(kernel_atomic[k]));
+      HIPCHECK(hipStreamDestroy(streams[k]));
+      free(host_buffer[k]);
+    }
+  }
+  if (flag == 1) {
+    failed("");
+  } else {
+    passed();
+  }
+}
@@ -1,173 +1,173 @@
-/*
- * Copyright (c) 2015-present Advanced Micro Devices, Inc. All rights reserved.
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
- */
-
-/*
- * Test to compare
- * 1.pciBusID from hipDeviceGetPCIBusId and hipDeviceGetAttribute **
- * 2.{pciDomainID, pciBusID, pciDeviceID} values hipDeviceGetPCIBusId vs lspci **
- */
-
-/* HIT_START
- * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS -std=c++11
- * TEST_NAMED: %t  hipDeviceGetPCIBusId-vs-hipDeviceGetAttribute --tests 0x1
- * TEST_NAMED: %t  hipDeviceGetPCIBusId-vs-lspci --tests 0x2 EXCLUDE_HIP_PLATFORM nvcc
- * HIT_END
- */
-
-#include "test_common.h"
-#define MAX_DEVICE_LENGTH 20
-
-static bool getPciBusId(int deviceCount, char hipDeviceList[][MAX_DEVICE_LENGTH]) {
-  for (int i = 0; i < deviceCount; i++) {
-    HIPCHECK(hipDeviceGetPCIBusId(hipDeviceList[i], MAX_DEVICE_LENGTH, i));
-  }
-  return true;
-}
-
-bool comparePciBusIDWithHipDeviceGetAttribute() {
-  bool testResult = true;
-  int deviceCount = 0;
-  HIPCHECK(hipGetDeviceCount(&deviceCount));
-  HIPASSERT(deviceCount != 0);
-  printf("No.of gpus in the system: %d\n", deviceCount);
-  char hipDeviceList[deviceCount][MAX_DEVICE_LENGTH];
-  char pciDeviceList[deviceCount][MAX_DEVICE_LENGTH];
-
-  getPciBusId(deviceCount, hipDeviceList);
-
-  for (int i = 0; i < deviceCount; i++) {
-    int pciBusID = -1;
-    int pciDeviceID = -1;
-    int pciDomainID = -1;
-    int tempPciBusId = -1;
-    sscanf(hipDeviceList[i], "%04x:%02x:%02x", &pciDomainID, &pciBusID,
-           &pciDeviceID);
-    HIPCHECK(hipDeviceGetAttribute(&tempPciBusId, hipDeviceAttributePciBusId, i));
-    if (pciBusID != tempPciBusId) {
-      testResult = false;
-      printf("pciBusID from hipDeviceGetPCIBusId mismatched to that from "
-             "hipDeviceGetAttribute for gpu %d\n", i);
-    }
-  }
-
-  printf("pciBusID output of both hipDeviceGetPCIBusId and"
-         " hipDeviceGetAttribute matched for all gpus\n");
-  return testResult;
-}
-
-bool compareHipDeviceGetPCIBusIdWithLspci() {
-  FILE *fpipe;
-  bool testResult = false;
-
-  {
-    // Check if lspci is installed, if not, don't proceed
-    char const *cmd = "lspci --version";
-    char *lspciCheck;
-    char temp[20];
-    fpipe = popen(cmd, "r");
-
-    if (fpipe == nullptr) {
-      printf("Unable to create command file\n");
-      return testResult;
-    }
-
-    lspciCheck = fgets(temp, 20, fpipe);
-    pclose(fpipe);
-
-    if (!lspciCheck) {
-      printf("lspci not found. Skipping the test\n");
-      return true;
-    }
-  }
-
-  int deviceCount = 0;
-  HIPCHECK(hipGetDeviceCount(&deviceCount));
-  HIPASSERT(deviceCount != 0);
-  printf("No.of gpus in the system: %d\n", deviceCount);
-  char hipDeviceList[deviceCount][MAX_DEVICE_LENGTH];
-  char pciDeviceList[deviceCount][MAX_DEVICE_LENGTH];
-
-  getPciBusId(deviceCount, hipDeviceList);
-
-  // Get lspci device list and compare with hip device list
-#if defined(__CUDA_ARCH__)
-  char const *command = "lspci -D | grep controller | grep NVIDIA | "
-                        "cut -d ' ' -f 1";
-#else
-  char const *command = "lspci -D | grep controller | grep AMD/ATI | "
-                        "cut -d ' ' -f 1";
-#endif
-  fpipe = popen(command, "r");
-
-  if (fpipe == nullptr) {
-    printf("Unable to create command file\n");
-    return testResult;
-  }
-
-  int index = 0;
-  int deviceMatchCount = 0;
-
-  while (fgets(pciDeviceList[index], sizeof(pciDeviceList[index]), fpipe)) {
-    bool bMatchFound = false;
-    for (int deviceNo = 0; deviceNo < deviceCount; deviceNo++) {
-      if (!strncmp(pciDeviceList[index], hipDeviceList[deviceNo], 10)) {
-        deviceMatchCount++;
-        bMatchFound = true;
-      }
-    }
-    if (bMatchFound == false) {
-      printf("PCI device: %s is not reported by HIP\n", pciDeviceList[index]);
-    }
-    index++;
-  }
-
-  pclose(fpipe);
-
-  if (deviceMatchCount == deviceCount) {
-    printf("hip and lspci output for {pciDomainID, pciBusID, pciDeviceID} "
-           "matched for all gpus\n");
-    testResult = true;
-  } else {
-    printf("Mismatch in number GPUs reported by HIP with lscpi\n");
-  }
-  return testResult;
-}
-
-int main(int argc, char* argv[]) {
-  bool testResult = true;
-  HipTest::parseStandardArguments(argc, argv, true);
-
-  if (p_tests & 0x1) {
-    testResult &= comparePciBusIDWithHipDeviceGetAttribute();
-  }
-
-  if (p_tests & 0x2) {
-#ifdef __unix__
-    testResult &= compareHipDeviceGetPCIBusIdWithLspci();
-#else
-    printf("Detected non-linux OS. Skipping the test\n");
-#endif
-  }
-
-  if (testResult) {
-    passed();
-  } else {
-    failed("one or more tests failed\n");
-  }
-}
+/*
+ * Copyright (c) 2015-present Advanced Micro Devices, Inc. All rights reserved.
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+/*
+ * Test to compare
+ * 1.pciBusID from hipDeviceGetPCIBusId and hipDeviceGetAttribute **
+ * 2.{pciDomainID, pciBusID, pciDeviceID} values hipDeviceGetPCIBusId vs lspci **
+ */
+
+/* HIT_START
+ * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS -std=c++11 EXCLUDE_HIP_PLATFORM nvcc
+ * TEST_NAMED: %t  hipDeviceGetPCIBusId-vs-hipDeviceGetAttribute --tests 0x1
+ * TEST_NAMED: %t  hipDeviceGetPCIBusId-vs-lspci --tests 0x2 EXCLUDE_HIP_PLATFORM nvcc
+ * HIT_END
+ */
+
+#include "test_common.h"
+#define MAX_DEVICE_LENGTH 20
+
+static bool getPciBusId(int deviceCount, char hipDeviceList[][MAX_DEVICE_LENGTH]) {
+  for (int i = 0; i < deviceCount; i++) {
+    HIPCHECK(hipDeviceGetPCIBusId(hipDeviceList[i], MAX_DEVICE_LENGTH, i));
+  }
+  return true;
+}
+
+bool comparePciBusIDWithHipDeviceGetAttribute() {
+  bool testResult = true;
+  int deviceCount = 0;
+  HIPCHECK(hipGetDeviceCount(&deviceCount));
+  HIPASSERT(deviceCount != 0);
+  printf("No.of gpus in the system: %d\n", deviceCount);
+  char hipDeviceList[deviceCount][MAX_DEVICE_LENGTH];
+  char pciDeviceList[deviceCount][MAX_DEVICE_LENGTH];
+
+  getPciBusId(deviceCount, hipDeviceList);
+
+  for (int i = 0; i < deviceCount; i++) {
+    int pciBusID = -1;
+    int pciDeviceID = -1;
+    int pciDomainID = -1;
+    int tempPciBusId = -1;
+    sscanf(hipDeviceList[i], "%04x:%02x:%02x", &pciDomainID, &pciBusID,
+           &pciDeviceID);
+    HIPCHECK(hipDeviceGetAttribute(&tempPciBusId, hipDeviceAttributePciBusId, i));
+    if (pciBusID != tempPciBusId) {
+      testResult = false;
+      printf("pciBusID from hipDeviceGetPCIBusId mismatched to that from "
+             "hipDeviceGetAttribute for gpu %d\n", i);
+    }
+  }
+
+  printf("pciBusID output of both hipDeviceGetPCIBusId and"
+         " hipDeviceGetAttribute matched for all gpus\n");
+  return testResult;
+}
+
+bool compareHipDeviceGetPCIBusIdWithLspci() {
+  FILE *fpipe;
+  bool testResult = false;
+
+  {
+    // Check if lspci is installed, if not, don't proceed
+    char const *cmd = "lspci --version";
+    char *lspciCheck;
+    char temp[20];
+    fpipe = popen(cmd, "r");
+
+    if (fpipe == nullptr) {
+      printf("Unable to create command file\n");
+      return testResult;
+    }
+
+    lspciCheck = fgets(temp, 20, fpipe);
+    pclose(fpipe);
+
+    if (!lspciCheck) {
+      printf("lspci not found. Skipping the test\n");
+      return true;
+    }
+  }
+
+  int deviceCount = 0;
+  HIPCHECK(hipGetDeviceCount(&deviceCount));
+  HIPASSERT(deviceCount != 0);
+  printf("No.of gpus in the system: %d\n", deviceCount);
+  char hipDeviceList[deviceCount][MAX_DEVICE_LENGTH];
+  char pciDeviceList[deviceCount][MAX_DEVICE_LENGTH];
+
+  getPciBusId(deviceCount, hipDeviceList);
+
+  // Get lspci device list and compare with hip device list
+#if defined(__CUDA_ARCH__)
+  char const *command = "lspci -D | grep controller | grep NVIDIA | "
+                        "cut -d ' ' -f 1";
+#else
+  char const *command = "lspci -D | grep controller | grep AMD/ATI | "
+                        "cut -d ' ' -f 1";
+#endif
+  fpipe = popen(command, "r");
+
+  if (fpipe == nullptr) {
+    printf("Unable to create command file\n");
+    return testResult;
+  }
+
+  int index = 0;
+  int deviceMatchCount = 0;
+
+  while (fgets(pciDeviceList[index], sizeof(pciDeviceList[index]), fpipe)) {
+    bool bMatchFound = false;
+    for (int deviceNo = 0; deviceNo < deviceCount; deviceNo++) {
+      if (!strncmp(pciDeviceList[index], hipDeviceList[deviceNo], 10)) {
+        deviceMatchCount++;
+        bMatchFound = true;
+      }
+    }
+    if (bMatchFound == false) {
+      printf("PCI device: %s is not reported by HIP\n", pciDeviceList[index]);
+    }
+    index++;
+  }
+
+  pclose(fpipe);
+
+  if (deviceMatchCount == deviceCount) {
+    printf("hip and lspci output for {pciDomainID, pciBusID, pciDeviceID} "
+           "matched for all gpus\n");
+    testResult = true;
+  } else {
+    printf("Mismatch in number GPUs reported by HIP with lscpi\n");
+  }
+  return testResult;
+}
+
+int main(int argc, char* argv[]) {
+  bool testResult = true;
+  HipTest::parseStandardArguments(argc, argv, true);
+
+  if (p_tests & 0x1) {
+    testResult &= comparePciBusIDWithHipDeviceGetAttribute();
+  }
+
+  if (p_tests & 0x2) {
+#ifdef __unix__
+    testResult &= compareHipDeviceGetPCIBusIdWithLspci();
+#else
+    printf("Detected non-linux OS. Skipping the test\n");
+#endif
+  }
+
+  if (testResult) {
+    passed();
+  } else {
+    failed("one or more tests failed\n");
+  }
+}
@@ -25,7 +25,7 @@
 */

 /* HIT_START
- * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS -std=c++11
+ * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS -std=c++11 EXCLUDE_HIP_PLATFORM nvcc
 * TEST_NAMED: %t hipSetGetDevice-invalidDevice
 * TEST_NAMED: %t hipSetGetDevice-allValidDevice
 * TEST_NAMED: %t hipSetGetDevice-validDev1 --computeDevCnt 1
@@ -0,0 +1,227 @@
+/*
+Copyright (c) 2020 - present Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+/* HIT_START
+ * BUILD: %t %s ../../test_common.cpp
+ * TEST: %t
+ * HIT_END
+ */
+
+#include <sys/types.h>
+#include <sys/mman.h>
+#include <sys/wait.h>
+#include <fcntl.h>
+#include <semaphore.h>
+#include <unistd.h>
+#include "test_common.h"
+
+#ifdef __linux__
+sem_t *sem_ob1 = NULL, *sem_ob2 = NULL;
+typedef struct mem_handle {
+  int device;
+  hipIpcMemHandle_t memHandle;
+  bool IfTestPassed;
+} hip_ipc_t;
+
+class IpcMemHandleTest {
+ public:
+  bool InitFlag = true;
+  hip_ipc_t *shrd_mem = NULL;
+  pid_t pid;
+  size_t N = 1024;
+  size_t Nbytes = N * sizeof(int);
+  int *A_d = NULL, out = 0;
+  int *A_h, *C_h;
+  int Num_devices = 0, Data_mismatch, CanAccessPeer = 0;
+  int *Ad1 = NULL, *Ad2 = NULL;
+  IpcMemHandleTest();
+  bool Test();
+  ~IpcMemHandleTest();
+};
+
+
+bool IpcMemHandleTest::Test() {
+  if (InitFlag == false) {
+    // Abort the test if the initialization fails
+    printf("Resource initialization failed. Hence test skipped!");
+    return false;
+  }
+  pid = fork();
+  if (pid != 0) {
+    // Parent process
+    HIPCHECK(hipGetDeviceCount(&Num_devices));
+    for (int i = 0; i < Num_devices; ++i) {
+      if (shrd_mem->IfTestPassed == true) {
+        HIPCHECK(hipSetDevice(i));
+        HIPCHECK(hipMalloc(&A_d, Nbytes));
+        HIPCHECK(hipIpcGetMemHandle((hipIpcMemHandle_t *) &shrd_mem->memHandle,
+                                    A_d));
+        HIPCHECK(hipMemcpy(A_d, A_h, Nbytes, hipMemcpyHostToDevice));
+        shrd_mem->device = i;
+        if ((out=sem_post(sem_ob1)) == -1) {
+          // Need to use inline function to release resources.
+          shrd_mem->IfTestPassed = false;
+          failed("sem_post() call failed in parent process.");
+        }
+        if ((out=sem_wait(sem_ob2)) == -1) {
+          shrd_mem->IfTestPassed = false;
+          failed("sem_wait() call failed in parent process.");
+        }
+        HIPCHECK(hipFree(A_d));
+      }
+    }
+  } else {
+    // Child process
+    HIPCHECK(hipGetDeviceCount(&Num_devices));
+    for (int j = 0; j < Num_devices; ++j) {
+      if ((out=sem_wait(sem_ob1)) == -1) {
+        shrd_mem->IfTestPassed = false;
+        printf("sem_wait() call failed in child process.");
+        if ((out=sem_post(sem_ob2)) == -1) {
+          printf("sem_post() call on sem_ob2 failed");
+          exit(1);
+        }
+      }
+      for (int i = 0; i < Num_devices; ++i) {
+        Data_mismatch = 0;
+        HIPCHECK(hipSetDevice(i));
+        HIPCHECK(hipMalloc(&Ad2, Nbytes));
+        HIPCHECK(hipIpcOpenMemHandle((void **) &Ad1, shrd_mem->memHandle,
+                                     hipIpcMemLazyEnablePeerAccess));
+        HIPCHECK(hipDeviceCanAccessPeer(&CanAccessPeer, i, shrd_mem->device));
+        if (CanAccessPeer == 1) {
+          HIPCHECK(hipMemcpy(Ad2, Ad1, Nbytes, hipMemcpyDeviceToDevice));
+          HIPCHECK(hipMemcpy(C_h, Ad2, Nbytes, hipMemcpyDeviceToDevice));
+          for (int i = 0; i < N; ++i) {
+            if (C_h[i] != 123)
+              Data_mismatch++;
+          }
+          if (Data_mismatch != 0) {
+            printf("Data mismatch found when data copied from Ipc memhandle");
+            printf(" to Device: %d\n", i);
+            shrd_mem->IfTestPassed = false;
+          }
+          memset(reinterpret_cast<void*>(C_h), 0, Nbytes);
+          // Checking if the data obtained from Ipc shared memory is consistent
+          HIPCHECK(hipMemcpy(C_h, Ad1, Nbytes, hipMemcpyDeviceToHost));
+          for (int i = 0; i < N; ++i) {
+            if (C_h[i] != 123)
+              Data_mismatch++;
+          }
+          if (Data_mismatch != 0) {
+            printf("Data mismatch found when data copied from Ipc memhandle");
+            printf(" Host.\n");
+            shrd_mem->IfTestPassed = false;
+          }
+        }
+        HIPCHECK(hipIpcCloseMemHandle(reinterpret_cast<void*>(Ad1)));
+      }
+    HIPCHECK(hipFree(Ad2));
+    if ((out=sem_post(sem_ob2)) == -1) {
+      shrd_mem->IfTestPassed = false;
+      printf("sem_post() call on sem_ob2 failed");
+      exit(1);
+    }
+  }
+  exit(0);
+  }
+
+  if ((out = sem_unlink("/my-sem-object1")) == -1) {
+    printf("sem_unlink() call on /my-sem-object1 failed");
+  }
+  if ((out = sem_unlink("/my-sem-object2")) == -1) {
+    printf("sem_unlink() call on /my-sem-object2 failed");
+  }
+  int status;
+  waitpid(pid, &status, 0);
+  if (shrd_mem->IfTestPassed == false) {
+    return false;
+  } else {
+    return true;
+  }
+}
+
+IpcMemHandleTest::IpcMemHandleTest() {
+  std::string cmd_line = "rm -rf /dev/shm/sem.my-sem-object*";
+  int res = system(cmd_line.c_str());
+  if (res == -1) {
+    InitFlag = false;
+    printf("System call to remove existing shared objects failed!");
+  }
+  int out;
+  if ((sem_ob1 = sem_open ("/my-sem-object1", O_CREAT|O_EXCL, 0660, 0)) ==
+      SEM_FAILED) {
+    InitFlag = false;
+    printf("Initialization of 1st semaphore object failed");
+  }
+  if ((sem_ob2 = sem_open ("/my-sem-object2", O_CREAT|O_EXCL, 0660, 0)) ==
+      SEM_FAILED) {
+    InitFlag = false;
+    printf("Initialization of 2nd semaphore object failed");
+  }
+
+  shrd_mem = reinterpret_cast<hip_ipc_t *>(mmap(NULL, sizeof(hip_ipc_t),
+                                                PROT_READ | PROT_WRITE,
+                                                MAP_SHARED | MAP_ANONYMOUS,
+                                                0, 0));
+  if (shrd_mem == NULL) {
+    InitFlag = false;
+    printf("mmap() call failed!");
+  }
+  shrd_mem->IfTestPassed = true;
+  A_h = reinterpret_cast<int*>(malloc(Nbytes));
+  C_h = reinterpret_cast<int*>(malloc(Nbytes));
+  for (size_t i = 0; i < N; i++) {
+    A_h[i] = 123;
+  }
+}
+
+IpcMemHandleTest::~IpcMemHandleTest() {
+  munmap(shrd_mem, sizeof(hip_ipc_t));
+  HIPCHECK(hipFree((A_d)));
+  free(A_h);
+  free(C_h);
+  HIPCHECK(hipFree((Ad1)));
+  HIPCHECK(hipFree((Ad2)));
+}
+#endif
+
+int main() {
+  bool IfTestPassed = true;
+  // The following program spawns a child process and does the following
+  // Parent iterate through each device, create memory -- create hipIpcMemhandle
+  // stores the mem handle in mmaped memory, release the child using sem_post()
+  // and wait for child to release itself(parent process)
+  // child process:
+  // Child process get the ipc mem handle using hipIpcOpenMemHandle
+  // Iterate through all the available gpus and do Device to Device copies
+  // and check for data consistencies and close the hipIpcCloseMemHandle
+  // release the parent and wait for parent to release itself(child)
+#ifdef __linux__
+  IpcMemHandleTest obj;
+  IfTestPassed = obj.Test();
+#else
+  printf("This is not a Linux platform. Hence Skipping the test!\n");
+  IfTestPassed = true;
+#endif
+  if (IfTestPassed == false) {
+    failed("");
+  }
+  passed();
+}
@@ -0,0 +1,487 @@
+/*
+Copyright (c) 2020-present Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+/**
+Testcase Scenarios :
+
+ (TestCase 1)::
+ 1) Test hipMalloc() api passing zero size and confirming *ptr returning
+ nullptr. Also pass nullptr to hipFree() api.
+ 2) Pass maximum value of size_t for hipMalloc() api and make sure appropriate
+ error is returned.
+ 3) Check for hipMalloc() error code, passing invalid/null pointer.
+
+ (TestCase 2)::
+ 4) Regress hipMalloc()/hipFree() in loop for bigger chunk of allocation
+ with adequate number of iterations and later test for kernel execution on
+ default gpu.
+ 5) Regress hipMalloc()/hipFree() in loop while allocating smaller chunks
+ keeping maximum number of iterations and then run kernel code on default
+ gpu, perfom data validation.
+
+ (TestCase 3)::
+ 6) Check hipMalloc() api adaptability when app creates small chunks of memory
+ continuously, stores it for later use and then frees it at later point
+ of time.
+
+ (TestCase 4)::
+ 7) Run hipMalloc() api/kernel code on same gpu parallely from parent and child
+ processes, validate the results.
+
+ (TestCase 5)::
+ 8) Execute hipMalloc() api simultaneously on all the gpus by spawning multiple
+ child processes. Validate buffers allocated after running kernel code.
+
+ (TestCase 6)::
+ 9) Multithread Scenario : Exercise hipMalloc() api parellely on all gpus from
+ multiple threads and regress the api.
+
+ (TestCases 2, 3, 4, 5, 6)::
+ 10) Validate memory usage with hipMemGetInfo() while regressing hipMalloc()
+ api. Check for any possible memory leaks.
+*/
+
+/* HIT_START
+ * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS --std=c++11
+ * TEST_NAMED: %t hipMalloc_ArgValidation  --tests 1
+ * TEST_NAMED: %t hipMalloc_LoopRegression_AllocFreeCycle --tests 2
+ * TEST_NAMED: %t hipMalloc_LoopRegression_AllocPool --tests 3
+ * TEST_NAMED: %t hipMallocChild_Concurrency_DefaultGpu --tests 4
+ * TEST_NAMED: %t hipMallocChild_Concurrency_MultiGpu --tests 5
+ * TEST_NAMED: %t hipMalloc_MultiThreaded_MultiGpu --tests 6
+ * HIT_END
+ */
+
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <unistd.h>
+
+#include <iostream>
+#include <vector>
+#include <limits>
+#include <atomic>
+
+#include "test_common.h"
+
+/* Max alloc/free iterations for bigger chunks */
+#define MAX_ALLOCFREE_BC (10000)
+
+/* Buffer size for alloc/free cycles */
+#define BUFF_SIZE_AF (5*1024*1024)
+
+/* Max alloc/free iterations for smaller chunks */
+#define MAX_ALLOCFREE_SC (5000000)
+
+/* Max alloc and pool iterations (TBD) */
+#define MAX_ALLOCPOOL_ITER (2000000)
+
+/**
+ * Validates data consitency on supplied gpu
+ */
+bool validateMemoryOnGPU(int gpu) {
+  size_t Nbytes = N * sizeof(int);
+  int *A_d, *B_d, *C_d;
+  int *A_h, *B_h, *C_h;
+  size_t prevAvl, prevTot, curAvl, curTot;
+  bool TestPassed = true;
+
+  HIPCHECK(hipSetDevice(gpu));
+  HIPCHECK(hipMemGetInfo(&prevAvl, &prevTot));
+  HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, false);
+
+  unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N);
+
+  HIPCHECK(hipMemcpy(A_d, A_h, Nbytes, hipMemcpyHostToDevice));
+  HIPCHECK(hipMemcpy(B_d, B_h, Nbytes, hipMemcpyHostToDevice));
+
+  hipLaunchKernelGGL(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock),
+                  0, 0, static_cast<const int*>(A_d),
+                  static_cast<const int*>(B_d), C_d, N);
+
+  HIPCHECK(hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost));
+
+  if (!HipTest::checkVectorADD(A_h, B_h, C_h, N)) {
+    printf("Validation PASSED for gpu %d from pid %d\n", gpu, getpid());
+  } else {
+    printf("%s : Validation FAILED for gpu %d from pid %d\n",
+        __func__, gpu, getpid());
+    TestPassed &= false;
+  }
+
+  HipTest::freeArrays(A_d, B_d, C_d, A_h, B_h, C_h, false);
+  HIPCHECK(hipMemGetInfo(&curAvl, &curTot));
+
+  if ((prevAvl != curAvl) || (prevTot != curTot)) {
+    printf("%s : Memory allocation mismatch observed."
+        "Possible memory leak.", __func__);
+    TestPassed &= false;
+  }
+
+  return TestPassed;
+}
+
+/**
+ * Fetches Gpu device count
+ */
+void getDeviceCount(int *pdevCnt) {
+#ifdef __linux__
+  int fd[2], val = 0;
+  pid_t childpid;
+
+  // create pipe descriptors
+  pipe(fd);
+
+  // disable visible_devices env from shell
+  unsetenv("ROCR_VISIBLE_DEVICES");
+  unsetenv("HIP_VISIBLE_DEVICES");
+
+  childpid = fork();
+
+  if (childpid > 0) {  // Parent
+    close(fd[1]);
+    // parent will wait to read the device cnt
+    read(fd[0], &val, sizeof(val));
+
+    // close the read-descriptor
+    close(fd[0]);
+
+    // wait for child exit
+    wait(NULL);
+
+    *pdevCnt = val;
+  } else if (!childpid) {  // Child
+    int devCnt = 1;
+    // writing only, no need for read-descriptor
+    close(fd[0]);
+
+    HIPCHECK(hipGetDeviceCount(&devCnt));
+    // send the value on the write-descriptor:
+    write(fd[1], &devCnt, sizeof(devCnt));
+
+    // close the write descriptor:
+    close(fd[1]);
+    exit(0);
+  } else {  // failure
+    *pdevCnt = 1;
+    return;
+  }
+
+#else
+  HIPCHECK(hipGetDeviceCount(pdevCnt));
+#endif
+}
+
+/**
+ * Regress memory allocation and free in loop
+ */
+bool regressAllocInLoop(int gpu) {
+  bool TestPassed = true;
+  size_t tot, avail, ptot, pavail;
+  int i = 0;
+  int *ptr;
+
+  HIPCHECK(hipSetDevice(gpu));
+
+  // Exercise allocation in loop with bigger chunks
+  for (i = 0; i < MAX_ALLOCFREE_BC; i++) {
+    size_t numBytes = BUFF_SIZE_AF;
+
+    HIPCHECK(hipMemGetInfo(&pavail, &ptot));
+    HIPCHECK(hipMalloc(&ptr, numBytes));
+    HIPCHECK(hipMemGetInfo(&avail, &tot));
+
+    if (pavail-avail != numBytes) {
+      printf("LoopAllocation : Memory allocation of %6.2fMB"
+             "not matching with hipMemGetInfo - FAIL\n",
+              numBytes/(1024.0*1024.0));
+      TestPassed &= false;
+      HIPCHECK(hipFree(ptr));
+      break;
+    }
+
+    HIPCHECK(hipFree(ptr));
+  }
+
+  // Exercise allocation in loop with smaller chunks and max iters
+  HIPCHECK(hipMemGetInfo(&pavail, &ptot));
+
+  for (i = 0; i < MAX_ALLOCFREE_SC; i++) {
+    size_t numBytes = 16;
+
+    HIPCHECK(hipMalloc(&ptr, numBytes));
+
+    HIPCHECK(hipFree(ptr));
+  }
+
+  HIPCHECK(hipMemGetInfo(&avail, &tot));
+
+  if ((pavail != avail) || (ptot != tot)) {
+    printf("LoopAllocation : Memory allocation mismatch observed."
+        "Possible memory leak.");
+    TestPassed &= false;
+  }
+
+  return TestPassed;
+}
+
+/*
+ * Thread func to regress alloc and check data consistency
+ */
+
+std::atomic<bool> g_thTestPassed(true);
+
+void threadFunc(int gpu) {
+  g_thTestPassed = g_thTestPassed & regressAllocInLoop(gpu);
+  g_thTestPassed = g_thTestPassed & validateMemoryOnGPU(gpu);
+
+  printf("thread execution status on gpu(%d) : %d\n", gpu, g_thTestPassed.load());
+}
+
+int main(int argc, char* argv[]) {
+  HipTest::parseStandardArguments(argc, argv, true);
+
+  if (p_tests == 1) {  // Arg validation
+    // Test hipMalloc for zero size
+    bool TestPassed = true;
+    int *ptr;
+
+    HIPCHECK(hipMalloc(&ptr, 0));
+
+    // ptr expected to be reset to null ptr
+    if (ptr) {
+      printf("ArgValidation : Failed in zero size test\n");
+      TestPassed &= false;
+    }
+
+    // Free null ptr
+    HIPCHECK(hipFree(ptr));
+
+    // Test hipMalloc for invalid arguments
+    hipError_t ret;
+
+    if ((ret = hipMalloc(NULL, 100)) != hipErrorInvalidValue) {
+      printf("ArgValidation : Inappropritate error value returned"
+          " for invalid argument. Error: '%s'(%d)\n",
+          hipGetErrorString(ret), ret);
+      TestPassed &= false;
+    }
+
+    // Test hipMalloc for Maximum value of size_t
+    if ((ret = hipMalloc(&ptr, std::numeric_limits<std::size_t>::max()))
+        != hipErrorMemoryAllocation) {
+      printf("ArgValidation : Invalid error returned for max size_t."
+          " Error: '%s'(%d)\n", hipGetErrorString(ret), ret);
+      TestPassed &= false;
+    }
+
+    if (TestPassed) {
+      passed();
+    } else {
+      failed("hipMalloc ArgumentValidation Failure!");
+    }
+
+  } else if (p_tests == 2) {  // Loop Regression Alloc/Free Cycle
+    bool TestPassed = true;
+
+    TestPassed &= regressAllocInLoop(0);
+    TestPassed &= validateMemoryOnGPU(0);
+
+    if (TestPassed) {
+      passed();
+    } else {
+      failed("hipMalloc_LoopRegression_AllocFreeCycle Failure!");
+    }
+
+  } else if (p_tests == 3) {  // Loop Regression Alloc and Pool
+    size_t avail, tot, pavail, ptot;
+    bool TestPassed = true;
+    hipError_t err;
+    int *ptr;
+
+    std::vector<int *> ptrlist;
+
+    HIPCHECK(hipMemGetInfo(&pavail, &ptot));
+
+    // Allocate small chunks of memory million times
+    for (int i = 0; i < MAX_ALLOCPOOL_ITER; i++) {  // Iterations TBD
+      if ((err = hipMalloc(&ptr, 10)) != hipSuccess) {
+        HIPCHECK(hipMemGetInfo(&avail, &tot));
+
+        printf("Loop regression pool allocation failure. "
+        "Total gpu memory : %6.2fMB, Free memory %6.2fMB iter %d error '%s'\n",
+        tot/(1024.0*1024.0), avail/(1024.0*1024.0), i, hipGetErrorString(err));
+
+        TestPassed &= false;
+        break;
+      }
+
+      // Store pointers allocated to emulate memory pool of app
+      ptrlist.push_back(ptr);
+    }
+
+    // Free ptrs at later point of time
+    for ( auto &t : ptrlist ) {
+      HIPCHECK(hipFree(t));
+    }
+
+    HIPCHECK(hipMemGetInfo(&avail, &tot));
+
+    TestPassed &= validateMemoryOnGPU(0);
+
+    if ((pavail != avail) || (ptot != tot)) {
+      printf("%s : Memory allocation mismatch observed. Possible memory leak.",
+          __func__);
+      TestPassed &= false;
+    }
+
+    if (TestPassed) {
+      passed();
+    } else {
+      failed("hipMalloc_LoopRegression_AllocPool failure!");
+    }
+
+  } else if (p_tests == 4) {
+    bool TestPassed = true;
+
+#ifdef __linux__
+    // Parallel execution of parent and child on gpu0
+    int pid;
+
+    if ((pid = fork()) < 0) {
+      printf("Child_Concurrency_Gpu0 : fork() returned error %d.", pid);
+      TestPassed &= false;
+
+    } else if (!pid) {   // Child process
+      bool TestPassedChild = true;
+
+      TestPassedChild = validateMemoryOnGPU(0);
+
+      if (TestPassedChild) {
+        exit(0);  // child exit with success status
+      } else {
+        printf("Child_Concurrency_Gpu0 : childpid %d failed\n", getpid());
+        exit(1);  // child exit with failure status
+      }
+
+    } else {  // Parent process
+      int exitStatus;
+      TestPassed = validateMemoryOnGPU(0);
+
+      pid = wait(&exitStatus);
+      if ( WEXITSTATUS(exitStatus) || ( pid < 0 ) )
+        TestPassed &= false;
+    }
+#else
+    printf("Test hipMallocChild_Concurrency_DefaultGpu skipped on non-linux\n");
+#endif
+
+    // TC scenarios specific to linux
+    // are treated as pass in windows.
+    if (TestPassed) {
+      passed();
+    } else {
+      failed("hipMallocChild_Concurrency_DefaultGpu Failed!");
+    }
+
+  } else if (p_tests == 5) {
+    bool TestPassed = true;
+#ifdef __linux__
+    // Parallel execution on multiple gpus from different child processes
+    int devCnt = 1, pid = 0, cumStatus = 0;
+
+    // Get GPU count
+    getDeviceCount(&devCnt);
+
+    // Spawn child for each GPU
+    for (int gpu = 0; gpu < devCnt; gpu++) {
+      if ((pid = fork()) < 0) {
+         printf("Child_Concurrency_MultiGpu : fork() returned error %d\n", pid);
+         failed("Test Failed!");
+
+      } else if (!pid) {  // Child process
+         bool TestPassedChild = true;
+         TestPassedChild = validateMemoryOnGPU(gpu);
+
+         if (TestPassedChild) {
+            exit(0);  // child exit with success status
+         } else {
+            printf("Child_Concurrency_MultiGpu : childpid %d failed\n",
+                getpid());
+            exit(1);  // child exit with failure status
+         }
+      }
+    }
+
+    // Parent shall wait for child to complete
+    for (int i = 0; i < devCnt; i++) {
+      int pidwait = 0, exitStatus;
+      pidwait = wait(&exitStatus);
+
+      if (pidwait < 0) {
+        TestPassed &= false;
+        break;
+      }
+
+      cumStatus |= WEXITSTATUS(exitStatus);
+    }
+
+    // Cummulative status of all child
+    if (cumStatus) {
+       TestPassed &= false;
+    }
+
+#else
+    printf("Test hipMallocChild_Concurrency_MultiGpu skipped on non-linux\n");
+#endif
+
+
+    // TC scenarios specific to linux
+    // are treated as pass in windows.
+    if (TestPassed) {
+      passed();
+    } else {
+      failed("hipMallocChild_Concurrency_MultiGpu Failed!");
+    }
+
+  } else if (p_tests == 6) {  // Multithreaded multiple gpu execution
+    std::vector<std::thread> threadlist;
+    int devCnt = 1;
+
+    // Get GPU count
+    getDeviceCount(&devCnt);
+
+
+    for (int i = 0; i < devCnt; i++) {
+      threadlist.push_back(std::thread(threadFunc, i));
+    }
+
+    for (auto &t : threadlist) {
+      t.join();
+    }
+
+    if (g_thTestPassed) {
+      passed();
+    } else {
+      failed("hipMalloc_MultiThreaded_MultiGpu Failed!");
+    }
+  } else {
+    failed("Didnt receive any valid option. Try options 1 to 6\n");
+  }
+}
+
@@ -0,0 +1,423 @@
+/*
+Copyright (c) 2020 - present Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+/* Test 6 is disabled */
+/* HIT_START
+ * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS -std=c++11
+ * TEST_NAMED: %t hipMallocManaged1 --tests 1
+ * TEST_NAMED: %t hipMallocManaged2 --tests 2
+ * TEST_NAMED: %t hipMallocManagedNegativeTests --tests 3
+ * TEST_NAMED: %t hipMallocManagedMultiChunkSingleDevice --tests 4
+ * TEST_NAMED: %t hipMallocManagedMultiChunkMultiDevice --tests 5 EXCLUDE_HIP_PLATFORM nvcc
+ * TEST_NAMED: %t hipMallocManagedOversubscription --tests 6 EXCLUDE_HIP_PLATFORM rocclr nvcc
+ * HIT_END
+ */
+
+#include <atomic>
+#include "test_common.h"
+#define N 1048576  // equals to (1024*1024)
+#define INIT_VAL 123
+
+/*
+ * Kernel function to perform addition operation.
+ */
+template <typename T>
+__global__ void
+vector_sum(T *Ad1, T *Ad2, size_t NUM_ELMTS) {
+    size_t offset = (blockIdx.x * blockDim.x + threadIdx.x);
+    size_t stride = blockDim.x * gridDim.x;
+
+    for (size_t i = offset; i < NUM_ELMTS; i += stride) {
+        Ad2[i] = Ad1[i] + Ad1[i];
+    }
+}
+
+// The following Test case tests the following scenario:
+// A large chunk of hipMallocManaged() memory(Hmm) is created
+// Equal parts of Hmm is accessed on available gpus and
+// kernel is launched on acessed chunk of hmm memory
+// and checks if there are any inconsistencies or access issues
+bool MultiChunkMultiDevice(int NumDevices) {
+  std::atomic<int> DataMismatch{0};
+  bool IfTestPassed = true;
+  int Counter = 0;
+  unsigned int NUM_ELMS = (1024 * 1024);
+  float *Ad[NumDevices], *Hmm = NULL, *Ah = new float[NUM_ELMS];
+  hipStream_t stream[NumDevices];
+  for (int Oloop = 0; Oloop < NumDevices; ++Oloop) {
+    HIPCHECK(hipSetDevice(Oloop));
+    HIPCHECK(hipMalloc(&Ad[Oloop], NUM_ELMS * sizeof(float)));
+    HIPCHECK(hipMemset(Ad[Oloop], 0, NUM_ELMS * sizeof(float)));
+    HIPCHECK(hipStreamCreate(&stream[Oloop]));
+  }
+  HIPCHECK(hipMallocManaged(&Hmm, (NumDevices * NUM_ELMS * sizeof(float))));
+  for (int i = 0; i < NumDevices; ++i) {
+    for (; Counter < ((i + 1) * NUM_ELMS); ++Counter) {
+      Hmm[Counter] = INIT_VAL + i;
+    }
+  }
+  const unsigned threadsPerBlock = 256;
+  const unsigned blocks = (NUM_ELMS + 255)/256;
+  for (int Klaunch = 0; Klaunch < NumDevices; ++Klaunch) {
+    vector_sum<float> <<<blocks, threadsPerBlock, 0, stream[Klaunch]>>>
+                      (&Hmm[Klaunch * NUM_ELMS], Ad[Klaunch], NUM_ELMS);
+  }
+  HIPCHECK(hipDeviceSynchronize());
+  for (int m = 0; m < NumDevices; ++m) {
+    HIPCHECK(hipMemcpy(Ah, Ad[m], NUM_ELMS * sizeof(float),
+                       hipMemcpyDeviceToHost));
+    for (int n = 0; n < NUM_ELMS; ++n) {
+      if (Ah[n] != ((INIT_VAL + m) * 2)) {
+        DataMismatch++;
+      }
+    }
+    memset(reinterpret_cast<void*>(Ah), 0, NUM_ELMS * sizeof(float));
+  }
+  if (DataMismatch.load() != 0) {
+    printf("MultiChunkMultiDevice: Mismatch observed!\n");
+    IfTestPassed = false;
+  }
+  for (int i = 0; i < NumDevices; ++i) {
+    HIPCHECK(hipFree(Ad[i]));
+    HIPCHECK(hipStreamDestroy(stream[i]));
+  }
+  HIPCHECK(hipFree(Hmm));
+  free(Ah);
+  return IfTestPassed;
+}
+
+// The following Test case tests the following scenario:
+// A large chunk of hipMallocManaged() memory(Hmm) is created
+// Equal parts of Hmm is accessed and
+// kernel is launched on acessed chunk of hmm memory
+// and checks if there are any inconsistencies or access issues
+
+bool MultiChunkSingleDevice(int NumDevices) {
+  std::atomic<int> DataMismatch{0};
+  int Chunks = 4, Counter = 0;
+  bool IfTestPassed = true;
+  unsigned int NUM_ELMS = (1024 * 1024);
+  float *Ad[Chunks], *Hmm = NULL, *Ah = new float[NUM_ELMS];
+  hipStream_t stream[Chunks];
+  for (int i = 0; i < Chunks; ++i) {
+    HIPCHECK(hipMalloc(&Ad[i], NUM_ELMS * sizeof(float)));
+    HIPCHECK(hipMemset(Ad[i], 0, NUM_ELMS * sizeof(float)));
+    HIPCHECK(hipStreamCreate(&stream[i]));
+  }
+  HIPCHECK(hipMallocManaged(&Hmm, (Chunks * NUM_ELMS * sizeof(float))));
+  for (int i = 0; i < Chunks; ++i) {
+    for (; Counter < ((i + 1) * NUM_ELMS); ++Counter) {
+      Hmm[Counter] = (INIT_VAL + i);
+    }
+  }
+  const unsigned threadsPerBlock = 256;
+  const unsigned blocks = (NUM_ELMS + 255)/256;
+  for (int k = 0; k < Chunks; ++k) {
+    vector_sum<float> <<<blocks, threadsPerBlock, 0, stream[k]>>>
+                      (&Hmm[k * NUM_ELMS], Ad[k], NUM_ELMS);
+  }
+  HIPCHECK(hipDeviceSynchronize());
+  for (int m = 0; m < Chunks; ++m) {
+    HIPCHECK(hipMemcpy(Ah, Ad[m], NUM_ELMS * sizeof(float),
+                       hipMemcpyDeviceToHost));
+    for (int n = 0; n < NUM_ELMS; ++n) {
+      if (Ah[n] != ((INIT_VAL + m) * 2)) {
+        DataMismatch++;
+      }
+    }
+  }
+  if (DataMismatch.load() != 0) {
+    printf("MultiChunkSingleDevice: Mismatch observed!\n");
+    IfTestPassed = false;
+  }
+  for (int i = 0; i < Chunks; ++i) {
+    HIPCHECK(hipFree(Ad[i]));
+    HIPCHECK(hipStreamDestroy(stream[i]));
+  }
+  HIPCHECK(hipFree(Hmm));
+  free(Ah);
+  return IfTestPassed;
+}
+
+// The following tests oversubscription hipMallocManaged() api
+// Currently disabled.
+bool TestOversubscriptionMallocManaged(int NumDevices) {
+  bool IfTestPassed = true;
+  hipError_t err;
+  void *A = NULL;
+  size_t total = 0, free = 0;
+  HIPCHECK(hipMemGetInfo(&free, &total));
+  // ToDo: In case of HMM, memory over-subscription is allowed.  Hence, relook
+  // into how out of memory can be tested.
+  // Demanding more mem size than available
+  err = hipMallocManaged(&A, (free +1), hipMemAttachGlobal);
+  if (hipErrorOutOfMemory != err) {
+    printf("hipMallocManaged: Returned %s for size value > device memory\n",
+           hipGetErrorString(err));
+    IfTestPassed = false;
+  }
+
+  return IfTestPassed;
+}
+
+// The following test does negative testing of hipMallocManaged() api
+// by passing invalid values and check if the behavior is as expected
+bool NegativeTestsMallocManaged(int NumDevices) {
+  bool IfTestPassed = true;
+  hipError_t err;
+  void *A = NULL;
+  size_t total = 0, free = 0;
+  HIPCHECK(hipMemGetInfo(&free, &total));
+
+  err = hipMallocManaged(NULL, 1024, hipMemAttachGlobal);
+  if (hipErrorInvalidValue != err) {
+    printf("hipMallocManaged: Returned %s when devPtr is null\n",
+           hipGetErrorString(err));
+    IfTestPassed = false;
+  }
+
+  err = hipMallocManaged(&A, 0, hipMemAttachGlobal);
+  if (hipErrorInvalidValue != err) {
+    printf("hipMallocManaged: Returned %s when size is 0\n",
+           hipGetErrorString(err));
+    IfTestPassed = false;
+  }
+
+  err = hipMallocManaged(NULL, 0, hipMemAttachGlobal);
+  if (hipErrorInvalidValue != err) {
+    printf("hipMallocManaged: Returned %s when devPtr & size is null & 0\n",
+           hipGetErrorString(err));
+    IfTestPassed = false;
+  }
+
+#ifdef __HIP_PLATFORM_HCC__
+  // The flag hipMemAttachHost is currently not supported therefore
+  // api should return "hipErrorInvalidValue" for now
+  err = hipMallocManaged(&A, 1024, hipMemAttachHost);
+  if (hipErrorInvalidValue != err) {
+    printf("hipMallocManaged: Returned %s for 'hipMemAttachHost' flag\n",
+           hipGetErrorString(err));
+    IfTestPassed = false;
+  }
+#endif  // __HIP_PLATFORM_HCC__
+
+  err = hipMallocManaged(NULL, 0, 0);
+  if (hipErrorInvalidValue != err) {
+    printf("hipMallocManaged: Returned %s when params are null, 0, 0\n",
+           hipGetErrorString(err));
+    IfTestPassed = false;
+  }
+
+  err = hipMallocManaged(&A, 1024, 145);
+  if (hipErrorInvalidValue != err) {
+    printf("hipMallocManaged: Returned %s when flag param is numerical 145\n",
+           hipGetErrorString(err));
+    IfTestPassed = false;
+  }
+
+  err = hipMallocManaged(&A, -10, hipMemAttachGlobal);
+  if (hipErrorOutOfMemory != err) {
+    printf("hipMallocManaged: Returned %s for negative size value.\n",
+           hipGetErrorString(err));
+    IfTestPassed = false;
+  }
+
+  return IfTestPassed;
+}
+
+
+// Allocate two pointers using hipMallocManaged(), initialize,
+// then launch kernel using these pointers directly and
+// later validate the content without using any Memcpy.
+template <typename T>
+bool TestMallocManaged2(int NumDevices) {
+  bool IfTestPassed = true;
+  T *Hmm1 = NULL, *Hmm2 = NULL;
+
+  for (int i = 0; i < NumDevices; ++i) {
+    HIPCHECK(hipSetDevice(i));
+    std::atomic<int> DataMismatch{0};
+    HIPCHECK(hipMallocManaged(&Hmm1, N * sizeof(T)));
+    HIPCHECK(hipMallocManaged(&Hmm2, N * sizeof(T)));
+    for (int m = 0; m < N; ++m) {
+      Hmm1[m] = m;
+      Hmm2[m] = 0;
+    }
+    const unsigned threadsPerBlock = 256;
+    const unsigned blocks = (N + 255)/256;
+    // Kernel launch
+    vector_sum <<<blocks, threadsPerBlock>>> (Hmm1, Hmm2, N);
+    HIPCHECK(hipDeviceSynchronize());
+    for (int v = 0; v < N; ++v) {
+      if (Hmm2[v] != (v + v)) {
+        DataMismatch++;
+      }
+    }
+    if (DataMismatch.load() != 0) {
+      IfTestPassed = false;
+    }
+    HIPCHECK(hipFree(Hmm1));
+    HIPCHECK(hipFree(Hmm2));
+  }
+  return IfTestPassed;
+}
+
+// In the following test, a memory is created using hipMallocManaged() by
+// setting a device and verified if it is accessible when the context is set
+// to all other devices. This include verification and Device two Device
+// transfers and kernel launch o discover if there any access issues.
+
+template <typename T>
+bool TestMallocManaged1(int NumDevices) {
+  std::atomic<unsigned int> DataMismatch;
+  bool TestPassed = true;
+  T *Ah1 = new T[N], *Ah2 = new T[N], *Ad = NULL, *Hmm = NULL;
+
+  for (int i =0; i < N; ++i) {
+    Ah1[i] = INIT_VAL;
+    Ah2[i] = 0;
+  }
+  for (int Oloop = 0; Oloop < NumDevices; ++Oloop) {
+    DataMismatch = 0;
+    HIPCHECK(hipSetDevice(Oloop));
+    HIPCHECK(hipMallocManaged(&Hmm, N * sizeof(T)));
+    for (int Iloop = 0; Iloop < NumDevices; ++Iloop) {
+      HIPCHECK(hipSetDevice(Iloop));
+      HIPCHECK(hipMalloc(&Ad, N * sizeof(T)));
+      // Copy data from host to hipMallocMananged memory and verify
+      HIPCHECK(hipMemcpy(Hmm, Ah1, N * sizeof(T), hipMemcpyHostToDevice));
+      for (int v = 0; v < N; ++v) {
+        if (Hmm[v] != INIT_VAL) {
+          DataMismatch++;
+        }
+      }
+      if (DataMismatch.load() != 0) {
+        printf("Mismatch is observed with host data at device %d", Iloop);
+        printf(" while hipMallocManaged memory set to the device %d\n", Oloop);
+        TestPassed = false;
+        DataMismatch = 0;
+      }
+      // Executing D2D transfer with hipMallocManaged memory and verify
+      HIPCHECK(hipMemcpy(Ad, Hmm, N * sizeof(T), hipMemcpyDeviceToDevice));
+      HIPCHECK(hipMemcpy(Ah2, Ad, N * sizeof(T), hipMemcpyDeviceToHost));
+      for (int k = 0; k < N; ++k) {
+        if (Ah2[k] != INIT_VAL) {
+          DataMismatch++;
+        }
+      }
+      if (DataMismatch.load() != 0) {
+        printf("Mismatch is observed with D2D transfer at device %d\n", Iloop);
+        printf(" while hipMallocManaged memory set to the device %d\n", Oloop);
+        TestPassed = false;
+        DataMismatch = 0;
+      }
+      HIPCHECK(hipMemset(Ad, 0, N * sizeof(T)));
+      const unsigned threadsPerBlock = 256;
+      const unsigned blocks = (N + 255)/256;
+      // Launching the kernel to check if there is any access issue with
+      // hipMallocManaged memory and local device's memory
+      vector_sum <<<blocks, threadsPerBlock>>> (Hmm, Ad, N);
+      hipDeviceSynchronize();
+      HIPCHECK(hipMemcpy(Ah2, Ad, N * sizeof(T), hipMemcpyDeviceToHost));
+      for (int m = 0; m < N; ++m) {
+        if (Ah2[m] != 246) {
+          DataMismatch++;
+        }
+      }
+      if (DataMismatch.load() != 0) {
+        printf("Data Mismatch observed after kernel lch device %d\n", Iloop);
+        TestPassed = false;
+        DataMismatch = 0;
+      }
+      HIPCHECK(hipFree(Ad));
+    }
+    HIPCHECK(hipFree(Hmm));
+  }
+  free(Ah1);
+  free(Ah2);
+  return TestPassed;
+}
+
+int main(int argc, char* argv[]) {
+  HipTest::parseStandardArguments(argc, argv, true);
+
+  if ((p_tests <= 0) || (p_tests > 5)) {
+    failed("Valid arguments are from 1 to 5");
+  }
+
+  int NumDevices = 0;
+  HIPCHECK(hipGetDeviceCount(&NumDevices));
+  bool TestStatus = true, OverAllStatus = true;
+  if (p_tests == 1) {
+    TestStatus = TestMallocManaged1<float>(NumDevices);
+    if (!TestStatus) {
+      printf("Test Failed with float datatype.\n");
+      OverAllStatus = false;
+    }
+    TestStatus = TestMallocManaged1<int>(NumDevices);
+    if (!TestStatus) {
+      printf("Test Failed with int datatype.\n");
+      OverAllStatus = false;
+    }
+    TestStatus = TestMallocManaged1<unsigned char>(NumDevices);
+    if (!TestStatus) {
+      printf("Test Failed with unsigned char datatype.\n");
+      OverAllStatus = false;
+    }
+    TestStatus = TestMallocManaged1<double>(NumDevices);
+    if (!TestStatus) {
+      printf("Test Failed with double datatype.\n");
+      OverAllStatus = false;
+    }
+    if (!OverAllStatus) {
+      failed("");
+    }
+  }
+  if (p_tests == 2) {
+    TestStatus = TestMallocManaged2<float>(NumDevices);
+    if (!TestStatus) {
+      failed("Test Failed with float datatype.");
+    }
+  }
+  if (p_tests == 3) {
+    TestStatus = NegativeTestsMallocManaged(NumDevices);
+    if (!TestStatus) {
+      failed("Negative Tests with hipMallocManaged() failed!.");
+    }
+  }
+  if (p_tests == 4) {
+    TestStatus = MultiChunkSingleDevice(NumDevices);
+    if (!TestStatus) {
+      failed("hipMallocManaged: MultiChunkSingleDevice test failed!");
+    }
+  }
+  if (p_tests == 5) {
+    TestStatus = MultiChunkMultiDevice(NumDevices);
+    if (!TestStatus) {
+      failed("hipMallocManaged: MultiChunkMultiDevice test failed!");
+    }
+  }
+  if (p_tests == 6) {
+    TestStatus = TestOversubscriptionMallocManaged(NumDevices);
+    if (!TestStatus) {
+      failed("hipMallocManaged: TestOversubscriptionMallocManaged failed!");
+    }
+  }
+  passed();
+}
@@ -75,6 +75,9 @@ int main() {
        HIPCHECK(hipFree(Z_d));
      } else {
        std::cout<<"Machine does not seem to have P2P Capabilities, Empty Pass"<<std::endl;
+        if (hip_skip_tests_enabled()) {
+          return hip_skip_retcode();
+        }
      }
    }

@@ -81,6 +81,9 @@ int main() {
        HIPCHECK(hipFree(Z_d));
      } else {
        std::cout<<"Machine does not seem to have P2P Capabilities, Empty Pass"<<std::endl;
+        if (hip_skip_tests_enabled()) {
+          return hip_skip_retcode();
+        }
      }
    }

@@ -77,6 +77,9 @@ int main() {
        HIPCHECK(hipFree(Z_d));
      } else {
        std::cout<<"Machine does not seem to have P2P Capabilities, Empty Pass"<<std::endl;
+        if (hip_skip_tests_enabled()) {
+          return hip_skip_retcode();
+        }
      }
    }
    passed();
@@ -83,6 +83,9 @@ int main() {
        HIPCHECK(hipFree(Z_d));
      } else {
        std::cout<<"Machine does not seem to have P2P Capabilities, Empty Pass"<<std::endl;
+        if (hip_skip_tests_enabled()) {
+          return hip_skip_retcode();
+        }
      }
    }

@@ -24,7 +24,7 @@ THE SOFTWARE.
 */

 /* HIT_START
- * BUILD: %t %s ../../test_common.cpp
+ * BUILD: %t %s ../../test_common.cpp EXCLUDE_HIP_PLATFORM nvcc
 * TEST: %t
 * HIT_END
 */
@@ -24,7 +24,7 @@ THE SOFTWARE.
 */

 /* HIT_START
- * BUILD: %t %s ../../test_common.cpp
+ * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS -std=c++11 EXCLUDE_HIP_PLATFORM nvcc
 * TEST: %t
 * HIT_END
 */
@@ -20,7 +20,7 @@
 // Test for hipMemset2D functionality for different width and height values

 /* HIT_START
- * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS -std=c++11
+ * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS -std=c++11 EXCLUDE_HIP_PLATFORM nvcc
 * TEST_NAMED: %t hipMemset2D-basic
 * TEST_NAMED: %t hipMemset2D-dim1 --width2D 10 --height2D 10 --memsetWidth 4 --memsetHeight 4
 * TEST_NAMED: %t hipMemset2D-dim2 --width2D 100 --height2D 100 --memsetWidth 20 --memsetHeight 40
@@ -21,7 +21,7 @@
 // and also launch hipMemcpyAsync() api on the same stream. This test case is simulate the scenario
 // reported in SWDEV-181598.
 /* HIT_START
- * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS --std=c++11
+ * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS --std=c++11 EXCLUDE_HIP_PLATFORM nvcc
 * TEST: %t
 * HIT_END
 */
@@ -21,7 +21,7 @@
 // and also launch hipMemcpyAsync() api. This test case is simulate the scenario
 // reported in SWDEV-181598.
 /* HIT_START
- * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS --std=c++11
+ * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS --std=c++11 EXCLUDE_HIP_PLATFORM nvcc
 * TEST: %t
 * HIT_END
 */
@@ -0,0 +1,27 @@
+/*
+Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include <hip/hip_runtime_api.h>
+#include "test_common.h"
+
+int main() {
+    hipSharedMemConfig_t config;
+    HIP_PRINT_STATUS(hipFuncSetSharedMemConfig(NULL));
+    HIP_PRINT_STATUS(hipFuncSetSharedMemConfig(&config));
+}
--- a/Показать больше
+++ b/Показать больше