From 374ead1d1934636fb0b0aecc322b5bf8c3b86c47 Mon Sep 17 00:00:00 2001
From: Vladislav Sytchenko <Vladislav.Sytchenko@amd.com>
Date: Mon, 5 Oct 2020 12:26:18 -0400
Subject: [PATCH] Revert "Merge branch 'amd-master-next' into amd-npi-next"

This reverts commit 73558e33632379e2e6eb157560aaff60e0405f31.

Reason for revert: <INSERT REASONING HERE>

Change-Id: I53322718dadde2c98f96140b8e260ec7ee9ef721
---
 CMakeLists.txt                                |  52 +-
 CONTRIBUTING.md                               | 104 +--
 bin/hip_embed_pch.sh                          |   9 +-
 bin/hip_gen_pch.sh                            |  36 +
 bin/hipcc                                     |   3 +-
 docs/markdown/hip_faq.md                      |   2 +-
 hip-config.cmake.in                           |  28 +-
 include/hip/hcc_detail/device_functions.h     | 138 +---
 include/hip/hcc_detail/hip_runtime.h          |  16 -
 include/hip/hcc_detail/hip_runtime_api.h      |  13 +-
 include/hip/hip_cooperative_groups.h          |   9 +-
 include/hip/hip_runtime_api.h                 |   1 -
 .../hip/nvcc_detail/hip_cooperative_groups.h  |  12 -
 include/hip/nvcc_detail/hip_runtime.h         |   4 +-
 include/hip/nvcc_detail/hip_runtime_api.h     |   1 -
 lpl_ca/CMakeLists.txt                         |   2 -
 packaging/hip-base.txt                        |  11 +-
 packaging/hip-doc.txt                         |  15 +-
 packaging/hip-hcc.txt                         |  13 +-
 packaging/hip-nvcc.txt                        |  13 +-
 packaging/hip-rocclr.txt                      |  13 +-
 packaging/hip-samples.txt                     |  13 +-
 rocclr/CMakeLists.txt                         |  30 +-
 rocclr/hip_code_object.cpp                    |  25 +
 rocclr/hip_code_object.hpp                    |   2 +
 rocclr/hip_device.cpp                         |   2 +-
 rocclr/hip_fatbin.cpp                         |   7 +-
 rocclr/hip_global.cpp                         |   4 +-
 rocclr/hip_global.hpp                         |   5 +
 rocclr/hip_internal.hpp                       |   2 +
 rocclr/hip_memory.cpp                         |  67 +-
 rocclr/hip_module.cpp                         |   2 +-
 rocclr/hip_peer.cpp                           |   4 -
 rocclr/hip_platform.cpp                       |  41 +-
 rocclr/hip_platform.hpp                       |   5 +
 samples/0_Intro/bit_extract/CMakeLists.txt    |  20 -
 samples/0_Intro/bit_extract/Makefile          |  10 +-
 samples/0_Intro/module_api/CMakeLists.txt     |  36 -
 .../0_Intro/module_api_global/CMakeLists.txt  |  30 -
 samples/0_Intro/square/CMakeLists.txt         |  21 -
 samples/0_Intro/square/Makefile               |   7 +-
 samples/0_Intro/square/README.md              |  42 +-
 .../1_Utils/hipBusBandwidth/CMakeLists.txt    |  20 -
 .../hipBusBandwidth/hipBusBandwidth.cpp       | 446 ++++++-----
 samples/1_Utils/hipCommander/CMakeLists.txt   |  31 -
 .../1_Utils/hipDispatchLatency/CMakeLists.txt |  35 -
 samples/1_Utils/hipInfo/CMakeLists.txt        |  20 -
 .../0_MatrixTranspose/CMakeLists.txt          |  20 -
 .../2_Cookbook/10_inline_asm/CMakeLists.txt   |  20 -
 .../11_texture_driver/CMakeLists.txt          |  30 -
 .../2_Cookbook/13_occupancy/CMakeLists.txt    |  20 -
 samples/2_Cookbook/1_hipEvent/CMakeLists.txt  |  20 -
 .../2_Cookbook/3_shared_memory/CMakeLists.txt |  20 -
 samples/2_Cookbook/4_shfl/CMakeLists.txt      |  20 -
 samples/2_Cookbook/5_2dshfl/CMakeLists.txt    |  19 -
 .../6_dynamic_shared/CMakeLists.txt           |  19 -
 samples/2_Cookbook/7_streams/CMakeLists.txt   |  19 -
 samples/2_Cookbook/8_peer2peer/CMakeLists.txt |  19 -
 samples/2_Cookbook/9_unroll/CMakeLists.txt    |  19 -
 samples/README.md                             |  27 -
 tests/hit/HIT.cmake                           |   1 -
 .../performance/compute/hipPerfMandelbrot.cpp | 747 ------------------
 .../stream/hipPerfDeviceConcurrency.cpp       | 289 -------
 .../hipCGGridGroupType.cpp                    |   8 +-
 .../hipCGGridGroupTypeViaBaseType.cpp         |   8 +-
 .../hipCGGridGroupTypeViaPublicApi.cpp        |   8 +-
 .../hipCGMultiGridGroupType.cpp               |  22 +-
 .../hipCGMultiGridGroupTypeViaBaseType.cpp    |  39 +-
 .../hipCGMultiGridGroupTypeViaPublicApi.cpp   |  39 +-
 .../hipCGThreadBlockType.cpp                  |  12 +-
 .../hipCGThreadBlockTypeViaBaseType.cpp       |  12 +-
 .../hipCGThreadBlockTypeViaPublicApi.cpp      |  12 +-
 tests/src/kernel/hipShflTests.cpp             |  17 +-
 tests/src/kernel/hipShflUpDownTest.cpp        |  64 +-
 tests/src/p2p/hipPeerToPeer_simple.cpp        |   3 -
 .../cooperativeGrps/api_failure_tests.cpp     | 280 -------
 .../cooperativeGrps/cooperative_streams.cpp   | 283 -------
 .../grid_group_data_sharing.cpp               | 303 -------
 .../multi_gpu_api_failure_tests.cpp           | 568 -------------
 .../cooperativeGrps/multi_gpu_streams.cpp     | 581 --------------
 .../multi_grid_group_all_gpus.cpp             | 374 ---------
 .../simple_grid_group_barrier.cpp             | 233 ------
 .../simple_multi_grid_group_barrier.cpp       | 374 ---------
 .../device/hipDeviceGetPCIBusId.cpp           | 346 ++++----
 .../src/runtimeApi/device/hipSetGetDevice.cpp |   2 +-
 .../runtimeApi/memory/hipIpcMemAccessTest.cpp | 227 ------
 .../memory/hipMallocConcurrency.cpp           | 487 ------------
 .../memory/hipMallocManaged_MultiScenario.cpp | 423 ----------
 tests/src/runtimeApi/memory/hipMemcpyDtoD.cpp |   3 -
 .../runtimeApi/memory/hipMemcpyDtoDAsync.cpp  |   3 -
 tests/src/runtimeApi/memory/hipMemcpyPeer.cpp |   3 -
 .../runtimeApi/memory/hipMemcpyPeerAsync.cpp  |   3 -
 .../runtimeApi/memory/hipMemcpyWithStream.cpp |   2 +-
 .../memory/hipMemcpyWithStreamMultiThread.cpp |   2 +-
 tests/src/runtimeApi/memory/hipMemset2D.cpp   |   2 +-
 .../hipMultiMemcpyMultiThrdMultiStrm.cpp      |   2 +-
 .../memory/hipMultiMemcpyMultiThread.cpp      |   2 +-
 .../module/hipFuncSetSharedMemConfig.cpp      |  27 -
 .../hipLaunchCoopMultiKernel.cpp              |   2 +-
 .../hipLaunchCooperativeKernel.cpp            |   7 +-
 .../hipModuleLoadDataMultThreadOnMultGPU.cpp  |   2 +-
 .../module/hipModuleLoadDataMultThreaded.cpp  |   2 +-
 .../stream/hipStreamACb_AltEnqueue.cpp        |   2 +-
 .../stream/hipStreamACb_MStrm_Mgpu.cpp        |   2 +-
 .../stream/hipStreamACb_MultiCalls.cpp        |   2 +-
 .../stream/hipStreamACb_StrmSyncTiming.cpp    |   2 +-
 .../stream/hipStreamACb_ThrdBehaviour.cpp     |   2 +-
 .../runtimeApi/stream/hipStreamACb_order.cpp  |   2 +-
 .../stream/hipStreamGetPriority.cpp           |   2 +-
 tests/src/test_common.h                       |  14 +-
 tests/unit/test_common.h                      |   1 +
 111 files changed, 750 insertions(+), 6797 deletions(-)
 create mode 100755 bin/hip_gen_pch.sh
 delete mode 100644 include/hip/nvcc_detail/hip_cooperative_groups.h
 delete mode 100644 samples/0_Intro/bit_extract/CMakeLists.txt
 delete mode 100644 samples/0_Intro/module_api/CMakeLists.txt
 delete mode 100644 samples/0_Intro/module_api_global/CMakeLists.txt
 delete mode 100644 samples/0_Intro/square/CMakeLists.txt
 delete mode 100644 samples/1_Utils/hipBusBandwidth/CMakeLists.txt
 delete mode 100644 samples/1_Utils/hipCommander/CMakeLists.txt
 delete mode 100644 samples/1_Utils/hipDispatchLatency/CMakeLists.txt
 delete mode 100644 samples/1_Utils/hipInfo/CMakeLists.txt
 delete mode 100644 samples/2_Cookbook/0_MatrixTranspose/CMakeLists.txt
 delete mode 100644 samples/2_Cookbook/10_inline_asm/CMakeLists.txt
 delete mode 100644 samples/2_Cookbook/11_texture_driver/CMakeLists.txt
 delete mode 100644 samples/2_Cookbook/13_occupancy/CMakeLists.txt
 delete mode 100644 samples/2_Cookbook/1_hipEvent/CMakeLists.txt
 delete mode 100644 samples/2_Cookbook/3_shared_memory/CMakeLists.txt
 delete mode 100644 samples/2_Cookbook/4_shfl/CMakeLists.txt
 delete mode 100644 samples/2_Cookbook/5_2dshfl/CMakeLists.txt
 delete mode 100644 samples/2_Cookbook/6_dynamic_shared/CMakeLists.txt
 delete mode 100644 samples/2_Cookbook/7_streams/CMakeLists.txt
 delete mode 100644 samples/2_Cookbook/8_peer2peer/CMakeLists.txt
 delete mode 100644 samples/2_Cookbook/9_unroll/CMakeLists.txt
 delete mode 100644 samples/README.md
 mode change 100755 => 100644 tests/hit/HIT.cmake
 delete mode 100644 tests/performance/compute/hipPerfMandelbrot.cpp
 delete mode 100644 tests/performance/stream/hipPerfDeviceConcurrency.cpp
 rename tests/src/{runtimeApi/cooperativeGrps => cg}/hipCGGridGroupType.cpp (97%)
 mode change 100755 => 100644
 rename tests/src/{runtimeApi/cooperativeGrps => cg}/hipCGGridGroupTypeViaBaseType.cpp (97%)
 mode change 100755 => 100644
 rename tests/src/{runtimeApi/cooperativeGrps => cg}/hipCGGridGroupTypeViaPublicApi.cpp (97%)
 mode change 100755 => 100644
 rename tests/src/{runtimeApi/cooperativeGrps => cg}/hipCGMultiGridGroupType.cpp (92%)
 mode change 100755 => 100644
 rename tests/src/{runtimeApi/cooperativeGrps => cg}/hipCGMultiGridGroupTypeViaBaseType.cpp (83%)
 rename tests/src/{runtimeApi/cooperativeGrps => cg}/hipCGMultiGridGroupTypeViaPublicApi.cpp (83%)
 rename tests/src/{runtimeApi/cooperativeGrps => cg}/hipCGThreadBlockType.cpp (95%)
 mode change 100755 => 100644
 rename tests/src/{runtimeApi/cooperativeGrps => cg}/hipCGThreadBlockTypeViaBaseType.cpp (94%)
 mode change 100755 => 100644
 rename tests/src/{runtimeApi/cooperativeGrps => cg}/hipCGThreadBlockTypeViaPublicApi.cpp (94%)
 mode change 100755 => 100644
 mode change 100755 => 100644 tests/src/p2p/hipPeerToPeer_simple.cpp
 delete mode 100644 tests/src/runtimeApi/cooperativeGrps/api_failure_tests.cpp
 delete mode 100644 tests/src/runtimeApi/cooperativeGrps/cooperative_streams.cpp
 delete mode 100644 tests/src/runtimeApi/cooperativeGrps/grid_group_data_sharing.cpp
 delete mode 100644 tests/src/runtimeApi/cooperativeGrps/multi_gpu_api_failure_tests.cpp
 delete mode 100644 tests/src/runtimeApi/cooperativeGrps/multi_gpu_streams.cpp
 delete mode 100644 tests/src/runtimeApi/cooperativeGrps/multi_grid_group_all_gpus.cpp
 delete mode 100644 tests/src/runtimeApi/cooperativeGrps/simple_grid_group_barrier.cpp
 delete mode 100644 tests/src/runtimeApi/cooperativeGrps/simple_multi_grid_group_barrier.cpp
 delete mode 100644 tests/src/runtimeApi/memory/hipIpcMemAccessTest.cpp
 delete mode 100644 tests/src/runtimeApi/memory/hipMallocConcurrency.cpp
 delete mode 100644 tests/src/runtimeApi/memory/hipMallocManaged_MultiScenario.cpp
 delete mode 100644 tests/src/runtimeApi/module/hipFuncSetSharedMemConfig.cpp
 rename tests/src/runtimeApi/{cooperativeGrps => module}/hipLaunchCoopMultiKernel.cpp (98%)
 rename tests/src/runtimeApi/{cooperativeGrps => module}/hipLaunchCooperativeKernel.cpp (94%)
 mode change 100755 => 100644 tests/src/test_common.h
 mode change 100755 => 100644 tests/unit/test_common.h
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 0c7156c478..c5a49feaa3 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -8,15 +8,10 @@ set(BUILD_SHARED_LIBS ON  CACHE BOOL "Build shared library (.so) or static lib (
 
 set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
 
-if(NOT DEFINED __HIP_ENABLE_PCH)
-  set(__HIP_ENABLE_PCH ON CACHE BOOL "enable/disable pre-compiled hip headers")
+if(NOT ${BUILD_SHARED_LIBS} AND NOT DEFINED ENABLE_HIP_PCH)
+  set(ENABLE_HIP_PCH ON CACHE BOOL "enable/disable pre-compiled hip headers")
 endif()
 
-if(${__HIP_ENABLE_PCH})
-  set(_pchStatus 1)
-else()
-  set(_pchStatus 0)
-endif()
 #############################
 # Options
 #############################
@@ -85,8 +80,8 @@ if(GIT_FOUND)
 
   set(HIP_VERSION_PATCH ${HIP_VERSION_GITDATE}-${HIP_VERSION_GITHASH})
 
-  if(DEFINED ENV{ROCM_LIBPATCH_VERSION})
-    set(HIP_PACKAGING_VERSION_PATCH ${HIP_VERSION_GITDATE}.${HIP_VERSION_GITCOUNT}.$ENV{ROCM_LIBPATCH_VERSION})
+  if(DEFINED ENV{ROCM_BUILD_ID})
+    set(HIP_PACKAGING_VERSION_PATCH ${HIP_VERSION_GITDATE}.${HIP_VERSION_GITCOUNT}-$ENV{ROCM_BUILD_ID}-${HIP_VERSION_GITHASH})
   else()
     set(HIP_PACKAGING_VERSION_PATCH ${HIP_VERSION_GITDATE}.${HIP_VERSION_GITCOUNT}-${HIP_VERSION_GITHASH})
   endif()
@@ -95,36 +90,6 @@ else()
   set(HIP_PACKAGING_VERSION_PATCH "0")
 endif()
 
-## Debian package specific variables
-if ( DEFINED ENV{CPACK_DEBIAN_PACKAGE_RELEASE} )
-  set ( CPACK_DEBIAN_PACKAGE_RELEASE $ENV{CPACK_DEBIAN_PACKAGE_RELEASE} )
-else()
-  set ( CPACK_DEBIAN_PACKAGE_RELEASE "local" )
-endif()
-message ( "Using CPACK_DEBIAN_PACKAGE_RELEASE ${CPACK_DEBIAN_PACKAGE_RELEASE}" )
-
-## RPM package specific variables
-if ( DEFINED ENV{CPACK_RPM_PACKAGE_RELEASE} )
-  set ( CPACK_RPM_PACKAGE_RELEASE $ENV{CPACK_RPM_PACKAGE_RELEASE} )
-else()
-  set ( CPACK_RPM_PACKAGE_RELEASE "local" )
-endif()
-
-## 'dist' breaks manual builds on debian systems due to empty Provides
-execute_process( COMMAND rpm --eval %{?dist}
-                 RESULT_VARIABLE PROC_RESULT
-                 OUTPUT_VARIABLE EVAL_RESULT
-                 OUTPUT_STRIP_TRAILING_WHITESPACE )
-
-if ( PROC_RESULT EQUAL "0" AND NOT EVAL_RESULT STREQUAL "" )
-  string ( APPEND CPACK_RPM_PACKAGE_RELEASE "%{?dist}" )
-endif()
-message("CPACK_RPM_PACKAGE_RELEASE: ${CPACK_RPM_PACKAGE_RELEASE}")
-
-add_to_config(_versionInfo HIP_PACKAGING_VERSION_PATCH)
-add_to_config(_versionInfo CPACK_DEBIAN_PACKAGE_RELEASE)
-add_to_config(_versionInfo CPACK_RPM_PACKAGE_RELEASE)
-
 add_to_config(_versionInfo HIP_VERSION_MAJOR)
 add_to_config(_versionInfo HIP_VERSION_MINOR)
 add_to_config(_versionInfo HIP_VERSION_PATCH)
@@ -137,6 +102,7 @@ else ()
    set (HIP_LIB_VERSION_PATCH ${HIP_VERSION_PATCH})
 endif ()
 set (HIP_LIB_VERSION_STRING "${HIP_LIB_VERSION_MAJOR}.${HIP_LIB_VERSION_MINOR}.${HIP_LIB_VERSION_PATCH}")
+
 if (DEFINED ENV{ROCM_RPATH})
     set (CMAKE_INSTALL_RPATH "$ENV{ROCM_RPATH}")
     set (CMAKE_BUILD_WITH_INSTALL_RPATH TRUE)
@@ -490,7 +456,6 @@ set(_versionInfoHeader
 #define HIP_VERSION_MINOR ${HIP_VERSION_MINOR}
 #define HIP_VERSION_PATCH ${HIP_VERSION_GITDATE}
 #define HIP_VERSION       (HIP_VERSION_MAJOR * 100 + HIP_VERSION_MINOR)\n
-#define __HIP_HAS_GET_PCH ${_pchStatus}\n
 #endif\n
 ")
 file(WRITE "${PROJECT_BINARY_DIR}/include/hip/hip_version.h" ${_versionInfoHeader})
@@ -704,11 +669,8 @@ endif()
 # Testing steps
 #############################
 # Target: test
-set(HIP_ROOT_DIR ${CMAKE_CURRENT_BINARY_DIR})
+set(HIP_ROOT_DIR ${CMAKE_INSTALL_PREFIX})
 set(HIP_SRC_PATH ${CMAKE_CURRENT_SOURCE_DIR})
-if(HIP_PLATFORM STREQUAL "nvcc")
-	execute_process(COMMAND "${CMAKE_COMMAND}" -E copy_directory "${HIP_SRC_PATH}/include" "${CMAKE_CURRENT_BINARY_DIR}/include" RESULT_VARIABLE RUN_HIT ERROR_QUIET)
-endif()
 execute_process(COMMAND "${CMAKE_COMMAND}" -E copy_directory "${HIP_SRC_PATH}/cmake" "${HIP_ROOT_DIR}/cmake" RESULT_VARIABLE RUN_HIT ERROR_QUIET)
 if(${RUN_HIT} EQUAL 0)
     execute_process(COMMAND "${CMAKE_COMMAND}" -E copy_directory "${HIP_SRC_PATH}/bin" "${HIP_ROOT_DIR}/bin" RESULT_VARIABLE RUN_HIT ERROR_QUIET)
@@ -751,7 +713,7 @@ endif()
 #############################
 # Target: clang
 if(HIP_HIPCC_EXECUTABLE)
-    add_custom_target(analyze
+    add_custom_target(analyze 
         COMMAND ${HIP_HIPCC_EXECUTABLE} -fvisibility=hidden -fvisibility-inlines-hidden --analyze --analyzer-outputtext  -isystem /opt/rocm/include ${HIP_HCC_BUILD_FLAGS} -Wno-unused-command-line-argument -I/opt/rocm/include -c  src/*.cpp -Iinclude/ -I./
     WORKING_DIRECTORY ${HIP_SRC_PATH})
     if(CPPCHECK_EXE)
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 750e6759c2..d9d353681d 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -1,15 +1,15 @@
-# Contributor Guidelines
+# Contributor Guidelines 
 
 ## Make Tips
-When building HIP, you will likely want to build and install to a local user-accessible directory (rather than /opt/rocm).
-This can be easily be done by setting the -DCMAKE_INSTALL_PREFIX variable when running cmake.  Typical use case is to
+When building HIP, you will likely want to build and install to a local user-accessible directory (rather than /opt/rocm).  
+This can be easily be done by setting the -DCMAKE_INSTALL_PREFIX variable when running cmake.  Typical use case is to 
 set CMAKE_INSTALL_PREFIX to your HIP git root, and then ensure HIP_PATH points to this directory.   For example
 
 ```
 cmake .. -DCMAKE_INSTALL_PREFIX=..
 make install
 
-export HIP_PATH=
+export HIP_PATH= 
 ```
 
 After making HIP, don't forget the "make install" step !
@@ -21,110 +21,118 @@ After making HIP, don't forget the "make install" step !
     - Add a translation to the hipify-clang tool ; many examples abound.
        - For stat tracking purposes, place the API into an appropriate stat category ("dev", "mem", "stream", etc).
     - Add a inlined NVCC implementation for the function in include/hip/nvcc_detail/hip_runtime_api.h.
-       - These are typically headers
-    - Add an HIP_ROCclr definition and Doxygen comments for the function in include/hcc_detail/hip_runtime_api.h
-       - Source implementation typically go in hip/rocclr/hip_*.cpp. The implementation involve calls to HIP runtime (ie for hipStream_t).
+       - These are typically headers 
+    - Add an HCC definition and Doxygen comments for the function in include/hcc_detail/hip_runtime_api.h
+       - Source implementation typically go in src/hcc_detail/hip_hcc.cpp. The implementation may involve 
+         calls to HCC runtime or HSA runtime, or interact with other pieces of the HIP runtime (ie for 
+         hipStream_t).
 
-## Check HIP-Clang version
-In some cases new HIP-Clang features are tied to specified releases, and it can be useful to check the current version is sufficiently new enough to support the desired feature.
-
-HIP runtime version
+#### Testing HCC version
+In some cases new HIP features are tied to specified releases of HCC, and it can be useful to determine at compile-time
+if the current HCC compiler is sufficiently new enough to support the desired feature.  The `__hcc_workweek__` compiler
+define is a monotonically increasing integer value that combines the year + workweek + day-of-week (0-6, Sunday is 0) 
+(ie 15403, 16014, etc).   
+The granularity is one day, so __hcc_workweek__  can only be used to distinguish compiler builds that are at least one day apart.
 
 ```
-> cat /opt/rocm/hip/bin/.hipVersion
-# Auto-generated by cmake
-HIP_VERSION_MAJOR=3
-HIP_VERSION_MINOR=9
-HIP_VERSION_PATCH=20345-519ef3f2
+#ifdef __hcc_workweek_ > 16014
+// use cool new HCC feature here
+#endif
 ```
 
-HIP-Clang compiler version
-
+Additionally, hcc binary can print the work-week to stdout: ("16014" in the version info below.)4
 ```
-$ /opt/rocm/llvm/bin/clang -v
-clang version 11.0.0 (/src/external/llvm-project/clang 075fedd3fd2f4d9d8cca79d0cd51f64c5ef21432)
+> /opt/rocm/hcc/bin/hcc -v
+HCC clang version 3.5.0  (based on HCC 0.8.16014-81f8a3f-f155163-5a1009a LLVM 3.5.0svn)
 Target: x86_64-unknown-linux-gnu
 Thread model: posix
-InstalledDir: /opt/rocm/llvm/bin
-Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/7
-Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/7.5.0
-Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/8
-Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/9
-Selected GCC installation: /usr/lib/gcc/x86_64-linux-gnu/9
+Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/4.8
+Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/4.8.4
+Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/4.9
+Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/4.9.1
+Selected GCC installation: /usr/lib/gcc/x86_64-linux-gnu/4.8
 Candidate multilib: .;@m64
 Candidate multilib: 32;@m32
 Candidate multilib: x32;@mx32
 Selected multilib: .;@m64
 ```
 
+The unix `date` command can print the HCC-format work-week for a specific date , ie:
+```
+> date --utc +%y%U%w -d 2015-11-09
+15451
+```
+
 ## Unit Testing Environment
 
-HIP includes unit tests in the tests/src directory.
+HIP includes unit tests in the tests/src directory.  
 When adding a new HIP feature, add a new unit test as well.
 See [tests/README.md](README.md) for more information.
 
 ## Development Flow
-
-Directed tests provide a great place to develop new features alongside the associated test.
+It is recommended that developers set the flag HIP_BUILD_LOCAL=1 so that the unit testing environment automatically rebuilds libhip_hcc.a and the tests when a change it made to the HIP source. 
+Directed tests provide a great place to develop new features alongside the associated test.  
 
 For applications and benchmarks outside the directed test environment, developments should use a two-step development flow:
-- #1. Compile, link, and install HIP/ROCclr.  See [Installation](README.md#Installation) notes.
-- #2. Relink the target application to include changes in HIP runtime file.
+- #1. Compile, link, and install HCC.  See [Installation](README.md#Installation) notes.
+- #2. Relink the target application to include changes in the libhip_hcc.a file.
 
 ## Environment Variables
-- **HIP_PATH** : Location of HIP include, src, bin, lib directories.
-- **HCC_ROCCLR_HOME** : Path to HIP/ROCclr directory, used on AMD platforms.  Default /opt/rocm/rocclr.
+- **HIP_PATH** : Location of HIP include, src, bin, lib directories.  
+- **HCC_HOME** : Path to HCC compiler.  Default /opt/rocm/hcc.
 - **HSA_PATH** : Path to HSA include, lib.  Default /opt/rocm/hsa.
 - **CUDA_PATH* : On nvcc system, this points to root of CUDA installation.
 
-## Contribution guidelines ##
+### Contribution guidelines ###
 
 Features (ie functions, classes, types) defined in hip*.h should resemble CUDA APIs.
 The HIP interface is designed to be very familiar for CUDA programmers.
 
-Differences or limitations of HIP APIs as compared to CUDA APIs should be clearly documented and described.
+Differences or limitations of HIP APIs as compared to CUDA APIs should be clearly documented and described. 
 
-### Coding Guidelines (in brief)
+## Coding Guidelines (in brief)
 - Code Indentation:
     - Tabs should be expanded to spaces.
     - Use 4 spaces indentation.
 - Capitalization and Naming
-    - Prefer camelCase for HIP interfaces and internal symbols.  Note HCC uses _ for separator.
+    - Prefer camelCase for HIP interfaces and internal symbols.  Note HCC uses _ for separator.  
       This guideline is not yet consistently followed in HIP code - eventual compliance is aspirational.
     - Member variables should begin with a leading "_".  This allows them to be easily distinguished from other variables or functions.
+    
 
 - {} placement
     - For functions, the opening { should be placed on a new line.
     - For if/else blocks, the opening { is placed on same line as the if/else. Use a space to separate {/" from if/else.  Example
 '''
     if (foo) {
-        doFoo()
-    } else {
+        doFoo() 
+    } else { 
         doFooElse();
     }
 '''
     - namespace should be on same line as { and separated by a space.
     - Single-line if statement should still use {/} pair (even though C++ does not require).
 - Miscellaneous
-    - All references in function parameter lists should be const.
+    - All references in function parameter lists should be const.  
     - "ihip" = internal hip structures.  These should not be exposed through the HIP API.
     - Keyword TODO refers to a note that should be addressed in long-term.  Could be style issue, software architecture, or known bugs.
     - FIXME refers to a short-term bug that needs to be addressed.
 
 - HIP_INIT_API() should be placed at the start of each top-level HIP API.  This function will make sure the HIP runtime is initialized,
   and also constructs an appropriate API string for tracing and CodeXL marker tracing.  The arguments to HIP_INIT_API should match
-  those of the parent function.
-- ihipLogStatus should only be called from top-level HIP APIs,and should be called to log and return the error code.  The error code
+  those of the parent function.  
+- ihipLogStatus should only be called from top-level HIP APIs,and should be called to log and return the error code.  The error code 
   is used by the GetLastError and PeekLastError functions - if a HIP API simply returns, then the error will not be logged correctly.
 
 - All HIP environment variables should begin with the keyword HIP_
     Environment variables should be long enough to describe their purpose but short enough so they can be remembered - perhaps 10-20 characters, with 3-4 parts separated by underscores.
     To see the list of current environment variables, along with their values, set HIP_PRINT_ENV and run any hip applications on ROCm platform .
-    HIPCC or other tools may support additional environment variables which should follow the above convention.
+    HIPCC or other tools may support additional environment variables which should follow the above convention.  
 
 
-### Presubmit Testing:
-Before checking in or submitting a pull request, run all directed tests (see tests/README.md) and all Rodinia tests.
+
+#### Presubmit Testing:
+Before checking in or submitting a pull request, run all directed tests (see tests/README.md) and all Rodinia tests.  
 Ensure pass results match starting point:
 
 ```shell
@@ -133,13 +141,13 @@ Ensure pass results match starting point:
 ```
 
 
-### Checkin messages
+#### Checkin messages
 Follow existing best practice for writing a good Git commit message.    Some tips:
     http://chris.beams.io/posts/git-commit/
     https://robots.thoughtbot.com/5-useful-tips-for-a-better-commit-message
 
-In particular :
-   - Use imperative voice, ie "Fix this bug", "Refactor the XYZ routine", "Update the doc".
+In particular : 
+   - Use imperative voice, ie "Fix this bug", "Refactor the XYZ routine", "Update the doc".  
      Not : "Fixing the bug", "Fixed the bug", "Bug fix", etc.
    - Subject should summarize the commit.  Do not end subject with a period.  Use a blank line
      after the subject.
diff --git a/bin/hip_embed_pch.sh b/bin/hip_embed_pch.sh
index 0f2cbabd84..8fe3c20f98 100755
--- a/bin/hip_embed_pch.sh
+++ b/bin/hip_embed_pch.sh
@@ -1,7 +1,8 @@
 #!/bin/bash
 
 #set -x
-LLVM_DIR="$1/../../../"
+
+ROCM_PATH=${ROCM_PATH:-/opt/rocm}
 tmp=/tmp/hip_pch.$$
 mkdir -p $tmp
 
@@ -46,12 +47,12 @@ __hip_pch_size:
   .long __hip_pch_size - __hip_pch
 EOF
 
-$LLVM_DIR/bin/clang -O3 -c -std=c++17 -isystem $LLVM_DIR/lib/clang/11.0.0/include/.. -isystem /opt/rocm/include -nogpulib --cuda-device-only -x hip $tmp/hip_pch.h -E >$tmp/pch.cui
+$ROCM_PATH/llvm/bin/clang -O3 -c -std=c++17 -isystem /opt/rocm/llvm/lib/clang/11.0.0/include/.. -isystem /opt/rocm/include -nogpulib --cuda-device-only -x hip $tmp/hip_pch.h -E >$tmp/pch.cui
 
 cat $tmp/hip_macros.h >> $tmp/pch.cui
 
-$LLVM_DIR/bin/clang -cc1 -O3 -emit-pch -triple amdgcn-amd-amdhsa -aux-triple x86_64-unknown-linux-gnu -fcuda-is-device -std=c++17 -fgnuc-version=4.2.1 -o $tmp/hip.pch -x hip-cpp-output - <$tmp/pch.cui
+$ROCM_PATH/llvm/bin/clang -cc1 -O3 -emit-pch -triple amdgcn-amd-amdhsa -aux-triple x86_64-unknown-linux-gnu -fcuda-is-device -std=c++17 -fgnuc-version=4.2.1 -o $tmp/hip.pch -x hip-cpp-output - <$tmp/pch.cui
 
-$LLVM_DIR/bin/llvm-mc -o hip_pch.o $tmp/hip_pch.mcin --filetype=obj
+$ROCM_PATH/llvm/bin/llvm-mc -o hip_pch.o $tmp/hip_pch.mcin --filetype=obj
 
 rm -rf $tmp
diff --git a/bin/hip_gen_pch.sh b/bin/hip_gen_pch.sh
new file mode 100755
index 0000000000..b212177119
--- /dev/null
+++ b/bin/hip_gen_pch.sh
@@ -0,0 +1,36 @@
+#!/bin/bash
+
+#set -x
+
+cat >/tmp/hip_macros.h <<EOF
+#define __device__ __attribute__((device))
+#define __host__ __attribute__((host))
+#define __global__ __attribute__((global))
+#define __constant__ __attribute__((constant))
+#define __shared__ __attribute__((shared))
+
+#define launch_bounds_impl0(requiredMaxThreadsPerBlock)                                            \
+    __attribute__((amdgpu_flat_work_group_size(1, requiredMaxThreadsPerBlock)))
+#define launch_bounds_impl1(requiredMaxThreadsPerBlock, minBlocksPerMultiprocessor)                \
+    __attribute__((amdgpu_flat_work_group_size(1, requiredMaxThreadsPerBlock),                     \
+                   amdgpu_waves_per_eu(minBlocksPerMultiprocessor)))
+#define select_impl_(_1, _2, impl_, ...) impl_
+#define __launch_bounds__(...)                                                                     \
+    select_impl_(__VA_ARGS__, launch_bounds_impl1, launch_bounds_impl0)(__VA_ARGS__)
+
+// Macro to replace extern __shared__ declarations
+// to local variable definitions
+#define HIP_DYNAMIC_SHARED(type, var) \
+    type* var = (type*)__amdgcn_get_dynamicgroupbaseptr();
+EOF
+
+cat >/tmp/hip_pch.h <<EOF
+#include "hip/hip_runtime.h"
+#include "hip/hip_fp16.h"
+EOF
+
+/opt/rocm/llvm/bin/clang -O3 -c -std=c++17 -isystem /opt/rocm/llvm/lib/clang/11.0.0/include/.. -isystem /opt/rocm/include -nogpulib --cuda-device-only -x hip /tmp/hip_pch.h -E >/tmp/pch.cui
+
+cat /tmp/hip_macros.h >> /tmp/pch.cui
+
+/opt/rocm/llvm/bin/clang -cc1 -O3 -emit-pch -triple amdgcn-amd-amdhsa -aux-triple x86_64-unknown-linux-gnu -fcuda-is-device -std=c++17 -fgnuc-version=4.2.1 -o /tmp/hip.pch -x hip-cpp-output - </tmp/pch.cui
diff --git a/bin/hipcc b/bin/hipcc
index 51cf35fff3..07e7b78d7c 100755
--- a/bin/hipcc
+++ b/bin/hipcc
@@ -803,8 +803,7 @@ if ($needHipHcc) {
     if ($linkType eq 0) {
         substr($HIPLDFLAGS,0,0) = "  $HIP_LIB_PATH/libamdhip64.a " ;
     } else {
-        #Currently in ROCm some of libraries are in lib64 and rest are in lib folder in centos.
-        substr($HIPLDFLAGS,0,0) = "  -Wl,--enable-new-dtags -Wl,--rpath=$HIP_LIB_PATH:$ROCM_PATH/lib:$ROCM_PATH/lib64 $HIP_LIB_PATH/libamdhip64.so ";
+        substr($HIPLDFLAGS,0,0) = "  -Wl,--enable-new-dtags -Wl,--rpath=$HIP_LIB_PATH:$ROCM_PATH/lib $HIP_LIB_PATH/libamdhip64.so ";
     }
 }
 
diff --git a/docs/markdown/hip_faq.md b/docs/markdown/hip_faq.md
index 90f0bc3d54..8aa3daa239 100644
--- a/docs/markdown/hip_faq.md
+++ b/docs/markdown/hip_faq.md
@@ -247,4 +247,4 @@ The workaround is to explicitly add the keyword of "static" before any functions
 Product of block.x, block.y, and block.z should be less than 1024.
 
 ### Are __shfl_*_sync functions supported on HIP platform?
-__shfl_*_sync is not supported on HIP but for nvcc path CUDA 9.0 and above all shuffle calls get redirected to it's sync version.
\ No newline at end of file
+__shfl_*_sync is not supported on HIP but for nvcc path CUDA 9.0 and above all shuffle calls get redirected to it's sync version.
diff --git a/hip-config.cmake.in b/hip-config.cmake.in
index d5af845cd6..ba340134f7 100644
--- a/hip-config.cmake.in
+++ b/hip-config.cmake.in
@@ -54,18 +54,7 @@ set_and_check( hip_BIN_INSTALL_DIR "@PACKAGE_BIN_INSTALL_DIR@" )
 set_and_check(hip_HIPCC_EXECUTABLE "${hip_BIN_INSTALL_DIR}/hipcc")
 set_and_check(hip_HIPCONFIG_EXECUTABLE "${hip_BIN_INSTALL_DIR}/hipconfig")
 
-# set a default path for ROCM_PATH
-if(NOT DEFINED ROCM_PATH)
-  set(ROCM_PATH /opt/rocm)
-endif()
-
-#If HIP isnot installed under ROCm, need this to find HSA assuming HSA is under ROCm
-if(DEFINED ENV{ROCM_PATH})
-  set(ROCM_PATH "$ENV{ROCM_PATH}")
-endif()
-
 if(HIP_COMPILER STREQUAL "clang")
-  set(HIP_CLANG_ROOT "${ROCM_PATH}/llvm")
   if(NOT HIP_CXX_COMPILER)
     set(HIP_CXX_COMPILER ${CMAKE_CXX_COMPILER})
   endif()
@@ -73,12 +62,16 @@ if(HIP_COMPILER STREQUAL "clang")
     execute_process(COMMAND ${HIP_CXX_COMPILER} --version
                     OUTPUT_STRIP_TRAILING_WHITESPACE
                     OUTPUT_VARIABLE HIP_CLANG_CXX_COMPILER_VERSION_OUTPUT)
-    if(HIP_CLANG_CXX_COMPILER_VERSION_OUTPUT MATCHES "InstalledDir:[ \t]*([^\n]*)")
-      get_filename_component(HIP_CLANG_ROOT "${CMAKE_MATCH_1}" DIRECTORY)
+    if(HIP_CLANG_CXX_COMPILER_VERSION_OUTPUT MATCHES "InstalledDir:[\t\r\n][\t\r\n]*([^\t\r\n])")
+      set(HIP_CLANG_ROOT ${CMAKE_MATCH_1})
+    else()
+      set(HIP_CLANG_ROOT /opt/rocm/llvm)
     endif()
   elseif (HIP_CXX_COMPILER MATCHES ".*clang\\+\\+")
-    get_filename_component(HIP_CLANG_ROOT "${HIP_CXX_COMPILER}" DIRECTORY)
-    get_filename_component(HIP_CLANG_ROOT "${HIP_CLANG_ROOT}" DIRECTORY)
+    get_filename_component(HIP_CLANG_ROOT "${HIP_CXX_COMPILER}" PATH)
+    get_filename_component(HIP_CLANG_ROOT "${HIP_CLANG_ROOT}" PATH)
+  else()
+    set(HIP_CLANG_ROOT /opt/rocm/llvm)
   endif()
   file(GLOB HIP_CLANG_INCLUDE_SEARCH_PATHS ${HIP_CLANG_ROOT}/lib/clang/*/include)
   find_path(HIP_CLANG_INCLUDE_PATH stddef.h
@@ -96,6 +89,11 @@ find_dependency(amd_comgr)
 
 include( "${CMAKE_CURRENT_LIST_DIR}/hip-targets.cmake" )
 
+#If HIP isnot installed under ROCm, need this to find HSA assuming HSA is under ROCm
+if( DEFINED ENV{ROCM_PATH} )
+     set(ROCM_PATH "$ENV{ROCM_PATH}")
+endif()
+
 #Using find_dependecy to locate the dependency for the packagaes
 #This makes the cmake generated file xxxx-targets to supply the linker libraries
 # without worrying other transitive dependencies
diff --git a/include/hip/hcc_detail/device_functions.h b/include/hip/hcc_detail/device_functions.h
index a5fe9425f4..fb06e88abd 100644
--- a/include/hip/hcc_detail/device_functions.h
+++ b/include/hip/hcc_detail/device_functions.h
@@ -365,25 +365,6 @@ long __shfl(long var, int src_lane, int width = warpSize)
 }
 __device__
 inline
-unsigned long __shfl(unsigned long var, int src_lane, int width = warpSize) {
-    #ifndef _MSC_VER
-    static_assert(sizeof(unsigned long) == 2 * sizeof(unsigned int), "");
-    static_assert(sizeof(unsigned long) == sizeof(uint64_t), "");
-
-    unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
-    tmp[0] = __shfl(tmp[0], src_lane, width);
-    tmp[1] = __shfl(tmp[1], src_lane, width);
-
-    uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
-    unsigned long tmp1;  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
-    return tmp1;
-    #else
-    static_assert(sizeof(unsigned long) == sizeof(unsigned int), "");
-    return static_cast<unsigned long>(__shfl(static_cast<unsigned int>(var), src_lane, width));
-    #endif
-}
-__device__
-inline
 long long __shfl(long long var, int src_lane, int width = warpSize)
 {
     static_assert(sizeof(long long) == 2 * sizeof(int), "");
@@ -397,22 +378,8 @@ long long __shfl(long long var, int src_lane, int width = warpSize)
     long long tmp1;  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
     return tmp1;
 }
-__device__
-inline
-unsigned long long __shfl(unsigned long long var, int src_lane, int width = warpSize) {
-    static_assert(sizeof(unsigned long long) == 2 * sizeof(unsigned int), "");
-    static_assert(sizeof(unsigned long long) == sizeof(uint64_t), "");
 
-    unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
-    tmp[0] = __shfl(tmp[0], src_lane, width);
-    tmp[1] = __shfl(tmp[1], src_lane, width);
-
-    uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
-    unsigned long long tmp1;  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
-    return tmp1;
-}
-
-__device__
+ __device__
 inline
 int __shfl_up(int var, unsigned int lane_delta, int width = warpSize) {
     int self = __lane_id();
@@ -468,28 +435,6 @@ long __shfl_up(long var, unsigned int lane_delta, int width = warpSize)
     return static_cast<long>(__shfl_up(static_cast<int>(var), lane_delta, width));
     #endif
 }
-
-__device__
-inline
-unsigned long __shfl_up(unsigned long var, unsigned int lane_delta, int width = warpSize)
-{
-    #ifndef _MSC_VER
-    static_assert(sizeof(unsigned long) == 2 * sizeof(unsigned int), "");
-    static_assert(sizeof(unsigned long) == sizeof(uint64_t), "");
-
-    unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
-    tmp[0] = __shfl_up(tmp[0], lane_delta, width);
-    tmp[1] = __shfl_up(tmp[1], lane_delta, width);
-
-    uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
-    unsigned long tmp1;  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
-    return tmp1;
-    #else
-    static_assert(sizeof(unsigned long) == sizeof(unsigned int), "");
-    return static_cast<unsigned long>(__shfl_up(static_cast<unsigned int>(var), lane_delta, width));
-    #endif
-}
-
 __device__
 inline
 long long __shfl_up(long long var, unsigned int lane_delta, int width = warpSize)
@@ -504,20 +449,6 @@ long long __shfl_up(long long var, unsigned int lane_delta, int width = warpSize
     return tmp1;
 }
 
-__device__
-inline
-unsigned long long __shfl_up(unsigned long long var, unsigned int lane_delta, int width = warpSize)
-{
-    static_assert(sizeof(unsigned long long) == 2 * sizeof(unsigned int), "");
-    static_assert(sizeof(unsigned long long) == sizeof(uint64_t), "");
-    unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
-    tmp[0] = __shfl_up(tmp[0], lane_delta, width);
-    tmp[1] = __shfl_up(tmp[1], lane_delta, width);
-    uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
-    unsigned long long tmp1;  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
-    return tmp1;
-}
-
 __device__
 inline
 int __shfl_down(int var, unsigned int lane_delta, int width = warpSize) {
@@ -576,26 +507,6 @@ long __shfl_down(long var, unsigned int lane_delta, int width = warpSize)
 }
 __device__
 inline
-unsigned long __shfl_down(unsigned long var, unsigned int lane_delta, int width = warpSize)
-{
-    #ifndef _MSC_VER
-    static_assert(sizeof(unsigned long) == 2 * sizeof(unsigned int), "");
-    static_assert(sizeof(unsigned long) == sizeof(uint64_t), "");
-
-    unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
-    tmp[0] = __shfl_down(tmp[0], lane_delta, width);
-    tmp[1] = __shfl_down(tmp[1], lane_delta, width);
-
-    uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
-    unsigned long tmp1;  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
-    return tmp1;
-    #else
-    static_assert(sizeof(unsigned long) == sizeof(unsigned int), "");
-    return static_cast<unsigned long>(__shfl_down(static_cast<unsigned int>(var), lane_delta, width));
-    #endif
-}
-__device__
-inline
 long long __shfl_down(long long var, unsigned int lane_delta, int width = warpSize)
 {
     static_assert(sizeof(long long) == 2 * sizeof(int), "");
@@ -607,19 +518,6 @@ long long __shfl_down(long long var, unsigned int lane_delta, int width = warpSi
     long long tmp1;  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
     return tmp1;
 }
-__device__
-inline
-unsigned long long __shfl_down(unsigned long long var, unsigned int lane_delta, int width = warpSize)
-{
-    static_assert(sizeof(unsigned long long) == 2 * sizeof(unsigned int), "");
-    static_assert(sizeof(unsigned long long) == sizeof(uint64_t), "");
-    unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
-    tmp[0] = __shfl_down(tmp[0], lane_delta, width);
-    tmp[1] = __shfl_down(tmp[1], lane_delta, width);
-    uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
-    unsigned long long tmp1;  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
-    return tmp1;
-}
 
 __device__
 inline
@@ -679,26 +577,6 @@ long __shfl_xor(long var, int lane_mask, int width = warpSize)
 }
 __device__
 inline
-unsigned long __shfl_xor(unsigned long var, int lane_mask, int width = warpSize)
-{
-    #ifndef _MSC_VER
-    static_assert(sizeof(unsigned long) == 2 * sizeof(unsigned int), "");
-    static_assert(sizeof(unsigned long) == sizeof(uint64_t), "");
-
-    unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
-    tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
-    tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
-
-    uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
-    unsigned long tmp1;  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
-    return tmp1;
-    #else
-    static_assert(sizeof(unsigned long) == sizeof(unsigned int), "");
-    return static_cast<unsigned long>(__shfl_xor(static_cast<unsigned int>(var), lane_mask, width));
-    #endif
-}
-__device__
-inline
 long long __shfl_xor(long long var, int lane_mask, int width = warpSize)
 {
     static_assert(sizeof(long long) == 2 * sizeof(int), "");
@@ -710,19 +588,7 @@ long long __shfl_xor(long long var, int lane_mask, int width = warpSize)
     long long tmp1;  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
     return tmp1;
 }
-__device__
-inline
-unsigned long long __shfl_xor(unsigned long long var, int lane_mask, int width = warpSize)
-{
-    static_assert(sizeof(unsigned long long) == 2 * sizeof(unsigned int), "");
-    static_assert(sizeof(unsigned long long) == sizeof(uint64_t), "");
-    unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
-    tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
-    tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
-    uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
-    unsigned long long tmp1;  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
-    return tmp1;
-}
+
 #define MASK1 0x00ff00ff
 #define MASK2 0xff00ff00
 
diff --git a/include/hip/hcc_detail/hip_runtime.h b/include/hip/hcc_detail/hip_runtime.h
index 0a173bb466..0e5820a016 100644
--- a/include/hip/hcc_detail/hip_runtime.h
+++ b/include/hip/hcc_detail/hip_runtime.h
@@ -487,22 +487,6 @@ struct __HIP_Coordinates {
 #endif
 
 };
-template <typename F>
-#if !defined(_MSC_VER)
-__attribute__((weak))
-#endif
-constexpr typename __HIP_Coordinates<F>::X __HIP_Coordinates<F>::x;
-template <typename F>
-#if !defined(_MSC_VER)
-__attribute__((weak))
-#endif
-constexpr typename __HIP_Coordinates<F>::Y __HIP_Coordinates<F>::y;
-template <typename F>
-#if !defined(_MSC_VER)
-__attribute__((weak))
-#endif
-constexpr typename __HIP_Coordinates<F>::Z __HIP_Coordinates<F>::z;
-
 extern "C" __device__ __attribute__((const)) size_t __ockl_get_global_size(uint);
 inline
 __device__
diff --git a/include/hip/hcc_detail/hip_runtime_api.h b/include/hip/hcc_detail/hip_runtime_api.h
index 74c0fb5f69..37fcccf192 100755
--- a/include/hip/hcc_detail/hip_runtime_api.h
+++ b/include/hip/hcc_detail/hip_runtime_api.h
@@ -345,16 +345,13 @@ typedef struct hipLaunchParams_t {
     hipStream_t stream;     ///< Stream identifier
 } hipLaunchParams;
 
-#if __HIP_HAS_GET_PCH
-/**
- * Internal use only. This API may change in the future
- * Pre-Compiled header for online compilation
- *
- */
-    void __hipGetPCH(const char** pch, unsigned int*size);
+// Pre-Compiled header for online compilation
+#ifdef ENABLE_HIP_PCH
+extern const char* __hip_pch;
+extern unsigned __hip_pch_size;
+void __hipGetPCH(const char** pch, unsigned int*size);
 #endif
 
-
 // Doxygen end group GlobalDefs
 /**  @} */
 
diff --git a/include/hip/hip_cooperative_groups.h b/include/hip/hip_cooperative_groups.h
index 41f36378bb..d919e83c7f 100644
--- a/include/hip/hip_cooperative_groups.h
+++ b/include/hip/hip_cooperative_groups.h
@@ -28,17 +28,14 @@ THE SOFTWARE.
  */
 
 #ifndef  HIP_INCLUDE_HIP_HIP_COOPERATIVE_GROUP_H
-#define  HIP_INCLUDE_HIP_HIP_COOPERATIVE_GROUP_H
-
-#include <hip/hip_version.h>
-#include <hip/hip_common.h>
+#define HIP_INCLUDE_HIP_HIP_VECTOR_TYPES_H
 
 #if defined(__HIP_PLATFORM_HCC__) && !defined(__HIP_PLATFORM_NVCC__)
-#if __cplusplus && defined(__clang__) && defined(__HIP__)
+#if __cplusplus
 #include <hip/hcc_detail/hip_cooperative_groups.h>
 #endif
 #elif defined(__HIP_PLATFORM_NVCC__) && !defined(__HIP_PLATFORM_HCC__)
-#include <hip/nvcc_detail/hip_cooperative_groups.h>
+#include <cooperative_groups.h>
 #else
 #error("Must define exactly one of __HIP_PLATFORM_HCC__ or __HIP_PLATFORM_NVCC__");
 #endif
diff --git a/include/hip/hip_runtime_api.h b/include/hip/hip_runtime_api.h
index 3a26fb74f4..4412bbd7da 100644
--- a/include/hip/hip_runtime_api.h
+++ b/include/hip/hip_runtime_api.h
@@ -32,7 +32,6 @@ THE SOFTWARE.
 
 
 #include <string.h>  // for getDeviceProp
-#include <hip/hip_version.h>
 #include <hip/hip_common.h>
 
 enum {
diff --git a/include/hip/nvcc_detail/hip_cooperative_groups.h b/include/hip/nvcc_detail/hip_cooperative_groups.h
deleted file mode 100644
index 113e600eec..0000000000
--- a/include/hip/nvcc_detail/hip_cooperative_groups.h
+++ /dev/null
@@ -1,12 +0,0 @@
-#ifndef HIP_INCLUDE_HIP_NVCC_DETAIL_HIP_COOPERATIVE_GROUPS_H
-#define HIP_INCLUDE_HIP_NVCC_DETAIL_HIP_COOPERATIVE_GROUPS_H
-
-// Include CUDA headers
-#include <cuda_runtime.h>
-#include <cooperative_groups.h>
-
-// Include HIP wrapper headers around CUDA
-#include <hip/hip_runtime.h>
-#include <hip/hip_runtime_api.h>
-
-#endif // HIP_INCLUDE_HIP_NVCC_DETAIL_HIP_COOPERATIVE_GROUPS_H
diff --git a/include/hip/nvcc_detail/hip_runtime.h b/include/hip/nvcc_detail/hip_runtime.h
index e7c3eaf32a..c13540df54 100644
--- a/include/hip/nvcc_detail/hip_runtime.h
+++ b/include/hip/nvcc_detail/hip_runtime.h
@@ -104,13 +104,13 @@ typedef int hipLaunchParm;
 #define HIP_DYNAMIC_SHARED_ATTRIBUTE
 
 #ifdef __HIP_DEVICE_COMPILE__
-#define abort_()                                                                                    \
+#define abort()                                                                                    \
     { asm("trap;"); }
 #undef assert
 #define assert(COND)                                                                               \
     {                                                                                              \
         if (!COND) {                                                                               \
-            abort_();                                                                               \
+            abort();                                                                               \
         }                                                                                          \
     }
 #endif
diff --git a/include/hip/nvcc_detail/hip_runtime_api.h b/include/hip/nvcc_detail/hip_runtime_api.h
index ce1469804e..faa0bf7d7b 100755
--- a/include/hip/nvcc_detail/hip_runtime_api.h
+++ b/include/hip/nvcc_detail/hip_runtime_api.h
@@ -26,7 +26,6 @@ THE SOFTWARE.
 #include <cuda_runtime_api.h>
 #include <cuda.h>
 #include <cuda_profiler_api.h>
-#include <cuda_fp16.h>
 
 #ifdef __cplusplus
 extern "C" {
diff --git a/lpl_ca/CMakeLists.txt b/lpl_ca/CMakeLists.txt
index 2473fbc254..c272273c09 100644
--- a/lpl_ca/CMakeLists.txt
+++ b/lpl_ca/CMakeLists.txt
@@ -20,7 +20,6 @@ target_include_directories(lpl
 
 target_compile_options(lpl PUBLIC -Wall)
 target_link_libraries(lpl PUBLIC pthread)
-add_custom_command(TARGET lpl POST_BUILD COMMAND ${CMAKE_COMMAND} -E copy ${PROJECT_BINARY_DIR}/lpl ${PROJECT_BINARY_DIR}/bin/lpl)
 
 install(TARGETS lpl RUNTIME DESTINATION bin)
 #-------------------------------------LPL--------------------------------------#
@@ -44,7 +43,6 @@ find_package(hsa-runtime64 REQUIRED CONFIG
 
 target_link_libraries(ca PUBLIC hsa-runtime64::hsa-runtime64 )
 target_compile_options(ca PUBLIC -DDISABLE_REDUCED_GPU_BLOB_COPY -Wall)
-add_custom_command(TARGET ca POST_BUILD COMMAND ${CMAKE_COMMAND} -E copy ${PROJECT_BINARY_DIR}/ca ${PROJECT_BINARY_DIR}/bin/ca)
 
 install(TARGETS ca RUNTIME DESTINATION bin)
 #-------------------------------------CA---------------------------------------#
diff --git a/packaging/hip-base.txt b/packaging/hip-base.txt
index 9b10ec2c3f..7ba7d3b93a 100644
--- a/packaging/hip-base.txt
+++ b/packaging/hip-base.txt
@@ -21,23 +21,22 @@ set(CPACK_PACKAGE_NAME "hip-base")
 set(CPACK_PACKAGE_DESCRIPTION_SUMMARY "HIP: Heterogenous-computing Interface for Portability [BASE]")
 set(CPACK_PACKAGE_VENDOR "Advanced Micro Devices, Inc.")
 set(CPACK_PACKAGE_CONTACT "Maneesh Gupta <maneesh.gupta@amd.com>")
+set(CPACK_PACKAGE_VERSION @HIP_VERSION_MAJOR@.@HIP_VERSION_MINOR@.@HIP_VERSION_PATCH@)
 set(CPACK_PACKAGE_VERSION_MAJOR @HIP_VERSION_MAJOR@)
 set(CPACK_PACKAGE_VERSION_MINOR @HIP_VERSION_MINOR@)
-set(CPACK_PACKAGE_VERSION_PATCH @HIP_PACKAGING_VERSION_PATCH@)
-set(CPACK_PACKAGE_VERSION @HIP_VERSION_MAJOR@.@HIP_VERSION_MINOR@.@HIP_PACKAGING_VERSION_PATCH@)
+set(CPACK_PACKAGE_VERSION_PATCH @HIP_VERSION_PATCH@)
+set(CPACK_PACKAGE_FILE_NAME ${CPACK_PACKAGE_NAME}-${CPACK_PACKAGE_VERSION_MAJOR}.${CPACK_PACKAGE_VERSION_MINOR}.${CPACK_PACKAGE_VERSION_PATCH})
 set(CPACK_GENERATOR "TGZ;DEB;RPM")
 
 set(CPACK_BINARY_DEB "ON")
-set(CPACK_DEBIAN_PACKAGE_RELEASE @CPACK_DEBIAN_PACKAGE_RELEASE@)
-set(CPACK_DEBIAN_FILE_NAME "DEB-DEFAULT")
+set(CPACK_DEBIAN_FILE_NAME ${CPACK_PACKAGE_FILE_NAME}_amd64.deb)
 set(CPACK_DEBIAN_PACKAGE_CONTROL_EXTRA "${PROJECT_BINARY_DIR}/postinst;${PROJECT_BINARY_DIR}/prerm")
 set(CPACK_DEBIAN_PACKAGE_DEPENDS "perl (>= 5.0),libfile-which-perl")
 set(CPACK_DEBIAN_PACKAGE_PROVIDES "hip-base")
 set(CPACK_DEBIAN_PACKAGE_REPLACES "hip_base")
 
 set(CPACK_BINARY_RPM "ON")
-set(CPACK_RPM_PACKAGE_RELEASE @CPACK_RPM_PACKAGE_RELEASE@)
-set(CPACK_RPM_FILE_NAME "RPM-DEFAULT")
+set(CPACK_RPM_FILE_NAME ${CPACK_PACKAGE_FILE_NAME}.x86_64.rpm)
 set(CPACK_RPM_PACKAGE_ARCHITECTURE "${CMAKE_SYSTEM_PROCESSOR}")
 set(CPACK_RPM_POST_INSTALL_SCRIPT_FILE "${PROJECT_BINARY_DIR}/postinst")
 set(CPACK_RPM_PRE_UNINSTALL_SCRIPT_FILE "${PROJECT_BINARY_DIR}/prerm")
diff --git a/packaging/hip-doc.txt b/packaging/hip-doc.txt
index 30f05cb6e6..911f2486fd 100644
--- a/packaging/hip-doc.txt
+++ b/packaging/hip-doc.txt
@@ -24,26 +24,25 @@ set(CPACK_PACKAGE_NAME "hip-doc")
 set(CPACK_PACKAGE_DESCRIPTION_SUMMARY "HIP: Heterogenous-computing Interface for Portability [DOCUMENTATION]")
 set(CPACK_PACKAGE_VENDOR "Advanced Micro Devices, Inc.")
 set(CPACK_PACKAGE_CONTACT "Maneesh Gupta <maneesh.gupta@amd.com>")
+set(CPACK_PACKAGE_VERSION @HIP_VERSION_MAJOR@.@HIP_VERSION_MINOR@.@HIP_VERSION_PATCH@)
 set(CPACK_PACKAGE_VERSION_MAJOR @HIP_VERSION_MAJOR@)
 set(CPACK_PACKAGE_VERSION_MINOR @HIP_VERSION_MINOR@)
-set(CPACK_PACKAGE_VERSION_PATCH @HIP_PACKAGING_VERSION_PATCH@)
-set(CPACK_PACKAGE_VERSION @HIP_VERSION_MAJOR@.@HIP_VERSION_MINOR@.@HIP_PACKAGING_VERSION_PATCH@)
+set(CPACK_PACKAGE_VERSION_PATCH @HIP_VERSION_PATCH@)
+set(CPACK_PACKAGE_FILE_NAME ${CPACK_PACKAGE_NAME}-${CPACK_PACKAGE_VERSION_MAJOR}.${CPACK_PACKAGE_VERSION_MINOR}.${CPACK_PACKAGE_VERSION_PATCH})
 set(CPACK_GENERATOR "TGZ;DEB;RPM")
 
 set(CPACK_BINARY_DEB "ON")
-set(CPACK_DEBIAN_PACKAGE_RELEASE @CPACK_DEBIAN_PACKAGE_RELEASE@)
-set(CPACK_DEBIAN_FILE_NAME "DEB-DEFAULT")
-set(CPACK_DEBIAN_PACKAGE_DEPENDS "hip-base (= ${CPACK_PACKAGE_VERSION}-${CPACK_DEBIAN_PACKAGE_RELEASE})")
+set(CPACK_DEBIAN_FILE_NAME ${CPACK_PACKAGE_FILE_NAME}_amd64.deb)
+set(CPACK_DEBIAN_PACKAGE_DEPENDS "hip-base (= ${CPACK_PACKAGE_VERSION})")
 set(CPACK_DEBIAN_PACKAGE_PROVIDES "hip-doc")
 set(CPACK_DEBIAN_PACKAGE_REPLACES "hip_doc")
 
 set(CPACK_BINARY_RPM "ON")
-set(CPACK_RPM_PACKAGE_RELEASE @CPACK_RPM_PACKAGE_RELEASE@)
-set(CPACK_RPM_FILE_NAME "RPM-DEFAULT")
+set(CPACK_RPM_FILE_NAME ${CPACK_PACKAGE_FILE_NAME}.x86_64.rpm)
 set(CPACK_RPM_PACKAGE_ARCHITECTURE "${CMAKE_SYSTEM_PROCESSOR}")
 set(CPACK_RPM_PACKAGE_AUTOREQPROV " no")
 string(REPLACE "-" "_" HIP_BASE_VERSION ${CPACK_PACKAGE_VERSION})
-set(CPACK_RPM_PACKAGE_REQUIRES "hip-base = ${HIP_BASE_VERSION}-${CPACK_RPM_PACKAGE_RELEASE}")
+set(CPACK_RPM_PACKAGE_REQUIRES "hip-base = ${HIP_BASE_VERSION}")
 set(CPACK_RPM_PACKAGE_OBSOLETES "hip_doc")
 set(CPACK_RPM_PACKAGE_CONFLICTS "hip_doc")
 set(CPACK_RPM_EXCLUDE_FROM_AUTO_FILELIST_ADDITION "/opt")
diff --git a/packaging/hip-hcc.txt b/packaging/hip-hcc.txt
index a17bd8ca86..d084e8d966 100644
--- a/packaging/hip-hcc.txt
+++ b/packaging/hip-hcc.txt
@@ -28,29 +28,24 @@ endif()
 set(CPACK_PACKAGE_DESCRIPTION_SUMMARY "HIP: Heterogenous-computing Interface for Portability [HCC]")
 set(CPACK_PACKAGE_VENDOR "Advanced Micro Devices, Inc.")
 set(CPACK_PACKAGE_CONTACT "Maneesh Gupta <maneesh.gupta@amd.com>")
+set(CPACK_PACKAGE_VERSION @HIP_VERSION_MAJOR@.@HIP_VERSION_MINOR@.@HIP_VERSION_PATCH@)
 set(CPACK_PACKAGE_VERSION_MAJOR @HIP_VERSION_MAJOR@)
 set(CPACK_PACKAGE_VERSION_MINOR @HIP_VERSION_MINOR@)
 set(CPACK_PACKAGE_VERSION_PATCH @HIP_VERSION_PATCH@)
-set(CPACK_PACKAGE_VERSION @HIP_VERSION_MAJOR@.@HIP_VERSION_MINOR@.@HIP_PACKAGING_VERSION_PATCH@)
+set(CPACK_PACKAGE_FILE_NAME ${CPACK_PACKAGE_NAME}-${CPACK_PACKAGE_VERSION_MAJOR}.${CPACK_PACKAGE_VERSION_MINOR}.${CPACK_PACKAGE_VERSION_PATCH})
 set(CPACK_GENERATOR "TGZ;DEB;RPM")
-
 set(CPACK_BINARY_DEB "ON")
-set(CPACK_DEBIAN_PACKAGE_RELEASE @CPACK_DEBIAN_PACKAGE_RELEASE@)
-set(CPACK_DEBIAN_FILE_NAME "DEB-DEFAULT")
 set(CPACK_DEBIAN_PACKAGE_CONTROL_EXTRA "${PROJECT_BINARY_DIR}/postinst;${PROJECT_BINARY_DIR}/prerm")
-set(CPACK_DEBIAN_PACKAGE_DEPENDS "hip-base (= ${CPACK_PACKAGE_VERSION}-${CPACK_DEBIAN_PACKAGE_RELEASE}), ${HCC_PACKAGE_NAME} (= @HCC_PACKAGE_VERSION@), comgr (>= 1.1)")
+set(CPACK_DEBIAN_PACKAGE_DEPENDS "hip-base (= ${CPACK_PACKAGE_VERSION}), ${HCC_PACKAGE_NAME} (= @HCC_PACKAGE_VERSION@), comgr (>= 1.1)")
 set(CPACK_DEBIAN_PACKAGE_PROVIDES "hip-hcc")
 set(CPACK_DEBIAN_PACKAGE_REPLACES "hip_hcc")
-
 set(CPACK_BINARY_RPM "ON")
-set(CPACK_RPM_PACKAGE_RELEASE @CPACK_RPM_PACKAGE_RELEASE@)
-set(CPACK_RPM_FILE_NAME "RPM-DEFAULT")
 set(CPACK_RPM_PACKAGE_ARCHITECTURE "${CMAKE_SYSTEM_PROCESSOR}")
 set(CPACK_RPM_POST_INSTALL_SCRIPT_FILE "${PROJECT_BINARY_DIR}/postinst")
 set(CPACK_RPM_PRE_UNINSTALL_SCRIPT_FILE "${PROJECT_BINARY_DIR}/prerm")
 set(CPACK_RPM_PACKAGE_AUTOREQPROV " no")
 string(REPLACE "-" "_" HIP_BASE_VERSION ${CPACK_PACKAGE_VERSION})
-set(CPACK_RPM_PACKAGE_REQUIRES "hip-base = ${HIP_BASE_VERSION}-${CPACK_RPM_PACKAGE_RELEASE}, ${HCC_PACKAGE_NAME} = @HCC_PACKAGE_VERSION@, comgr >= 1.1")
+set(CPACK_RPM_PACKAGE_REQUIRES "hip-base = ${HIP_BASE_VERSION}, ${HCC_PACKAGE_NAME} = @HCC_PACKAGE_VERSION@, comgr >= 1.1")
 set(CPACK_RPM_PACKAGE_OBSOLETES "hip_hcc")
 set(CPACK_RPM_PACKAGE_CONFLICTS "hip_hcc")
 set(CPACK_RPM_EXCLUDE_FROM_AUTO_FILELIST_ADDITION "/opt")
diff --git a/packaging/hip-nvcc.txt b/packaging/hip-nvcc.txt
index f5d43533dc..5d3d91ffb6 100644
--- a/packaging/hip-nvcc.txt
+++ b/packaging/hip-nvcc.txt
@@ -10,29 +10,28 @@ set(CPACK_PACKAGE_NAME "hip-nvcc")
 set(CPACK_PACKAGE_DESCRIPTION_SUMMARY "HIP: Heterogenous-computing Interface for Portability [NVCC]")
 set(CPACK_PACKAGE_VENDOR "Advanced Micro Devices, Inc.")
 set(CPACK_PACKAGE_CONTACT "Maneesh Gupta <maneesh.gupta@amd.com>")
+set(CPACK_PACKAGE_VERSION @HIP_VERSION_MAJOR@.@HIP_VERSION_MINOR@.@HIP_VERSION_PATCH@)
 set(CPACK_PACKAGE_VERSION_MAJOR @HIP_VERSION_MAJOR@)
 set(CPACK_PACKAGE_VERSION_MINOR @HIP_VERSION_MINOR@)
 set(CPACK_PACKAGE_VERSION_PATCH @HIP_VERSION_PATCH@)
-set(CPACK_PACKAGE_VERSION @HIP_VERSION_MAJOR@.@HIP_VERSION_MINOR@.@HIP_PACKAGING_VERSION_PATCH@)
+set(CPACK_PACKAGE_FILE_NAME ${CPACK_PACKAGE_NAME}-${CPACK_PACKAGE_VERSION_MAJOR}.${CPACK_PACKAGE_VERSION_MINOR}.${CPACK_PACKAGE_VERSION_PATCH})
 set(CPACK_GENERATOR "TGZ;DEB;RPM")
 
 set(CPACK_BINARY_DEB "ON")
-set(CPACK_DEBIAN_PACKAGE_RELEASE @CPACK_DEBIAN_PACKAGE_RELEASE@)
-set(CPACK_DEBIAN_FILE_NAME "DEB-DEFAULT")
+set(CPACK_DEBIAN_FILE_NAME ${CPACK_PACKAGE_FILE_NAME}_amd64.deb)
 set(CPACK_DEBIAN_PACKAGE_CONTROL_EXTRA "${PROJECT_BINARY_DIR}/postinst;${PROJECT_BINARY_DIR}/prerm")
-set(CPACK_DEBIAN_PACKAGE_DEPENDS "hip-base (= ${CPACK_PACKAGE_VERSION}-${CPACK_DEBIAN_PACKAGE_RELEASE}), cuda (>= 7.5)")
+set(CPACK_DEBIAN_PACKAGE_DEPENDS "hip-base (= ${CPACK_PACKAGE_VERSION}), cuda (>= 7.5)")
 set(CPACK_DEBIAN_PACKAGE_PROVIDES "hip-nvcc")
 set(CPACK_DEBIAN_PACKAGE_REPLACES "hip_nvcc")
 
 set(CPACK_BINARY_RPM "ON")
-set(CPACK_RPM_PACKAGE_RELEASE @CPACK_RPM_PACKAGE_RELEASE@)
-set(CPACK_RPM_FILE_NAME "RPM-DEFAULT")
+set(CPACK_RPM_FILE_NAME ${CPACK_PACKAGE_FILE_NAME}.x86_64.rpm)
 set(CPACK_RPM_PACKAGE_ARCHITECTURE "${CMAKE_SYSTEM_PROCESSOR}")
 set(CPACK_RPM_POST_INSTALL_SCRIPT_FILE "${PROJECT_BINARY_DIR}/postinst")
 set(CPACK_RPM_PRE_UNINSTALL_SCRIPT_FILE "${PROJECT_BINARY_DIR}/prerm")
 set(CPACK_RPM_PACKAGE_AUTOREQPROV " no")
 string(REPLACE "-" "_" HIP_BASE_VERSION ${CPACK_PACKAGE_VERSION})
-set(CPACK_RPM_PACKAGE_REQUIRES "hip-base = ${HIP_BASE_VERSION}-${CPACK_RPM_PACKAGE_RELEASE}, cuda >= 7.5")
+set(CPACK_RPM_PACKAGE_REQUIRES "hip-base = ${HIP_BASE_VERSION}, cuda >= 7.5")
 set(CPACK_RPM_PACKAGE_OBSOLETES "hip_nvcc")
 set(CPACK_RPM_PACKAGE_CONFLICTS "hip_nvcc")
 set(CPACK_RPM_EXCLUDE_FROM_AUTO_FILELIST_ADDITION "/opt")
diff --git a/packaging/hip-rocclr.txt b/packaging/hip-rocclr.txt
index ee5ec0c3db..6f5c16bb96 100644
--- a/packaging/hip-rocclr.txt
+++ b/packaging/hip-rocclr.txt
@@ -33,28 +33,27 @@ set(HCC_PACKAGE_NAME "rocclr")
 set(CPACK_PACKAGE_DESCRIPTION_SUMMARY "HIP: Heterogenous-computing Interface for Portability [ROCClr]")
 set(CPACK_PACKAGE_VENDOR "Advanced Micro Devices, Inc.")
 set(CPACK_PACKAGE_CONTACT "Maneesh Gupta <maneesh.gupta@amd.com>")
+set(CPACK_PACKAGE_VERSION @HIP_VERSION_MAJOR@.@HIP_VERSION_MINOR@.@HIP_VERSION_PATCH@)
 set(CPACK_PACKAGE_VERSION_MAJOR @HIP_VERSION_MAJOR@)
 set(CPACK_PACKAGE_VERSION_MINOR @HIP_VERSION_MINOR@)
 set(CPACK_PACKAGE_VERSION_PATCH @HIP_VERSION_PATCH@)
-set(CPACK_PACKAGE_VERSION @HIP_VERSION_MAJOR@.@HIP_VERSION_MINOR@.@HIP_PACKAGING_VERSION_PATCH@)
+set(CPACK_PACKAGE_FILE_NAME ${CPACK_PACKAGE_NAME}-${CPACK_PACKAGE_VERSION_MAJOR}.${CPACK_PACKAGE_VERSION_MINOR}.${CPACK_PACKAGE_VERSION_PATCH})
 set(CPACK_GENERATOR "TGZ;DEB;RPM")
 
 set(CPACK_BINARY_DEB "ON")
-set(CPACK_DEBIAN_PACKAGE_RELEASE @CPACK_DEBIAN_PACKAGE_RELEASE@)
-set(CPACK_DEBIAN_FILE_NAME "DEB-DEFAULT")
+set(CPACK_DEBIAN_FILE_NAME ${CPACK_PACKAGE_FILE_NAME}_amd64.deb)
 set(CPACK_DEBIAN_PACKAGE_CONTROL_EXTRA "${PROJECT_BINARY_DIR}/postinst;${PROJECT_BINARY_DIR}/prerm")
-set(CPACK_DEBIAN_PACKAGE_DEPENDS "hsa-rocr-dev, rocminfo, hip-base (= ${CPACK_PACKAGE_VERSION}-${CPACK_DEBIAN_PACKAGE_RELEASE}),  comgr (>= 1.1), llvm-amdgpu")
+set(CPACK_DEBIAN_PACKAGE_DEPENDS "hsa-rocr-dev, rocminfo, hip-base (= ${CPACK_PACKAGE_VERSION}),  comgr (>= 1.1), llvm-amdgpu")
 set(CPACK_DEBIAN_PACKAGE_PROVIDES "hip-hcc (= ${CPACK_PACKAGE_VERSION})")
 
 set(CPACK_BINARY_RPM "ON")
-set(CPACK_RPM_PACKAGE_RELEASE @CPACK_RPM_PACKAGE_RELEASE@)
-set(CPACK_RPM_FILE_NAME "RPM-DEFAULT")
+set(CPACK_RPM_FILE_NAME ${CPACK_PACKAGE_FILE_NAME}.x86_64.rpm)
 set(CPACK_RPM_PACKAGE_ARCHITECTURE "${CMAKE_SYSTEM_PROCESSOR}")
 set(CPACK_RPM_POST_INSTALL_SCRIPT_FILE "${PROJECT_BINARY_DIR}/postinst")
 set(CPACK_RPM_PRE_UNINSTALL_SCRIPT_FILE "${PROJECT_BINARY_DIR}/prerm")
 set(CPACK_RPM_PACKAGE_AUTOREQPROV " no")
 string(REPLACE "-" "_" HIP_BASE_VERSION ${CPACK_PACKAGE_VERSION})
-set(CPACK_RPM_PACKAGE_REQUIRES "hsa-rocr-dev, rocminfo, hip-base = ${HIP_BASE_VERSION}-${CPACK_RPM_PACKAGE_RELEASE},  comgr >= 1.1, llvm-amdgpu")
+set(CPACK_RPM_PACKAGE_REQUIRES "hsa-rocr-dev, rocminfo, hip-base = ${HIP_BASE_VERSION},  comgr >= 1.1, llvm-amdgpu")
 set(CPACK_RPM_PACKAGE_PROVIDES "hip-hcc = ${HIP_BASE_VERSION}")
 set(CPACK_RPM_EXCLUDE_FROM_AUTO_FILELIST_ADDITION "/opt")
 set(CPACK_SOURCE_GENERATOR "TGZ")
diff --git a/packaging/hip-samples.txt b/packaging/hip-samples.txt
index 34f0dddd2e..6481cf7bde 100644
--- a/packaging/hip-samples.txt
+++ b/packaging/hip-samples.txt
@@ -12,26 +12,25 @@ set(CPACK_PACKAGE_NAME "hip-samples")
 set(CPACK_PACKAGE_DESCRIPTION_SUMMARY "HIP: Heterogenous-computing Interface for Portability [SAMPLES]")
 set(CPACK_PACKAGE_VENDOR "Advanced Micro Devices, Inc.")
 set(CPACK_PACKAGE_CONTACT "Maneesh Gupta <maneesh.gupta@amd.com>")
+set(CPACK_PACKAGE_VERSION @HIP_VERSION_MAJOR@.@HIP_VERSION_MINOR@.@HIP_VERSION_PATCH@)
 set(CPACK_PACKAGE_VERSION_MAJOR @HIP_VERSION_MAJOR@)
 set(CPACK_PACKAGE_VERSION_MINOR @HIP_VERSION_MINOR@)
 set(CPACK_PACKAGE_VERSION_PATCH @HIP_VERSION_PATCH@)
-set(CPACK_PACKAGE_VERSION @HIP_VERSION_MAJOR@.@HIP_VERSION_MINOR@.@HIP_PACKAGING_VERSION_PATCH@)
+set(CPACK_PACKAGE_FILE_NAME ${CPACK_PACKAGE_NAME}-${CPACK_PACKAGE_VERSION_MAJOR}.${CPACK_PACKAGE_VERSION_MINOR}.${CPACK_PACKAGE_VERSION_PATCH})
 set(CPACK_GENERATOR "TGZ;DEB;RPM")
 
 set(CPACK_BINARY_DEB "ON")
-set(CPACK_DEBIAN_PACKAGE_RELEASE @CPACK_DEBIAN_PACKAGE_RELEASE@)
-set(CPACK_DEBIAN_FILE_NAME "DEB-DEFAULT")
-set(CPACK_DEBIAN_PACKAGE_DEPENDS "hip-base (= ${CPACK_PACKAGE_VERSION}-${CPACK_DEBIAN_PACKAGE_RELEASE})")
+set(CPACK_DEBIAN_FILE_NAME ${CPACK_PACKAGE_FILE_NAME}_amd64.deb)
+set(CPACK_DEBIAN_PACKAGE_DEPENDS "hip-base (= ${CPACK_PACKAGE_VERSION})")
 set(CPACK_DEBIAN_PACKAGE_PROVIDES "hip-samples")
 set(CPACK_DEBIAN_PACKAGE_REPLACES "hip_samples")
 
 set(CPACK_BINARY_RPM "ON")
-set(CPACK_RPM_PACKAGE_RELEASE @CPACK_RPM_PACKAGE_RELEASE@)
-set(CPACK_RPM_FILE_NAME "RPM-DEFAULT")
+set(CPACK_RPM_FILE_NAME ${CPACK_PACKAGE_FILE_NAME}.x86_64.rpm)
 set(CPACK_RPM_PACKAGE_ARCHITECTURE "${CMAKE_SYSTEM_PROCESSOR}")
 set(CPACK_RPM_PACKAGE_AUTOREQPROV " no")
 string(REPLACE "-" "_" HIP_BASE_VERSION ${CPACK_PACKAGE_VERSION})
-set(CPACK_RPM_PACKAGE_REQUIRES "hip-rocclr = ${HIP_BASE_VERSION}-${CPACK_RPM_PACKAGE_RELEASE}")
+set(CPACK_RPM_PACKAGE_REQUIRES "hip-rocclr = ${HIP_BASE_VERSION}")
 set(CPACK_RPM_PACKAGE_OBSOLETES "hip_samples")
 set(CPACK_RPM_PACKAGE_CONFLICTS "hip_samples")
 set(CPACK_RPM_EXCLUDE_FROM_AUTO_FILELIST_ADDITION "/opt")
diff --git a/rocclr/CMakeLists.txt b/rocclr/CMakeLists.txt
index ec1dc50407..187edab746 100755
--- a/rocclr/CMakeLists.txt
+++ b/rocclr/CMakeLists.txt
@@ -96,14 +96,6 @@ find_package(amd_comgr REQUIRED CONFIG
 
 message(STATUS "Code Object Manager found at ${amd_comgr_DIR}.")
 
-find_package(LLVM REQUIRED CONFIG
-   PATHS
-     /opt/rocm/llvm
-   PATH_SUFFIXES
-     lib/cmake/llvm)
-
-message(STATUS "llvm found at ${LLVM_DIR}.")
-
 add_library(hip64 OBJECT
  hip_context.cpp
  hip_code_object.cpp
@@ -156,9 +148,10 @@ endif()
 
 # Short-Term solution for pre-compiled headers for online compilation
 # Enable pre compiled header
-if(${__HIP_ENABLE_PCH})
-    execute_process(COMMAND sh -c "${CMAKE_CURRENT_SOURCE_DIR}/../bin/hip_embed_pch.sh ${LLVM_DIR}")
-    add_definitions(-D__HIP_ENABLE_PCH)
+if(${ENABLE_HIP_PCH})
+    execute_process(COMMAND sh -c "${CMAKE_CURRENT_SOURCE_DIR}/../bin/hip_gen_pch.sh")
+    execute_process(COMMAND sh -c "${CMAKE_CURRENT_SOURCE_DIR}/../bin/hip_embed_pch.sh")
+    add_definitions(-DENABLE_HIP_PCH)
 endif()
 
 # Enable profiling API
@@ -223,7 +216,7 @@ add_library(device INTERFACE)
 target_link_libraries(device INTERFACE host)
 
 # Short-Term solution for pre-compiled headers for online compilation
-if(${__HIP_ENABLE_PCH})
+if(${ENABLE_HIP_PCH})
   target_link_libraries(amdhip64 PRIVATE ${CMAKE_BINARY_DIR}/hip_pch.o)
 endif()
 
@@ -234,18 +227,6 @@ endif()
 # filename.
 if(${BUILD_SHARED_LIBS})
     target_link_libraries(amdhip64 PRIVATE amdrocclr_static Threads::Threads dl hsa-runtime64::hsa-runtime64)
-
-    add_custom_command(TARGET amdhip64 POST_BUILD COMMAND
-        ${CMAKE_COMMAND} -E create_symlink ${PROJECT_BINARY_DIR}/lib/libamdhip64.so.${HIP_LIB_VERSION_STRING}
-        ${PROJECT_BINARY_DIR}/lib/libhip_hcc.so.${HIP_LIB_VERSION_MAJOR})
-    add_custom_command(TARGET amdhip64 POST_BUILD COMMAND
-        ${CMAKE_COMMAND} -E create_symlink ${PROJECT_BINARY_DIR}/lib/libhip_hcc.so.${HIP_LIB_VERSION_MAJOR}
-        ${PROJECT_BINARY_DIR}/lib/libhip_hcc.so)
-    add_custom_command(TARGET amdhip64 POST_BUILD COMMAND
-        ${CMAKE_COMMAND} -E copy ${PROJECT_BINARY_DIR}/.hipInfo ${PROJECT_BINARY_DIR}/lib/.hipInfo)
-    add_custom_command(TARGET amdhip64 POST_BUILD COMMAND ${CMAKE_COMMAND} -E copy_directory
-        ${PROJECT_SOURCE_DIR}/include ${PROJECT_BINARY_DIR}/include)
-
     INSTALL(PROGRAMS $<TARGET_FILE:amdhip64> DESTINATION lib COMPONENT MAIN)
 else()
     target_link_libraries(amdhip64 PRIVATE Threads::Threads dl hsa-runtime64::hsa-runtime64 amd_comgr)
@@ -263,7 +244,6 @@ else()
     INSTALL(PROGRAMS $<TARGET_FILE:amdhip64> DESTINATION lib COMPONENT MAIN)
 endif()
 
-
 INSTALL(TARGETS amdhip64 host device EXPORT hip-targets DESTINATION ${LIB_INSTALL_DIR})
 INSTALL(EXPORT hip-targets DESTINATION ${CONFIG_PACKAGE_INSTALL_DIR} NAMESPACE hip::)
 
diff --git a/rocclr/hip_code_object.cpp b/rocclr/hip_code_object.cpp
index b0979c5246..c6a866c9c4 100755
--- a/rocclr/hip_code_object.cpp
+++ b/rocclr/hip_code_object.cpp
@@ -202,10 +202,19 @@ hipError_t DynCO::populateDynGlobalVars() {
     return hipErrorSharedObjectSymbolNotFound;
   }
 
+  if (!dev_program->getUndefinedVarFromCodeObj(&undef_var_names)) {
+    DevLogPrintfError("Could not get undefined Variables for Module: 0x%x \n", module());
+    return hipErrorSharedObjectSymbolNotFound;
+  }
+
   for (auto& elem : var_names) {
     vars_.insert(std::make_pair(elem, new Var(elem, Var::DeviceVarKind::DVK_Variable, 0, 0, 0, nullptr)));
   }
 
+  for (auto& elem : undef_var_names) {
+    vars_.insert(std::make_pair(elem, new Var(elem, Var::DeviceVarKind::DVK_Texture, 0, 0, 0, nullptr)));
+  }
+
   return hipSuccess;
 }
 
@@ -368,4 +377,20 @@ hipError_t StatCO::getStatGlobalVar(const void* hostVar, int deviceId, hipDevice
   *size_ptr = dvar->size();
   return hipSuccess;
 }
+
+hipError_t StatCO::getStatGlobalVarByName(std::string hostVar, int deviceId, hipModule_t hmod,
+                                          hipDeviceptr_t* dev_ptr, size_t* size_ptr) {
+  amd::ScopedLock lock(sclock_);
+
+  for (auto& elem : vars_) {
+    if ((elem.second->name() == hostVar)
+        && (elem.second->module(deviceId) == hmod)) {
+      *dev_ptr = elem.second->device_ptr(deviceId);
+      *size_ptr = elem.second->device_size(deviceId);
+      return hipSuccess;
+    }
+  }
+
+  return hipErrorNotFound;
+}
 }; //namespace: hip
diff --git a/rocclr/hip_code_object.hpp b/rocclr/hip_code_object.hpp
index 0cc2a7051a..f5f179570b 100755
--- a/rocclr/hip_code_object.hpp
+++ b/rocclr/hip_code_object.hpp
@@ -118,6 +118,8 @@ public:
   hipError_t getStatFuncAttr(hipFuncAttributes* func_attr, const void* hostFunction, int deviceId);
   hipError_t getStatGlobalVar(const void* hostVar, int deviceId, hipDeviceptr_t* dev_ptr,
                               size_t* size_ptr);
+  hipError_t getStatGlobalVarByName(std::string hostVar, int deviceId, hipModule_t hmod,
+                                    hipDeviceptr_t* dev_ptr, size_t* size_ptr);
 
 private:
   friend class ::PlatformState;
diff --git a/rocclr/hip_device.cpp b/rocclr/hip_device.cpp
index c0dbc89970..70548d5328 100644
--- a/rocclr/hip_device.cpp
+++ b/rocclr/hip_device.cpp
@@ -155,7 +155,7 @@ hipError_t hipGetDeviceProperties ( hipDeviceProp_t* props, hipDevice_t device )
   ::strncpy(deviceProps.name, info.boardName_, 128);
   deviceProps.totalGlobalMem = info.globalMemSize_;
   deviceProps.sharedMemPerBlock = info.localMemSizePerCU_;
-  deviceProps.regsPerBlock = info.availableRegistersPerCU_;
+  deviceProps.regsPerBlock = info.availableSGPRs_;
   deviceProps.warpSize = info.wavefrontWidth_;
   deviceProps.maxThreadsPerBlock = info.maxWorkGroupSize_;
   deviceProps.maxThreadsDim[0] = info.maxWorkItemSizes_[0];
diff --git a/rocclr/hip_fatbin.cpp b/rocclr/hip_fatbin.cpp
index 8072c18b36..95a91063a2 100755
--- a/rocclr/hip_fatbin.cpp
+++ b/rocclr/hip_fatbin.cpp
@@ -12,7 +12,7 @@ FatBinaryDeviceInfo::~FatBinaryDeviceInfo() {
 }
 
 FatBinaryInfo::FatBinaryInfo(const char* fname, const void* image)
-               : fdesc_(amd::Os::FDescInit()), fsize_(0), image_(image), uri_(std::string()) {
+               : fdesc_(-1), fsize_(0), image_(image), uri_(std::string()) {
   guarantee(fname || image);
 
   if (fname != nullptr) {
@@ -41,7 +41,7 @@ FatBinaryInfo::~FatBinaryInfo() {
   }
 
   fname_ = std::string();
-  fdesc_ = amd::Os::FDescInit();
+  fdesc_ = -1;
   fsize_ = 0;
   image_ = nullptr;
   uri_ = std::string();
@@ -64,9 +64,6 @@ hipError_t FatBinaryInfo::ExtractFatBinary(const std::vector<hip::Device*>& devi
     if (!amd::Os::GetFileHandle(fname_.c_str(), &fdesc_, &fsize_)) {
       return hipErrorFileNotFound;
     }
-    if (fsize_ == 0) {
-      return hipErrorInvalidKernelFile;
-    }
 
     // Extract the code object from file
     hip_error = CodeObject::ExtractCodeObjectFromFile(fdesc_, fsize_,
diff --git a/rocclr/hip_global.cpp b/rocclr/hip_global.cpp
index 46e6efcf52..bed2dcd850 100755
--- a/rocclr/hip_global.cpp
+++ b/rocclr/hip_global.cpp
@@ -5,9 +5,7 @@
 #include "hip_code_object.hpp"
 #include "platform/program.hpp"
 
-#ifdef __HIP_ENABLE_PCH
-extern const char __hip_pch[];
-extern unsigned __hip_pch_size;
+#ifdef ENABLE_HIP_PCH
 void __hipGetPCH(const char** pch, unsigned int *size) {
   *pch = __hip_pch;
   *size = __hip_pch_size;
diff --git a/rocclr/hip_global.hpp b/rocclr/hip_global.hpp
index fd57ecfb50..3888daf30b 100755
--- a/rocclr/hip_global.hpp
+++ b/rocclr/hip_global.hpp
@@ -95,6 +95,11 @@ public:
   hipError_t getStatDeviceVar(DeviceVar** dvar, int deviceId);
   void resize_dVar(size_t size) { dVar_.resize(size); }
 
+  //Accessor for device_ptrs.
+  std::string name() const { return name_; }
+  hipModule_t module(int deviceId) const { return nullptr; }
+  hipDeviceptr_t device_ptr(int deviceId) const { return dVar_[deviceId]->device_ptr(); }
+  size_t device_size(int deviceId) const { return dVar_[deviceId]->size(); }
   FatBinaryInfo** moduleInfo() { return modules_; };
 
 private:
diff --git a/rocclr/hip_internal.hpp b/rocclr/hip_internal.hpp
index 7e0cc8b9a2..a950961ea7 100755
--- a/rocclr/hip_internal.hpp
+++ b/rocclr/hip_internal.hpp
@@ -252,6 +252,8 @@ extern int ihipGetDevice();
 extern hipError_t ihipMalloc(void** ptr, size_t sizeBytes, unsigned int flags);
 extern amd::Memory* getMemoryObject(const void* ptr, size_t& offset);
 extern amd::Memory* getMemoryObjectWithOffset(const void* ptr, const size_t size);
+extern bool CL_CALLBACK getSvarInfo(cl_program program, std::string var_name, void** var_addr,
+                                    size_t* var_size);
 
 constexpr bool kOptionChangeable = true;
 constexpr bool kNewDevProg = false;
diff --git a/rocclr/hip_memory.cpp b/rocclr/hip_memory.cpp
index 8fd9b05cdb..b0e1d6abdd 100755
--- a/rocclr/hip_memory.cpp
+++ b/rocclr/hip_memory.cpp
@@ -124,7 +124,7 @@ hipError_t ihipMalloc(void** ptr, size_t sizeBytes, unsigned int flags)
   if (*ptr == nullptr) {
     size_t free = 0, total =0;
     hipMemGetInfo(&free, &total);
-    LogPrintfError("Allocation failed : Device memory : required :%zu | free :%zu | total :%zu \n", sizeBytes, free, total);
+    LogPrintfError("Allocation failed : Device memory : required :%u | free :%u | total :%u \n", sizeBytes, free, total);
     return hipErrorOutOfMemory;
   }
 
@@ -202,14 +202,14 @@ hipError_t ihipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKin
       }
     } else {
       amd::HostQueue* pQueue = &queue;
-      if ((srcMemory->getContext().devices()[0] == dstMemory->getContext().devices()[0]) &&
-          (queueDevice != srcMemory->getContext().devices()[0])) {
+      if (queueDevice != srcMemory->getContext().devices()[0]) {
         pQueue = hip::getNullStream(srcMemory->getContext());
         amd::Command* cmd = queue.getLastQueuedCommand(true);
         if (cmd != nullptr) {
           waitList.push_back(cmd);
         }
       }
+
       command = new amd::CopyMemoryCommand(*pQueue, CL_COMMAND_COPY_BUFFER, waitList,
           *srcMemory->asBuffer(), *dstMemory->asBuffer(), sOffset, dOffset, sizeBytes);
     }
@@ -1850,27 +1850,18 @@ hipError_t ihipMemset3D(hipPitchedPtr pitchedDevPtr,
                         hipExtent extent,
                         hipStream_t stream,
                         bool isAsync = false) {
-  size_t offset = 0;
-  amd::Memory* memory = getMemoryObject(pitchedDevPtr.ptr, offset);
-
-  auto sizeBytes = extent.width * extent.height * extent.depth;
-
-  if (memory == nullptr) {
-    return hipErrorInvalidValue;
-  }
-  if (sizeBytes > memory->getSize()) {
-    return hipErrorInvalidValue;
-  }
-
   if (pitchedDevPtr.pitch == extent.width) {
-    return ihipMemset(pitchedDevPtr.ptr, value, sizeof(int8_t), static_cast<size_t>(sizeBytes), stream, isAsync);
+    return ihipMemset(pitchedDevPtr.ptr, value, sizeof(int8_t), extent.width * extent.height * extent.depth, stream, isAsync);
   }
 
-  // Workaround for cases when pitch > row until fill kernel will be updated to support pitch.
-  // Fall back to filling one row at a time.
+  // Workaround for cases when pitch > row untill fill kernel will be updated to support pitch.
+  // Fallback to filling one row at a time.
 
   amd::HostQueue* queue = hip::getQueue(stream);
 
+  size_t offset = 0;
+  amd::Memory* memory = getMemoryObject(pitchedDevPtr.ptr, offset);
+
   amd::Coord3D origin(offset);
   amd::Coord3D region(pitchedDevPtr.xsize, pitchedDevPtr.ysize, extent.depth);
   amd::BufferRect rect;
@@ -1879,26 +1870,34 @@ hipError_t ihipMemset3D(hipPitchedPtr pitchedDevPtr,
     return hipErrorInvalidValue;
   }
 
-  std::vector<amd::FillMemoryCommand*> commands;
+  if (memory != nullptr) {
+    std::vector<amd::FillMemoryCommand*> commands;
 
-  for (size_t slice = 0; slice < extent.depth; slice++) {
-    for (size_t row = 0; row < extent.height; row++) {
-      const size_t rowOffset = rect.offset(0, row, slice);
-      amd::FillMemoryCommand *command = new amd::FillMemoryCommand(*queue,
-          CL_COMMAND_FILL_BUFFER, amd::Command::EventWaitList { },
-          *memory->asBuffer(), &value, sizeof(int8_t), amd::Coord3D { rowOffset,
-              0, 0 }, amd::Coord3D { extent.width, 1, 1 });
+    for (size_t slice = 0; slice < extent.depth; slice++) {
+      for (size_t row = 0; row < extent.height; row++) {
+        const size_t rowOffset = rect.offset(0, row, slice);
+        amd::FillMemoryCommand* command = new amd::FillMemoryCommand(*queue,
+                                                                     CL_COMMAND_FILL_BUFFER,
+                                                                     amd::Command::EventWaitList{},
+                                                                     *memory->asBuffer(),
+                                                                     &value,
+                                                                     sizeof(int8_t),
+                                                                     amd::Coord3D{rowOffset, 0, 0},
+                                                                     amd::Coord3D{extent.width, 1, 1});
 
-      command->enqueue();
-      commands.push_back(command);
+        command->enqueue();
+        commands.push_back(command);
+      }
     }
-  }
 
-  for (auto &command : commands) {
-    if (!isAsync) {
-      command->awaitCompletion();
+    for (auto &command: commands) {
+      if (!isAsync) {
+        command->awaitCompletion();
+      }
+      command->release();
     }
-    command->release();
+  } else {
+	return hipErrorInvalidValue;
   }
 
   return hipSuccess;
@@ -2039,7 +2038,7 @@ hipError_t hipPointerGetAttributes(hipPointerAttribute_t* attributes, const void
   memset(attributes, 0, sizeof(hipPointerAttribute_t));
 
   if (memObj != nullptr) {
-    attributes->memoryType = ((CL_MEM_SVM_FINE_GRAIN_BUFFER | CL_MEM_USE_HOST_PTR) & memObj->getMemFlags())? hipMemoryTypeHost : hipMemoryTypeDevice;
+    attributes->memoryType = (CL_MEM_SVM_FINE_GRAIN_BUFFER & memObj->getMemFlags())? hipMemoryTypeHost : hipMemoryTypeDevice;
     if (attributes->memoryType == hipMemoryTypeHost) {
       attributes->hostPointer = static_cast<char*>(memObj->getSvmPtr()) + offset;
     }
diff --git a/rocclr/hip_module.cpp b/rocclr/hip_module.cpp
index b72ee1a5a2..4a09cc6ed0 100755
--- a/rocclr/hip_module.cpp
+++ b/rocclr/hip_module.cpp
@@ -537,7 +537,7 @@ hipError_t ihipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsL
     if (result != hipSuccess) {
       break;
     }
-    prevGridSize += globalWorkSizeX * globalWorkSizeY * globalWorkSizeZ;
+    prevGridSize += launch.gridDim.x * launch.gridDim.y * launch.gridDim.z;
   }
 
   // Sync the execution streams on all devices
diff --git a/rocclr/hip_peer.cpp b/rocclr/hip_peer.cpp
index fe22803c33..ded6843957 100755
--- a/rocclr/hip_peer.cpp
+++ b/rocclr/hip_peer.cpp
@@ -97,10 +97,6 @@ hipError_t hipExtGetLinkTypeAndHopCount(int device1, int device2,
                                         uint32_t* linktype, uint32_t* hopcount) {
   HIP_INIT_API(hipExtGetLinkTypeAndHopCount, device1, device2, linktype, hopcount);
 
-  if (linktype == nullptr || hopcount == nullptr ||
-      device1 == device2  || device1 < 0 || device2 < 0) {
-    HIP_RETURN(hipErrorInvalidValue);
-  }
   // Fill out the list of LinkAttributes
   std::vector<amd::Device::LinkAttrType> link_attrs;
   link_attrs.push_back(std::make_pair(amd::Device::LinkAttribute::kLinkLinkType, 0));
diff --git a/rocclr/hip_platform.cpp b/rocclr/hip_platform.cpp
index 6abea0df4e..6e6f08bf44 100755
--- a/rocclr/hip_platform.cpp
+++ b/rocclr/hip_platform.cpp
@@ -80,6 +80,27 @@ extern "C" hip::FatBinaryInfo** __hipRegisterFatBinary(const void* data)
   return PlatformState::instance().addFatBinary(fbwrapper->binary);
 }
 
+bool PlatformState::getShadowVarInfo(std::string var_name, hipModule_t hmod,
+                                     void** var_addr, size_t* var_size) {
+
+  amd::ScopedLock lock(lock_);
+  if (hipSuccess == getDynGlobalVar(var_name.c_str(), ihipGetDevice(), hmod, var_addr, var_size)) {
+    return true;
+  }
+
+  if (hipSuccess == getStatGlobalVarByName(var_name, ihipGetDevice(), hmod, var_addr, var_size)) {
+    return true;
+  }
+
+  return false;
+}
+
+bool CL_CALLBACK getSvarInfo(cl_program program, std::string var_name, void** var_addr,
+                             size_t* var_size) {
+  return PlatformState::instance().getShadowVarInfo(var_name, reinterpret_cast<hipModule_t>(program),
+                                                    var_addr, var_size);
+}
+
 extern "C" void __hipRegisterFunction(
   hip::FatBinaryInfo** modules,
   const void*  hostFunction,
@@ -665,19 +686,11 @@ static inline std::uint32_t __convert_float_to_half(float a) noexcept {
   return s | v;
 }
 
-extern "C"
-#if !defined(_MSC_VER)
-__attribute__((weak))
-#endif
-float  __gnu_h2f_ieee(unsigned short h){
+extern "C" __attribute__((weak)) float  __gnu_h2f_ieee(unsigned short h){
   return __convert_half_to_float((std::uint32_t) h);
 }
 
-extern "C"
-#if !defined(_MSC_VER)
-__attribute__((weak))
-#endif
-unsigned short  __gnu_f2h_ieee(float f){
+extern "C" __attribute__((weak)) unsigned short  __gnu_f2h_ieee(float f){
   return (unsigned short)__convert_float_to_half(f);
 }
 
@@ -752,9 +765,6 @@ hipError_t PlatformState::getDynFunc(hipFunction_t* hfunc, hipModule_t hmod,
     DevLogPrintfError("Cannot find the module: 0x%x", hmod);
     return hipErrorNotFound;
   }
-  if (0 == strlen(func_name)) {
-    return hipErrorNotFound;
-  }
 
   return it->second->getDynFunc(hfunc, func_name);
 }
@@ -858,6 +868,11 @@ hipError_t PlatformState::getStatGlobalVar(const void* hostVar, int deviceId, hi
   return statCO_.getStatGlobalVar(hostVar, deviceId, dev_ptr, size_ptr);
 }
 
+hipError_t PlatformState::getStatGlobalVarByName(std::string hostVar, int deviceId, hipModule_t hmod,
+                                                 hipDeviceptr_t* dev_ptr, size_t* size_ptr) {
+  return statCO_.getStatGlobalVarByName(hostVar, deviceId, hmod, dev_ptr, size_ptr);
+}
+
 void PlatformState::setupArgument(const void *arg, size_t size, size_t offset) {
   auto& arguments = execStack_.top().arguments_;
 
diff --git a/rocclr/hip_platform.hpp b/rocclr/hip_platform.hpp
index 51fea0841e..2bcf620f6d 100755
--- a/rocclr/hip_platform.hpp
+++ b/rocclr/hip_platform.hpp
@@ -77,6 +77,11 @@ public:
   hipError_t getStatFuncAttr(hipFuncAttributes* func_attr, const void* hostFunction, int deviceId);
   hipError_t getStatGlobalVar(const void* hostVar, int deviceId, hipDeviceptr_t* dev_ptr,
                               size_t* size_ptr);
+  hipError_t getStatGlobalVarByName(std::string hostVar, int deviceId, hipModule_t hmod,
+                                    hipDeviceptr_t* dev_ptr, size_t* size_ptr);
+
+  bool getShadowVarInfo(std::string var_name, hipModule_t hmod,
+                            void** var_addr, size_t* var_size);
 
   //Exec Functions
   void setupArgument(const void *arg, size_t size, size_t offset);
diff --git a/samples/0_Intro/bit_extract/CMakeLists.txt b/samples/0_Intro/bit_extract/CMakeLists.txt
deleted file mode 100644
index c9b13be812..0000000000
--- a/samples/0_Intro/bit_extract/CMakeLists.txt
+++ /dev/null
@@ -1,20 +0,0 @@
-project(bit_extract)
-
-cmake_minimum_required(VERSION 3.10)
-
-# Search for rocm in common locations
-list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
-
-# Find hip
-find_package(hip)
-
-# Set compiler and linker
-set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
-
-set(CMAKE_CXX_LINKER   ${HIP_HIPCC_EXECUTABLE})
-
-# Create the excutable
-add_executable(bit_extract bit_extract.cpp)
-
-# Link with HIP
-target_link_libraries(bit_extract hip::host)
\ No newline at end of file
diff --git a/samples/0_Intro/bit_extract/Makefile b/samples/0_Intro/bit_extract/Makefile
index 3427815ffc..4a3a0bb4fe 100644
--- a/samples/0_Intro/bit_extract/Makefile
+++ b/samples/0_Intro/bit_extract/Makefile
@@ -9,15 +9,19 @@ HIPCC=$(HIP_PATH)/bin/hipcc
 
 # Show how to use PLATFORM to specify different options for each compiler:
 ifeq (${HIP_PLATFORM}, nvcc)
-	HIPCC_FLAGS = -gencode=arch=compute_20,code=sm_20
+	HIPCC_FLAGS = -gencode=arch=compute_20,code=sm_20 
 endif
 
 EXE=bit_extract
+EXE_STATIC=bit_extract_static
 
 $(EXE): bit_extract.cpp
 	$(HIPCC) $(HIPCC_FLAGS) $< -o $@
 
-all: $(EXE)
+$(EXE_STATIC): bit_extract.cpp
+	$(HIPCC) -use-staticlib  $(HIPCC_FLAGS) $< -o $@
+
+all: $(EXE) $(EXE_STATIC)
 
 clean:
-	rm -f *.o $(EXE)
+	rm -f *.o $(EXE) $(EXE_STATIC)
diff --git a/samples/0_Intro/module_api/CMakeLists.txt b/samples/0_Intro/module_api/CMakeLists.txt
deleted file mode 100644
index 0f5cc32f91..0000000000
--- a/samples/0_Intro/module_api/CMakeLists.txt
+++ /dev/null
@@ -1,36 +0,0 @@
-project(module_api)
-
-cmake_minimum_required(VERSION 3.10)
-
-# Search for rocm in common locations
-list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
-
-# Find hip
-find_package(hip)
-
-# Set compiler and linker
-set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
-
-set(CMAKE_CXX_LINKER   ${HIP_HIPCC_EXECUTABLE})
-
-# Create the excutable
-add_executable(runKernel.hip.out runKernel.cpp)
-add_executable(launchKernelHcc.hip.out launchKernelHcc.cpp)
-add_executable(defaultDriver.hip.out defaultDriver.cpp)
-
-# Generate code object
-add_custom_target(
-  codeobj
-  ALL
-  COMMAND  ${HIP_HIPCC_EXECUTABLE} --genco  ../vcpy_kernel.cpp -o vcpy_kernel.code
-  COMMENT "codeobj generated"
-)
-
-add_dependencies(runKernel.hip.out codeobj)
-add_dependencies(launchKernelHcc.hip.out codeobj)
-add_dependencies(defaultDriver.hip.out codeobj)
-
-# Link with HIP
-target_link_libraries(runKernel.hip.out hip::host)
-target_link_libraries(launchKernelHcc.hip.out hip::host)
-target_link_libraries(defaultDriver.hip.out hip::host)
diff --git a/samples/0_Intro/module_api_global/CMakeLists.txt b/samples/0_Intro/module_api_global/CMakeLists.txt
deleted file mode 100644
index 00caa79cfa..0000000000
--- a/samples/0_Intro/module_api_global/CMakeLists.txt
+++ /dev/null
@@ -1,30 +0,0 @@
-project(modile_api_global)
-
-cmake_minimum_required(VERSION 3.10)
-
-# Search for rocm in common locations
-list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
-
-# Find hip
-find_package(hip)
-
-# Set compiler and linker
-set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
-
-set(CMAKE_CXX_LINKER   ${HIP_HIPCC_EXECUTABLE})
-
-# Create the excutable
-add_executable(runKernel.hip.out runKernel.cpp)
-
-# Generate code object
-add_custom_target(
-  codeobj
-  ALL
-  COMMAND  ${HIP_HIPCC_EXECUTABLE} --genco  ../vcpy_kernel.cpp -o vcpy_kernel.code
-  COMMENT "codeobj generated"
-)
-
-add_dependencies(runKernel.hip.out codeobj)
-
-# Link with HIP
-target_link_libraries(runKernel.hip.out hip::host)
\ No newline at end of file
diff --git a/samples/0_Intro/square/CMakeLists.txt b/samples/0_Intro/square/CMakeLists.txt
deleted file mode 100644
index 845c43fd1f..0000000000
--- a/samples/0_Intro/square/CMakeLists.txt
+++ /dev/null
@@ -1,21 +0,0 @@
-#Follow "README.md" to generate square.cpp if it's missing
-
-project(square)
-
-cmake_minimum_required(VERSION 3.10)
-
-# Search for rocm in common locations
-list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
-
-# Find hip
-find_package(hip)
-
-# Set compiler and linker
-set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
-set(CMAKE_CXX_LINKER   ${HIP_HIPCC_EXECUTABLE})
-
-# Create the excutable
-add_executable(square square.cpp)
-
-# Link with HIP
-target_link_libraries(square hip::host)
\ No newline at end of file
diff --git a/samples/0_Intro/square/Makefile b/samples/0_Intro/square/Makefile
index 9bb0dd8205..aa046eeaaa 100644
--- a/samples/0_Intro/square/Makefile
+++ b/samples/0_Intro/square/Makefile
@@ -11,7 +11,7 @@ else
 	SOURCES=square.cpp
 endif
 
-all: square.out
+all: square.out square.out.static
 
 # Step
 square.cpp: square.cu
@@ -20,5 +20,8 @@ square.cpp: square.cu
 square.out: $(SOURCES)
 	$(HIPCC) $(CXXFLAGS) $(SOURCES) -o $@
 
+square.out.static: $(SOURCES)
+	$(HIPCC) -use-staticlib $(CXXFLAGS) $(SOURCES) -o $@
+
 clean:
-	rm -f *.o *.out square.cpp
+	rm -f *.o *.out *.out.static square.cpp
diff --git a/samples/0_Intro/square/README.md b/samples/0_Intro/square/README.md
index 0bbb2f7e39..c185903993 100644
--- a/samples/0_Intro/square/README.md
+++ b/samples/0_Intro/square/README.md
@@ -1,39 +1,13 @@
 # Square.md
 
-Simple test which shows how to use hipify-perl to port CUDA code to HIP.
-See related [blog](http://gpuopen.com/hip-to-be-squared-an-introductory-hip-tutorial) that explains the example.
+Simple test which shows how to use hipify-perl to port CUDA code to HIP.  
+See related [blog](http://gpuopen.com/hip-to-be-squared-an-introductory-hip-tutorial) that explains the example. 
 Now it is even simpler and requires no manual modification to the hipified source code - just hipify and compile:
 
-- Add hip/bin path to the PATH
+1. Add hip/bin path to the PATH  :
+    <code>export PATH=$PATH:[MYHIP]/bin</code>
 
-```
-$ export PATH=$PATH:[MYHIP]/bin
-```
-
-- Define environment variable
-
-```
-$ export HIP_PATH=[MYHIP]
-```
-
-- Build executible file
-
-```
-$ cd ~/hip/samples/0_Intro/square
-$ make
-/home/user/hip/bin/hipify-perl square.cu > square.cpp
-/home/user/hip/bin/hipcc  square.cpp -o square.out
-/home/user/hip/bin/hipcc -use-staticlib  square.cpp -o square.out.static
-```
-- Execute file
-```
-$ ./square.out
-info: running on device Navi 14 [Radeon Pro W5500]
-info: allocate host mem (  7.63 MB)
-info: allocate device mem (  7.63 MB)
-info: copy Host2Device
-info: launch 'vector_square' kernel
-info: copy Device2Host
-info: check result
-PASSED!
-```
+2. <code>$ make </code>
+   Make runs these steps.  This can be performed on either CUDA or AMD platform:
+   <code>hipify-perl square.cu > square.cpp </code>    # convert cuda code to hip code
+   <code>hipcc square.cpp</code>                       # compile into executable
diff --git a/samples/1_Utils/hipBusBandwidth/CMakeLists.txt b/samples/1_Utils/hipBusBandwidth/CMakeLists.txt
deleted file mode 100644
index df01c31d97..0000000000
--- a/samples/1_Utils/hipBusBandwidth/CMakeLists.txt
+++ /dev/null
@@ -1,20 +0,0 @@
-project(hipBusBandwidth)
-
-cmake_minimum_required(VERSION 3.10)
-
-# Search for rocm in common locations
-list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
-
-# Find hip
-find_package(hip)
-
-# Set compiler and linker
-set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
-set(CMAKE_CXX_LINKER   ${HIP_HIPCC_EXECUTABLE})
-set(CMAKE_BUILD_TYPE Release)
-
-# Create the excutable
-add_executable(hipBusBandwidth hipBusBandwidth.cpp ResultDatabase.cpp)
-
-# Link with HIP
-target_link_libraries(hipBusBandwidth hip::host)
\ No newline at end of file
diff --git a/samples/1_Utils/hipBusBandwidth/hipBusBandwidth.cpp b/samples/1_Utils/hipBusBandwidth/hipBusBandwidth.cpp
index 8032bd0a20..6181c49afe 100644
--- a/samples/1_Utils/hipBusBandwidth/hipBusBandwidth.cpp
+++ b/samples/1_Utils/hipBusBandwidth/hipBusBandwidth.cpp
@@ -12,7 +12,7 @@ enum MallocMode { MallocPinned, MallocUnpinned, MallocRegistered };
 bool p_verbose = false;
 MallocMode p_malloc_mode = MallocPinned;
 int p_numa_ctl = -1;
-int p_iterations = 0;
+int p_iterations = 10;
 int p_beatsperiteration = 1;
 int p_device = 0;
 int p_detailed = 0;
@@ -89,9 +89,7 @@ hipError_t memcopy(void* dst, const void* src, size_t sizeBytes, enum hipMemcpyK
 int sizes[] = {-64, -256, -512, 1,    2,    4,     8,     16,    32,     64,     128,   256,
                512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072, 262144, 524288};
 int nSizes = sizeof(sizes) / sizeof(int);
-// iterations to be run for the corresponding sizes, less number as the size increases
-int iterations[] = {1000, 1000, 1000, 1000, 500, 500, 500, 500, 500, 200, 200, 200,
-               200, 200, 100, 100, 100, 100, 50, 50, 50, 20, 20};
+
 
 // ****************************************************************************
 // Function: RunBenchmark_H2D
@@ -176,48 +174,53 @@ void RunBenchmark_H2D(ResultDatabase& resultDB) {
     hipEventCreate(&stop);
     CHECK_HIP_ERROR();
 
-    // store the times temporarily to estimate latency
-    // float times[nSizes];
-    for (int i = 0; i < nSizes; i++) {
-      int sizeIndex, iterIndex;
-      sizeIndex = i;
-      iterIndex = i;
+    // Three passes, forward and backward both
+    for (int pass = 0; pass < p_iterations; pass++) {
+        // store the times temporarily to estimate latency
+        // float times[nSizes];
+        // Step through sizes forward on even passes and backward on odd
+        for (int i = 0; i < nSizes; i++) {
+            int sizeIndex;
+            if ((pass % 2) == 0)
+                sizeIndex = i;
+            else
+                sizeIndex = (nSizes - 1) - i;
 
-      const int thisSize = p_onesize ? p_onesize : sizes[sizeIndex];
-      const int nbytes = sizeToBytes(thisSize);
-      const int niter = p_iterations ? p_iterations : iterations[iterIndex];
-      for (int pass = 0; pass < niter; pass++) {
+            const int thisSize = p_onesize ? p_onesize : sizes[sizeIndex];
+            const int nbytes = sizeToBytes(thisSize);
 
-        hipEventRecord(start, 0);
-        for (int j = 0; j < p_beatsperiteration; j++) {
-          memcopy(device, hostMem, nbytes, hipMemcpyHostToDevice);
+            hipEventRecord(start, 0);
+            for (int j = 0; j < p_beatsperiteration; j++) {
+                memcopy(device, hostMem, nbytes, hipMemcpyHostToDevice);
+            }
+            hipEventRecord(stop, 0);
+            hipEventSynchronize(stop);
+            float t = 0;
+            hipEventElapsedTime(&t, start, stop);
+            // times[sizeIndex] = t;
+
+            // Convert to GB/sec
+            if (p_verbose) {
+                std::cerr << "size " << sizeToString(thisSize) << " took " << t << " ms\n";
+            }
+
+            double speed =
+                (double(double(sizeToBytes(thisSize)/1000) * p_beatsperiteration) /  1000) / t;
+            char sizeStr[256];
+            if (p_beatsperiteration > 1) {
+                sprintf(sizeStr, "%9sx%d", sizeToString(thisSize).c_str(), p_beatsperiteration);
+            } else {
+                sprintf(sizeStr, "%9s", sizeToString(thisSize).c_str());
+            }
+            resultDB.AddResult(std::string("H2D_Bandwidth") + "_" + mallocModeString(p_malloc_mode),
+                               sizeStr, "GB/sec", speed);
+            resultDB.AddResult(std::string("H2D_Time") + mallocModeString(p_malloc_mode), sizeStr,
+                               "ms", t);
+
+            if (p_onesize) {
+                break;
+            }
         }
-        hipEventRecord(stop, 0);
-        hipEventSynchronize(stop);
-        float t = 0;
-        hipEventElapsedTime(&t, start, stop);
-        // times[sizeIndex] = t;
-        // Convert to GB/sec
-        if (p_verbose) {
-          std::cerr << "size " << sizeToString(thisSize) << " took " << t << " ms\n";
-        }
-
-        double speed =
-            (double(double(sizeToBytes(thisSize)/1000) * p_beatsperiteration) /  1000) / t;
-        char sizeStr[256];
-        if (p_beatsperiteration > 1) {
-          sprintf(sizeStr, "%9sx%d", sizeToString(thisSize).c_str(), p_beatsperiteration);
-        } else {
-          sprintf(sizeStr, "%9s", sizeToString(thisSize).c_str());
-        }
-        resultDB.AddResult(std::string("H2D_Bandwidth") + "_" + mallocModeString(p_malloc_mode),
-                           sizeStr, "GB/sec", speed);
-        resultDB.AddResult(std::string("H2D_Time") + mallocModeString(p_malloc_mode), sizeStr,                                            "ms", t);
-
-      }
-      if (p_onesize) {
-        break;
-      }
     }
 
     if (p_onesize) {
@@ -344,50 +347,53 @@ void RunBenchmark_D2H(ResultDatabase& resultDB) {
     hipEventCreate(&stop);
     CHECK_HIP_ERROR();
 
-    // store the times temporarily to estimate latency
-    // float times[nSizes];
-    for (int i = 0; i < nSizes; i++) {
-      int sizeIndex, iterIndex;
-      sizeIndex = i;
-      iterIndex = i;
+    // Three passes, forward and backward both
+    for (int pass = 0; pass < p_iterations; pass++) {
+        // store the times temporarily to estimate latency
+        // float times[nSizes];
+        // Step through sizes forward on even passes and backward on odd
+        for (int i = 0; i < nSizes; i++) {
+            int sizeIndex;
+            if ((pass % 2) == 0)
+                sizeIndex = i;
+            else
+                sizeIndex = (nSizes - 1) - i;
 
-      const int thisSize = p_onesize ? p_onesize : sizes[sizeIndex];
-      const int nbytes = sizeToBytes(thisSize);
-      const int niter = p_iterations ? p_iterations : iterations[iterIndex];
-      for (int pass = 0; pass < niter; pass++) {
+            const int thisSize = p_onesize ? p_onesize : sizes[sizeIndex];
+            const int nbytes = sizeToBytes(thisSize);
 
-        hipEventRecord(start, 0);
-        for (int j = 0; j < p_beatsperiteration; j++) {
-          memcopy(hostMem2, device, nbytes, hipMemcpyDeviceToHost);
+            hipEventRecord(start, 0);
+            for (int j = 0; j < p_beatsperiteration; j++) {
+                memcopy(hostMem2, device, nbytes, hipMemcpyDeviceToHost);
+            }
+            hipEventRecord(stop, 0);
+            hipEventSynchronize(stop);
+            float t = 0;
+            hipEventElapsedTime(&t, start, stop);
+            // times[sizeIndex] = t;
+
+            // Convert to GB/sec
+            if (p_verbose) {
+                std::cerr << "size " << sizeToString(thisSize) << " took " << t << " ms\n";
+            }
+
+            double speed =
+                (double(double(sizeToBytes(thisSize)/1000) * p_beatsperiteration) / 1000) / t;
+            char sizeStr[256];
+            sprintf(sizeStr, "%9s", sizeToString(thisSize).c_str());
+            if (p_beatsperiteration > 1) {
+                sprintf(sizeStr, "%9sx%d", sizeToString(thisSize).c_str(), p_beatsperiteration);
+            } else {
+                sprintf(sizeStr, "%9s", sizeToString(thisSize).c_str());
+            }
+            resultDB.AddResult(std::string("D2H_Bandwidth") + "_" + mallocModeString(p_malloc_mode),
+                               sizeStr, "GB/sec", speed);
+            resultDB.AddResult(std::string("D2H_Time") + "_" + mallocModeString(p_malloc_mode),
+                               sizeStr, "ms", t);
+            if (p_onesize) {
+                break;
+            }
         }
-        hipEventRecord(stop, 0);
-        hipEventSynchronize(stop);
-        float t = 0;
-        hipEventElapsedTime(&t, start, stop);
-        // times[sizeIndex] = t;
-        // Convert to GB/sec
-        if (p_verbose) {
-          std::cerr << "size " << sizeToString(thisSize) << " took " << t << " ms\n";
-        }
-
-        double speed =
-            (double(double(sizeToBytes(thisSize)/1000) * p_beatsperiteration) / 1000) / t;
-        char sizeStr[256];
-        sprintf(sizeStr, "%9s", sizeToString(thisSize).c_str());
-        if (p_beatsperiteration > 1) {
-          sprintf(sizeStr, "%9sx%d", sizeToString(thisSize).c_str(), p_beatsperiteration);
-        } else {
-          sprintf(sizeStr, "%9s", sizeToString(thisSize).c_str());
-        }
-        resultDB.AddResult(std::string("D2H_Bandwidth") + "_" + mallocModeString(p_malloc_mode),
-                           sizeStr, "GB/sec", speed);
-        resultDB.AddResult(std::string("D2H_Time") + "_" + mallocModeString(p_malloc_mode),
-                           sizeStr, "ms", t);
-
-      }
-      if (p_onesize) {
-        break;
-      }
     }
 
     if (p_onesize) {
@@ -516,43 +522,43 @@ void RunBenchmark_Bidir(ResultDatabase& resultDB) {
     hipStreamCreate(&stream[0]);
     hipStreamCreate(&stream[1]);
 
-    // store the times temporarily to estimate latency
-    // float times[nSizes];
-    for (int i = 0; i < nSizes; i++) {
-      int sizeIndex, iterIndex;
-      sizeIndex = i;
-      iterIndex = i;
+    // Three passes, forward and backward both
+    for (int pass = 0; pass < p_iterations; pass++) {
+        // store the times temporarily to estimate latency
+        // float times[nSizes];
+        // Step through sizes forward on even passes and backward on odd
+        for (int i = 0; i < nSizes; i++) {
+            int sizeIndex;
+            if ((pass % 2) == 0)
+                sizeIndex = i;
+            else
+                sizeIndex = (nSizes - 1) - i;
 
-      const int thisSize = p_onesize ? p_onesize : sizes[sizeIndex];
-      const int nbytes = sizeToBytes(thisSize);
-      const int niter = p_iterations ? p_iterations : iterations[iterIndex];
-      for (int pass = 0; pass < niter; pass++) {
+            const int thisSize = p_onesize ? p_onesize : sizes[sizeIndex];
+            const int nbytes = sizeToBytes(thisSize);
 
-        hipEventRecord(start, 0);
-        hipMemcpyAsync(deviceMem[0], hostMem[0], nbytes, hipMemcpyHostToDevice, stream[0]);
-        hipMemcpyAsync(hostMem[1], deviceMem[1], nbytes, hipMemcpyDeviceToHost, stream[1]);
-        hipEventRecord(stop, 0);
-        hipEventSynchronize(stop);
-        float t = 0;
-        hipEventElapsedTime(&t, start, stop);
+            hipEventRecord(start, 0);
+            hipMemcpyAsync(deviceMem[0], hostMem[0], nbytes, hipMemcpyHostToDevice, stream[0]);
+            hipMemcpyAsync(hostMem[1], deviceMem[1], nbytes, hipMemcpyDeviceToHost, stream[1]);
+            hipEventRecord(stop, 0);
+            hipEventSynchronize(stop);
+            float t = 0;
+            hipEventElapsedTime(&t, start, stop);
 
-        // Convert to GB/sec
-        if (p_verbose) {
-          std::cerr << "size " << sizeToString(thisSize) << " took " << t << " ms\n";
+            // Convert to GB/sec
+            if (p_verbose) {
+                std::cerr << "size " << sizeToString(thisSize) << " took " << t << " ms\n";
+            }
+
+            double speed = (double(sizeToBytes(2 * thisSize)) / (1000 * 1000)) / t;
+            char sizeStr[256];
+            sprintf(sizeStr, "%9s", sizeToString(thisSize).c_str());
+            resultDB.AddResult(
+                std::string("Bidir_Bandwidth") + "_" + mallocModeString(p_malloc_mode), sizeStr,
+                "GB/sec", speed);
+            resultDB.AddResult(std::string("Bidir_Time") + "_" + mallocModeString(p_malloc_mode),
+                               sizeStr, "ms", t);
         }
-
-        double speed = (double(sizeToBytes(2 * thisSize)) / (1000 * 1000)) / t;
-        char sizeStr[256];
-        sprintf(sizeStr, "%9s", sizeToString(thisSize).c_str());
-        resultDB.AddResult(
-            std::string("Bidir_Bandwidth") + "_" + mallocModeString(p_malloc_mode), sizeStr,
-            "GB/sec", speed);
-        resultDB.AddResult(std::string("Bidir_Time") + "_" + mallocModeString(p_malloc_mode),
-                           sizeStr, "ms", t);
-      }
-      if (p_onesize) {
-        break;
-      }
     }
 
     // Cleanup
@@ -702,63 +708,66 @@ void RunBenchmark_P2P_Unidir(ResultDatabase& resultDB) {
             hipEventCreate(&stop);
             CHECK_HIP_ERROR();
 
-            // store the times temporarily to estimate latency
-            // float times[nSizes];
-            for (int i = 0; i < nSizes; i++) {
-              int sizeIndex, iterIndex;
-              sizeIndex = i;
-              iterIndex = i;
+            // Three passes, forward and backward both
+            for (int pass = 0; pass < p_iterations; pass++) {
+                // store the times temporarily to estimate latency
+                // float times[nSizes];
+                // Step through sizes forward on even passes and backward on odd
+                for (int i = 0; i < nSizes; i++) {
+                    int sizeIndex;
+                    if ((pass % 2) == 0)
+                        sizeIndex = i;
+                    else
+                        sizeIndex = (nSizes - 1) - i;
 
-              const int thisSize = p_onesize ? p_onesize : sizes[sizeIndex];
-              const int nbytes = sizeToBytes(thisSize);
-              const int niter = p_iterations ? p_iterations : iterations[iterIndex];
-              for (int pass = 0; pass < niter; pass++) {
+                    const int thisSize = p_onesize ? p_onesize : sizes[sizeIndex];
+                    const int nbytes = sizeToBytes(thisSize);
 
-                hipDeviceSynchronize();
+                    hipDeviceSynchronize();
 
-                hipEventRecord(start, 0);
+                    hipEventRecord(start, 0);
 
-                for (int j = 0; j < p_beatsperiteration; j++) {
-                  hipMemcpy(peerGpuMem, currentGpuMem, nbytes, hipMemcpyDeviceToDevice);
-                }
+                    for (int j = 0; j < p_beatsperiteration; j++) {
+                        hipMemcpy(peerGpuMem, currentGpuMem, nbytes, hipMemcpyDeviceToDevice);
+                    }
 
-                hipEventRecord(stop, 0);
+                    hipEventRecord(stop, 0);
 
-                hipEventSynchronize(stop);
+                    hipEventSynchronize(stop);
 
-                float t = 0;
-                hipEventElapsedTime(&t, start, stop);
-                // times[sizeIndex] = t;
+                    float t = 0;
+                    hipEventElapsedTime(&t, start, stop);
+                    // times[sizeIndex] = t;
 
-                // Convert to GB/sec
-                if (p_verbose) {
-                  std::cerr << "size " << sizeToString(thisSize) << " took " << t << " ms\n";
-                }
+                    // Convert to GB/sec
+                    if (p_verbose) {
+                        std::cerr << "size " << sizeToString(thisSize) << " took " << t << " ms\n";
+                    }
 
-                double speed =
-                    (double(double(sizeToBytes(thisSize)/1000) * p_beatsperiteration) / 1000) / t;
-                char sizeStr[256];
-                if (p_beatsperiteration > 1) {
-                  sprintf(sizeStr, "%9sx%d", sizeToString(thisSize).c_str(),
-                          p_beatsperiteration);
-                } else {
-                  sprintf(sizeStr, "%9s", sizeToString(thisSize).c_str());
-                }
+                    double speed =
+                        (double(double(sizeToBytes(thisSize)/1000) * p_beatsperiteration) / 1000) / t;
+                    char sizeStr[256];
+                    if (p_beatsperiteration > 1) {
+                        sprintf(sizeStr, "%9sx%d", sizeToString(thisSize).c_str(),
+                                p_beatsperiteration);
+                    } else {
+                        sprintf(sizeStr, "%9s", sizeToString(thisSize).c_str());
+                    }
 
-                string cGpu, pGpu;
-                cGpu = gpuIDToString(currentGpu);
-                pGpu = gpuIDToString(peerGpu);
+                    string cGpu, pGpu;
+                    cGpu = gpuIDToString(currentGpu);
+                    pGpu = gpuIDToString(peerGpu);
 
-                resultDB.AddResult(std::string("p2p_uni") + "_gpu" + std::string(cGpu) +
-                                       "_gpu" + std::string(pGpu),
+                    resultDB.AddResult(std::string("p2p_uni") + "_gpu" + std::string(cGpu) +
+                                           "_gpu" + std::string(pGpu),
                                        sizeStr, "GB/sec", speed);
-                resultDB.AddResult(std::string("P2P_uni") + "_gpu" + std::string(cGpu) +
-                                       "_gpu" + std::string(pGpu),
+                    resultDB.AddResult(std::string("P2P_uni") + "_gpu" + std::string(cGpu) +
+                                           "_gpu" + std::string(pGpu),
                                        sizeStr, "ms", t);
 
-              }
-                if (p_onesize) {
-                  break;
+                    if (p_onesize) {
+                        break;
+                    }
                 }
             }
 
@@ -820,68 +829,71 @@ void RunBenchmark_P2P_Bidir(ResultDatabase& resultDB) {
             hipStreamCreate(&stream[0]);
             hipStreamCreate(&stream[1]);
 
-            // store the times temporarily to estimate latency
-            // float times[nSizes];
-            for (int i = 0; i < nSizes; i++) {
-              int sizeIndex, iterIndex;
-              sizeIndex = i;
-              iterIndex = i;
+            // Three passes, forward and backward both
+            for (int pass = 0; pass < p_iterations; pass++) {
+                // store the times temporarily to estimate latency
+                // float times[nSizes];
+                // Step through sizes forward on even passes and backward on odd
+                for (int i = 0; i < nSizes; i++) {
+                    int sizeIndex;
+                    if ((pass % 2) == 0)
+                        sizeIndex = i;
+                    else
+                        sizeIndex = (nSizes - 1) - i;
 
-              const int thisSize = p_onesize ? p_onesize : sizes[sizeIndex];
-              const int nbytes = sizeToBytes(thisSize);
-              const int niter = p_iterations ? p_iterations : iterations[iterIndex];
-              for (int pass = 0; pass < niter; pass++) {
+                    const int thisSize = p_onesize ? p_onesize : sizes[sizeIndex];
+                    const int nbytes = sizeToBytes(thisSize);
 
-                hipDeviceSynchronize();
+                    hipDeviceSynchronize();
 
-                hipEventRecord(start, 0);
+                    hipEventRecord(start, 0);
 
-                for (int j = 0; j < p_beatsperiteration; j++) {
-                  hipMemcpyAsync(peerGpuMem[0], currentGpuMem[0], nbytes,
-                                 hipMemcpyDeviceToDevice, stream[0]);
-                  hipMemcpyAsync(currentGpuMem[1], peerGpuMem[1], nbytes,
-                                 hipMemcpyDeviceToDevice, stream[1]);
+                    for (int j = 0; j < p_beatsperiteration; j++) {
+                        hipMemcpyAsync(peerGpuMem[0], currentGpuMem[0], nbytes,
+                                       hipMemcpyDeviceToDevice, stream[0]);
+                        hipMemcpyAsync(currentGpuMem[1], peerGpuMem[1], nbytes,
+                                       hipMemcpyDeviceToDevice, stream[1]);
+                    }
+
+                    hipEventRecord(stop, 0);
+
+                    hipEventSynchronize(stop);
+
+                    float t = 0;
+                    hipEventElapsedTime(&t, start, stop);
+                    // times[sizeIndex] = t;
+
+                    // Convert to GB/sec
+                    if (p_verbose) {
+                        std::cerr << "size " << sizeToString(thisSize) << " took " << t << " ms\n";
+                    }
+
+                    double speed =
+                        (double(double(sizeToBytes(2 * thisSize)/1000) * p_beatsperiteration) / 1000) /
+                        t;
+                    char sizeStr[256];
+                    if (p_beatsperiteration > 1) {
+                        sprintf(sizeStr, "%9sx%d", sizeToString(thisSize).c_str(),
+                                p_beatsperiteration);
+                    } else {
+                        sprintf(sizeStr, "%9s", sizeToString(thisSize).c_str());
+                    }
+
+                    string cGpu, pGpu;
+                    cGpu = gpuIDToString(currentGpu);
+                    pGpu = gpuIDToString(peerGpu);
+
+                    resultDB.AddResult(std::string("p2p_bi") + "_gpu" + std::string(cGpu) + "_gpu" +
+                                           std::string(pGpu),
+                                       sizeStr, "GB/sec", speed);
+                    resultDB.AddResult(std::string("P2P_bi") + "_gpu" + std::string(cGpu) + "_gpu" +
+                                           std::string(pGpu),
+                                       sizeStr, "ms", t);
+
+                    if (p_onesize) {
+                        break;
+                    }
                 }
-
-                hipEventRecord(stop, 0);
-
-                hipEventSynchronize(stop);
-
-                float t = 0;
-                hipEventElapsedTime(&t, start, stop);
-                // times[sizeIndex] = t;
-
-                // Convert to GB/sec
-                if (p_verbose) {
-                  std::cerr << "size " << sizeToString(thisSize) << " took " << t << " ms\n";
-                }
-
-                double speed =
-                    (double(double(sizeToBytes(2 * thisSize)/1000) * p_beatsperiteration) / 1000) /
-                    t;
-                char sizeStr[256];
-                if (p_beatsperiteration > 1) {
-                  sprintf(sizeStr, "%9sx%d", sizeToString(thisSize).c_str(),
-                          p_beatsperiteration);
-                } else {
-                  sprintf(sizeStr, "%9s", sizeToString(thisSize).c_str());
-                }
-
-                string cGpu, pGpu;
-                cGpu = gpuIDToString(currentGpu);
-                pGpu = gpuIDToString(peerGpu);
-
-                resultDB.AddResult(std::string("p2p_bi") + "_gpu" + std::string(cGpu) + "_gpu" +
-                                       std::string(pGpu),
-                                   sizeStr, "GB/sec", speed);
-                resultDB.AddResult(std::string("P2P_bi") + "_gpu" + std::string(cGpu) + "_gpu" +
-                                       std::string(pGpu),
-                                   sizeStr, "ms", t);
-
-              }
-              if (p_onesize) {
-                break;
-              }
             }
 
             if (p_onesize) {
diff --git a/samples/1_Utils/hipCommander/CMakeLists.txt b/samples/1_Utils/hipCommander/CMakeLists.txt
deleted file mode 100644
index 2592020c66..0000000000
--- a/samples/1_Utils/hipCommander/CMakeLists.txt
+++ /dev/null
@@ -1,31 +0,0 @@
-project(hipCommander)
-
-cmake_minimum_required(VERSION 3.10)
-
-# Search for rocm in common locations
-list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
-
-# Find hip
-find_package(hip)
-
-# Set compiler and linker
-set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
-set(CMAKE_CXX_LINKER   ${HIP_HIPCC_EXECUTABLE})
-set(CMAKE_BUILD_TYPE Release)
-
-# Create the excutable
-add_executable(hipCommander hipCommander.cpp)
-
-# Generate code object
-add_custom_target(
-  codeobj
-  ALL
-  COMMAND  ${HIP_HIPCC_EXECUTABLE} --genco  ../nullkernel.hip.cpp -o nullkernel.hsaco
-  COMMENT "codeobj generated"
-)
-
-add_dependencies(hipCommander codeobj)
-
-# Link with HIP
-target_link_libraries(hipCommander hip::host)
-set_property(TARGET hipCommander PROPERTY CXX_STANDARD 11)
diff --git a/samples/1_Utils/hipDispatchLatency/CMakeLists.txt b/samples/1_Utils/hipDispatchLatency/CMakeLists.txt
deleted file mode 100644
index b267f91905..0000000000
--- a/samples/1_Utils/hipDispatchLatency/CMakeLists.txt
+++ /dev/null
@@ -1,35 +0,0 @@
-project(hipDispatchLatency)
-
-cmake_minimum_required(VERSION 3.10)
-
-# Search for rocm in common locations
-list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
-
-# Find hip
-find_package(hip)
-
-# Set compiler and linker
-set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
-set(CMAKE_CXX_LINKER   ${HIP_HIPCC_EXECUTABLE})
-set(CMAKE_BUILD_TYPE Release)
-
-# Create the excutable
-add_executable(hipDispatchLatency hipDispatchLatency.cpp)
-add_executable(hipDispatchEnqueueRateMT hipDispatchEnqueueRateMT.cpp)
-
-# Generate code object
-add_custom_target(
-  codeobj
-  ALL
-  COMMAND  ${HIP_HIPCC_EXECUTABLE} --genco  ../test_kernel.cpp -o test_kernel.code
-  COMMENT "codeobj generated"
-)
-
-add_dependencies(hipDispatchLatency codeobj)
-add_dependencies(hipDispatchEnqueueRateMT codeobj)
-
-# Link with HIP
-target_link_libraries(hipDispatchLatency hip::host)
-target_link_libraries(hipDispatchEnqueueRateMT hip::host)
-set_property(TARGET hipDispatchLatency PROPERTY CXX_STANDARD 11)
-set_property(TARGET hipDispatchEnqueueRateMT PROPERTY CXX_STANDARD 11)
diff --git a/samples/1_Utils/hipInfo/CMakeLists.txt b/samples/1_Utils/hipInfo/CMakeLists.txt
deleted file mode 100644
index f3678d3160..0000000000
--- a/samples/1_Utils/hipInfo/CMakeLists.txt
+++ /dev/null
@@ -1,20 +0,0 @@
-project(hipInfo)
-
-cmake_minimum_required(VERSION 3.10)
-
-# Search for rocm in common locations
-list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
-
-# Find hip
-find_package(hip)
-
-# Set compiler and linker
-set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
-set(CMAKE_CXX_LINKER   ${HIP_HIPCC_EXECUTABLE})
-set(CMAKE_BUILD_TYPE Release)
-
-# Create the excutable
-add_executable(hipInfo hipInfo.cpp)
-
-# Link with HIP
-target_link_libraries(hipInfo hip::host)
diff --git a/samples/2_Cookbook/0_MatrixTranspose/CMakeLists.txt b/samples/2_Cookbook/0_MatrixTranspose/CMakeLists.txt
deleted file mode 100644
index de5bb0b5ea..0000000000
--- a/samples/2_Cookbook/0_MatrixTranspose/CMakeLists.txt
+++ /dev/null
@@ -1,20 +0,0 @@
-project(MatrixTranspose)
-
-cmake_minimum_required(VERSION 3.10)
-
-# Search for rocm in common locations
-list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
-
-# Find hip
-find_package(hip)
-
-# Set compiler and linker
-set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
-set(CMAKE_CXX_LINKER   ${HIP_HIPCC_EXECUTABLE})
-set(CMAKE_BUILD_TYPE Release)
-
-# Create the excutable
-add_executable(MatrixTranspose MatrixTranspose.cpp)
-
-# Link with HIP
-target_link_libraries(MatrixTranspose hip::host)
diff --git a/samples/2_Cookbook/10_inline_asm/CMakeLists.txt b/samples/2_Cookbook/10_inline_asm/CMakeLists.txt
deleted file mode 100644
index 7adb51f5de..0000000000
--- a/samples/2_Cookbook/10_inline_asm/CMakeLists.txt
+++ /dev/null
@@ -1,20 +0,0 @@
-project(inline_asm)
-
-cmake_minimum_required(VERSION 3.10)
-
-# Search for rocm in common locations
-list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
-
-# Find hip
-find_package(hip)
-
-# Set compiler and linker
-set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
-set(CMAKE_CXX_LINKER   ${HIP_HIPCC_EXECUTABLE})
-set(CMAKE_BUILD_TYPE Release)
-
-# Create the excutable
-add_executable(inline_asm inline_asm.cpp)
-
-# Link with HIP
-target_link_libraries(inline_asm hip::host)
diff --git a/samples/2_Cookbook/11_texture_driver/CMakeLists.txt b/samples/2_Cookbook/11_texture_driver/CMakeLists.txt
deleted file mode 100644
index 8ff242c993..0000000000
--- a/samples/2_Cookbook/11_texture_driver/CMakeLists.txt
+++ /dev/null
@@ -1,30 +0,0 @@
-project(texture2dDrv)
-
-cmake_minimum_required(VERSION 3.10)
-
-# Search for rocm in common locations
-list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
-
-# Find hip
-find_package(hip)
-
-# Set compiler and linker
-set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
-set(CMAKE_CXX_LINKER   ${HIP_HIPCC_EXECUTABLE})
-set(CMAKE_BUILD_TYPE Release)
-
-# Create the excutable
-add_executable(texture2dDrv texture2dDrv.cpp)
-
-# Generate code object
-add_custom_target(
-  codeobj
-  ALL
-  COMMAND  ${HIP_HIPCC_EXECUTABLE} --genco  ../tex2dKernel.cpp -o tex2dKernel.code
-  COMMENT "codeobj generated"
-)
-
-add_dependencies(texture2dDrv codeobj)
-
-# Link with HIP
-target_link_libraries(texture2dDrv hip::host)
diff --git a/samples/2_Cookbook/13_occupancy/CMakeLists.txt b/samples/2_Cookbook/13_occupancy/CMakeLists.txt
deleted file mode 100644
index 6cad76a395..0000000000
--- a/samples/2_Cookbook/13_occupancy/CMakeLists.txt
+++ /dev/null
@@ -1,20 +0,0 @@
-project(occupancy)
-
-cmake_minimum_required(VERSION 3.10)
-
-# Search for rocm in common locations
-list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
-
-# Find hip
-find_package(hip)
-
-# Set compiler and linker
-set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
-set(CMAKE_CXX_LINKER   ${HIP_HIPCC_EXECUTABLE})
-set(CMAKE_BUILD_TYPE Release)
-
-# Create the excutable
-add_executable(occupancy occupancy.cpp)
-
-# Link with HIP
-target_link_libraries(occupancy hip::host)
diff --git a/samples/2_Cookbook/1_hipEvent/CMakeLists.txt b/samples/2_Cookbook/1_hipEvent/CMakeLists.txt
deleted file mode 100644
index 6f6ee4e050..0000000000
--- a/samples/2_Cookbook/1_hipEvent/CMakeLists.txt
+++ /dev/null
@@ -1,20 +0,0 @@
-project(hipEvent)
-
-cmake_minimum_required(VERSION 3.10)
-
-# Search for rocm in common locations
-list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
-
-# Find hip
-find_package(hip)
-
-# Set compiler and linker
-set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
-set(CMAKE_CXX_LINKER   ${HIP_HIPCC_EXECUTABLE})
-set(CMAKE_BUILD_TYPE Release)
-
-# Create the excutable
-add_executable(hipEvent hipEvent.cpp)
-
-# Link with HIP
-target_link_libraries(hipEvent hip::host)
diff --git a/samples/2_Cookbook/3_shared_memory/CMakeLists.txt b/samples/2_Cookbook/3_shared_memory/CMakeLists.txt
deleted file mode 100644
index 6401488628..0000000000
--- a/samples/2_Cookbook/3_shared_memory/CMakeLists.txt
+++ /dev/null
@@ -1,20 +0,0 @@
-project(sharedMemory)
-
-cmake_minimum_required(VERSION 3.10)
-
-# Search for rocm in common locations
-list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
-
-# Find hip
-find_package(hip)
-
-# Set compiler and linker
-set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
-set(CMAKE_CXX_LINKER   ${HIP_HIPCC_EXECUTABLE})
-set(CMAKE_BUILD_TYPE Release)
-
-# Create the excutable
-add_executable(sharedMemory sharedMemory.cpp)
-
-# Link with HIP
-target_link_libraries(sharedMemory hip::host)
diff --git a/samples/2_Cookbook/4_shfl/CMakeLists.txt b/samples/2_Cookbook/4_shfl/CMakeLists.txt
deleted file mode 100644
index 9d142eeb02..0000000000
--- a/samples/2_Cookbook/4_shfl/CMakeLists.txt
+++ /dev/null
@@ -1,20 +0,0 @@
-project(shfl)
-
-cmake_minimum_required(VERSION 3.10)
-
-# Search for rocm in common locations
-list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
-
-# Find hip
-find_package(hip)
-
-# Set compiler and linker
-set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
-set(CMAKE_CXX_LINKER   ${HIP_HIPCC_EXECUTABLE})
-set(CMAKE_BUILD_TYPE Release)
-
-# Create the excutable
-add_executable(shfl shfl.cpp)
-
-# Link with HIP
-target_link_libraries(shfl hip::host)
diff --git a/samples/2_Cookbook/5_2dshfl/CMakeLists.txt b/samples/2_Cookbook/5_2dshfl/CMakeLists.txt
deleted file mode 100644
index adc0e3595d..0000000000
--- a/samples/2_Cookbook/5_2dshfl/CMakeLists.txt
+++ /dev/null
@@ -1,19 +0,0 @@
-project(2dshfl)
-
-cmake_minimum_required(VERSION 3.10)
-
-# Search for rocm in common locations
-list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
-
-# Find hip
-find_package(hip)
-
-# Set compiler and linker
-set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
-set(CMAKE_CXX_LINKER   ${HIP_HIPCC_EXECUTABLE})
-
-# Create the excutable
-add_executable(2dshfl 2dshfl.cpp)
-
-# Link with HIP
-target_link_libraries(2dshfl hip::host)
diff --git a/samples/2_Cookbook/6_dynamic_shared/CMakeLists.txt b/samples/2_Cookbook/6_dynamic_shared/CMakeLists.txt
deleted file mode 100644
index f177952d5a..0000000000
--- a/samples/2_Cookbook/6_dynamic_shared/CMakeLists.txt
+++ /dev/null
@@ -1,19 +0,0 @@
-project(dynamic_shared)
-
-cmake_minimum_required(VERSION 3.10)
-
-# Search for rocm in common locations
-list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
-
-# Find hip
-find_package(hip)
-
-# Set compiler and linker
-set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
-set(CMAKE_CXX_LINKER   ${HIP_HIPCC_EXECUTABLE})
-
-# Create the excutable
-add_executable(dynamic_shared dynamic_shared.cpp)
-
-# Link with HIP
-target_link_libraries(dynamic_shared hip::host)
diff --git a/samples/2_Cookbook/7_streams/CMakeLists.txt b/samples/2_Cookbook/7_streams/CMakeLists.txt
deleted file mode 100644
index fac4187b47..0000000000
--- a/samples/2_Cookbook/7_streams/CMakeLists.txt
+++ /dev/null
@@ -1,19 +0,0 @@
-project(stream)
-
-cmake_minimum_required(VERSION 3.10)
-
-# Search for rocm in common locations
-list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
-
-# Find hip
-find_package(hip)
-
-# Set compiler and linker
-set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
-set(CMAKE_CXX_LINKER   ${HIP_HIPCC_EXECUTABLE})
-
-# Create the excutable
-add_executable(stream stream.cpp)
-
-# Link with HIP
-target_link_libraries(stream hip::host)
diff --git a/samples/2_Cookbook/8_peer2peer/CMakeLists.txt b/samples/2_Cookbook/8_peer2peer/CMakeLists.txt
deleted file mode 100644
index 7c38373911..0000000000
--- a/samples/2_Cookbook/8_peer2peer/CMakeLists.txt
+++ /dev/null
@@ -1,19 +0,0 @@
-project(peer2peer)
-
-cmake_minimum_required(VERSION 3.10)
-
-# Search for rocm in common locations
-list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
-
-# Find hip
-find_package(hip)
-
-# Set compiler and linker
-set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
-set(CMAKE_CXX_LINKER   ${HIP_HIPCC_EXECUTABLE})
-
-# Create the excutable
-add_executable(peer2peer peer2peer.cpp)
-
-# Link with HIP
-target_link_libraries(peer2peer hip::host)
diff --git a/samples/2_Cookbook/9_unroll/CMakeLists.txt b/samples/2_Cookbook/9_unroll/CMakeLists.txt
deleted file mode 100644
index fc1b740e33..0000000000
--- a/samples/2_Cookbook/9_unroll/CMakeLists.txt
+++ /dev/null
@@ -1,19 +0,0 @@
-project(unroll)
-
-cmake_minimum_required(VERSION 3.10)
-
-# Search for rocm in common locations
-list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
-
-# Find hip
-find_package(hip)
-
-# Set compiler and linker
-set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
-set(CMAKE_CXX_LINKER   ${HIP_HIPCC_EXECUTABLE})
-
-# Create the excutable
-add_executable(unroll unroll.cpp)
-
-# Link with HIP
-target_link_libraries(unroll hip::host)
diff --git a/samples/README.md b/samples/README.md
deleted file mode 100644
index 739045382e..0000000000
--- a/samples/README.md
+++ /dev/null
@@ -1,27 +0,0 @@
-Build procedure
-
-We provide Makefile and CMakeLists.txt to build the samples seperately.
-
-1.Makefile supports shared lib of hip-rocclr runtime and nvcc.
-
-To build a sample, just type in sample folder,
-
-make
-
-
-
-2.CMakeLists.txt can support shared and static libs of hip-rocclr runtime.
-
-To build a sample, type in sample folder,
-
-mkdir build (if build folder is missing)
-
-cd build
-
-cmake ..
-
-make
-
-If you want debug version, follow,
-
-cmake -DCMAKE_BUILD_TYPE=Debug ..
\ No newline at end of file
diff --git a/tests/hit/HIT.cmake b/tests/hit/HIT.cmake
old mode 100755
new mode 100644
index 839b90befb..1677d93a20
--- a/tests/hit/HIT.cmake
+++ b/tests/hit/HIT.cmake
@@ -303,7 +303,6 @@ macro(MAKE_TEST _config exe)
         add_test(NAME ${testname} CONFIGURATIONS ${_config} COMMAND ${PROJECT_BINARY_DIR}/${exe} ${ARGN})
     endif()
     set_tests_properties(${testname} PROPERTIES PASS_REGULAR_EXPRESSION "PASSED" ENVIRONMENT HIP_PATH=${HIP_ROOT_DIR})
-    set_tests_properties(${testname} PROPERTIES SKIP_RETURN_CODE 127 ENVIRONMENT HIP_PATH=${HIP_ROOT_DIR})
 endmacro()
 
 macro(MAKE_NAMED_TEST _config exe testname)
diff --git a/tests/performance/compute/hipPerfMandelbrot.cpp b/tests/performance/compute/hipPerfMandelbrot.cpp
deleted file mode 100644
index c4234d8c37..0000000000
--- a/tests/performance/compute/hipPerfMandelbrot.cpp
+++ /dev/null
@@ -1,747 +0,0 @@
-/*
- Copyright (c) 2015-2020 Advanced Micro Devices, Inc. All rights reserved.
- Permission is hereby granted, free of charge, to any person obtaining a copy
- of this software and associated documentation files (the "Software"), to deal
- in the Software without restriction, including without limitation the rights
- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- copies of the Software, and to permit persons to whom the Software is
- furnished to do so, subject to the following conditions:
- The above copyright notice and this permission notice shall be included in
- all copies or substantial portions of the Software.
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- THE SOFTWARE.
- */
-
-/* HIT_START
- * BUILD: %t %s ../../src/test_common.cpp EXCLUDE_HIP_PLATFORM nvcc
- * TEST: %t
- * HIT_END
- */
-
-#include <iostream>
-#include <chrono>
-#include "test_common.h"
-#include <hip/hip_vector_types.h>
-#include <hip/math_functions.h>
-#include <vector>
-#include <string>
-#include <map>
-
-typedef struct {
-  double x;
-  double y;
-  double width;
-} coordRec;
-
-coordRec coords[] = {
-    {0.0, 0.0, 4.0},                                     // Whole set
-    {0.0, 0.0, 0.00001},                                 // All black
-    {-0.0180789661868, 0.6424294066162, 0.00003824140},  // Hit detail
-};
-
-static unsigned int numCoords = sizeof(coords) / sizeof(coordRec);
-
-template <typename T>
-__global__ void float_mad_kernel(uint *out, uint width, T xPos,  T yPos, T xStep, T yStep,
-                                  uint maxIter) {
-
-#pragma FP_CONTRACT ON
-  int tid = (blockIdx.x * blockDim.x + threadIdx.x);
-  int i = tid % width;
-  int j = tid / width;
-  float x0 = (float)(xPos + xStep*i);
-  float y0 = (float)(yPos + yStep*j);
-
-  float x = x0;
-  float y = y0;
-
-  uint iter = 0;
-  float tmp;
-  for (iter = 0; (x*x + y*y <= 4.0f) && (iter < maxIter); iter++) {
-    tmp = x;
-    x = fma(-y,y,fma(x,x,x0));
-    y = fma(2.0f*tmp,y,y0);
-  }
-
-  out[tid] = iter;
-};
-
-template <typename T>
-__global__ void float_mandel_unroll_kernel(uint *out, uint width, T xPos,
-    T yPos, T xStep, T yStep, uint maxIter) {
-
-#pragma FP_CONTRACT ON
-  int tid = (blockIdx.x * blockDim.x + threadIdx.x);
-  int i = tid % width;
-  int j = tid / width;
-  float x0 = (float)(xPos + xStep*(float)i);
-  float y0 = (float)(yPos + yStep*(float)j);
-
-  float x = x0;
-  float y = y0;
-
-#define FAST
-  uint iter = 0;
-  float tmp;
-  int stay;
-  int ccount = 0;
-  stay = (x*x+y*y) <= 4.0;
-  float savx = x;
-  float savy = y;
-#ifdef FAST
-  for (iter = 0; (iter < maxIter); iter+=16) {
-#else
-  for (iter = 0; stay && (iter < maxIter); iter+=16) {
-#endif
-    x = savx;
-    y = savy;
-
-    // Two iterations
-    tmp =  fma(-y,y, fma(x,x,x0));
-    y =  fma(2.0f*x,y,y0);
-    x =  fma(-y,y, fma(tmp,tmp,x0));
-    y =  fma(2.0f*tmp,y,y0);
-
-    // Two iterations
-    tmp =  fma(-y,y, fma(x,x,x0));
-    y =  fma(2.0f*x,y,y0);
-    x =  fma(-y,y, fma(tmp,tmp,x0));
-    y =  fma(2.0f*tmp,y,y0);
-
-    // Two iterations
-    tmp =  fma(-y,y, fma(x,x,x0));
-    y =  fma(2.0f*x,y,y0);
-    x =  fma(-y,y, fma(tmp,tmp,x0));
-    y =  fma(2.0f*tmp,y,y0);
-
-    // Two iterations
-    tmp =  fma(-y,y, fma(x,x,x0));
-    y =  fma(2.0f*x,y,y0);
-    x =  fma(-y,y, fma(tmp,tmp,x0));
-    y =  fma(2.0f*tmp,y,y0);
-
-    // Two iterations
-    tmp =  fma(-y,y, fma(x,x,x0));
-    y =  fma(2.0f*x,y,y0);
-    x =  fma(-y,y, fma(tmp,tmp,x0));
-    y =  fma(2.0f*tmp,y,y0);
-
-    // Two iterations
-    tmp =  fma(-y,y, fma(x,x,x0));
-    y =  fma(2.0f*x,y,y0);
-    x =  fma(-y,y, fma(tmp,tmp,x0));
-    y =  fma(2.0f*tmp,y,y0);
-
-    // Two iterations
-    tmp =  fma(-y,y, fma(x,x,x0));
-    y =  fma(2.0f*x,y,y0);
-    x =  fma(-y,y, fma(tmp,tmp,x0));
-    y =  fma(2.0f*tmp,y,y0);
-
-    // Two iterations
-    tmp =  fma(-y,y, fma(x,x,x0));
-    y =  fma(2.0f*x,y,y0);
-    x =  fma(-y,y, fma(tmp,tmp,x0));
-    y =  fma(2.0f*tmp,y,y0);
-
-    stay = (x*x+y*y) <= 4.0;
-    savx = (stay ? x : savx);
-    savy = (stay ? y : savy);
-    ccount += stay*16;
-#ifdef FAST
-    if (!stay)
-      break;
-#endif
-  }
-  // Handle remainder
-  if (!stay) {
-    iter = 16;
-    do {
-      x = savx;
-      y = savy;
-      stay = ((x*x+y*y) <= 4.0) && (ccount < maxIter);
-      tmp = x;
-      x =  fma(-y,y, fma(x,x,x0));
-      y =  fma(2.0f*tmp,y,y0);
-      ccount += stay;
-      iter--;
-      savx = (stay ? x : savx);
-      savy = (stay ? y : savy);
-    } while (stay && iter);
-  }
-
-
-  out[tid] = (uint)ccount;
-
-};
-
-
-template <typename T>
-__global__ void double_mad_kernel(uint *out, uint width, T xPos,  T yPos, T xStep, T yStep,
-                                   uint maxIter) {
-
-#pragma FP_CONTRACT ON
-  int tid = (blockIdx.x * blockDim.x + threadIdx.x);
-  int i = tid % width;
-  int j = tid / width;
-  double x0 = (double)(xPos + xStep*i);
-  double y0 = (double)(yPos + yStep*j);
-
-  double x = x0;
-  double y = y0;
-
-  uint iter = 0;
-  double tmp;
-  for (iter = 0; (x*x + y*y <= 4.0f) && (iter < maxIter); iter++) {
-    tmp = x;
-    x = fma(-y,y,fma(x,x,x0));
-    y = fma(2.0f*tmp,y,y0);
-  }
-  out[tid] = iter;
-};
-
-
-template <typename T>
-__global__ void double_mandel_unroll_kernel(uint *out, uint width, T xPos,
-                  T yPos, T xStep, T yStep, uint maxIter) {
-
-#pragma FP_CONTRACT ON
-  int tid = (blockIdx.x * blockDim.x + threadIdx.x);
-
-  int i = tid % width;
-  int j = tid / width;
-  double x0 = (double)(xPos + xStep*(double)i);
-  double y0 = (double)(yPos + yStep*(double)j);
-
-  double x = x0;
-  double y = y0;
-
-#define FAST
-  uint iter = 0;
-  double tmp;
-  int stay;
-  int ccount = 0;
-  stay = (x*x+y*y) <= 4.0;
-  double savx = x;
-  double savy = y;
-#ifdef FAST
-  for (iter = 0; (iter < maxIter); iter+=16)
-#else
-  for (iter = 0; stay && (iter < maxIter); iter+=16)
-#endif
-  {
-    x = savx;
-    y = savy;
-
-    // Two iterations
-    tmp = fma(-y,y, fma(x,x,x0));
-    y =   fma(2.0f*x,y,y0);
-    x =   fma(-y,y, fma(tmp,tmp,x0));
-    y =   fma(2.0f*tmp,y,y0);
-
-    // Two iterations
-    tmp = fma(-y,y, fma(x,x,x0));
-    y =   fma(2.0f*x,y,y0);
-    x =   fma(-y,y, fma(tmp,tmp,x0));
-    y =   fma(2.0f*tmp,y,y0);
-
-    // Two iterations
-    tmp = fma(-y,y, fma(x,x,x0));
-    y =   fma(2.0f*x,y,y0);
-    x =   fma(-y,y, fma(tmp,tmp,x0));
-    y =   fma(2.0f*tmp,y,y0);
-
-    // Two iterations
-    tmp =  fma(-y,y, fma(x,x,x0));
-    y =    fma(2.0f*x,y,y0);
-    x =    fma(-y,y, fma(tmp,tmp,x0));
-    y =    fma(2.0f*tmp,y,y0);
-
-    // Two iterations
-    tmp =  fma(-y,y, fma(x,x,x0));
-    y =    fma(2.0f*x,y,y0);
-    x =    fma(-y,y, fma(tmp,tmp,x0));
-    y =    fma(2.0f*tmp,y,y0);
-
-    // Two iterations
-    tmp =  fma(-y,y, fma(x,x,x0));
-    y =    fma(2.0f*x,y,y0);
-    x =    fma(-y,y, fma(tmp,tmp,x0));
-    y =    fma(2.0f*tmp,y,y0);
-
-    // Two iterations
-    tmp =  fma(-y,y, fma(x,x,x0));
-    y =    fma(2.0f*x,y,y0);
-    x =    fma(-y,y, fma(tmp,tmp,x0));
-    y =    fma(2.0f*tmp,y,y0);
-
-    // Two iterations
-    tmp =  fma(-y,y, fma(x,x,x0));
-    y =    fma(2.0f*x,y,y0);
-    x =    fma(-y,y, fma(tmp,tmp,x0));
-    y =    fma(2.0f*tmp,y,y0);
-
-    stay = (x*x+y*y) <= 4.0;
-    savx = (stay ? x : savx);
-    savy = (stay ? y : savy);
-    ccount += stay*16;
-#ifdef FAST
-    if (!stay)
-      break;
-#endif
-    }
-  // Handle remainder
-    if (!stay) {
-      iter = 16;
-      do {
-        x = savx;
-        y = savy;
-        stay = ((x*x+y*y) <= 4.0) && (ccount < maxIter);
-        tmp = x;
-        x =  fma(-y,y, fma(x,x,x0));
-        y =  fma(2.0f*tmp,y,y0);
-        ccount += stay;
-        iter--;
-        savx = (stay ? x : savx);
-        savy = (stay ? y : savy);
-      }
-      while (stay && iter);
-
-    }
-    out[tid] = (uint)ccount;
-};
-
-static const unsigned int FMA_EXPECTEDVALUES_INDEX = 15;
-
-// Expected results for each kernel run at each coord
-unsigned long long expectedIters[] = {
-    203277748ull,  2147483648ull, 120254651ull,  203277748ull,  2147483648ull,
-    120254651ull,  203277748ull,  2147483648ull, 120254651ull,  203315114ull,
-    2147483648ull, 120042599ull,  203315114ull,  2147483648ull, 120042599ull,
-    203280620ull,  2147483648ull, 120485704ull,  203280620ull,  2147483648ull,
-    120485704ull,  203280620ull,  2147483648ull, 120485704ull,  203315114ull,
-    2147483648ull, 120042599ull,  203315114ull,  2147483648ull, 120042599ull};
-
-class hipPerfMandelBrot {
-  public:
-  hipPerfMandelBrot();
-  ~hipPerfMandelBrot();
-
-  void setNumKernels(unsigned int num) {
-    numKernels = num;
-  }
-
-  unsigned int getNumKernels() {
-    return numKernels;
-  }
-
-  void setNumStreams(unsigned int num) {
-    numStreams = num;
-  }
-  unsigned int getNumStreams() {
-    return numStreams;
-  }
-
-  void open(int deviceID);
-  void run(unsigned int testCase, unsigned int deviceId);
-  void printResults(void);
-
-  // array of funtion pointers
-  typedef void (hipPerfMandelBrot::*funPtr)(uint *out, uint width, float xPos,  float yPos,
-                 float xStep, float yStep, uint maxIter,  hipStream_t* streams, int blocks,
-                 int threads_per_block, int kernelCnt);
-
-  // Wrappers
-  void float_mad(uint *out, uint width, float xPos,  float yPos,
-                  float xStep, float yStep, uint maxIter, hipStream_t* streams,
-                  int blocks, int threads_per_block, int kernelCnt);
-
-  void float_mandel_unroll(uint *out, uint width, float xPos,  float yPos,
-                            float xStep, float yStep, uint maxIter, hipStream_t* streams,
-                            int blocks, int threads_per_block, int kernelCnt);
-
-  void double_mad(uint *out, uint width, float xPos,  float yPos, float xStep,
-                   float yStep, uint maxIter, hipStream_t* streams, int blocks,
-                   int threads_per_block, int kernelCnt);
-
-  void double_mandel_unroll(uint *out, uint width, float xPos,  float yPos, float xStep,
-                             float yStep, uint maxIter, hipStream_t* streams, int blocks,
-                             int threads_per_block, int kernelCnt);
-
-  hipStream_t streams[2];
-
-  private:
-  void setData(void *ptr, unsigned int value);
-  void checkData(uint *ptr);
-
-  unsigned int numKernels;
-  unsigned int numStreams;
-
-  std::map<std::string, std::vector<double>> results;
-  unsigned int width_;
-  unsigned int bufSize;
-  unsigned int maxIter;
-  unsigned int coordIdx;
-  volatile unsigned long long totalIters = 0;
-  int numCUs;
-  static const unsigned int numLoops = 10;
-};
-
-
-hipPerfMandelBrot::hipPerfMandelBrot() {}
-
-hipPerfMandelBrot::~hipPerfMandelBrot() {}
-
-void hipPerfMandelBrot::open(int deviceId) {
-
-
-  int nGpu = 0;
-  HIPCHECK(hipGetDeviceCount(&nGpu));
-  if (nGpu < 1) {
-  std::cout << "info: didn't find any GPU! skipping the test!\n";
-  passed();
-  return;
-  }
-
-
-  HIPCHECK(hipSetDevice(deviceId));
-  hipDeviceProp_t props = {0};
-  HIPCHECK(hipGetDeviceProperties(&props, deviceId));
-  std::cout << "info: running on bus " << "0x" << props.pciBusID << " " << props.name
-    << " with " << props.multiProcessorCount << " CUs" << " and device id: " << deviceId
-    << std::endl;
-
-  numCUs = props.multiProcessorCount;
-}
-
-
-void hipPerfMandelBrot::printResults() {
-
-  int numkernels = getNumKernels();
-  int numStreams = getNumStreams();
-
-  std::cout << "\n" <<"Measured perf for kernels in GFLOPS on "
-            << numStreams << " streams (s)" <<  std::endl;
-
-  std::map<std::string, std::vector<double>>:: iterator itr;
-  for (itr = results.begin(); itr != results.end(); itr++) {
-          std::cout << "\n" << std::setw(20) << itr->first << " ";
-          for(auto i : results[itr->first]) {
-            std::cout << std::setw(10) << i << " ";
-            }
-     }
-  results.clear();
-
-  std::cout << std::endl;
-}
-
-
-// Wrappers for the kernel launches
-void hipPerfMandelBrot::float_mad(uint *out, uint width, float xPos,  float yPos, float xStep,
-                                   float yStep, uint maxIter, hipStream_t* streams,
-                                   int blocks, int threads_per_block, int kernelCnt) {
-
-  int streamCnt = getNumStreams();
-  hipLaunchKernelGGL(float_mad_kernel<float>, dim3(blocks), dim3(threads_per_block), 0,
-                      streams[kernelCnt % streamCnt], out, width_, xPos, yPos, xStep, yStep,
-                      maxIter);
-
-
-}
-
-
-void hipPerfMandelBrot::float_mandel_unroll(uint *out, uint width, float xPos,  float yPos,
-                             float xStep, float yStep, uint maxIter, hipStream_t * streams,
-                             int blocks, int threads_per_block, int kernelCnt) {
-
-  int streamCnt = getNumStreams();
-  hipLaunchKernelGGL(float_mandel_unroll_kernel<float>, dim3(blocks), dim3(threads_per_block), 0,
-                  streams[kernelCnt % streamCnt], out, width_, xPos, yPos, xStep, yStep, maxIter);
-
-}
-
-
-void hipPerfMandelBrot::double_mad(uint *out, uint width, float xPos,  float yPos,
-                               float xStep, float yStep, uint maxIter, hipStream_t * streams,
-                               int blocks, int threads_per_block, int kernelCnt) {
-
-  int streamCnt = getNumStreams();
-  hipLaunchKernelGGL(double_mad_kernel<double>, dim3(blocks), dim3(threads_per_block), 0,
-                  streams[kernelCnt % streamCnt], out, width_, xPos, yPos, xStep, yStep, maxIter);
-
-}
-
-
-void hipPerfMandelBrot::double_mandel_unroll(uint *out, uint width, float xPos,  float yPos,
-                              float xStep, float yStep, uint maxIter, hipStream_t * streams,
-                              int blocks, int threads_per_block, int kernelCnt) {
-
-  int streamCnt = getNumStreams();
-  hipLaunchKernelGGL(float_mandel_unroll_kernel<double>, dim3(blocks), dim3(threads_per_block), 0,
-                  streams[kernelCnt % streamCnt], out, width_, xPos, yPos, xStep, yStep, maxIter);
-
-}
-
-
-void hipPerfMandelBrot::run(unsigned int testCase,unsigned int deviceId) {
-
-  unsigned int numStreams = getNumStreams();
-
-  funPtr p[] = {&hipPerfMandelBrot::float_mad, &hipPerfMandelBrot::float_mandel_unroll,
-               &hipPerfMandelBrot::double_mad, &hipPerfMandelBrot::double_mandel_unroll};
-
-  // Maximum iteration count
-  maxIter = 32768;
-
-  uint * hPtr[numKernels];
-  uint * dPtr[numKernels];
-
-  // Width is divisible by 4 because the mandelbrot kernel processes 4 pixels at once.
-  width_ = 256;
-
-  bufSize = width_  * width_ * sizeof(uint);
-
-  // Create streams for concurrency
-  for (uint i = 0; i < numStreams; i++) {
-    HIPCHECK(hipStreamCreate(&streams[i]));
-  }
-
-
-  // Allocate memory on the host and device
-  for (uint i = 0; i < numKernels; i++) {
-    HIPCHECK(hipHostMalloc((void **)&hPtr[i], bufSize, hipHostMallocDefault));
-    setData(hPtr[i], 0xdeadbeef);
-    HIPCHECK(hipMalloc((uint **)&dPtr[i], bufSize))
-  }
-
-
-  // Prepare kernel launch parameters
-  int threads = (bufSize/sizeof(uint));
-  int threads_per_block  = 64;
-  int blocks = (threads/threads_per_block) + (threads % threads_per_block);
-
-  float xStep = (float)(coords[coordIdx].width / (double)width_);
-  float yStep = (float)(-coords[coordIdx].width / (double)width_);
-  float xPos = (float)(coords[coordIdx].x - 0.5 * coords[coordIdx].width);
-  float yPos = (float)(coords[coordIdx].y + 0.5 * coords[coordIdx].width);
-
-  // Copy memory asynchronously and concurrently from host to device
-  for (uint i = 0; i < numKernels; i++) {
-    HIPCHECK(hipMemcpy(dPtr[i], hPtr[i], bufSize, hipMemcpyHostToDevice));
-  }
-
-  // Synchronize to make sure all the copies are completed
-  HIPCHECK(hipStreamSynchronize(0));
-
-  int kernelIdx;
-  if(testCase == 0 || testCase == 5 || testCase == 10) {
-    kernelIdx = 0;
-  }
-
-  else if(testCase == 1 || testCase == 6 || testCase == 11) {
-    kernelIdx = 1;
-  }
-  else if(testCase == 2 || testCase == 7 || testCase == 12) {
-    kernelIdx = 2;
-  }
-  else if(testCase == 3 || testCase == 8 || testCase == 13){
-    kernelIdx = 3;
-  }
-
-
-  double totalTime = 0.0;
-
-  for (unsigned int k = 0; k < numLoops; k++) {
-
-  coordIdx = testCase % numCoords;
-
-  if ((testCase == 0 || testCase == 1 || testCase == 2 ||
-                  testCase == 5 || testCase == 6 || testCase == 7 ||
-                  testCase == 10 || testCase == 11 || testCase == 12)) {
-  float xStep = (float)(coords[coordIdx].width / (double)width_);
-  float yStep = (float)(-coords[coordIdx].width / (double)width_);
-  float xPos = (float)(coords[coordIdx].x - 0.5 * coords[coordIdx].width);
-  float yPos = (float)(coords[coordIdx].y + 0.5 * coords[coordIdx].width);
-
-  // Time the kernel execution
-  auto all_start = std::chrono::steady_clock::now();
-
-  for (uint i = 0; i < numKernels; i++) {
-    (this->*p[kernelIdx])(dPtr[i], width_, xPos, yPos, xStep, yStep, maxIter, streams, blocks,
-                           threads_per_block, i);
-  }
-
-
-  // Synchronize all the concurrent streams to have completed execution
-  HIPCHECK(hipStreamSynchronize(0));
-
-  auto all_end = std::chrono::steady_clock::now();
-  std::chrono::duration<double> all_kernel_time = all_end - all_start;
-  totalTime += all_kernel_time.count();
-
-  }
-
-
-  else {
-  double xStep = coords[coordIdx].width / (double)width_;
-  double yStep = -coords[coordIdx].width / (double)width_;
-  double xPos = coords[coordIdx].x - 0.5 * coords[coordIdx].width;
-  double yPos = coords[coordIdx].y + 0.5 * coords[coordIdx].width;
-
-  // Time the kernel execution
-  auto all_start = std::chrono::steady_clock::now();
-
-  for (uint i = 0; i < numKernels; i++) {
-  (this->*p[kernelIdx])(dPtr[i], width_, xPos, yPos, xStep, yStep, maxIter, streams, blocks,
-                         threads_per_block, i);
-  }
-
-
-  // Synchronize all the concurrent streams to have completed execution
-  HIPCHECK(hipStreamSynchronize(0));
-
-  auto all_end = std::chrono::steady_clock::now();
-  std::chrono::duration<double> all_kernel_time = all_end - all_start;
-  totalTime += all_kernel_time.count();
-  }
-
-
-  }
-
-  // Copy data back from device to the host
-  for(uint i = 0; i < numKernels; i++) {
-    HIPCHECK(hipMemcpy(hPtr[i] ,dPtr[i], bufSize, hipMemcpyDeviceToHost));
-  }
-
-
-  for(uint i = 0; i < numKernels; i++) {
-  checkData(hPtr[i]);
-
-  int j =0;
-  while((totalIters != expectedIters[j] && totalIters > expectedIters[j]) && j < 30) {
-          j++;
-  }
-
-  if(j==30) {
-    std::cout << "Incorrect iteration count detected. ";
-  }
-
-  }
-
-
-  // Compute GFLOPS.  There are 7 FLOPs per iteration
-  double perf = ((double)(totalIters*numKernels) * 7 * (double)(1e-09)) /
-                (totalTime / (double)numLoops);
-
-
-  std::vector<std::string> kernelName = {"float", "float_unroll",
-                      "double", "double_unroll"};
-
-  // Print results except for Warm-up kernel
-  if(testCase!=100) {
-  results[kernelName[testCase % 4]].push_back(perf);
- }
-
-
-  for(uint i = 0 ; i < numStreams; i++) {
-    HIPCHECK(hipStreamDestroy(streams[i]));
-  }
-
-
-  // Free host and device memory
-  for (uint i = 0; i < numKernels; i++) {
-    HIPCHECK(hipFree(hPtr[i]));
-    HIPCHECK(hipFree(dPtr[i]));
-  }
-
-
-}
-
-
-void hipPerfMandelBrot::setData(void *ptr, unsigned int value) {
-  unsigned int *ptr2 = (unsigned int *)ptr;
-  for (unsigned int i = 0; i < width_ * width_; i++) {
-      ptr2[i] = value;
-  }
-}
-
-
-void hipPerfMandelBrot::checkData(uint *ptr) {
-  totalIters = 0;
-  for (unsigned int i = 0; i < width_ * width_; i++) {
-    totalIters += ptr[i];
-  }
-}
-
-
-int main(int argc, char* argv[]) {
-  hipPerfMandelBrot mandelbrotCompute;
-  int deviceId = 0;
-
-  mandelbrotCompute.open(deviceId);
-
-  for (unsigned int testCase = 0; testCase < 3; testCase++) {
-
-
-  switch (testCase) {
-
-
-  case 0: {
-    // Warmup-kernel - default stream executes serially
-    mandelbrotCompute.setNumStreams(1);
-    mandelbrotCompute.setNumKernels(1);
-    mandelbrotCompute.run(100/*Random number*/, deviceId);
-    break;
-    }
-
-
-  case 1: {
-    // run all - sync
-    int i = 0;
-    do {
-    mandelbrotCompute.setNumStreams(1);
-    mandelbrotCompute.setNumKernels(1);
-    mandelbrotCompute.run(i, deviceId);
-    i++;
-    }while(i < 12);
-    mandelbrotCompute.printResults();
-
-    break;
-  }
-
-
-  case 2: {
-    // run all - async
-    int i = 0;
-    do {
-    mandelbrotCompute.setNumStreams(2);
-    mandelbrotCompute.setNumKernels(2);
-    mandelbrotCompute.run(i, deviceId);
-    i++;
-    }while(i < 12);
-    mandelbrotCompute.printResults();
-
-    break;
-
-  }
-
-
-  default: {
-    break;
-  }
-
-
-  }
-
-
-
-  }
-
-
-  passed();
-}
diff --git a/tests/performance/stream/hipPerfDeviceConcurrency.cpp b/tests/performance/stream/hipPerfDeviceConcurrency.cpp
deleted file mode 100644
index 7d6699a9a2..0000000000
--- a/tests/performance/stream/hipPerfDeviceConcurrency.cpp
+++ /dev/null
@@ -1,289 +0,0 @@
-/*
- Copyright (c) 2015-2020 Advanced Micro Devices, Inc. All rights reserved.
- Permission is hereby granted, free of charge, to any person obtaining a copy
- of this software and associated documentation files (the "Software"), to deal
- in the Software without restriction, including without limitation the rights
- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- copies of the Software, and to permit persons to whom the Software is
- furnished to do so, subject to the following conditions:
- The above copyright notice and this permission notice shall be included in
- all copies or substantial portions of the Software.
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- THE SOFTWARE.
- */
-
-/* HIT_START
- * BUILD: %t %s ../../src/test_common.cpp EXCLUDE_HIP_PLATFORM nvcc
- * TEST: %t
- * HIT_END
- */
-
-#include <iostream>
-#include <chrono>
-#include "test_common.h"
-
-typedef struct {
-  double x;
-  double y;
-  double width;
-} coordRec;
-
-static coordRec coords[] = {
-    {0.0, 0.0, 0.00001},         // All black
-};
-
-static unsigned int numCoords = sizeof(coords) / sizeof(coordRec);
-
-__global__ void mandelbrot(uint *out, uint width, float xPos,  float yPos, float xStep,
-                            float yStep, uint maxIter) {
-
-  int tid = (blockIdx.x * blockDim.x + threadIdx.x);
-  int i = tid % width;
-  int j = tid / width;
-  float x0 = (float)(xPos + xStep*i);
-  float y0 = (float)(yPos + yStep*j);
-
-  float x = x0;
-  float y = y0;
-
-  uint iter = 0;
-  float tmp;
-  for (iter = 0; (x*x + y*y <= 4.0f) && (iter < maxIter); iter++) {
-    tmp = x;
-    x = fma(-y,y,fma(x,x,x0));
-    y = fma(2.0f*tmp,y,y0);
-  }
-
-  out[tid] = iter;
-};
-
-class hipPerfDeviceConcurrency {
-  public:
-  hipPerfDeviceConcurrency();
-  ~hipPerfDeviceConcurrency();
-
-  void setNumGpus(unsigned int num) {
-    numDevices = num;
-  }
-  unsigned int getNumGpus() {
-    return numDevices;
-  }
-
-  void open(void);
-  void close(void);
-  void run(unsigned int testCase, int numGpus);
-
-  private:
-  void setData(void *ptr, unsigned int value);
-  void checkData(uint *ptr);
-
-  unsigned int numDevices;
-  unsigned int width_;
-  unsigned int bufSize;
-  unsigned int coordIdx;
-  unsigned long long totalIters = 0;
-};
-
-
-hipPerfDeviceConcurrency::hipPerfDeviceConcurrency() {}
-
-hipPerfDeviceConcurrency::~hipPerfDeviceConcurrency() {}
-
-void hipPerfDeviceConcurrency::open(void) {
-
-
-  int nGpu = 0;
-  HIPCHECK(hipGetDeviceCount(&nGpu));
-  setNumGpus(nGpu);
-  if (nGpu < 1) {
-  std::cout << "info: didn't find any GPU! skipping the test!\n";
-  passed();
-  }
-
-
-}
-
-
-void hipPerfDeviceConcurrency::close() {
-}
-
-void hipPerfDeviceConcurrency::run(unsigned int testCase, int numGpus) {
-
-
-  static int deviceId;
-  uint * hPtr[numGpus];
-  uint * dPtr[numGpus];
-  hipStream_t streams[numGpus];
-  int numCUs[numGpus];
-  unsigned int maxIter[numGpus];
-  unsigned long long expectedIters[numGpus];
-
-  int threads, threads_per_block, blocks;
-  float xStep, yStep, xPos, yPos;
-
-  for(int i = 0; i < numGpus; i++) {
-
-  if(testCase != 0) {
-    deviceId = i;
-  }
-
-  HIPCHECK(hipSetDevice(deviceId));
-
-  hipDeviceProp_t props = {0};
-  HIPCHECK(hipGetDeviceProperties(&props, i));
-
-  if (testCase != 0) {
-  std::cout << "info: running on bus " << "0x" << props.pciBusID << " " << props.name
-            << " with " << props.multiProcessorCount << " CUs" << " and device ID: "
-            << i << std::endl;
-  }
-
-  numCUs[i] = props.multiProcessorCount;
-  int clkFrequency = 0;
-  HIPCHECK(hipDeviceGetAttribute(&clkFrequency, hipDeviceAttributeClockRate, i));
-
-  clkFrequency =(unsigned int)clkFrequency/1000;
-
-  // Maximum iteration count
-  // maxIter = 8388608 * (engine_clock / 1000).serial execution
-  maxIter[i] = (unsigned int)(((8388608 * ((float)clkFrequency / 1000)) * numCUs[i]) / 128);
-  maxIter[i] = (maxIter[i] + 15) & ~15;
-
-  // Width is divisible by 4 because the mandelbrot kernel processes 4 pixels at once.
-  width_ = 256;
-
-  bufSize = width_ * width_ * sizeof(uint);
-
-  // Create streams for concurrency
-  HIPCHECK(hipStreamCreate(&streams[i]));
-
-  // Allocate memory on the host and device
-  HIPCHECK(hipHostMalloc((void **)&hPtr[i], bufSize, hipHostMallocDefault));
-  setData(hPtr[i], 0xdeadbeef);
-  HIPCHECK(hipMalloc((uint **)&dPtr[i], bufSize))
-
-  // Prepare kernel launch parameters
-  threads = (bufSize/sizeof(uint));
-  threads_per_block  = 64;
-  blocks = (threads/threads_per_block) + (threads % threads_per_block);
-
-  coordIdx = testCase % numCoords;
-  xStep = (float)(coords[coordIdx].width / (double)width_);
-  yStep = (float)(-coords[coordIdx].width / (double)width_);
-  xPos = (float)(coords[coordIdx].x - 0.5 * coords[coordIdx].width);
-  yPos = (float)(coords[coordIdx].y + 0.5 * coords[coordIdx].width);
-
-  // Copy memory from host to device
-  HIPCHECK(hipMemcpy(dPtr[i], hPtr[i], bufSize, hipMemcpyHostToDevice));
-
-  }
-
-  // Time the kernel execution
-  auto all_start = std::chrono::steady_clock::now();
-
-  for(int i = 0; i < numGpus; i++) {
-
-  if(testCase != 0) {
-    deviceId = i;
-  }
-
-  HIPCHECK(hipSetDevice(deviceId));
-
-  hipLaunchKernelGGL(mandelbrot, dim3(blocks), dim3(threads_per_block), 0, streams[i],
-                      dPtr[i], width_, xPos, yPos, xStep, yStep, maxIter[i]);
-
-  }
-
-  for(int i = 0; i < numGpus; i++) {
-    HIPCHECK(hipStreamSynchronize(0));
-  }
-
-
-  auto all_end = std::chrono::steady_clock::now();
-  std::chrono::duration<double> all_kernel_time = all_end - all_start;
-
-  for(int i = 0; i < numGpus; i++) {
-
-  if(testCase != 0) {
-    deviceId = i;
-  }
-  HIPCHECK(hipSetDevice(deviceId));
-
-  // Copy data back from device to the host
-  HIPCHECK(hipMemcpy(hPtr[i], dPtr[i], bufSize, hipMemcpyDeviceToHost));
-
-  checkData(hPtr[i]);
-  expectedIters[i] = width_ * width_ * (unsigned long long) maxIter[i];
-
-  if (testCase != 0) {
-    checkData(hPtr[i]);
-    if(totalIters != expectedIters[i]) {
-      std::cout << "Incorrect iteration count detected" << std::endl;
-    }
-  }
-
-
-  HIPCHECK(hipStreamDestroy(streams[i]));
-
-  // Free host and device memory
-  HIPCHECK(hipFree(hPtr[i]));
-  HIPCHECK(hipFree(dPtr[i]));
-  }
-
-  if (testCase != 0) {
-  std::cout << '\n' << "Measured time for kernel computation on " << numGpus << " device (s): "
-            << all_kernel_time.count() << " (s) " << '\n' << std::endl;
-  }
-
-  if(testCase == 0) {
-    deviceId++;
-  }
-
-
-}
-
-
-void hipPerfDeviceConcurrency::setData(void *ptr, unsigned int value) {
-  unsigned int *ptr2 = (unsigned int *)ptr;
-  for (unsigned int i = 0; i < width_ * width_ ; i++) {
-      ptr2[i] = value;
-  }
-}
-
-
-void hipPerfDeviceConcurrency::checkData(uint *ptr) {
-  totalIters = 0;
-  for (unsigned int i = 0; i < width_ * width_; i++) {
-    totalIters += ptr[i];
-  }
-}
-
-
-int main(int argc, char* argv[]) {
-  hipPerfDeviceConcurrency deviceConcurrency;
-
-  deviceConcurrency.open();
-
-  int nGpu = deviceConcurrency.getNumGpus();
-
-  // testCase = 0 refers to warmup kernel run
-  int testCase = 0;
-
-  for (int i = 0; i < nGpu; i++) {
-    // Warm-up kernel on all devices
-    deviceConcurrency.run(testCase, 1);
-  }
-
-  // Time for kernel on 1 device
-  deviceConcurrency.run(++testCase, 1);
-
-  // Time for kernel on all available devices
-  deviceConcurrency.run(++testCase, nGpu);
-
-  passed();
-}
diff --git a/tests/src/runtimeApi/cooperativeGrps/hipCGGridGroupType.cpp b/tests/src/cg/hipCGGridGroupType.cpp
old mode 100755
new mode 100644
similarity index 97%
rename from tests/src/runtimeApi/cooperativeGrps/hipCGGridGroupType.cpp
rename to tests/src/cg/hipCGGridGroupType.cpp
index 79f1cb1c38..db45c10512
--- a/tests/src/runtimeApi/cooperativeGrps/hipCGGridGroupType.cpp
+++ b/tests/src/cg/hipCGGridGroupType.cpp
@@ -22,7 +22,7 @@ THE SOFTWARE.
 
 
 /* HIT_START
- * BUILD: %t %s ../../test_common.cpp
+ * BUILD: %t %s ../test_common.cpp
  * TEST: %t
  * HIT_END
  */
@@ -139,11 +139,7 @@ int main()
 
   if (!deviceProperties.cooperativeLaunch) {
     std::cout << "info: Device doesn't support cooperative launch! skipping the test!\n";
-    if (hip_skip_tests_enabled()) {
-      return hip_skip_retcode();
-    } else {
-      passed();
-    }
+    passed();
     return 0;
   }
 
diff --git a/tests/src/runtimeApi/cooperativeGrps/hipCGGridGroupTypeViaBaseType.cpp b/tests/src/cg/hipCGGridGroupTypeViaBaseType.cpp
old mode 100755
new mode 100644
similarity index 97%
rename from tests/src/runtimeApi/cooperativeGrps/hipCGGridGroupTypeViaBaseType.cpp
rename to tests/src/cg/hipCGGridGroupTypeViaBaseType.cpp
index 7407f266dd..11562dfff6
--- a/tests/src/runtimeApi/cooperativeGrps/hipCGGridGroupTypeViaBaseType.cpp
+++ b/tests/src/cg/hipCGGridGroupTypeViaBaseType.cpp
@@ -22,7 +22,7 @@ THE SOFTWARE.
 
 
 /* HIT_START
- * BUILD: %t %s ../../test_common.cpp
+ * BUILD: %t %s ../test_common.cpp
  * TEST: %t
  * HIT_END
  */
@@ -139,11 +139,7 @@ int main()
 
   if (!deviceProperties.cooperativeLaunch) {
     std::cout << "info: Device doesn't support cooperative launch! skipping the test!\n";
-    if (hip_skip_tests_enabled()) {
-      return hip_skip_retcode();
-    } else {
-      passed();
-    }
+    passed();
     return 0;
   }
 
diff --git a/tests/src/runtimeApi/cooperativeGrps/hipCGGridGroupTypeViaPublicApi.cpp b/tests/src/cg/hipCGGridGroupTypeViaPublicApi.cpp
old mode 100755
new mode 100644
similarity index 97%
rename from tests/src/runtimeApi/cooperativeGrps/hipCGGridGroupTypeViaPublicApi.cpp
rename to tests/src/cg/hipCGGridGroupTypeViaPublicApi.cpp
index cb9d8d7c53..21f0348aec
--- a/tests/src/runtimeApi/cooperativeGrps/hipCGGridGroupTypeViaPublicApi.cpp
+++ b/tests/src/cg/hipCGGridGroupTypeViaPublicApi.cpp
@@ -22,7 +22,7 @@ THE SOFTWARE.
 
 
 /* HIT_START
- * BUILD: %t %s ../../test_common.cpp
+ * BUILD: %t %s ../test_common.cpp
  * TEST: %t
  * HIT_END
  */
@@ -139,11 +139,7 @@ int main()
 
   if (!deviceProperties.cooperativeLaunch) {
     std::cout << "info: Device doesn't support cooperative launch! skipping the test!\n";
-    if (hip_skip_tests_enabled()) {
-      return hip_skip_retcode();
-    } else {
-      passed();
-    }
+    passed();
     return 0;
   }
 
diff --git a/tests/src/runtimeApi/cooperativeGrps/hipCGMultiGridGroupType.cpp b/tests/src/cg/hipCGMultiGridGroupType.cpp
old mode 100755
new mode 100644
similarity index 92%
rename from tests/src/runtimeApi/cooperativeGrps/hipCGMultiGridGroupType.cpp
rename to tests/src/cg/hipCGMultiGridGroupType.cpp
index 02be0a521b..5a0529867a
--- a/tests/src/runtimeApi/cooperativeGrps/hipCGMultiGridGroupType.cpp
+++ b/tests/src/cg/hipCGMultiGridGroupType.cpp
@@ -22,7 +22,7 @@ THE SOFTWARE.
 
 
 /* HIT_START
- * BUILD: %t %s ../../test_common.cpp
+ * BUILD: %t %s ../test_common.cpp
  * TEST: %t
  * HIT_END
  */
@@ -34,8 +34,6 @@ THE SOFTWARE.
 #include <climits>
 
 #define ASSERT_EQUAL(lhs, rhs) assert(lhs == rhs)
-#define ASSERT_LE(lhs, rhs) assert(lhs <= rhs)
-#define ASSERT_GE(lhs, rhs) assert(lhs >= rhs)
 
 using namespace cooperative_groups;
 
@@ -195,27 +193,15 @@ static void test_cg_multi_grid_group_type(int blockSize)
   }
 
   // Validate results
-  int gridsSeen[MaxGPUs];
   for (int i = 0; i < nGpu; ++i) {
     for (int j = 0; j < 2 * blockSize; ++j) {
-      ASSERT_EQUAL(numGridsTestH[i][j], nGpu);
-      ASSERT_GE(gridRankTestH[i][j], 0);
-      ASSERT_LE(gridRankTestH[i][j], nGpu-1);
-      ASSERT_EQUAL(gridRankTestH[i][j], gridRankTestH[i][0]);
+      //ASSERT_EQUAL(numGridsTestH[i][j], nGpu);
+      //ASSERT_EQUAL(gridRankTestH[i][j], i);
       ASSERT_EQUAL(sizeTestH[i][j], nGpu * 2 * blockSize);
-      int gridRank = gridRankTestH[i][j];
-      ASSERT_EQUAL(thdRankTestH[i][j], (gridRank * 2 * blockSize) + j);
+      ASSERT_EQUAL(thdRankTestH[i][j], (i * 2 * blockSize) + j);
       ASSERT_EQUAL(isValidTestH[i][j], 1);
     }
     ASSERT_EQUAL(syncResultD[i+1],  2 * blockSize);
-
-    // Validate uniqueness property of grid rank
-    gridsSeen[i] = gridRankTestH[i][0];
-    for (int k = 0; k < i; ++k) {
-      if (gridsSeen[k] == gridsSeen[i]) {
-        assert (false && "Grid rank in multi-gpu setup should be unique");
-      }
-    }
   }
   ASSERT_EQUAL(syncResultD[0], nGpu * 2 * blockSize);
 
diff --git a/tests/src/runtimeApi/cooperativeGrps/hipCGMultiGridGroupTypeViaBaseType.cpp b/tests/src/cg/hipCGMultiGridGroupTypeViaBaseType.cpp
similarity index 83%
rename from tests/src/runtimeApi/cooperativeGrps/hipCGMultiGridGroupTypeViaBaseType.cpp
rename to tests/src/cg/hipCGMultiGridGroupTypeViaBaseType.cpp
index 0830e807c3..dae72f4cf8 100644
--- a/tests/src/runtimeApi/cooperativeGrps/hipCGMultiGridGroupTypeViaBaseType.cpp
+++ b/tests/src/cg/hipCGMultiGridGroupTypeViaBaseType.cpp
@@ -22,7 +22,7 @@ THE SOFTWARE.
 
 
 /* HIT_START
- * BUILD: %t %s ../../test_common.cpp
+ * BUILD: %t %s ../test_common.cpp
  * TEST: %t
  * HIT_END
  */
@@ -34,14 +34,11 @@ THE SOFTWARE.
 #include <climits>
 
 #define ASSERT_EQUAL(lhs, rhs) assert(lhs == rhs)
-#define ASSERT_LE(lhs, rhs) assert(lhs <= rhs)
-#define ASSERT_GE(lhs, rhs) assert(lhs >= rhs)
 
 using namespace cooperative_groups;
 
 static __global__
 void kernel_cg_multi_grid_group_type_via_base_type(int *sizeTestD,
-                                                   int* gridRankTestD,
                                                    int *thdRankTestD,
                                                    int *isValidTestD,
                                                    int *syncTestD,
@@ -54,7 +51,6 @@ void kernel_cg_multi_grid_group_type_via_base_type(int *sizeTestD,
   sizeTestD[gIdx] = tg.size();
 
   // Test thread_rank
-  gridRankTestD[gIdx] = this_multi_grid().grid_rank();
   thdRankTestD[gIdx] = tg.thread_rank();
 
   // Test is_valid
@@ -114,7 +110,6 @@ static void test_cg_multi_grid_group_type_via_base_type(int blockSize)
   // Allocate host and device memory
   int nBytes = sizeof(int) * 2 * blockSize;
   int *sizeTestD[MaxGPUs], *sizeTestH[MaxGPUs];
-  int *gridRankTestD[MaxGPUs], *gridRankTestH[MaxGPUs];
   int *thdRankTestD[MaxGPUs], *thdRankTestH[MaxGPUs];
   int *isValidTestD[MaxGPUs], *isValidTestH[MaxGPUs];
   int *syncTestD[MaxGPUs], *syncResultD;
@@ -122,13 +117,11 @@ static void test_cg_multi_grid_group_type_via_base_type(int blockSize)
     ASSERT_EQUAL(hipSetDevice(i), hipSuccess);
 
     ASSERT_EQUAL(hipMalloc(&sizeTestD[i], nBytes), hipSuccess);
-    ASSERT_EQUAL(hipMalloc(&gridRankTestD[i], nBytes), hipSuccess);
     ASSERT_EQUAL(hipMalloc(&thdRankTestD[i], nBytes), hipSuccess);
     ASSERT_EQUAL(hipMalloc(&isValidTestD[i], nBytes), hipSuccess);
     ASSERT_EQUAL(hipMalloc(&syncTestD[i], nBytes), hipSuccess);
 
     ASSERT_EQUAL(hipHostMalloc(&sizeTestH[i], nBytes), hipSuccess);
-    ASSERT_EQUAL(hipHostMalloc(&gridRankTestH[i], nBytes), hipSuccess);
     ASSERT_EQUAL(hipHostMalloc(&thdRankTestH[i], nBytes), hipSuccess);
     ASSERT_EQUAL(hipHostMalloc(&isValidTestH[i], nBytes), hipSuccess);
 
@@ -142,18 +135,17 @@ static void test_cg_multi_grid_group_type_via_base_type(int blockSize)
   }
 
   // Launch Kernel
-  constexpr int NumKernelArgs = 6;
+  constexpr int NumKernelArgs = 5;
   hipLaunchParams* launchParamsList = new hipLaunchParams[nGpu];
   void* args[MaxGPUs * NumKernelArgs];
   for (int i = 0; i < nGpu; i++) {
     ASSERT_EQUAL(hipSetDevice(i), hipSuccess);
 
     args[i * NumKernelArgs    ] = &sizeTestD[i];
-    args[i * NumKernelArgs + 1] = &gridRankTestD[i];
-    args[i * NumKernelArgs + 2] = &thdRankTestD[i];
-    args[i * NumKernelArgs + 3] = &isValidTestD[i];
-    args[i * NumKernelArgs + 4] = &syncTestD[i];
-    args[i * NumKernelArgs + 5] = &syncResultD;
+    args[i * NumKernelArgs + 1] = &thdRankTestD[i];
+    args[i * NumKernelArgs + 2] = &isValidTestD[i];
+    args[i * NumKernelArgs + 3] = &syncTestD[i];
+    args[i * NumKernelArgs + 4] = &syncResultD;
 
     launchParamsList[i].func = reinterpret_cast<void*>(kernel_cg_multi_grid_group_type_via_base_type);
     launchParamsList[i].gridDim = 2;
@@ -172,8 +164,6 @@ static void test_cg_multi_grid_group_type_via_base_type(int blockSize)
 
     ASSERT_EQUAL(hipMemcpy(sizeTestH[i], sizeTestD[i], nBytes, hipMemcpyDeviceToHost),
                  hipSuccess);
-    ASSERT_EQUAL(hipMemcpy(gridRankTestH[i], gridRankTestD[i], nBytes, hipMemcpyDeviceToHost),
-                 hipSuccess);
     ASSERT_EQUAL(hipMemcpy(thdRankTestH[i], thdRankTestD[i], nBytes, hipMemcpyDeviceToHost),
                  hipSuccess);
     ASSERT_EQUAL(hipMemcpy(isValidTestH[i], isValidTestD[i], nBytes, hipMemcpyDeviceToHost),
@@ -183,26 +173,13 @@ static void test_cg_multi_grid_group_type_via_base_type(int blockSize)
   }
 
   // Validate results
-  int gridsSeen[MaxGPUs];
   for (int i = 0; i < nGpu; ++i) {
     for (int j = 0; j < 2 * blockSize; ++j) {
       ASSERT_EQUAL(sizeTestH[i][j], nGpu * 2 * blockSize);
-      ASSERT_GE(gridRankTestH[i][j], 0);
-      ASSERT_LE(gridRankTestH[i][j], nGpu-1);
-      ASSERT_EQUAL(gridRankTestH[i][j], gridRankTestH[i][0]);
-      int gridRank = gridRankTestH[i][j];
-      ASSERT_EQUAL(thdRankTestH[i][j], (gridRank * 2 * blockSize) + j);
+      ASSERT_EQUAL(thdRankTestH[i][j], (i * 2 * blockSize) + j);
       ASSERT_EQUAL(isValidTestH[i][j], 1);
     }
     ASSERT_EQUAL(syncResultD[i+1],  2 * blockSize);
-
-    // Validate uniqueness property of grid rank
-    gridsSeen[i] = gridRankTestH[i][0];
-    for (int k = 0; k < i; ++k) {
-      if (gridsSeen[k] == gridsSeen[i]) {
-        assert (false && "Grid rank in multi-gpu setup should be unique");
-      }
-    }
   }
   ASSERT_EQUAL(syncResultD[0], nGpu * 2 * blockSize);
 
@@ -212,7 +189,6 @@ static void test_cg_multi_grid_group_type_via_base_type(int blockSize)
     ASSERT_EQUAL(hipSetDevice(i), hipSuccess);
 
     ASSERT_EQUAL(hipFree(sizeTestD[i]), hipSuccess);
-    ASSERT_EQUAL(hipFree(gridRankTestD[i]), hipSuccess);
     ASSERT_EQUAL(hipFree(thdRankTestD[i]), hipSuccess);
     ASSERT_EQUAL(hipFree(isValidTestD[i]), hipSuccess);
     ASSERT_EQUAL(hipFree(syncTestD[i]), hipSuccess);
@@ -221,7 +197,6 @@ static void test_cg_multi_grid_group_type_via_base_type(int blockSize)
       ASSERT_EQUAL(hipFree(syncResultD), hipSuccess);
 
     ASSERT_EQUAL(hipHostFree(sizeTestH[i]), hipSuccess);
-    ASSERT_EQUAL(hipHostFree(gridRankTestH[i]), hipSuccess);
     ASSERT_EQUAL(hipHostFree(thdRankTestH[i]), hipSuccess);
     ASSERT_EQUAL(hipHostFree(isValidTestH[i]), hipSuccess);
 
diff --git a/tests/src/runtimeApi/cooperativeGrps/hipCGMultiGridGroupTypeViaPublicApi.cpp b/tests/src/cg/hipCGMultiGridGroupTypeViaPublicApi.cpp
similarity index 83%
rename from tests/src/runtimeApi/cooperativeGrps/hipCGMultiGridGroupTypeViaPublicApi.cpp
rename to tests/src/cg/hipCGMultiGridGroupTypeViaPublicApi.cpp
index 5975ffa068..2f2f378931 100644
--- a/tests/src/runtimeApi/cooperativeGrps/hipCGMultiGridGroupTypeViaPublicApi.cpp
+++ b/tests/src/cg/hipCGMultiGridGroupTypeViaPublicApi.cpp
@@ -22,7 +22,7 @@ THE SOFTWARE.
 
 
 /* HIT_START
- * BUILD: %t %s ../../test_common.cpp
+ * BUILD: %t %s ../test_common.cpp
  * TEST: %t
  * HIT_END
  */
@@ -34,14 +34,11 @@ THE SOFTWARE.
 #include <climits>
 
 #define ASSERT_EQUAL(lhs, rhs) assert(lhs == rhs)
-#define ASSERT_LE(lhs, rhs) assert(lhs <= rhs)
-#define ASSERT_GE(lhs, rhs) assert(lhs >= rhs)
 
 using namespace cooperative_groups;
 
 static __global__
 void kernel_cg_multi_grid_group_type_via_public_api(int *sizeTestD,
-                                                    int* gridRankTestD,
                                                     int *thdRankTestD,
                                                     int *isValidTestD,
                                                     int *syncTestD,
@@ -54,7 +51,6 @@ void kernel_cg_multi_grid_group_type_via_public_api(int *sizeTestD,
   sizeTestD[gIdx] = group_size(mg);
 
   // Test thread_rank api
-  gridRankTestD[gIdx] = this_multi_grid().grid_rank();
   thdRankTestD[gIdx] = thread_rank(mg);
 
   // Test is_valid api
@@ -114,7 +110,6 @@ static void test_cg_multi_grid_group_type_via_public_api(int blockSize)
   // Allocate host and device memory
   int nBytes = sizeof(int) * 2 * blockSize;
   int *sizeTestD[MaxGPUs], *sizeTestH[MaxGPUs];
-  int *gridRankTestD[MaxGPUs], *gridRankTestH[MaxGPUs];
   int *thdRankTestD[MaxGPUs], *thdRankTestH[MaxGPUs];
   int *isValidTestD[MaxGPUs], *isValidTestH[MaxGPUs];
   int *syncTestD[MaxGPUs], *syncResultD;
@@ -122,13 +117,11 @@ static void test_cg_multi_grid_group_type_via_public_api(int blockSize)
     ASSERT_EQUAL(hipSetDevice(i), hipSuccess);
 
     ASSERT_EQUAL(hipMalloc(&sizeTestD[i], nBytes), hipSuccess);
-    ASSERT_EQUAL(hipMalloc(&gridRankTestD[i], nBytes), hipSuccess);
     ASSERT_EQUAL(hipMalloc(&thdRankTestD[i], nBytes), hipSuccess);
     ASSERT_EQUAL(hipMalloc(&isValidTestD[i], nBytes), hipSuccess);
     ASSERT_EQUAL(hipMalloc(&syncTestD[i], nBytes), hipSuccess);
 
     ASSERT_EQUAL(hipHostMalloc(&sizeTestH[i], nBytes), hipSuccess);
-    ASSERT_EQUAL(hipHostMalloc(&gridRankTestH[i], nBytes), hipSuccess);
     ASSERT_EQUAL(hipHostMalloc(&thdRankTestH[i], nBytes), hipSuccess);
     ASSERT_EQUAL(hipHostMalloc(&isValidTestH[i], nBytes), hipSuccess);
 
@@ -142,18 +135,17 @@ static void test_cg_multi_grid_group_type_via_public_api(int blockSize)
   }
 
   // Launch Kernel
-  constexpr int NumKernelArgs = 6;
+  constexpr int NumKernelArgs = 5;
   hipLaunchParams* launchParamsList = new hipLaunchParams[nGpu];
   void* args[MaxGPUs * NumKernelArgs];
   for (int i = 0; i < nGpu; i++) {
     ASSERT_EQUAL(hipSetDevice(i), hipSuccess);
 
     args[i * NumKernelArgs    ] = &sizeTestD[i];
-    args[i * NumKernelArgs + 1] = &gridRankTestD[i];
-    args[i * NumKernelArgs + 2] = &thdRankTestD[i];
-    args[i * NumKernelArgs + 3] = &isValidTestD[i];
-    args[i * NumKernelArgs + 4] = &syncTestD[i];
-    args[i * NumKernelArgs + 5] = &syncResultD;
+    args[i * NumKernelArgs + 1] = &thdRankTestD[i];
+    args[i * NumKernelArgs + 2] = &isValidTestD[i];
+    args[i * NumKernelArgs + 3] = &syncTestD[i];
+    args[i * NumKernelArgs + 4] = &syncResultD;
 
     launchParamsList[i].func = reinterpret_cast<void*>(kernel_cg_multi_grid_group_type_via_public_api);
     launchParamsList[i].gridDim = 2;
@@ -172,8 +164,6 @@ static void test_cg_multi_grid_group_type_via_public_api(int blockSize)
 
     ASSERT_EQUAL(hipMemcpy(sizeTestH[i], sizeTestD[i], nBytes, hipMemcpyDeviceToHost),
                  hipSuccess);
-    ASSERT_EQUAL(hipMemcpy(gridRankTestH[i], gridRankTestD[i], nBytes, hipMemcpyDeviceToHost),
-                 hipSuccess);
     ASSERT_EQUAL(hipMemcpy(thdRankTestH[i], thdRankTestD[i], nBytes, hipMemcpyDeviceToHost),
                  hipSuccess);
     ASSERT_EQUAL(hipMemcpy(isValidTestH[i], isValidTestD[i], nBytes, hipMemcpyDeviceToHost),
@@ -183,26 +173,13 @@ static void test_cg_multi_grid_group_type_via_public_api(int blockSize)
   }
 
   // Validate results
-  int gridsSeen[MaxGPUs];
   for (int i = 0; i < nGpu; ++i) {
     for (int j = 0; j < 2 * blockSize; ++j) {
       ASSERT_EQUAL(sizeTestH[i][j], nGpu * 2 * blockSize);
-      ASSERT_GE(gridRankTestH[i][j], 0);
-      ASSERT_LE(gridRankTestH[i][j], nGpu-1);
-      ASSERT_EQUAL(gridRankTestH[i][j], gridRankTestH[i][0]);
-      int gridRank = gridRankTestH[i][j];
-      ASSERT_EQUAL(thdRankTestH[i][j], (gridRank * 2 * blockSize) + j);
+      ASSERT_EQUAL(thdRankTestH[i][j], (i * 2 * blockSize) + j);
       ASSERT_EQUAL(isValidTestH[i][j], 1);
     }
     ASSERT_EQUAL(syncResultD[i+1],  2 * blockSize);
-
-    // Validate uniqueness property of grid rank
-    gridsSeen[i] = gridRankTestH[i][0];
-    for (int k = 0; k < i; ++k) {
-      if (gridsSeen[k] == gridsSeen[i]) {
-        assert (false && "Grid rank in multi-gpu setup should be unique");
-      }
-    }
   }
   ASSERT_EQUAL(syncResultD[0], nGpu * 2 * blockSize);
 
@@ -212,7 +189,6 @@ static void test_cg_multi_grid_group_type_via_public_api(int blockSize)
     ASSERT_EQUAL(hipSetDevice(i), hipSuccess);
 
     ASSERT_EQUAL(hipFree(sizeTestD[i]), hipSuccess);
-    ASSERT_EQUAL(hipFree(gridRankTestD[i]), hipSuccess);
     ASSERT_EQUAL(hipFree(thdRankTestD[i]), hipSuccess);
     ASSERT_EQUAL(hipFree(isValidTestD[i]), hipSuccess);
     ASSERT_EQUAL(hipFree(syncTestD[i]), hipSuccess);
@@ -221,7 +197,6 @@ static void test_cg_multi_grid_group_type_via_public_api(int blockSize)
       ASSERT_EQUAL(hipFree(syncResultD), hipSuccess);
 
     ASSERT_EQUAL(hipHostFree(sizeTestH[i]), hipSuccess);
-    ASSERT_EQUAL(hipHostFree(gridRankTestH[i]), hipSuccess);
     ASSERT_EQUAL(hipHostFree(thdRankTestH[i]), hipSuccess);
     ASSERT_EQUAL(hipHostFree(isValidTestH[i]), hipSuccess);
 
diff --git a/tests/src/runtimeApi/cooperativeGrps/hipCGThreadBlockType.cpp b/tests/src/cg/hipCGThreadBlockType.cpp
old mode 100755
new mode 100644
similarity index 95%
rename from tests/src/runtimeApi/cooperativeGrps/hipCGThreadBlockType.cpp
rename to tests/src/cg/hipCGThreadBlockType.cpp
index dccac38bf3..4e1de9e44a
--- a/tests/src/runtimeApi/cooperativeGrps/hipCGThreadBlockType.cpp
+++ b/tests/src/cg/hipCGThreadBlockType.cpp
@@ -22,7 +22,7 @@ THE SOFTWARE.
 
 
 /* HIT_START
- * BUILD: %t %s ../../test_common.cpp
+ * BUILD: %t %s ../test_common.cpp
  * TEST: %t
  * HIT_END
  */
@@ -166,16 +166,6 @@ int main()
   ASSERT_EQUAL(hipGetDeviceProperties(&deviceProperties, deviceId), hipSuccess);
   int maxThreadsPerBlock = deviceProperties.maxThreadsPerBlock;
 
-  if (!deviceProperties.cooperativeLaunch) {
-    std::cout << "info: Device doesn't support cooperative launch! skipping the test!\n";
-    if (hip_skip_tests_enabled()) {
-      return hip_skip_retcode();
-    } else {
-      passed();
-    }
-    return 0;
-  }
-
   // Test block sizes which are powers of 2
   int i = 0;
   while (true) {
diff --git a/tests/src/runtimeApi/cooperativeGrps/hipCGThreadBlockTypeViaBaseType.cpp b/tests/src/cg/hipCGThreadBlockTypeViaBaseType.cpp
old mode 100755
new mode 100644
similarity index 94%
rename from tests/src/runtimeApi/cooperativeGrps/hipCGThreadBlockTypeViaBaseType.cpp
rename to tests/src/cg/hipCGThreadBlockTypeViaBaseType.cpp
index b0a42782c0..d4c9402268
--- a/tests/src/runtimeApi/cooperativeGrps/hipCGThreadBlockTypeViaBaseType.cpp
+++ b/tests/src/cg/hipCGThreadBlockTypeViaBaseType.cpp
@@ -22,7 +22,7 @@ THE SOFTWARE.
 
 
 /* HIT_START
- * BUILD: %t %s ../../test_common.cpp
+ * BUILD: %t %s ../test_common.cpp
  * TEST: %t
  * HIT_END
  */
@@ -135,16 +135,6 @@ int main()
   ASSERT_EQUAL(hipGetDeviceProperties(&deviceProperties, deviceId), hipSuccess);
   int maxThreadsPerBlock = deviceProperties.maxThreadsPerBlock;
 
-  if (!deviceProperties.cooperativeLaunch) {
-    std::cout << "info: Device doesn't support cooperative launch! skipping the test!\n";
-    if (hip_skip_tests_enabled()) {
-      return hip_skip_retcode();
-    } else {
-      passed();
-    }
-    return 0;
-  }
-
   // Test block sizes which are powers of 2
   int i = 0;
   while (true) {
diff --git a/tests/src/runtimeApi/cooperativeGrps/hipCGThreadBlockTypeViaPublicApi.cpp b/tests/src/cg/hipCGThreadBlockTypeViaPublicApi.cpp
old mode 100755
new mode 100644
similarity index 94%
rename from tests/src/runtimeApi/cooperativeGrps/hipCGThreadBlockTypeViaPublicApi.cpp
rename to tests/src/cg/hipCGThreadBlockTypeViaPublicApi.cpp
index e4a6a6e330..d13e58b059
--- a/tests/src/runtimeApi/cooperativeGrps/hipCGThreadBlockTypeViaPublicApi.cpp
+++ b/tests/src/cg/hipCGThreadBlockTypeViaPublicApi.cpp
@@ -22,7 +22,7 @@ THE SOFTWARE.
 
 
 /* HIT_START
- * BUILD: %t %s ../../test_common.cpp
+ * BUILD: %t %s ../test_common.cpp
  * TEST: %t
  * HIT_END
  */
@@ -135,16 +135,6 @@ int main()
   ASSERT_EQUAL(hipGetDeviceProperties(&deviceProperties, deviceId), hipSuccess);
   int maxThreadsPerBlock = deviceProperties.maxThreadsPerBlock;
 
-  if (!deviceProperties.cooperativeLaunch) {
-    std::cout << "info: Device doesn't support cooperative launch! skipping the test!\n";
-    if (hip_skip_tests_enabled()) {
-      return hip_skip_retcode();
-    } else {
-      passed();
-    }
-    return 0;
-  }
-
   // Test block sizes which are powers of 2
   int i = 0;
   while (true) {
diff --git a/tests/src/kernel/hipShflTests.cpp b/tests/src/kernel/hipShflTests.cpp
index 06b6a90b83..9b1cc73248 100644
--- a/tests/src/kernel/hipShflTests.cpp
+++ b/tests/src/kernel/hipShflTests.cpp
@@ -57,15 +57,6 @@ void matrixTransposeCPUReference(T* output, T* input, const unsigned int width)
     }
 }
 
-void getFactor(int& fact) { fact = 101; }
-void getFactor(unsigned int& fact) { fact = static_cast<unsigned int>(INT32_MAX)+1; }
-void getFactor(float& fact) { fact = 2.5; }
-void getFactor(double& fact) { fact = 2.5; }
-void getFactor(long& fact) { fact = 202; }
-void getFactor(unsigned long& fact) { fact = static_cast<unsigned long>(__LONG_MAX__)+1; }
-void getFactor(long long& fact) { fact = 303; }
-void getFactor(unsigned long long& fact) { fact = static_cast<unsigned long long>(__LONG_LONG_MAX__)+1; }
-
 template<typename T>
 void runTest() {
     T* Matrix;
@@ -86,10 +77,8 @@ void runTest() {
     cpuTransposeMatrix = (T*)malloc(NUM * sizeof(T));
 
     // initialize the input data
-    T factor;
-    getFactor(factor);
     for (i = 0; i < NUM; i++) {
-        Matrix[i] = (T)i + factor;
+        Matrix[i] = (T)i * 10l;
     }
 
     // allocate the memory on the device side
@@ -135,11 +124,7 @@ void runTest() {
 int main() {
     runTest<int>();
     runTest<float>();
-    runTest<double>();
     runTest<long>();
     runTest<long long>();
-    runTest<unsigned int>();
-    runTest<unsigned long>();
-    runTest<unsigned long long>();
     passed();
 }
diff --git a/tests/src/kernel/hipShflUpDownTest.cpp b/tests/src/kernel/hipShflUpDownTest.cpp
index cd3900aee5..553087ce45 100644
--- a/tests/src/kernel/hipShflUpDownTest.cpp
+++ b/tests/src/kernel/hipShflUpDownTest.cpp
@@ -47,31 +47,13 @@ __global__ void shflUpSum(T* a, int size) {
     a[threadIdx.x] = val;
 }
 
-template <typename T>
-__global__ void shflXorSum(T* a, int size) {
-  T val = a[threadIdx.x];
-  for (int i = size/2; i > 0; i /= 2)
-    val += __shfl_xor(val, i, size);
-  a[threadIdx.x] = val;
-}
-
-void getFactor(int& fact) { fact = 101; }
-void getFactor(unsigned int& fact) { fact = static_cast<unsigned int>(INT32_MAX)+1; }
-void getFactor(float& fact) { fact = 2.5; }
-void getFactor(double& fact) { fact = 2.5; }
-void getFactor(long& fact) { fact = 202; }
-void getFactor(unsigned long& fact) { fact = static_cast<unsigned long>(__LONG_MAX__)+1; }
-void getFactor(long long& fact) { fact = 303; }
-void getFactor(unsigned long long& fact) { fact = static_cast<unsigned long long>(__LONG_LONG_MAX__)+1; }
-
 template <typename T>
 void runTestShflUp() {
     const int size = 32;
     T a[size];
     T cpuSum = 0;
-    T factor; getFactor(factor);
     for (int i = 0; i < size; i++) {
-        a[i] = i + factor;
+        a[i] = i;
         cpuSum += a[i];
     }
     T* d_a;
@@ -91,9 +73,8 @@ void runTestShflDown() {
     const int size = 32;
     T a[size];
     T cpuSum = 0;
-    T factor; getFactor(factor);
     for (int i = 0; i < size; i++) {
-        a[i] = i + factor;
+        a[i] = i;
         cpuSum += a[i];
     }
     T* d_a;
@@ -103,58 +84,19 @@ void runTestShflDown() {
     hipMemcpy(&a, d_a, sizeof(T) * size, hipMemcpyDefault);
     if (a[0] != cpuSum) {
         hipFree(d_a);
-        failed("Shfl Down Sum did not match.");
-    }
-    hipFree(d_a);
-}
-
-template <typename T>
-void runTestShflXor() {
-    const int size = 32;
-    T a[size];
-    T cpuSum = 0;
-    T factor; getFactor(factor);
-    for (int i = 0; i < size; i++) {
-        a[i] = i + factor;
-        cpuSum += a[i];
-    }
-    T* d_a;
-    hipMalloc(&d_a, sizeof(T) * size);
-    hipMemcpy(d_a, &a, sizeof(T) * size, hipMemcpyDefault);
-    hipLaunchKernelGGL(shflXorSum<T>, 1, size, 0, 0, d_a, size);
-    hipMemcpy(&a, d_a, sizeof(T) * size, hipMemcpyDefault);
-    if (a[0] != cpuSum) {
-        hipFree(d_a);
-        failed("Shfl Xor Sum did not match.");
+        failed("Shfl Up Sum did not match.");
     }
     hipFree(d_a);
 }
 int main() {
     runTestShflUp<int>();
     runTestShflUp<float>();
-    runTestShflUp<double>();
     runTestShflUp<long>();
     runTestShflUp<long long>();
-    runTestShflUp<unsigned int>();
-    runTestShflUp<unsigned long>();
-    runTestShflUp<unsigned long long>();
 
     runTestShflDown<int>();
     runTestShflDown<float>();
-    runTestShflDown<double>();
     runTestShflDown<long>();
     runTestShflDown<long long>();
-    runTestShflDown<unsigned int>();
-    runTestShflDown<unsigned long>();
-    runTestShflDown<unsigned long long>();
-
-    runTestShflXor<int>();
-    runTestShflXor<float>();
-    runTestShflXor<double>();
-    runTestShflXor<long>();
-    runTestShflXor<long long>();
-    runTestShflXor<unsigned int>();
-    runTestShflXor<unsigned long>();
-    runTestShflXor<unsigned long long>();
     passed();
 }
diff --git a/tests/src/p2p/hipPeerToPeer_simple.cpp b/tests/src/p2p/hipPeerToPeer_simple.cpp
old mode 100755
new mode 100644
index 13779694e2..9f0982f353
--- a/tests/src/p2p/hipPeerToPeer_simple.cpp
+++ b/tests/src/p2p/hipPeerToPeer_simple.cpp
@@ -395,9 +395,6 @@ int main(int argc, char* argv[]) {
 
     if (gpuCount < 2) {
         printf("P2P application requires atleast 2 gpu devices\n");
-        if (hip_skip_tests_enabled()) {
-          return hip_skip_retcode();
-        }
     } else {
         if (p_tests & 0x100) {
             testPeerHostToDevice(false /*useAsyncCopy*/);
diff --git a/tests/src/runtimeApi/cooperativeGrps/api_failure_tests.cpp b/tests/src/runtimeApi/cooperativeGrps/api_failure_tests.cpp
deleted file mode 100644
index f073d7f72e..0000000000
--- a/tests/src/runtimeApi/cooperativeGrps/api_failure_tests.cpp
+++ /dev/null
@@ -1,280 +0,0 @@
-/*
-  Copyright (c) 2020-present Advanced Micro Devices, Inc. All rights reserved.
-  Permission is hereby granted, free of charge, to any person obtaining a copy
-  of this software and associated documentation files (the "Software"), to deal
-  in the Software without restriction, including without limitation the rights
-  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-  copies of the Software, and to permit persons to whom the Software is
-  furnished to do so, subject to the following conditions:
-  The above copyright notice and this permission notice shall be included in
-  all copies or substantial portions of the Software.
-  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-  THE SOFTWARE.
-*/
-// Test Description:
-/*The general idea of the application is to test how Cooperative Groups kernel
-launches work when launching too many warps to the target device. This test
-first queries the nominal warp size of the target device. It then walks through
-block sizes from 1 thread, 1 warp, 2 warps, ... `maximum_warps_in_a_block`. For
-each of these, it queries the maximum number of blocks that can fit in each SM.
-It then queries the number of SMs on the target device. This will yield a
-calculation for the maximum number of blocks that can be co-scheduled on this
-device.
-
-The Cooperative Groups API says that users should not launch more than this
-many warps (or blocks, etc.) to the target device. This test first tires to
-launch 2x as many blcoks, to confirm that the runtime prevents such a launch
-by returning a proper error value (`hipErrorCooperativeLaunchTooLarge`).
-
-It then ensures that trying to launch too large of a kernel invocation does
-not break the GPU by launching a kernel with exactly the maximum number of
-blocks.
-
-Finally, we run the same test for a block size that is larger than the maximum
-allowed by the device, to ensure that this case is properly detected by the
-runtime and that nothing breaks.*/
-
-
-
-/* HIT_START
- * BUILD: %t %s ../../test_common.cpp
- * TEST: %t
- * HIT_END
- */
-
-
-#include <hip/hip_runtime.h>
-#include <hip/hip_cooperative_groups.h>
-#include "test_common.h"
-
-
-static inline void hipCheckAndFail(hipError_t errval,
-        const char *file, int line) {
-  hipError_t last_err = hipGetLastError();
-  if (errval != hipSuccess) {
-    std::cerr << "hip error: " << hipGetErrorString(errval);
-    std::cerr << std::endl;
-    std::cerr << "    Location: " << file << ":" << line << std::endl;
-    failed("");
-  }
-  if (last_err != errval) {
-    std::cerr << "Error: the return value of a function was not the same ";
-    std::cerr << "as the value returned by hipGetLastError()" << std::endl;
-    std::cerr << "    Location: " << file << ":" << line << std::endl;
-    std::cerr << "    Function returned: " << hipGetErrorString(errval);
-    std::cerr << " (" << errval << ")" << std::endl;
-    std::cerr << "hipGetLastError() returned: " << hipGetErrorString(last_err);
-    std::cerr << " (" << last_err << ")" << std::endl;
-    failed("");
-  }
-}
-#define hipCheckErr(errval) \
-        do { hipCheckAndFail((errval), __FILE__, __LINE__); } while (0)
-
-static inline bool hipCheckExpected(hipError_t errval,
-        hipError_t expected_err, const char *file, int line) {
-  hipError_t last_err = hipGetLastError();
-  if (errval != expected_err) {
-    std::cerr << "hip error: " << hipGetErrorString(errval);
-    std::cerr << std::endl;
-    std::cerr << "    Location: " << file << ":" << line << std::endl;
-    return false;
-  }
-  if (last_err != errval) {
-    std::cerr << "Error: the return value of a function was not the same ";
-    std::cerr << "as the value returned by hipGetLastError()" << std::endl;
-    std::cerr << "    Location: " << file << ":" << line << std::endl;
-    std::cerr << "    Function returned: " << hipGetErrorString(errval);
-    std::cerr << " (" << errval << ")" << std::endl;
-    std::cerr << "hipGetLastError() returned: " << hipGetErrorString(last_err);
-    std::cerr << " (" << last_err << ")" << std::endl;
-    return false;
-  }
-  return true;
-}
-
-static bool cooperative_groups_support(int device_id) {
-  hipError_t err;
-  int cooperative_attribute;
-  HIPCHECK(hipDeviceGetAttribute(&cooperative_attribute,
-           hipDeviceAttributeCooperativeLaunch, device_id));
-  if (!cooperative_attribute) {
-    std::cerr << "Cooperative launch support not available in ";
-    std::cerr << "the device attribute for device " << device_id;
-    std::cerr << std::endl;
-    return false;
-  }
-  hipDeviceProp_t device_properties;
-  HIPCHECK(hipGetDeviceProperties(&device_properties, device_id));
-  if (device_properties.cooperativeLaunch == 0) {
-    std::cerr << "Cooperative group support not available in ";
-    std::cerr << "device properties." << std::endl;
-    return false;
-  }
-  return true;
-}
-
-__global__ void test_kernel(long long *array) {
-  unsigned int rank = blockIdx.x * blockDim.x + threadIdx.x;
-  array[rank] += clock64();
-}
-
-int main(int argc, char** argv) {
-  hipError_t err;
-  int device_num, FailFlag = 0;
-  // Alocate the host input buffer, and two device-focused buffers that we
-  // will use for our test.
-  unsigned int *dev_array[2];
-  HIPCHECK(hipGetDeviceCount(&device_num));
-  for (int dev = 0; dev < device_num; ++dev) {
-    /*************************************************************************/
-    /* Test whether target device supports cooperative groups ****************/
-    HIPCHECK(hipSetDevice(dev));
-    if (!cooperative_groups_support(dev)) {
-      std::cout << "Skipping the test with Pass result.\n";
-      passed();
-    }
-
-    /*************************************************************************/
-    /* Create the streams we will use in this test. **************************/
-    hipStream_t streams[2];
-    for (int i = 0; i < 2; i++) {
-      HIPCHECK(hipStreamCreate(&streams[i]));
-    }
-
-    /*************************************************************************/
-    /* We will try to launch more waves than the GPU can fit. ***************/
-    hipDeviceProp_t device_properties;
-    HIPCHECK(hipGetDeviceProperties(&device_properties, dev));
-    int warp_size = device_properties.warpSize;
-    int num_sms = device_properties.multiProcessorCount;
-    int max_num_threads = device_properties.maxThreadsPerBlock;
-
-    // Check single-thread block, all numbers of warps, then too-large block
-    for (int block_size = 0; block_size <= (max_num_threads + warp_size);
-         block_size += warp_size) {
-      if (block_size == 0) {
-        block_size = 1;
-      }
-      int max_blocks_per_sm;
-      // Calculate the device occupancy to know how many blocks can be run.
-      HIPCHECK(hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
-              &max_blocks_per_sm, test_kernel, block_size, 0,
-              hipOccupancyDefault));
-
-      if ((block_size > max_num_threads) && (max_blocks_per_sm != 0)) {
-        std::cerr << "ERROR! Occupancy API indicated that we can have >0 ";
-        std::cerr << "blocks in a kernel when the block size is too large ";
-        std::cerr << "to work on the device." << std::endl;
-        std::cerr << "This is incorrect, and could possibly lead users ";
-        std::cerr << "to try to launch kernels that will fail." << std::endl;
-        //failed("");
-        FailFlag = 1;
-        break;
-      }
-
-      int desired_blocks = max_blocks_per_sm * num_sms;
-      bool expect_fail = false;
-      if (desired_blocks == 0) {
-        desired_blocks = 1;
-        expect_fail = true;
-      }
-
-      /**********************************************************************/
-      /* Set up data to pass into the kernel ********************************/
-
-      for (int i = 0; i < 2; i++) {
-        int test_size;
-        // Case where we expect to fail at launch.
-        if (i == 0) {
-          test_size = 2 * desired_blocks;
-        } else {
-          test_size = desired_blocks;
-        }
-        HIPCHECK(hipMalloc(reinterpret_cast<void**>(&dev_array[i]),
-                           test_size * block_size * sizeof(long long)));
-        HIPCHECK(hipMemsetAsync(dev_array[i], 0,
-                                test_size * block_size * sizeof(long long),
-                                streams[i]));
-      }
-
-      HIPCHECK(hipDeviceSynchronize());
-
-      /***********************************************************************/
-      /* Launch the kernels **************************************************/
-      void *coop_params[2][1];
-      for (int i = 0; i < 2; i++) {
-        coop_params[i][0] = reinterpret_cast<void*>(&dev_array[i]);
-      }
-
-      err = hipLaunchCooperativeKernel(reinterpret_cast<void*>(test_kernel),
-                                       2 * desired_blocks, block_size,
-                                       coop_params[0], 0, streams[0]);
-
-      hipError_t expect_to_see;
-      if (expect_fail) {
-        expect_to_see = hipErrorInvalidConfiguration;
-      } else {
-        expect_to_see = hipErrorCooperativeLaunchTooLarge;
-      }
-      if (!hipCheckExpected(err, expect_to_see, __FILE__, __LINE__)) {
-        std::cerr << "ERROR! Tried to launch a cooperative kernel with ";
-        std::cerr << "too many warps." << std::endl;
-        std::cerr << "This SHOULD have failed with the error ";
-        std::cerr << hipGetErrorString(expect_to_see);
-        std::cerr << " (" << expect_to_see << ")." << std::endl;
-        std::cerr << "Instead, the launch returned " << hipGetErrorName(err);
-        std::cerr << " (" << err << ")" << std::endl;
-        FailFlag = 1;
-        break;
-      }
-
-      HIPCHECK(hipDeviceSynchronize());
-      err = hipLaunchCooperativeKernel(reinterpret_cast<void*>(test_kernel),
-                                       desired_blocks, block_size,
-                                       coop_params[1], 0, streams[1]);
-
-      if (expect_fail) {
-        expect_to_see = hipErrorInvalidConfiguration;
-      } else {
-        expect_to_see = hipSuccess;
-      }
-      if (!hipCheckExpected(err, expect_to_see, __FILE__, __LINE__)) {
-        std::cerr << "ERROR! Tried to launch a cooperative kernel ";
-        std::cerr << "with a normal size, but a block size of ";
-        std::cerr << desired_blocks << std::endl;
-        std::cerr << "This SHOULD have returned ";
-        std::cerr << hipGetErrorString(expect_to_see);
-        std::cerr << " (" << expect_to_see << ")." << std::endl;
-        std::cerr << "Instead, the launch returned " << hipGetErrorName(err);
-        std::cerr << " (" << err << ")" << std::endl;
-        FailFlag = 1;
-        break;
-      }
-
-      HIPCHECK(hipDeviceSynchronize());
-
-      if (block_size == 1) {
-        block_size = 0;
-      }
-      for (int m = 0; m < 2; ++m) {
-        HIPCHECK(hipFree(dev_array[m]));
-      }
-    }
-    for (int m = 0; m < 2; ++m) {
-      HIPCHECK(hipStreamDestroy(streams[m]));
-    }
-    if (FailFlag == 1) {
-      for (int m = 0; m < 2; ++m) {
-        HIPCHECK(hipFree(dev_array[m]));
-      }
-      failed("");
-    }
-  }
-  passed();
-}
diff --git a/tests/src/runtimeApi/cooperativeGrps/cooperative_streams.cpp b/tests/src/runtimeApi/cooperativeGrps/cooperative_streams.cpp
deleted file mode 100644
index c9adc03b24..0000000000
--- a/tests/src/runtimeApi/cooperativeGrps/cooperative_streams.cpp
+++ /dev/null
@@ -1,283 +0,0 @@
-/*
-Copyright (c) 2020-present Advanced Micro Devices, Inc. All rights reserved.
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR
-IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
-*/
-// Test Description:
-/*
-The general idea of the application is to test how Cooperative Groups kernel
-launches to a stream interact with other kernels being launched to different
-streams.
-
-For example: the HIP runtime will force cooperative kernel launches to run
-serially, even if they are launched to different streams. However,
-cooperative kernel launches can run in parallel with regular kernels that
-are launched to other streams. This limitation is so that the cooperative
-kernels do not conflict with one another for resources and potentially
-deadlock the system.
-
-As such, this benchmark tests three situations:
-
-  1. Launching a cooperative kernel by itself to stream[0]
-  2. Launching two cooperative kernels in parallel to stream[0] and stream[1]
-  3. Launching two cooperative kernels in parallel to stream[0] and stream[1]
-     and launching a third non-cooperative kernel to stream[2]
-
-We time how long it takes to run each of these benchmarks and print it as
-the output of the benchmark. The kernels themselves are just useless time-
-wasting code so that the kernel takes a meaningful amount of time on the
-GPU before it exits. We only launch a single wavefront for each kernel, so
-any serialization should not be because of GPU occupancy concerns.
-
-If test #2 takes roughly twice as long as #1, that implies that cooperative
-kernels are properly serialized with each other by the runtime.
-
-If test #3 takes the same amount of time as test #2, that implies that
-regular kernels can properly run in parallel with cooperative kernels.
-*/
-
-/* HIT_START
- * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS --std=c++11
- * TEST: %t
- * HIT_END
- */
-
-#include <chrono>
-#include <hip/hip_runtime.h>
-#include <hip/hip_cooperative_groups.h>
-#include "test_common.h"
-
-static inline void hipCheckAndFail(hipError_t errval,
-        const char *file, int line) {
-  hipError_t last_err = hipGetLastError();
-  if (errval != hipSuccess) {
-    std::cerr << "hip error: " << hipGetErrorString(errval);
-    std::cerr << std::endl;
-    std::cerr << "Location: " << file << ":" << line << std::endl;
-    failed("");
-  }
-  if (last_err != errval) {
-    std::cerr << "Error: the return value of a function was not the same ";
-    std::cerr << "as the value returned by hipGetLastError()" << std::endl;
-    std::cerr << "Location: " << file << ":" << line << std::endl;
-    std::cerr << "Function returned: " << hipGetErrorString(errval);
-    std::cerr << " (" << errval << ")" << std::endl;
-    std::cerr << "hipGetLastError() returned: " << hipGetErrorString(last_err);
-    std::cerr << " (" << last_err << ")" << std::endl;
-    failed("");
-  }
-}
-#define hipCheckErr(errval) \
-  do { hipCheckAndFail((errval), __FILE__, __LINE__); } while (0)
-
-static int cooperative_groups_support(int device_id) {
-  hipError_t err;
-  int cooperative_attribute;
-  HIPCHECK(hipDeviceGetAttribute(&cooperative_attribute,
-           hipDeviceAttributeCooperativeLaunch, device_id));
-  if (!cooperative_attribute) {
-    std::cerr << "Cooperative launch support not available in ";
-    std::cerr << "the device attribute for device " << device_id;
-    std::cerr << std::endl;
-    return 0;
-  }
-
-  hipDeviceProp_t device_properties;
-  HIPCHECK(hipGetDeviceProperties(&device_properties, device_id));
-  if (device_properties.cooperativeLaunch == 0) {
-    std::cerr << "Cooperative group support not available in ";
-    std::cerr << "device properties." << std::endl;
-    return 0;
-  }
-  return 1;
-}
-
-__global__ void test_kernel(uint32_t loops, unsigned long long *array) {
-  unsigned int rank = blockIdx.x * blockDim.x + threadIdx.x;
-
-  for (int i = 0; i < loops; i++) {
-    long long start_clock = clock64();
-    while (clock64() < (start_clock+1000000)) {}
-    array[rank] += clock64();
-  }
-}
-
-int main(int argc, char** argv) {
-  hipError_t err;
-  /*************************************************************************/
-  int device_num = 0, loops = 1000, FailFlag = 0;
-  /* Create the streams we will use in this test. **************************/
-  hipStream_t streams[3];
-  // Alocate the host input buffer, and two device-focused buffers that we
-  // will use for our test.
-  unsigned long long *dev_array[3];
-  HIPCHECK(hipGetDeviceCount(&device_num));
-  for (int dev = 0; dev < device_num; ++dev) {
-    /*************************************************************************/
-    /* Test whether target device supports cooperative groups ****************/
-    HIPCHECK(hipSetDevice(dev));
-    if (!cooperative_groups_support(dev)) {
-      std::cout << "Skipping the test with Pass result.\n";
-      passed();
-    }
-
-    /*************************************************************************/
-    /* We will launch enough waves to fill up all of the GPU *****************/
-    hipDeviceProp_t device_properties;
-    HIPCHECK(hipGetDeviceProperties(&device_properties, dev));
-    int warp_size = device_properties.warpSize;
-    int num_sms = device_properties.multiProcessorCount;
-    int desired_blocks = 1;
-    std::cout << "Device: " << dev << std::endl;
-    std::cout << "Device name: " << device_properties.name << std::endl;
-
-    int max_blocks_per_sm;
-    // Calculate the device occupancy to know how many blocks can be run.
-    HIPCHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor(&max_blocks_per_sm,
-                                                          test_kernel,
-                                                          warp_size, 0));
-
-    if (desired_blocks > max_blocks_per_sm * num_sms) {
-      std::cerr << "The requested number of blocks will not fit on the GPU";
-      std::cerr << std::endl;
-      std::cerr << "You requested " << desired_blocks << " but we can only ";
-      std::cerr << "fit " << (max_blocks_per_sm * num_sms) << std::endl;
-      failed("");
-    }
-
-    /*************************************************************************/
-    for (int i = 0; i < 3; i++) {
-      HIPCHECK(hipStreamCreate(&streams[i]));
-    }
-
-    /*************************************************************************/
-    /* Set up data to pass into the kernel ***********************************/
-
-    for (int i = 0; i < 3; i++) {
-      HIPCHECK(hipMalloc(reinterpret_cast<void**>(&dev_array[i]),
-                         warp_size * sizeof(long long)));
-      HIPCHECK(hipMemsetAsync(dev_array[i], 0, warp_size * sizeof(long long),
-                              streams[i]));
-    }
-
-    HIPCHECK(hipDeviceSynchronize());
-
-    /*************************************************************************/
-    /* Launch the kernels ****************************************************/
-    void *coop_params[3][2];
-    for (int i = 0; i < 3; i++) {
-      coop_params[i][0] = reinterpret_cast<void*>(&loops);
-      coop_params[i][1] = reinterpret_cast<void*>(&dev_array[i]);
-    }
-
-    std::cout << "Launching a single cooperative kernel..." << std::endl;
-    auto single_start = std::chrono::system_clock::now();
-    HIPCHECK(hipLaunchCooperativeKernel(reinterpret_cast<void*>(test_kernel),
-                                        desired_blocks, warp_size,
-                                        coop_params[0], 0, streams[0]));
-
-    HIPCHECK(hipDeviceSynchronize());
-    auto single_end = std::chrono::system_clock::now();
-    std::cout << "Launching 2 cooperative kernels to different streams...";
-    std::cout << std::endl;
-
-    auto double_start = std::chrono::system_clock::now();
-    HIPCHECK(hipLaunchCooperativeKernel(reinterpret_cast<void*>(test_kernel),
-                                        desired_blocks, warp_size,
-                                        coop_params[0], 0, streams[0]));
-    HIPCHECK(hipLaunchCooperativeKernel(reinterpret_cast<void*>(test_kernel),
-                                        desired_blocks, warp_size,
-                                        coop_params[1], 0, streams[1]));
-
-    HIPCHECK(hipDeviceSynchronize());
-    auto double_end = std::chrono::system_clock::now();
-    std::cout << "Launching 2 cooperative kernels and 1 normal kernel...";
-    std::cout << std::endl;
-
-    auto triple_start = std::chrono::system_clock::now();
-    HIPCHECK(hipLaunchCooperativeKernel(reinterpret_cast<void*>(test_kernel),
-                                        desired_blocks, warp_size,
-                                        coop_params[0], 0, streams[0]));
-    HIPCHECK(hipLaunchCooperativeKernel(reinterpret_cast<void*>(test_kernel),
-                                        desired_blocks, warp_size,
-                                        coop_params[1], 0, streams[1]));
-    hipLaunchKernelGGL(test_kernel, dim3(desired_blocks), dim3(warp_size),
-                       0, streams[2], loops, dev_array[2]);
-    err = hipGetLastError();
-    hipCheckErr(err);
-
-    HIPCHECK(hipDeviceSynchronize());
-    auto triple_end = std::chrono::system_clock::now();
-    std::chrono::duration<double> single_kernel_time =
-                                  (single_end - single_start);
-    std::chrono::duration<double> double_kernel_time =
-                                  (double_end - double_start);
-    std::chrono::duration<double> triple_kernel_time =
-                                  (triple_end - triple_start);
-
-    std::cout << "A single kernel took:" << std::endl;
-    std::cout << "    " << single_kernel_time.count();
-    std::cout << " seconds" << std::endl;
-    std::cout << std::endl;
-    std::cout << "Two cooperative kernels that could run together took:";
-    std::cout << std::endl;
-    std::cout << "    " << double_kernel_time.count();
-    std::cout << " seconds" << std::endl;
-    std::cout << std::endl;
-    std::cout << "Two coop kernels and a third regular kernel took:";
-    std::cout << std::endl << "    ";
-    std::cout << triple_kernel_time.count();
-    std::cout << " seconds" << std::endl;
-
-    std::cout << "Testing whether these times make sense.." << std::endl;
-    // Test that two cooperative kernels is roughly twice as long as one
-    if (double_kernel_time < 1.8 * single_kernel_time) {
-      std::cerr << "ERROR!" << std::endl;
-      std::cerr << "Two cooperative kernels launched at the same ";
-      std::cerr << "time did not take roughly twice as long as a single ";
-      std::cerr << "cooperative kernel." << std::endl;
-      std::cerr << "Were they truly serialized?" << std::endl;
-      FailFlag = 1;
-      break;
-    }
-
-    // Test that the three kernels together took roughly as long as two
-    // cooperative kernels.
-    if (triple_kernel_time > 1.1 * double_kernel_time) {
-      std::cerr << "ERROR!" << std::endl;
-      std::cerr << "Launching a normal kernel in parallel with two ";
-      std::cerr << "back-to-back cooperative kernels still ended up taking ";
-      std::cerr << "more than 10% longer than the two cooperative kernels ";
-      std::cerr << "alone." << std::endl;
-      std::cerr << "Is the normal kernel being serialized with the ";
-      std::cerr << "cooperative kernels on different streams?" << std::endl;
-      FailFlag = 1;
-      break;
-    }
-    for (int k = 0; k < 3; ++k) {
-      HIPCHECK(hipFree(dev_array[k]));
-      HIPCHECK(hipStreamDestroy(streams[k]));
-    }
-  }
-  if (FailFlag == 1) {
-    for (int k = 0; k < 3; ++k) {
-      HIPCHECK(hipFree(dev_array[k]));
-      HIPCHECK(hipStreamDestroy(streams[k]));
-    }
-    failed("");
-  }
-  passed();
-}
diff --git a/tests/src/runtimeApi/cooperativeGrps/grid_group_data_sharing.cpp b/tests/src/runtimeApi/cooperativeGrps/grid_group_data_sharing.cpp
deleted file mode 100644
index 46ad7ea7a4..0000000000
--- a/tests/src/runtimeApi/cooperativeGrps/grid_group_data_sharing.cpp
+++ /dev/null
@@ -1,303 +0,0 @@
-/*
-Copyright (c) 2020-present Advanced Micro Devices, Inc. All rights reserved.
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR
-IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
-*/
-// Test Description:
-/*The general idea of the application is to create a buffer of width N. N is a
-command line parameter, and the user will need to make sure that we can fit
-two buffers of N unsigned integers onto the target GPU at the same time.
-
-We then launch a fixed number of warps to the GPU. This number is calculated
-to fill the GPU with as many warps as can simultaneously run on the GPU.
-The threads in these warps then walk over two arrays. First, values from
-A[offset] are added into B[offset]. After all of A is added into all of B
-in this element-wise manner, all of the waves barrier with one another.
-
-After the barrier, the waves start adding values from B[mirror_offset] into
-A[offset]. Mirror offset means that the wave that is writing into A[7] is
-reading from B[7 before the last value]. This was probably written by a
-different thread before the barrier.
-
-After going through this loop a certain number of times, the kernel ends and
-we read the arrays back out and recalculate this algorithm serially on the
-CPU. We compare the serial version to the version that has inter-thread data
-sharing and barriers and ensure they result in the same answer.
-
-If they do have the same answer, then we can pretty confidently say that
-writing from thread X and then hitting a barrier allows thread Y to see the
-values.*/
-
-/* HIT_START
- * BUILD: %t %s ../../test_common.cpp
- * TEST: %t
- * HIT_END
- */
-#include <hip/hip_runtime.h>
-#include <hip/hip_cooperative_groups.h>
-#include "test_common.h"
-
-static inline void hipCheckAndFail(hipError_t errval,
-                                   const char *file, int line) {
-  hipError_t last_err = hipGetLastError();
-  if (errval != hipSuccess) {
-    std::cerr << "hip error: " << hipGetErrorString(errval);
-    std::cerr << std::endl;
-    std::cerr << "    Location: " << file << ":" << line << std::endl;
-    exit(errval);
-  }
-  if (last_err != errval) {
-    std::cerr << "Error: the return value of a function was not the same ";
-    std::cerr << "as the value returned by hipGetLastError()" << std::endl;
-    std::cerr << "    Location: " << file << ":" << line << std::endl;
-    std::cerr << "    Function returned: " << hipGetErrorString(errval);
-    std::cerr << " (" << errval << ")" << std::endl;
-    std::cerr << "hipGetLastError() returned: " << hipGetErrorString(last_err);
-    std::cerr << " (" << last_err << ")" << std::endl;
-    failed("");
-  }
-}
-#define hipCheckErr(errval)\
-        do { hipCheckAndFail((errval), __FILE__, __LINE__); } while (0)
-
-static int cooperative_groups_support(int device_id) {
-  hipError_t err;
-
-  int cooperative_attribute;
-  HIPCHECK(hipDeviceGetAttribute(&cooperative_attribute,
-          hipDeviceAttributeCooperativeLaunch, device_id));
-  if (!cooperative_attribute) {
-    std::cerr << "Cooperative launch support not available in ";
-    std::cerr << "the device attribute for device " << device_id;
-    std::cerr << std::endl;
-    return 0;
-  }
-
-  hipDeviceProp_t device_properties;
-  HIPCHECK(hipGetDeviceProperties(&device_properties, device_id));
-  if (device_properties.cooperativeLaunch == 0) {
-    std::cerr << "Cooperative group support not available in ";
-    std::cerr << "device properties." << std::endl;
-    return 0;
-  }
-  return 1;
-}
-
-static int verify_coop_arrays(unsigned int loops, unsigned int *host_input,
-                              unsigned int *first_array,
-                              unsigned int *second_array,
-                              unsigned int array_len) {
-  unsigned int *host_first_array = host_input;
-  unsigned int *host_second_array = (unsigned int*)calloc(array_len,
-                                                          sizeof(int));
-
-  for (int i = 0; i < loops; i++) {
-    for (int offset = 0; offset < array_len; offset++) {
-      host_second_array[offset] += host_first_array[offset];
-    }
-
-    for (int offset = 0; offset < array_len; offset++) {
-      unsigned int swizzle_offset = array_len - offset - 1;
-      host_first_array[offset] += host_second_array[swizzle_offset];
-    }
-  }
-
-  for (int i = 0; i < array_len; i++) {
-    if (host_first_array[i] != first_array[i]) {
-      std::cerr << "Test failure!" << std::endl;
-      std::cerr << "    host_first_array[" << i << "] contains the ";
-      std::cerr << "value " << host_first_array[i] << std::endl;
-      std::cerr << "    GPU first_array[" << i << "] contains the ";
-      std::cerr << "value " << first_array[i] << std::endl;
-      return -1;
-    }
-    if (host_second_array[i] != second_array[i]) {
-      std::cerr << "Test failure!" << std::endl;
-      std::cerr << "    host_second_array[" << i << "] contains the ";
-      std::cerr << "value " << host_second_array[i] << std::endl;
-      std::cerr << "    GPU second_array[" << i << "] contains the ";
-      std::cerr << "value " << second_array[i] << std::endl;
-      return -1;
-    }
-  }
-
-  std::cout << "Coop test appears to work properly!" << std::endl;
-  free(host_second_array);
-  return 0;
-}
-
-__global__ void
-coop_kernel(unsigned int *first_array, unsigned int *second_array,
-            unsigned int loops, unsigned int array_len) {
-  cooperative_groups::grid_group grid = cooperative_groups::this_grid();
-  unsigned int rank = grid.thread_rank();
-  unsigned int grid_size = grid.size();
-
-  for (int i = 0; i < loops; i++) {
-    // The goal of this loop is to directly add in values from
-    // array one into array two, on a per-wave basis.
-    for (int offset = rank; offset < array_len; offset += grid_size) {
-      second_array[offset] += first_array[offset];
-    }
-
-    grid.sync();
-
-    // The goal of this loop is to pull data the "mirror" lane in
-    // array two and add it back into array one. This causes inter-
-    // thread swizzling.
-    for (int offset = rank; offset < array_len; offset += grid_size) {
-      unsigned int swizzle_offset = array_len - offset - 1;
-      first_array[offset] += second_array[swizzle_offset];
-    }
-
-    grid.sync();
-  }
-}
-
-int main(int argc, char** argv) {
-  hipError_t err;
-  /*************************************************************************/
-  /* Parse the command line parameters *************************************/
-  // Arguments to pull out of the command line.
-  int device_num = 0, loops = 2, width = 4096, flag = 0;
-  HIPCHECK(hipGetDeviceCount(&device_num));
-  for (int dev = 0; dev < device_num; ++dev) {
-    std::cout << "Device number: " << dev << std::endl;
-    std::cout << "Loops: " << loops << std::endl;
-    std::cout << "Width: " << width << std::endl;
-
-    /*************************************************************************/
-    /* Test whether target device supports cooperative groups ****************/
-    HIPCHECK(hipSetDevice(dev));
-
-    if (!cooperative_groups_support(dev)) {
-      std::cout << "Skipping the test with Pass result.\n";
-      passed();
-    }
-
-    /*************************************************************************/
-    /* We will launch enough waves to fill up all of the GPU *****************/
-    hipDeviceProp_t device_properties;
-    HIPCHECK(hipGetDeviceProperties(&device_properties, dev));
-
-    int warp_size = device_properties.warpSize;
-    int num_sms = device_properties.multiProcessorCount;
-
-    std::cout << "Device name: " << device_properties.name << std::endl;
-    std::cout << std::endl;
-
-    // Calculate the device occupancy to know how many blocks can be run.
-    int max_blocks_per_sm;
-    HIPCHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor(&max_blocks_per_sm,
-                                                          coop_kernel,
-                                                          warp_size, 0));
-
-    int total_blocks = max_blocks_per_sm * num_sms;
-
-    /*************************************************************************/
-    /* Create the streams we will use in this test. **************************/
-    hipStream_t streams[2];
-    for (int i = 0; i < 2; i++) {
-      HIPCHECK(hipStreamCreate(&streams[i]));
-    }
-
-    /*************************************************************************/
-    /* Set up data to pass into the kernel ***********************************/
-
-    // Alocate the host input buffer, and two device-focused buffers that we
-    // will use for our test.
-    unsigned int *input_buffer = (unsigned int*)calloc(width,
-                                                       sizeof(unsigned int));
-    for (int i = 0; i < width; i++) {
-      input_buffer[i] = i;
-    }
-
-    unsigned int *first_dev_array;
-    HIPCHECK(hipMalloc(reinterpret_cast<void**>(&first_dev_array),
-                       width * sizeof(unsigned int)));
-
-    HIPCHECK(hipMemcpyAsync(first_dev_array, input_buffer,
-                            width * sizeof(unsigned int),
-                            hipMemcpyHostToDevice, streams[0]));
-
-    unsigned int *second_dev_array;
-    HIPCHECK(hipMalloc(reinterpret_cast<void**>(&second_dev_array),
-                       width * sizeof(unsigned int)));
-    HIPCHECK(hipMemsetAsync(second_dev_array, 0, width * sizeof(unsigned int),
-                            streams[0]));
-
-    /*************************************************************************/
-    /* Launch the kernels ****************************************************/
-    std::cout << "Launching a cooperative kernel with " << total_blocks;
-    std::cout << " thread blocks, each with " << warp_size << " threads";
-    std::cout << std::endl;
-
-    void *coop_params[4];
-    coop_params[0] = reinterpret_cast<void*>(&first_dev_array);
-    coop_params[1] = reinterpret_cast<void*>(&second_dev_array);
-    coop_params[2] = reinterpret_cast<void*>(&loops);
-    coop_params[3] = reinterpret_cast<void*>(&width);
-    HIPCHECK(hipLaunchCooperativeKernel(reinterpret_cast<void*>(coop_kernel),
-                                        total_blocks, warp_size, coop_params,
-                                        0, streams[0]));
-
-    /*************************************************************************/
-    /* Read back the buffers and print out their data ************************/
-    unsigned int *first_array = (unsigned int*)calloc(width,
-                                                      sizeof(unsigned int));
-    unsigned int *second_array = (unsigned int*)calloc(width,
-                                                       sizeof(unsigned int));
-    HIPCHECK(hipMemcpyAsync(first_array, first_dev_array,
-                            width * sizeof(unsigned int),
-                            hipMemcpyDeviceToHost, streams[0]));
-
-    HIPCHECK(hipMemcpyAsync(second_array, second_dev_array,
-                            width * sizeof(unsigned int),
-                            hipMemcpyDeviceToHost, streams[0]));
-
-    std::cout << "Waiting for cooperative work to finish..." << std::endl;
-    std::cout << std::flush;
-
-    HIPCHECK(hipStreamSynchronize(streams[0]));
-
-
-    int ret_val = 0;
-
-    std::cout << "Attemping to verify buffers." << std::endl;
-    std::cout << std::flush;
-    ret_val = verify_coop_arrays(loops, input_buffer, first_array,
-                                 second_array, width);
-    if (!ret_val) {
-      std::cout << "It appears that inter-thread data sharing at ";
-      std::cout << "grid_group sync points works properly!" << std::endl;
-    } else {
-      flag = 1;
-    }
-    for (int k = 0; k < 2; ++k) {
-      HIPCHECK(hipStreamDestroy(streams[k]));
-    }
-    HIPCHECK(hipFree(first_dev_array));
-    HIPCHECK(hipFree(second_dev_array));
-    free(input_buffer);
-    free(first_array);
-    free(second_array);
-  }
-  if (!flag) {
-    passed();
-  } else {
-    failed("");
-  }
-}
diff --git a/tests/src/runtimeApi/cooperativeGrps/multi_gpu_api_failure_tests.cpp b/tests/src/runtimeApi/cooperativeGrps/multi_gpu_api_failure_tests.cpp
deleted file mode 100644
index b75725fed4..0000000000
--- a/tests/src/runtimeApi/cooperativeGrps/multi_gpu_api_failure_tests.cpp
+++ /dev/null
@@ -1,568 +0,0 @@
-/*
-Copyright (c) 2020-present Advanced Micro Devices, Inc. All rights reserved.
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR
-IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
-*/
-// Test Description:
-/*The general idea of the application is to test how Cooperative Groups kernel
-launches work when launching too many warps to multiple target devices. This
-tests the following failure modes for hipLaunchCooperativeKernelMultiDevice:
-  1) Do not launch more warps to any device than can fit on that device
-  2) All device targets for the multi-device launch function must be different
-  3) All streams must be explicit (non-NULL)
-  4) The kernels sent in must be identical between devices
-  5) The grid and block sizes must be identical between devices
-  6) The block dimensions must be non-zero
-  7) The dynamic shared memory size must be identical between devices.
-
-This test ensures that the proper error conditions are returned, even if the
-target kernel does not actually use any fo the cooperative groups features.
-
-Note that tests 4, 5, and 7 only hold on Nvidia GPUs. AMD GPUs running ROCm
-do not have these constraints. As such, the test checks to see whether they
-should fail or succeed and compares this to what actually happens.
-*/
-
-/* HIT_START
- * BUILD: %t %s ../../test_common.cpp
- * TEST: %t
- * HIT_END
- */
-
-
-#include <hip/hip_runtime.h>
-#include <hip/hip_cooperative_groups.h>
-#include "test_common.h"
-
-static inline void hipCheckAndFail(hipError_t errval,
-                                   const char *file, int line) {
-  hipError_t last_err = hipGetLastError();
-  if (errval != hipSuccess) {
-    std::cerr << "hip error: " << hipGetErrorString(errval);
-    std::cerr << std::endl;
-    std::cerr << "    Location: " << file << ":" << line << std::endl;
-    failed("");
-  }
-  if (last_err != errval) {
-    std::cerr << "Error: the return value of a function was not the same ";
-    std::cerr << "as the value returned by hipGetLastError()" << std::endl;
-    std::cerr << "    Location: " << file << ":" << line << std::endl;
-    std::cerr << "    Function returned: " << hipGetErrorString(errval);
-    std::cerr << " (" << errval << ")" << std::endl;
-    std::cerr << "hipGetLastError() returned: " << hipGetErrorString(last_err);
-    std::cerr << " (" << last_err << ")" << std::endl;
-    failed("");
-  }
-}
-#define hipCheckErr(errval) \
-  do { hipCheckAndFail((errval), __FILE__, __LINE__); } while (0)
-
-static int cooperative_groups_support(int device_id) {
-  hipError_t err;
-
-  int cooperative_attribute;
-  HIPCHECK(hipDeviceGetAttribute(&cooperative_attribute,
-           hipDeviceAttributeCooperativeLaunch, device_id));
-  if (!cooperative_attribute) {
-    std::cerr << "Cooperative launch support not available in ";
-    std::cerr << "the device attribute for device " << device_id;
-    std::cerr << std::endl;
-    return 0;
-  }
-
-  int multi_gpu_cooperative_attribute;
-  HIPCHECK(hipDeviceGetAttribute(&multi_gpu_cooperative_attribute,
-           hipDeviceAttributeCooperativeMultiDeviceLaunch, device_id));
-
-  if (!multi_gpu_cooperative_attribute) {
-    std::cerr << "Multi-GPU cooperative launch support not available in ";
-    std::cerr << "the device attribute for device " << device_id;
-    std::cerr << std::endl;
-    return 0;
-  }
-
-  hipDeviceProp_t device_properties;
-  HIPCHECK(hipGetDeviceProperties(&device_properties, device_id));
-  if (device_properties.cooperativeLaunch == 0) {
-    std::cerr << "Cooperative group support not available in ";
-    std::cerr << "device properties." << std::endl;
-    return 0;
-  }
-  if (device_properties.cooperativeMultiDeviceLaunch == 0) {
-    std::cerr << "Multi-GPU cooperative group support not available in ";
-    std::cerr << "device properties." << std::endl;
-    return 0;
-  }
-  return 1;
-}
-
-static int support_for_separate_kernels(int device_id) {
-  hipError_t err;
-
-  int separate_kernel_supported;
-  HIPCHECK(hipDeviceGetAttribute(&separate_kernel_supported,
-           hipDeviceAttributeCooperativeMultiDeviceUnmatchedFunc,
-           device_id));
-  if (!separate_kernel_supported) {
-    return 0;
-  }
-
-  hipDeviceProp_t device_properties;
-  HIPCHECK(hipGetDeviceProperties(&device_properties, device_id));
-  if (device_properties.cooperativeMultiDeviceUnmatchedFunc == 0) {
-    return 0;
-  }
-  return 1;
-}
-
-static int support_for_separate_grid_sizes(int device_id) {
-  hipError_t err;
-  int separate_sizes_supported;
-  HIPCHECK(hipDeviceGetAttribute(&separate_sizes_supported,
-           hipDeviceAttributeCooperativeMultiDeviceUnmatchedGridDim,
-           device_id));
-  if (!separate_sizes_supported) {
-    return 0;
-  }
-
-  hipDeviceProp_t device_properties;
-  HIPCHECK(hipGetDeviceProperties(&device_properties, device_id));
-  if (device_properties.cooperativeMultiDeviceUnmatchedGridDim == 0) {
-    return 0;
-  }
-  return 1;
-}
-
-static int support_for_separate_block_dims(int device_id) {
-  hipError_t err;
-  int separate_sizes_supported;
-  HIPCHECK(hipDeviceGetAttribute(&separate_sizes_supported,
-           hipDeviceAttributeCooperativeMultiDeviceUnmatchedBlockDim,
-           device_id));
-  if (!separate_sizes_supported) {
-    return 0;
-  }
-
-  hipDeviceProp_t device_properties;
-  HIPCHECK(hipGetDeviceProperties(&device_properties, device_id));
-  if (device_properties.cooperativeMultiDeviceUnmatchedBlockDim == 0) {
-    return 0;
-  }
-  return 1;
-}
-
-static int support_for_separate_shared_sizes(int device_id) {
-  hipError_t err;
-  int separate_sizes_supported;
-  HIPCHECK(hipDeviceGetAttribute(&separate_sizes_supported,
-           hipDeviceAttributeCooperativeMultiDeviceUnmatchedSharedMem,
-           device_id));
-  if (!separate_sizes_supported) {
-    return 0;
-  }
-
-  hipDeviceProp_t device_properties;
-  HIPCHECK(hipGetDeviceProperties(&device_properties, device_id));
-  if (device_properties.cooperativeMultiDeviceUnmatchedSharedMem == 0) {
-    return 0;
-  }
-  return 1;
-}
-
-__global__ void test_kernel(long long *array) {
-    unsigned int rank = blockIdx.x * blockDim.x + threadIdx.x;
-    array[rank] += clock64();
-}
-
-__global__ void second_test_kernel(long long *array) {
-    unsigned int rank = blockIdx.x * blockDim.x + threadIdx.x;
-    array[rank] += clock64();
-}
-
-int main(int argc, char** argv) {
-  hipError_t err;
-  /*************************************************************************/
-  /* Parse the command line parameters *************************************/
-  // Arguments to pull out of the command line.
-  int device_num, FailFlag = 0;
-  HIPCHECK(hipGetDeviceCount(&device_num));
-  if (device_num < 2) {
-    std::cout << "This test requires atleast two gpus but the system has ";
-    std::cout << " only "<< device_num <<std::endl;
-    std::cout << "The test is skipping with Pass result" << std::endl;
-    passed();
-  }
-  for (int dev = 0; dev < (device_num-1); ++dev) {
-    std::cout << "First device number: " << dev << std::endl;
-    std::cout << "Second device number: " << (dev + 1) << std::endl;
-
-    /*************************************************************************/
-    /* Test whether target devices support cooperative groups ****************/
-    for (int i = 0; i < 2; i++) {
-       if (!cooperative_groups_support((dev + i))) {
-         std::cout << "Skipping the test with Pass result.\n";
-         passed();
-        }
-    }
-
-    /*************************************************************************/
-    /* We will try to launch more waves than the GPUs can fit. ***************/
-    int warp_sizes[2];
-    int num_sms[2];
-    hipDeviceProp_t device_properties[2];
-    int warp_size = INT_MAX;
-    int num_sm = INT_MAX;
-    for (int i = 0; i < 2; i++) {
-      HIPCHECK(hipGetDeviceProperties(&device_properties[i], (dev + i)));
-      warp_sizes[i] = device_properties[i].warpSize;
-      if (warp_sizes[i] < warp_size) {
-        warp_size = warp_sizes[i];
-      }
-      num_sms[i] = device_properties[i].multiProcessorCount;
-      if (num_sms[i] < num_sm) {
-        num_sm = num_sms[i];
-      }
-      std::cout << "Device " << (dev + i);
-      std::cout << " name: " << device_properties[i].name << std::endl;
-    }
-    std::cout << std::endl;
-
-    // Calculate the device occupancy to know how many blocks can be run.
-    int max_blocks_per_sm_arr[2];
-    int max_blocks_per_sm = INT_MAX;
-    for (int i = 0; i < 2; i++) {
-      HIPCHECK(hipSetDevice((dev + i)));
-      HIPCHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor(
-               &max_blocks_per_sm_arr[i], test_kernel, warp_size, 0));
-      if (max_blocks_per_sm_arr[i] < max_blocks_per_sm) {
-          max_blocks_per_sm = max_blocks_per_sm_arr[i];
-      }
-    }
-
-    int desired_blocks = max_blocks_per_sm * num_sm;
-
-    /*************************************************************************/
-    /* Create the streams we will use in this test. **************************/
-    hipStream_t streams[2];
-    for (int i = 0; i < 2; i++) {
-      HIPCHECK(hipSetDevice((dev + i)));
-      HIPCHECK(hipStreamCreate(&streams[i]));
-    }
-
-    /*************************************************************************/
-    /* Set up data to pass into the kernel ***********************************/
-
-    // Alocate the host input buffer, and two device-focused buffers per GPU
-    // that we will use for our test.
-    unsigned int *good_dev_array[2];
-    unsigned int *bad_dev_array[2];
-    for (int i = 0; i < 2; i++) {
-      int good_size = desired_blocks * warp_size * sizeof(long long);
-      int bad_size = 2 * desired_blocks * warp_size * sizeof(long long);
-
-      HIPCHECK(hipSetDevice((dev + i)));
-      HIPCHECK(hipMalloc(reinterpret_cast<void**>(&good_dev_array[i]),
-                         good_size));
-      HIPCHECK(hipMemsetAsync(good_dev_array[i], 0, good_size, streams[i]));
-      HIPCHECK(hipMalloc(reinterpret_cast<void**>(&bad_dev_array[i]),
-                         bad_size));
-      HIPCHECK(hipMemsetAsync(bad_dev_array[i], 0, bad_size, streams[i]));
-    }
-    HIPCHECK(hipDeviceSynchronize());
-
-    /*************************************************************************/
-    /* Launch the kernels ****************************************************/
-    std::cout << "Launching a multi-GPU cooperative kernel with too many ";
-    std::cout << "warps..." << std::endl;
-
-    void *dev_params[2][1];
-    hipLaunchParams md_params[2];
-    for (int i = 0; i < 2; i++) {
-      dev_params[i][0] = reinterpret_cast<void*>(&bad_dev_array[i]);
-
-      md_params[i].func = reinterpret_cast<void*>(test_kernel);
-      md_params[i].gridDim = 2 * desired_blocks;
-      md_params[i].blockDim = warp_size;
-      md_params[i].sharedMem = 0;
-      md_params[i].stream = streams[i];
-      md_params[i].args = dev_params[i];
-    }
-
-    err = hipLaunchCooperativeKernelMultiDevice(md_params, 2, 0);
-    if (err != hipErrorCooperativeLaunchTooLarge) {
-      std::cerr << "ERROR! Tried to launch a multi-GPU cooperative kernel ";
-      std::cerr << "with too many warps." << std::endl;
-      std::cerr << "This SHOULD have failed with the error ";
-      std::cerr << "hipErrorCooperativeLaunchTooLarge (";
-      std::cerr << hipErrorCooperativeLaunchTooLarge << ")." << std::endl;
-      std::cerr << "Instead, the launch returned " << hipGetErrorName(err);
-      std::cerr << " (" << err << ")" << std::endl;
-      FailFlag = 1;
-    } else {
-      std::cout << "\tProperly saw this return ";
-      std::cout << "hipErrorCooperativeLaunchTooLarge" << std::endl;
-    }
-    HIPCHECK(hipDeviceSynchronize());
-
-    std::cout << "Launching a multi-GPU cooperative kernel to the same ";
-    std::cout << "device twice..." << std::endl;
-    for (int i = 0; i < 2; i++) {
-      dev_params[i][0] = reinterpret_cast<void*>(&good_dev_array[i]);
-      md_params[i].gridDim = desired_blocks;
-      md_params[i].stream = streams[0];
-    }
-    err = hipLaunchCooperativeKernelMultiDevice(md_params, 2, 0);
-    if (err != hipErrorInvalidDevice) {
-      std::cerr << "ERROR! Tried to launch a multi-GPU cooperative kernel ";
-      std::cerr << "to the same device twice." << std::endl;
-      std::cerr << "This SHOULD have failed with the error ";
-      std::cerr << "hipErrorInvalidDevice (";
-      std::cerr << hipErrorInvalidDevice << ")." << std::endl;
-      std::cerr << "Instead, the launch returned " << hipGetErrorName(err);
-      std::cerr << " (" << err << ")" << std::endl;
-      FailFlag = 1;
-    } else {
-      std::cout << "\tProperly saw this return ";
-      std::cout << "hipErrorInvalidDevice" << std::endl;
-    }
-    HIPCHECK(hipDeviceSynchronize());
-
-    std::cout << "Launching a multi-GPU cooperative kernel to the NULL ";
-    std::cout << "stream" << std::endl;
-    for (int i = 0; i < 2; i++) {
-      md_params[i].stream = NULL;
-    }
-    err = hipLaunchCooperativeKernelMultiDevice(md_params, 2, 0);
-    if (err != hipErrorInvalidResourceHandle) {
-      std::cerr << "ERROR! Tried to launch a multi-GPU cooperative kernel ";
-      std::cerr << "to the NULL stream." << std::endl;
-      std::cerr << "This SHOULD have failed with the error ";
-      std::cerr << "hipErrorInvalidResourceHandle (";
-      std::cerr << hipErrorInvalidResourceHandle << ")." << std::endl;
-      std::cerr << "Instead, the launch returned " << hipGetErrorName(err);
-      std::cerr << " (" << err << ")" << std::endl;
-      FailFlag = 1;
-    } else {
-      std::cout << "\tProperly saw this return ";
-      std::cout << "hipErrorInvalidResourceHandle" << std::endl;
-    }
-    HIPCHECK(hipDeviceSynchronize());
-
-    std::cout << "Launching a multi-GPU cooperative kernel with two ";
-    std::cout << "different kernels." << std::endl;
-    bool supports_sep_kernels = true;
-    for (int i = 0; i < 2; i++) {
-      md_params[i].stream = streams[i];
-      if (!support_for_separate_kernels((dev + i))) {
-        supports_sep_kernels = false;
-      }
-    }
-    md_params[1].func = reinterpret_cast<void*>(second_test_kernel);
-    err = hipLaunchCooperativeKernelMultiDevice(md_params, 2, 0);
-    if ((supports_sep_kernels && err != hipSuccess) ||
-        (!supports_sep_kernels && err != hipErrorInvalidValue)) {
-      if (supports_sep_kernels) {
-        std::cerr << "ERROR! Tried to launch a multi-GPU cooperative kernel ";
-        std::cerr << "with two different kernels." << std::endl;
-        std::cerr << "This SHOULD have succeeded with hipSuccess (";
-        std::cerr << hipSuccess << ")." << std::endl;
-        std::cerr << "Instead, the launch returned " << hipGetErrorName(err);
-        std::cerr << " (" << err << ")" << std::endl;
-      } else {
-        std::cerr << "ERROR! Tried to launch a multi-GPU cooperative kernel ";
-        std::cerr << "with two different kernels." << std::endl;
-        std::cerr << "This SHOULD have failed with the error ";
-        std::cerr << "hipErrorInvalidValue (";
-        std::cerr << hipErrorInvalidValue << ")." << std::endl;
-        std::cerr << "Instead, the launch returned " << hipGetErrorName(err);
-        std::cerr << " (" << err << ")" << std::endl;
-      }
-      FailFlag = 1;
-    } else {
-      std::cout << "\tProperly saw this return ";
-      if (supports_sep_kernels) {
-        std::cout << "hipSuccess" << std::endl;
-      } else {
-        std::cout << "hipErrorInvalidValue" << std::endl;
-      }
-    }
-    HIPCHECK(hipDeviceSynchronize());
-
-    std::cout << "Launching a multi-GPU cooperative kernel with two ";
-    std::cout << "different grid sizes." << std::endl;
-    bool supports_sep_sizes = true;
-    for (int i = 0; i < 2; i++) {
-      md_params[i].func = reinterpret_cast<void*>(test_kernel);
-      md_params[i].gridDim = i+1;
-      if (!support_for_separate_grid_sizes((dev + i))) {
-        supports_sep_sizes = false;
-      }
-    }
-    err = hipLaunchCooperativeKernelMultiDevice(md_params, 2, 0);
-    if ((supports_sep_sizes && err != hipSuccess) ||
-      (!supports_sep_sizes && err == hipErrorInvalidValue)) {
-      if (supports_sep_sizes) {
-        std::cerr << "ERROR! Tried to launch a multi-GPU cooperative kernel ";
-        std::cerr << "with two different grid sizes." << std::endl;
-        std::cerr << "This SHOULD have succeeded with hipSuccess (";
-        std::cerr << hipSuccess << ")." << std::endl;
-        std::cerr << "Instead, the launch returned " << hipGetErrorName(err);
-        std::cerr << " (" << err << ")" << std::endl;
-      } else {
-        std::cerr << "ERROR! Tried to launch a multi-GPU cooperative kernel ";
-        std::cerr << "with two different grid sizes." << std::endl;
-        std::cerr << "This SHOULD have failed with the error ";
-        std::cerr << "hipErrorInvalidValue (";
-        std::cerr << hipErrorInvalidValue << ")." << std::endl;
-        std::cerr << "Instead, the launch returned " << hipGetErrorName(err);
-        std::cerr << " (" << err << ")" << std::endl;
-        FailFlag = 1;
-      }
-    } else {
-      std::cout << "\tProperly saw this return ";
-      if (supports_sep_kernels) {
-        std::cout << "hipSuccess" << std::endl;
-      } else {
-        std::cout << "hipErrorInvalidValue" << std::endl;
-      }
-    }
-    HIPCHECK(hipDeviceSynchronize());
-
-    std::cout << "Launching a multi-GPU cooperative kernel with two ";
-    std::cout << "different block dimensions." << std::endl;
-    supports_sep_sizes = true;
-    for (int i = 0; i < 2; i++) {
-      md_params[i].gridDim = desired_blocks;
-      md_params[i].blockDim = i+1;
-      if (!support_for_separate_block_dims((dev + i))) {
-        supports_sep_sizes = false;
-      }
-    }
-    err = hipLaunchCooperativeKernelMultiDevice(md_params, 2, 0);
-    if ((supports_sep_sizes && err != hipSuccess) ||
-          (!supports_sep_sizes && err == hipErrorInvalidValue)) {
-      if (supports_sep_sizes) {
-        std::cerr << "ERROR! Tried to launch a multi-GPU cooperative kernel ";
-        std::cerr << "with two different block dimensions." << std::endl;
-        std::cerr << "This SHOULD have succeeded with hipSuccess (";
-        std::cerr << hipSuccess << ")." << std::endl;
-        std::cerr << "Instead, the launch returned " << hipGetErrorName(err);
-        std::cerr << " (" << err << ")" << std::endl;
-      } else {
-        std::cerr << "ERROR! Tried to launch a multi-GPU cooperative kernel ";
-        std::cerr << "with two different block dimensions." << std::endl;
-        std::cerr << "This SHOULD have failed with the error ";
-        std::cerr << "hipErrorInvalidValue (";
-        std::cerr << hipErrorInvalidValue << ")." << std::endl;
-        std::cerr << "Instead, the launch returned " << hipGetErrorName(err);
-        std::cerr << " (" << err << ")" << std::endl;
-        FailFlag = 1;
-      }
-    } else {
-      std::cout << "\tProperly saw this return ";
-      if (supports_sep_kernels) {
-        std::cout << "hipSuccess" << std::endl;
-      } else {
-        std::cout << "hipErrorInvalidValue" << std::endl;
-      }
-    }
-    HIPCHECK(hipDeviceSynchronize());
-
-    std::cout << "Launching a multi-GPU cooperative kernel with block ";
-    std::cout << "dimensions of zero." << std::endl;
-    for (int i = 0; i < 2; i++) {
-      md_params[i].blockDim = 0;
-    }
-    err = hipLaunchCooperativeKernelMultiDevice(md_params, 2, 0);
-    if (err != hipErrorInvalidConfiguration) {
-      std::cerr << "ERROR! Tried to launch a multi-GPU cooperative kernel ";
-      std::cerr << "with block dimensions of zero." << std::endl;
-      std::cerr << "This SHOULD have failed with the error ";
-      std::cerr << "hipErrorInvalidConfiguration (";
-      std::cerr << hipErrorInvalidConfiguration << ")." << std::endl;
-      std::cerr << "Instead, the launch returned " << hipGetErrorName(err);
-      std::cerr << " (" << err << ")" << std::endl;
-      FailFlag = 1;
-    } else {
-      std::cout << "\tProperly saw this return ";
-      std::cout << "hipErrorInvalidConfiguration" << std::endl;
-    }
-    HIPCHECK(hipDeviceSynchronize());
-
-    std::cout << "Launching a multi-GPU cooperative kernel with two ";
-    std::cout << "different shared memory sizes." << std::endl;
-    supports_sep_sizes = true;
-    for (int i = 0; i < 2; i++) {
-      md_params[i].blockDim = warp_size;
-      md_params[i].sharedMem = i;
-      if (!support_for_separate_shared_sizes((dev + i))) {
-        supports_sep_sizes = false;
-      }
-    }
-    err = hipLaunchCooperativeKernelMultiDevice(md_params, 2, 0);
-    if ((supports_sep_sizes && err != hipSuccess) ||
-          (!supports_sep_sizes && err == hipErrorInvalidValue)) {
-      if (supports_sep_sizes) {
-        std::cerr << "ERROR! Tried to launch a multi-GPU cooperative kernel ";
-        std::cerr << "with two different shared memory sizes." << std::endl;
-        std::cerr << "This SHOULD have succeeded with hipSuccess (";
-        std::cerr << hipSuccess << ")." << std::endl;
-        std::cerr << "Instead, the launch returned " << hipGetErrorName(err);
-        std::cerr << " (" << err << ")" << std::endl;
-      } else {
-        std::cerr << "ERROR! Tried to launch a multi-GPU cooperative kernel ";
-        std::cerr << "with two different shared memory sizes." << std::endl;
-        std::cerr << "This SHOULD have failed with the error ";
-        std::cerr << "hipErrorInvalidValue (";
-        std::cerr << hipErrorInvalidValue << ")." << std::endl;
-        std::cerr << "Instead, the launch returned " << hipGetErrorName(err);
-        std::cerr << " (" << err << ")" << std::endl;
-        FailFlag = 1;
-      }
-    } else {
-      std::cout << "\tProperly saw this return ";
-      if (supports_sep_kernels) {
-        std::cout << "hipSuccess" << std::endl;
-      } else {
-        std::cout << "hipErrorInvalidValue" << std::endl;
-      }
-    }
-    HIPCHECK(hipDeviceSynchronize());
-
-    std::cout << "Launching a multi-GPU cooperative kernel with maximum ";
-    std::cout << "number of warps..." << std::endl;
-    for (int i = 0; i < 2; i++) {
-      md_params[i].sharedMem = 0;
-    }
-    HIPCHECK(hipLaunchCooperativeKernelMultiDevice(md_params, 2, 0));
-    std::cout << "\tProperly launched." << std::endl;
-
-    HIPCHECK(hipDeviceSynchronize());
-    for (int m = 0; m < 2; ++m) {
-      HIPCHECK(hipFree(good_dev_array[m]));
-      HIPCHECK(hipFree(bad_dev_array[m]));
-      HIPCHECK(hipStreamDestroy(streams[m]));
-    }
-    if (FailFlag == 1) {
-      break;
-    }
-  }
-  if (FailFlag == 1) {
-    failed("");
-  } else {
-  passed();
-  }
-}
diff --git a/tests/src/runtimeApi/cooperativeGrps/multi_gpu_streams.cpp b/tests/src/runtimeApi/cooperativeGrps/multi_gpu_streams.cpp
deleted file mode 100644
index a0275d7ba5..0000000000
--- a/tests/src/runtimeApi/cooperativeGrps/multi_gpu_streams.cpp
+++ /dev/null
@@ -1,581 +0,0 @@
-/*
-Copyright (c) 2020-present Advanced Micro Devices, Inc. All rights reserved.
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR
-IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
-*/
-// Test Description:
-/*The general idea of the application is to test how multi-GPU Cooperative
-Groups kernel launches to a stream interact with other things that may be
-simultaneously running in the same streams.
-
-The HIP specification says that a multi-GPU cooperative launch will wait
-until all of the streams it's using finish their work. Only then will the
-cooperative kernel be launched to all of the devices. Then no other work
-can take part in the any of the streams until all of the multi-GPU
-cooperative work is done.
-
-However, there are flags that allow you to disable each of these
-serialization points: hipCooperativeLaunchMultiDeviceNoPreSync and
-hipCooperativeLaunchMultiDeviceNoPostSync.
-
-As such, this benchmark tests the following five situations launching
-to two GPUs (and thus two streams):
-
-    1. Normal multi-GPU cooperative kernel:
-        This should result in the following pattern:
-        Stream 0: Cooperative
-        Stream 1: Cooperative
-    2. Regular kernel launches and multi-GPU cooperative kernel launches
-       with the default flags, resulting in the following pattern:
-        Stream 0: Regular --> Cooperative
-        Stream 1:         --> Cooperative --> Regular
-
-    3. Regular kernel launches and multi-GPU cooperative kernel launches
-       that turn off "pre-sync". This should allow a cooperative kernel
-       to launch even if work is already in a stream pointing to
-       another GPU.
-        This should result in the following pattern:
-        Stream 0: Regular --> Cooperative
-        Stream 1: Cooperative            --> Regular
-
-    4. Regular kernel launches and multi-GPU cooperative kernel launches
-       that turn off "post-sync". This should allow a new kernel to enter
-       a GPU even if another GPU still has a cooperative kernel on it.
-        This should result in the following pattern:
-        Stream 0: Regular --> Cooperative
-        Stream 1:         --> Cooperative--> Regular
-
-    5. Regular kernel launches and multi-GPU cooperative kernel launches
-       that turn off both pre- and post-sync. This should allow any of
-       the kernels to launch to their GPU regardless of the status of
-       other kernels in other multi-GPU stream groups.
-        This should result in the following pattern:
-        Stream 0: Regular --> Cooperative
-        Stream 1: Cooperative --> Regular
-
-We time how long it takes to run each of these benchmarks and print it as
-the output of the benchmark. The kernels themselves are just useless time-
-wasting code so that the kernel takes a meaningful amount of time on the
-GPU before it exits. We only launch a single wavefront for each kernel, so
-any serialization should not be because of GPU occupancy concerns.
-
-If tests 2, 3, and 4 take roughly 3x as long as #1, that implies that
-cooperative kernels are serialized as expected.
-
-If test #5 takes roughly twice as long as #1, that implies that the
-overlap-allowing flags work as expected.
-*/
-
-/* HIT_START
- * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS --std=c++11 -rdc=true -gencode arch=compute_60,code=sm_60
- * TEST: %t
- * HIT_END
- */
-
-#include <chrono>
-#include <hip/hip_runtime.h>
-#include <hip/hip_cooperative_groups.h>
-#include "test_common.h"
-
-static inline void hipCheckAndFail(hipError_t errval,
-                                   const char *file, int line) {
-  hipError_t last_err = hipGetLastError();
-  if (errval != hipSuccess) {
-    std::cerr << "hip error: " << hipGetErrorString(errval);
-    std::cerr << std::endl;
-    std::cerr << "    Location: " << file << ":" << line << std::endl;
-    failed("");
-  }
-  if (last_err != errval) {
-    std::cerr << "Error: the return value of a function was not the same ";
-    std::cerr << "as the value returned by hipGetLastError()" << std::endl;
-    std::cerr << "    Location: " << file << ":" << line << std::endl;
-    std::cerr << "    Function returned: " << hipGetErrorString(errval);
-    std::cerr << " (" << errval << ")" << std::endl;
-    std::cerr << "hipGetLastError() returned: " << hipGetErrorString(last_err);
-    std::cerr << " (" << last_err << ")" << std::endl;
-    failed("");
-  }
-}
-#define hipCheckErr(errval) \
-  do { hipCheckAndFail((errval), __FILE__, __LINE__); } while (0)
-
-static int cooperative_groups_support(int device_id) {
-  hipError_t err;
-  int cooperative_attribute;
-  HIPCHECK(hipDeviceGetAttribute(&cooperative_attribute,
-          hipDeviceAttributeCooperativeLaunch, device_id));
-  if (!cooperative_attribute) {
-    std::cerr << "Cooperative launch support not available in ";
-    std::cerr << "the device attribute for device " << device_id;
-    std::cerr << std::endl;
-    return 0;
-  }
-
-  int multi_gpu_cooperative_attribute;
-  HIPCHECK(hipDeviceGetAttribute(&multi_gpu_cooperative_attribute,
-           hipDeviceAttributeCooperativeMultiDeviceLaunch, device_id));
-  if (!multi_gpu_cooperative_attribute) {
-    std::cerr << "Multi-GPU cooperative launch support not available in ";
-    std::cerr << "the device attribute for device " << device_id;
-    std::cerr << std::endl;
-    return 0;
-  }
-
-  hipDeviceProp_t device_properties;
-  HIPCHECK(hipGetDeviceProperties(&device_properties, device_id));
-  if (device_properties.cooperativeLaunch == 0) {
-    std::cerr << "Cooperative group support not available in ";
-    std::cerr << "device properties." << std::endl;
-    return 0;
-  }
-  if (device_properties.cooperativeMultiDeviceLaunch == 0) {
-    std::cerr << "Multi-GPU cooperative group support not available in ";
-    std::cerr << "device properties." << std::endl;
-    return 0;
-  }
-  return 1;
-}
-
-__global__ void test_coop_kernel(unsigned int loops, long long *array,
-                                 int fast_gpu) {
-  cooperative_groups::multi_grid_group mgrid =
-  cooperative_groups::this_multi_grid();
-  unsigned int rank = blockIdx.x * blockDim.x + threadIdx.x;
-
-  if (mgrid.grid_rank() == fast_gpu) {
-    return;
-  }
-
-  for (int i = 0; i < loops; i++) {
-    long long start_clock = clock64();
-    while (clock64() < (start_clock+1000000)) {}
-    array[rank] += clock64();
-  }
-}
-
-__global__ void test_kernel(uint32_t loops, unsigned long long *array) {
-  unsigned int rank = blockIdx.x * blockDim.x + threadIdx.x;
-
-  for (int i = 0; i < loops; i++) {
-    long long start_clock = clock64();
-    while (clock64() < (start_clock+1000000)) {}
-    array[rank] += clock64();
-  }
-}
-
-int main(int argc, char** argv) {
-  hipError_t err;
-  int device_num, FailFlag = 0;
-  uint32_t loops = 2000;
-  uint32_t fast_loops = 1;
-  int32_t fast_gpu = -1;
-  HIPCHECK(hipGetDeviceCount(&device_num));
-  if (device_num < 2) {
-    std::cout << "This test requires atleast two gpus but the system has ";
-    std::cout << " only "<< device_num <<std::endl;
-    std::cout << "The test is skipping with Pass result" << std::endl;
-    passed();
-  }
-  for (int dev = 0; dev < (device_num-1); ++dev) {
-    std::cout << "First device number: " << dev << std::endl;
-    std::cout << "Second device number: " << (dev + 1) << std::endl;
-    std::cout << "Loops: " << loops << std::endl;
-
-    /*************************************************************************/
-    /* Test whether target devices support cooperative groups ****************/
-    for (int i = 0; i < 2; i++) {
-      if (!cooperative_groups_support(dev + i)) {
-        std::cout << "Skipping the test with Pass result.\n";
-        passed();
-      }
-    }
-
-    /*************************************************************************/
-    /* We will launch enough waves to fill up all of the GPU *****************/
-    int warp_sizes[2];
-    int num_sms[2];
-    hipDeviceProp_t device_properties[2];
-    int warp_size = INT_MAX;
-    int num_sm = INT_MAX;
-    for (int i = 0; i < 2; i++) {
-      HIPCHECK(hipGetDeviceProperties(&device_properties[i], (dev + i)));
-      warp_sizes[i] = device_properties[i].warpSize;
-      if (warp_sizes[i] < warp_size) {
-        warp_size = warp_sizes[i];
-      }
-      num_sms[i] = device_properties[i].multiProcessorCount;
-      if (num_sms[i] < num_sm) {
-        num_sm = num_sms[i];
-      }
-      std::cout << "Device " << (i + 1);
-      std::cout << " name: " << device_properties[i].name << std::endl;
-    }
-    std::cout << std::endl;
-
-    // Calculate the device occupancy to know how many blocks can be run.
-    int max_blocks_per_sm_arr[2];
-    int max_blocks_per_sm = INT_MAX;
-    for (int i = 0; i < 2; i++) {
-      HIPCHECK(hipSetDevice(dev + i));
-      HIPCHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor(
-               &max_blocks_per_sm_arr[i], test_kernel, warp_size, 0));
-      if (max_blocks_per_sm_arr[i] < max_blocks_per_sm) {
-        max_blocks_per_sm = max_blocks_per_sm_arr[i];
-      }
-    }
-    int desired_blocks = 1;
-
-    if (desired_blocks > max_blocks_per_sm * num_sm) {
-      std::cerr << "The requested number of blocks will not fit on the GPU";
-      std::cerr << std::endl;
-      std::cerr << "You requested " << desired_blocks << " but we can only ";
-      std::cerr << "fit " << (max_blocks_per_sm * num_sm) << std::endl;
-      failed("");
-    }
-
-    /*************************************************************************/
-    /* Create the streams we will use in this test. **************************/
-    hipStream_t streams[2];
-    for (int i = 0; i < 2; i++) {
-      HIPCHECK(hipSetDevice(dev + i));
-      HIPCHECK(hipStreamCreate(&streams[i]));
-    }
-
-    /*************************************************************************/
-    /* Set up data to pass into the kernelx **********************************/
-
-    // Alocate the host input buffer, and two device-focused buffers that we
-    // will use for our test.
-    unsigned long long *dev_array[2];
-    for (int i = 0; i < 2; i++) {
-      int good_size = desired_blocks * warp_size * sizeof(long long);
-      HIPCHECK(hipSetDevice(dev + i));
-      HIPCHECK(hipMalloc(reinterpret_cast<void**>(&dev_array[i]), good_size));
-      HIPCHECK(hipMemsetAsync(dev_array[i], 0, good_size, streams[i]));
-    }
-    for (int i = 0; i < 2; i++) {
-      HIPCHECK(hipSetDevice(dev + i));
-      HIPCHECK(hipDeviceSynchronize());
-    }
-
-    /*************************************************************************/
-    /* Launch the kernels ****************************************************/
-    void *dev_params[2][3];
-    hipLaunchParams md_params[2];
-    std::chrono::time_point<std::chrono::system_clock> start_time[6];
-    std::chrono::time_point<std::chrono::system_clock> end_time[6];
-
-    std::cout << "Test 0: Launching a multi-GPU cooperative kernel...\n";
-    std::cout << "This should result in the following pattern:" << std::endl;
-    std::cout << "GPU " << dev << ": Long Coop Kernel" << std::endl;
-    std::cout << "GPU " << (dev + 1) << ": Long Coop Kernel" << std::endl;
-
-    for (int i = 0; i < 2; i++) {
-      dev_params[i][0] = reinterpret_cast<void*>(&loops);
-      dev_params[i][1] = reinterpret_cast<void*>(&dev_array[i]);
-      dev_params[i][2] = reinterpret_cast<void*>(&fast_gpu);
-      md_params[i].func = reinterpret_cast<void*>(test_coop_kernel);
-      md_params[i].gridDim = desired_blocks;
-      md_params[i].blockDim = warp_size;
-      md_params[i].sharedMem = 0;
-      md_params[i].stream = streams[i];
-      md_params[i].args = dev_params[i];
-    }
-
-    start_time[0] = std::chrono::system_clock::now();
-    HIPCHECK(hipLaunchCooperativeKernelMultiDevice(md_params, 2, 0));
-    for (int i = 0; i < 2; i++) {
-      HIPCHECK(hipSetDevice(dev + i));
-      HIPCHECK(hipDeviceSynchronize());
-    }
-    end_time[0] = std::chrono::system_clock::now();
-
-    std::cout << std::endl;
-    std::cout << "Test 1: Launching a multi-GPU cooperative kernel with the ";
-    std::cout << "following pattern:" << std::endl;
-    std::cout << "GPU " << dev << ": Standard  Kernel --> Long Coop Kernel\n";
-    std::cout << "GPU " << (dev + 1) << ":                  --> Coop        ";
-    std::cout << "--> Standard  Kernel\n";
-    fast_gpu = 1;
-    start_time[1] = std::chrono::system_clock::now();
-    HIPCHECK(hipSetDevice(dev));
-    hipLaunchKernelGGL(test_kernel, dim3(desired_blocks), dim3(warp_size), 0,
-                       streams[0], loops, dev_array[0]);
-    HIPCHECK(hipGetLastError());
-    HIPCHECK(hipLaunchCooperativeKernelMultiDevice(md_params, 2, 0));
-    HIPCHECK(hipSetDevice(dev + 1));
-    hipLaunchKernelGGL(test_kernel, dim3(desired_blocks), dim3(warp_size), 0,
-                       streams[1], loops, dev_array[1]);
-    HIPCHECK(hipGetLastError());
-    for (int i = 0; i < 2; i++) {
-      HIPCHECK(hipSetDevice(dev + i));
-      HIPCHECK(hipDeviceSynchronize());
-    }
-    end_time[1] = std::chrono::system_clock::now();
-    fast_gpu = -1;
-
-    std::cout << std::endl;
-    std::cout << "Test 2: Launching a multi-GPU cooperative kernel with the ";
-    std::cout << "following pattern:" << std::endl;
-    std::cout << "GPU " << dev << ": Standard  Kernel --> Coop" << std::endl;
-    std::cout << "GPU " << (dev + 1) << ":                  --> Long Coop";
-    std::cout << " Kernel --> ";
-    std::cout << "Standard  Kernel\n";
-    fast_gpu = 0;
-    start_time[2] = std::chrono::system_clock::now();
-    HIPCHECK(hipSetDevice(dev));
-    hipLaunchKernelGGL(test_kernel, dim3(desired_blocks), dim3(warp_size), 0,
-                       streams[0], loops, dev_array[0]);
-    HIPCHECK(hipGetLastError());
-    HIPCHECK(hipLaunchCooperativeKernelMultiDevice(md_params, 2, 0));
-    HIPCHECK(hipSetDevice(dev + 1));
-    hipLaunchKernelGGL(test_kernel, dim3(desired_blocks), dim3(warp_size), 0,
-                       streams[1], loops, dev_array[1]);
-    HIPCHECK(hipGetLastError());
-    for (int i = 0; i < 2; i++) {
-      HIPCHECK(hipSetDevice(dev + i));
-      HIPCHECK(hipDeviceSynchronize());
-    }
-    end_time[2] = std::chrono::system_clock::now();
-    fast_gpu = -1;
-
-    std::cout << std::endl;
-    std::cout << "Test 3: Launching a multi-GPU cooperative kernel with the ";
-    std::cout << "ability to overlap regular and cooperative kernels ";
-    std::cout << "only at the beginning." << std::endl;
-    std::cout << "This should result in the following pattern:" << std::endl;
-    std::cout << "GPU " << dev << ": Standard  Kernel --> Coop" << std::endl;
-    std::cout << "GPU " << (dev + 1) << ": Long Coop Kernel -->      Standard";
-    std::cout<< "  Kernel\n";
-    fast_gpu = 0;
-    start_time[3] = std::chrono::system_clock::now();
-    HIPCHECK(hipSetDevice(dev));
-    hipLaunchKernelGGL(test_kernel, dim3(desired_blocks), dim3(warp_size), 0,
-                       streams[0], loops, dev_array[0]);
-    HIPCHECK(hipGetLastError());
-    HIPCHECK(hipLaunchCooperativeKernelMultiDevice(md_params, 2,
-             hipCooperativeLaunchMultiDeviceNoPreSync));
-    HIPCHECK(hipSetDevice(dev + 1));
-    hipLaunchKernelGGL(test_kernel, dim3(desired_blocks), dim3(warp_size), 0,
-                       streams[1], loops, dev_array[1]);
-    HIPCHECK(hipGetLastError());
-    for (int i = 0; i < 2; i++) {
-      HIPCHECK(hipSetDevice(dev + i));
-      HIPCHECK(hipDeviceSynchronize());
-    }
-    end_time[3] = std::chrono::system_clock::now();
-    fast_gpu = -1;
-
-    std::cout << std::endl;
-    std::cout << "Test 4: Launching a multi-GPU cooperative kernel with the ";
-    std::cout << "ability to overlap regular and cooperative kernels ";
-    std::cout << "only at the end." << std::endl;
-    std::cout << "This should result in the following pattern:" << std::endl;
-    std::cout << "GPU " << dev << ": Standard  Kernel --> Long Coop Kernel\n";
-    std::cout << "GPU " << (dev + 1) << ":                  --> Coop --> ";
-    std::cout << "Standard  Kernel\n";
-    fast_gpu = 1;
-    start_time[4] = std::chrono::system_clock::now();
-    HIPCHECK(hipSetDevice(dev));
-    hipLaunchKernelGGL(test_kernel, dim3(desired_blocks), dim3(warp_size), 0,
-                       streams[0], loops, dev_array[0]);
-    HIPCHECK(hipGetLastError());
-    HIPCHECK(hipLaunchCooperativeKernelMultiDevice(md_params, 2,
-             hipCooperativeLaunchMultiDeviceNoPostSync));
-    HIPCHECK(hipSetDevice(dev + 1));
-    hipLaunchKernelGGL(test_kernel, dim3(desired_blocks), dim3(warp_size), 0,
-                       streams[1], loops, dev_array[1]);
-    for (int i = 0; i < 2; i++) {
-      HIPCHECK(hipSetDevice(dev + i));
-      HIPCHECK(hipDeviceSynchronize());
-    }
-    end_time[4] = std::chrono::system_clock::now();
-    fast_gpu = -1;
-
-    std::cout << std::endl;
-    std::cout << "Test 5: Launching a multi-GPU cooperative kernel with the ";
-    std::cout << "ability to overlap regular and cooperative kernels";
-    std::cout << std::endl;
-    std::cout << "This should result in the following pattern:" << std::endl;
-    std::cout << "GPU " << dev << ": Standard  Kernel --> Long Coop Kernel\n";
-    std::cout << "GPU " << (dev + 1) << ": Long Coop Kernel --> Standard";
-    std::cout << "  Kernel\n";
-    start_time[5] = std::chrono::system_clock::now();
-    HIPCHECK(hipSetDevice(dev));
-    hipLaunchKernelGGL(test_kernel, dim3(desired_blocks), dim3(warp_size), 0,
-                       streams[0], loops, dev_array[0]);
-    HIPCHECK(hipGetLastError());
-    HIPCHECK(hipLaunchCooperativeKernelMultiDevice(md_params, 2,
-             hipCooperativeLaunchMultiDeviceNoPreSync |
-             hipCooperativeLaunchMultiDeviceNoPostSync));
-    HIPCHECK(hipSetDevice(dev + 1));
-    hipLaunchKernelGGL(test_kernel, dim3(desired_blocks), dim3(warp_size), 0,
-                       streams[1], loops, dev_array[1]);
-    HIPCHECK(hipGetLastError());
-    for (int i = 0; i < 2; i++) {
-      HIPCHECK(hipSetDevice(dev + i));
-      HIPCHECK(hipDeviceSynchronize());
-    }
-    end_time[5] = std::chrono::system_clock::now();
-
-    std::chrono::duration<double> single_kernel_time =
-                                  (end_time[0] - start_time[0]);
-    std::chrono::duration<double> serialized_gpu0_time =
-                                  (end_time[1] - start_time[1]);
-    std::chrono::duration<double> serialized_gpu1_time =
-                                  (end_time[2] - start_time[2]);
-    std::chrono::duration<double> pre_overlapped_time =
-                                  (end_time[3] - start_time[3]);
-    std::chrono::duration<double> post_overlapped_time =
-                                  (end_time[4] - start_time[4]);
-    std::chrono::duration<double> overlapped_time =
-                                  (end_time[5] - start_time[5]);
-
-    std::cout << "Test 0: A single kernel on both GPUs took:" << std::endl;
-    std::cout << "    " << single_kernel_time.count();
-    std::cout << " seconds" << std::endl;
-    std::cout << std::endl;
-    std::cout << "Test 1: Serialized set of three kernels with GPU0";
-    std::cout << " being long took:";
-    std::cout << "    " << serialized_gpu0_time.count();
-    std::cout << " seconds" << std::endl;
-    std::cerr << "Expect between " << (2.7 * single_kernel_time.count());
-    std::cerr << " and ";
-    std::cerr << (3.3 * single_kernel_time.count()) << " seconds.\n";
-    std::cout << std::endl;
-    std::cout << "Test 2: Serialized set of three kernels with GPU1";
-    std::cout << " being long took:" << std::endl;
-    std::cout << "    " << serialized_gpu1_time.count();
-    std::cout << " seconds" << std::endl;
-    std::cerr << "Expect between " << (2.7 * single_kernel_time.count());
-    std::cerr << " and ";
-    std::cerr << (3.3 * single_kernel_time.count()) << " seconds.\n";
-    std::cout << std::endl;
-    std::cout << "Test 3: Multiple kernels with pre-overlap allowed took:\n";
-    std::cout << "    " << pre_overlapped_time.count();
-    std::cout << " seconds" << std::endl;
-    std::cerr << "Expect between " << (1.7 * single_kernel_time.count());
-    std::cerr << " and ";
-    std::cerr << (2.3 * single_kernel_time.count()) << " seconds.\n";
-    std::cout << std::endl;
-    std::cout << "Test 4: Multiple kernels with post-overlap allowed took:\n";
-    std::cout << "    " << post_overlapped_time.count();
-    std::cout << " seconds" << std::endl;
-    std::cerr << "Expect between " << (1.7 * single_kernel_time.count());
-    std::cerr << " and ";
-    std::cerr << (2.3 * single_kernel_time.count()) << " seconds.";
-    std::cout << std::endl;
-    std::cout << "Test 5: Multiple kernels with overlap allowed took:\n";
-    std::cout << "    " << overlapped_time.count();
-    std::cout << " seconds" << std::endl;
-    std::cerr << "Expect between " << (1.8 * single_kernel_time.count());
-    std::cerr << " and ";
-    std::cerr << (2.2 * single_kernel_time.count()) << " seconds.\n";
-
-    // Test that fully not-overlapped kernels take roughly 3x as long as one
-    // cooperative kernel.
-    if (serialized_gpu0_time > 3.3 * single_kernel_time ||
-        serialized_gpu0_time < 2.7 * single_kernel_time) {
-      std::cerr << "ERROR!" << std::endl;
-      std::cerr << "Test 1, the first case where all kernels should be ";
-      std::cerr << "serialized, had a runtime that was very different ";
-      std::cerr << "than what was expected." << std::endl;
-      std::cerr << "Was " << serialized_gpu0_time.count() << " seconds.\n";
-      std::cerr << "Expected between ";
-      std::cerr << (2.7 * single_kernel_time.count()) << " and ";
-      std::cerr << (3.3 * single_kernel_time.count()) << " seconds.\n";
-      std::cerr << "Were they truly serialized?" << std::endl;
-      FailFlag = 1;
-    }
-
-    // Test that fully not-overlapped kernels take roughly 3x as long as one
-    // cooperative kernel.
-    if (serialized_gpu1_time > 3.3 * single_kernel_time ||
-        serialized_gpu1_time < 2.7 * single_kernel_time) {
-      std::cerr << "ERROR!" << std::endl;
-      std::cerr << "Test 2, the second case where all kernels should be ";
-      std::cerr << "serialized, had a runtime that was very different ";
-      std::cerr << "than what was expected." << std::endl;
-      std::cerr << "Was " << serialized_gpu1_time.count();
-      std::cerr << " seconds." << std::endl;
-      std::cerr << "Expected between ";
-      std::cerr << (2.7 * single_kernel_time.count()) << " and ";
-      std::cerr << (3.3 * single_kernel_time.count()) << " seconds.\n";
-      std::cerr << "Were they truly serialized?" << std::endl;
-      FailFlag = 1;
-    }
-
-    // Test that kernels that can overlap only before the cooperative kernel
-    // launches kernels take roughly the same time (in this case)
-    if (pre_overlapped_time > 2.3 * single_kernel_time ||
-        pre_overlapped_time < 1.7 * single_kernel_time) {
-      std::cerr << "ERROR!" << std::endl;
-      std::cerr << "Test 3, the case where the last kernel is serialized, had ";
-      std::cerr << "a runtime that was very different than what was ";
-      std::cerr << "expected." << std::endl;
-      std::cerr << "Was " << pre_overlapped_time.count() << " seconds.\n";
-      std::cerr << "Expected between ";
-      std::cerr << (1.7 * single_kernel_time.count()) << " and ";
-      std::cerr << (2.3 * single_kernel_time.count()) << " seconds.\n";
-      FailFlag = 1;
-    }
-
-    // Test that kernels that can overlap only after the cooperative kernel
-    // launches kernels take roughly the same time (in this case)
-    if (post_overlapped_time > 2.3 * single_kernel_time ||
-        post_overlapped_time < 1.7 * single_kernel_time) {
-      std::cerr << "ERROR!" << std::endl;
-      std::cerr << "Teste 4, the case where the first kernel is ";
-      std::cerr << "serialized, had a runtime that was very different ";
-      std::cerr << "than what was expected." << std::endl;
-      std::cerr << "Was " << post_overlapped_time.count() << " seconds.\n";
-      std::cerr << "Expected between ";
-      std::cerr << (1.7 * single_kernel_time.count()) << " and ";
-      std::cerr << (2.3 * single_kernel_time.count()) << " seconds.\n";
-      FailFlag = 1;
-    }
-
-    // Test that, with the right flags on the kernel launch, that we prevent
-    // incomplete launches from serializing the cooperative launch streams.
-    if (overlapped_time > 2.2 * single_kernel_time ||
-        overlapped_time < 1.8 * single_kernel_time) {
-      std::cerr << "ERROR!" << std::endl;
-      std::cerr << "Test 5, the case where normal and cooperative kernel ";
-      std::cerr << "launches should overlap, does not appear to have done so.";
-      std::cerr << std::endl;
-      std::cerr << "Was " << overlapped_time.count() << " seconds.\n";
-      std::cerr << "Expected between ";
-      std::cerr << (1.8 * single_kernel_time.count()) << " and ";
-      std::cerr << (2.2 * single_kernel_time.count()) << " seconds.\n";
-      std::cerr << "Is the normal kernel being serialized with the ";
-      std::cerr << "cooperative kernels on different streams?" << std::endl;
-      FailFlag = 1;
-    }
-    for (int k = 0; k < 2; ++k) {
-      HIPCHECK(hipFree(dev_array[k]));
-      HIPCHECK(hipStreamDestroy(streams[k]));
-    }
-    if (FailFlag == 1) {
-      break;
-    }
-  }
-  if (FailFlag == 1) {
-    failed("");
-  } else {
-    passed();
-  }
-}
diff --git a/tests/src/runtimeApi/cooperativeGrps/multi_grid_group_all_gpus.cpp b/tests/src/runtimeApi/cooperativeGrps/multi_grid_group_all_gpus.cpp
deleted file mode 100644
index f2f9814dba..0000000000
--- a/tests/src/runtimeApi/cooperativeGrps/multi_grid_group_all_gpus.cpp
+++ /dev/null
@@ -1,374 +0,0 @@
-/*
-Copyright (c) 2020-present Advanced Micro Devices, Inc. All rights reserved.
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR
-IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
-*/
-
-// Test Description:
-/*The general idea of the application is to launch N warps to all GPUs detected
-in the HIP system. N is a command-line parameter, but the user should set N
-small enough that all warps can be on each of the GPUs at the same time.
-
-All of the warps do a "work loop". Within the work loop, every warp
-atomically increments a global variable that is shared between both fo the
-target GPUs. The value returned from this atomic increment entriely depends
-on the order the warps from the GPUs arrive at the atomic instruction. Each
-warp then stores the result into a global array based on its warp ID.
-
-We also add a sleep/wait loop into the code so that the last warp runs much
-slower than everyone else. As such, it should store much larger values than
-all the other warps.
-
-If there are no barrier within the loop, then warp 0 will likely ge to the
-global variable the first time while all the other warps have each
-incremented it many times. If the barrier properly works, then each warp
-will increment the variable once per time through the loop, and all threads
-will sleep on the barrier waiting for the last warp to finally catch up.
-*/
-
-/* HIT_START
- * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS -rdc=true -gencode arch=compute_60,code=sm_60
- * TEST: %t
- * HIT_END
- */
-
-#include <hip/hip_runtime.h>
-#include <hip/hip_cooperative_groups.h>
-#include "test_common.h"
-
-static int cooperative_groups_support(int device_id) {
-  hipError_t err;
-  int cooperative_attribute;
-  HIPCHECK(hipDeviceGetAttribute(&cooperative_attribute,
-           hipDeviceAttributeCooperativeLaunch, device_id));
-  if (!cooperative_attribute) {
-    std::cerr << "Cooperative launch support not available in ";
-    std::cerr << "the device attribute for device " << device_id;
-    std::cerr << std::endl;
-    return 0;
-  }
-
-  int multi_gpu_cooperative_attribute;
-  HIPCHECK(hipDeviceGetAttribute(&multi_gpu_cooperative_attribute,
-           hipDeviceAttributeCooperativeMultiDeviceLaunch, device_id));
-  if (!multi_gpu_cooperative_attribute) {
-    std::cerr << "Multi-GPU cooperative launch support not available in ";
-    std::cerr << "the device attribute for device " << device_id;
-    std::cerr << std::endl;
-    return 0;
-  }
-
-  hipDeviceProp_t device_properties;
-  HIPCHECK(hipGetDeviceProperties(&device_properties, device_id));
-  if (device_properties.cooperativeLaunch == 0) {
-    std::cerr << "Cooperative group support not available in ";
-    std::cerr << "device properties." << std::endl;
-    return 0;
-  }
-  if (device_properties.cooperativeMultiDeviceLaunch == 0) {
-    std::cerr << "Multi-GPU cooperative group support not available in ";
-    std::cerr << "device properties." << std::endl;
-    return 0;
-  }
-  return 1;
-}
-
-static int verify_barrier_buffer(unsigned int loops, unsigned int warps,
-                                 unsigned int *host_buffer,
-                                 unsigned int num_devs) {
-  unsigned int max_in_this_loop = 0;
-  for (unsigned int i = 0; i < loops; i++) {
-    max_in_this_loop += (warps * num_devs);
-    for (unsigned int j = 0; j < warps; j++) {
-      if (host_buffer[i*warps+j] > max_in_this_loop) {
-        std::cerr << "Barrier failure!" << std::endl;
-        std::cerr << "    Buffer entry " << i*warps+j;
-        std::cerr << " contains the value " << host_buffer[i*warps+j];
-        std::cerr << " but it should not be more than ";
-        std::cerr << max_in_this_loop << std::endl;
-        return -1;
-      }
-    }
-  }
-  std::cout << "\tBarriers work properly!" << std::endl;
-  return 0;
-}
-
-static int verify_multi_gpu_buffer(unsigned int loops, unsigned int array_val) {
-  unsigned int desired_val = 0;
-  for (int i = 0; i < loops; i++) {
-    if (i % 2 == 0) {
-      desired_val += 2;
-    } else {
-      desired_val *= 2;
-    }
-  }
-  std::cout << "Desired value is " << desired_val << std::endl;
-  if (array_val != desired_val) {
-    std::cerr << "ERROR! Multi-grid barrier does not appear to work.";
-    std::cerr << std::endl;
-    std::cerr << "Expected the multi-GPUs to work together to produce ";
-    std::cerr << "the value " << desired_val << std::endl;
-    std::cerr << "However, the entry returned from the multi-GPU ";
-    std::cerr << "kernel was " << array_val << std::endl;
-    return -1;
-  }
-    std::cout << "\tMulti-GPU barriers appear to work here." << std::endl;
-    return 0;
-}
-
-__global__ void
-test_kernel(unsigned int *atomic_val, unsigned int *global_array,
-            unsigned int *array, uint32_t loops) {
-  cooperative_groups::grid_group grid = cooperative_groups::this_grid();
-  cooperative_groups::multi_grid_group mgrid =
-                      cooperative_groups::this_multi_grid();
-  unsigned rank = grid.thread_rank();
-  unsigned global_rank = mgrid.thread_rank();
-
-  int offset = blockIdx.x;
-  for (int i = 0; i < loops; i++) {
-    // Make the last thread run way behind everyone else.
-    // If the grid barrier below fails, then the other threads may hit the
-    // atomicInc instruction many times before the last thread ever gets
-    // to it.
-    // As such, without the barrier, the last array entry will eventually
-    // contain a very large value, defined by however many times the other
-    // wavefronts make it through this loop.
-    // If the barrier works, then it will likely contain some number
-    // near "total number of blocks". It will be the last wavefront to
-    // reach the atomicInc, but everyone will have only hit the atomic once.
-    if (rank == (grid.size() - 1)) {
-      long long start_clock = clock64();
-      while (clock64() < (start_clock+1000000)) {}
-    }
-    if (threadIdx.x == 0) {
-      array[offset] = atomicInc(atomic_val, UINT_MAX);
-    }
-    grid.sync();
-
-    // Make the last thread in the entire multi-grid run way behind
-    // everyone else.
-    // If the mgrid barrier below fails, then the two global_array entries
-    // will end up being out of sync, because the intermingling of adds
-    // and multiplies will not be aligned between to the two GPUs.
-    if (global_rank == (mgrid.size() - 1)) {
-      long long start_clock = clock64();
-      while (clock64() < (start_clock+100000000)) {}
-    }
-    // During even iterations, add into your own array entry
-    // During odd iterations, add into your partner's array entry
-    unsigned grid_rank = mgrid.grid_rank();
-    unsigned inter_gpu_offset = (grid_rank + i) % mgrid.num_grids();
-    if (rank == (grid.size() - 1)) {
-      if (i % mgrid.num_grids() == 0) {
-        global_array[grid_rank] += 2;
-      } else {
-        global_array[inter_gpu_offset] *= 2;
-      }
-    }
-    mgrid.sync();
-    offset += gridDim.x;
-  }
-}
-
-int main(int argc, char** argv) {
-  hipError_t err;
-  int num_devices = 0;
-  uint32_t loops = 2;
-  uint32_t warps = 10;
-  uint32_t block_size = 1;
-
-  std::cout << "Loops: " << loops << std::endl;
-  std::cout << "Warps: " << warps << std::endl;
-  std::cout << "Block size: " << block_size << std::endl;
-
-  HIPCHECK(hipGetDeviceCount(&num_devices));
-  if (num_devices < 2) {
-    std::cout << "Not enough GPUs to run test." << std::endl;
-    std::cout << "We require at least 2 GPUs, but only found ";
-    std::cout << num_devices << std::endl;
-    std::cout << "Skipping the test with PASSED result\n";
-    passed();
-  }
-
-  uint32_t device_num[num_devices];
-
-  /*************************************************************************/
-  /* Test whether target device supports cooperative groups ****************/
-  for (int i = 0; i < num_devices; i++) {
-    device_num[i] = i;
-    if (!cooperative_groups_support(device_num[i])) {
-      std::cout << "Skipping the test with Pass result.\n";
-      passed();
-    }
-  }
-
-  /*************************************************************************/
-  /* Test whether the requested size will fit on the GPU *******************/
-  int warp_sizes[num_devices];
-  int num_sms[num_devices];
-  hipDeviceProp_t device_properties[num_devices];
-  int warp_size = INT_MAX;
-  int num_sm = INT_MAX;
-  for (int i = 0; i < num_devices; i++) {
-    HIPCHECK(hipGetDeviceProperties(&device_properties[i], device_num[i]));
-    warp_sizes[i] = device_properties[i].warpSize;
-    if (warp_sizes[i] < warp_size) {
-      warp_size = warp_sizes[i];
-    }
-    num_sms[i] = device_properties[i].multiProcessorCount;
-    if (num_sms[i] < num_sm) {
-      num_sm = num_sms[i];
-    }
-    std::cout << "Device " << (i + 1);
-    std::cout << " name: " << device_properties[i].name << std::endl;
-  }
-  std::cout << std::endl;
-
-  int num_threads_in_block = block_size * warp_size;
-
-  // Calculate the device occupancy to know how many blocks can be run.
-  int max_blocks_per_sm_arr[num_devices];
-  int max_blocks_per_sm = INT_MAX;
-  for (int i = 0; i < num_devices; i++) {
-    HIPCHECK(hipSetDevice(device_num[i]));
-    HIPCHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor(
-            &max_blocks_per_sm_arr[i], test_kernel, num_threads_in_block, 0));
-    if (max_blocks_per_sm_arr[i] < max_blocks_per_sm) {
-      max_blocks_per_sm = max_blocks_per_sm_arr[i];
-    }
-  }
-
-  int requested_blocks = warps / block_size;
-  if (requested_blocks > max_blocks_per_sm * num_sm) {
-    std::cerr << "Requesting to run " << requested_blocks << " blocks, ";
-    std::cerr << "but we can only guarantee to simultaneously run ";
-    std::cerr << (max_blocks_per_sm * num_sm) << std::endl;
-    failed("");
-  }
-
-  /*************************************************************************/
-  /* Set up data to pass into the kernel ***********************************/
-  // Each block will output a single value per loop.
-  uint32_t total_buffer_len = requested_blocks*loops;
-
-  // Alocate the buffer that will hold the kernel's output, and which will
-  // also be used to globally synchronize during GWS initialization
-  unsigned int *host_buffer[num_devices];
-  unsigned int *kernel_buffer[num_devices];
-  unsigned int *kernel_atomic[num_devices];
-  hipStream_t streams[num_devices];
-  for (int i = 0; i < num_devices; i++) {
-    host_buffer[i] = (unsigned int*)calloc(total_buffer_len,
-                                           sizeof(unsigned int));
-    HIPCHECK(hipSetDevice(device_num[i]));
-    HIPCHECK(hipMalloc(reinterpret_cast<void**>(&kernel_buffer[i]),
-                       total_buffer_len * sizeof(unsigned int)));
-    HIPCHECK(hipMemcpy(kernel_buffer[i], host_buffer[i],
-                       total_buffer_len * sizeof(unsigned int),
-                       hipMemcpyHostToDevice));
-    HIPCHECK(hipMalloc(reinterpret_cast<void**>(&kernel_atomic[i]),
-                       sizeof(unsigned int)));
-    HIPCHECK(hipMemset(kernel_atomic[i], 0, sizeof(unsigned int)));
-    HIPCHECK(hipStreamCreate(&streams[i]));
-  }
-
-  // Single kernel atomic shared between both devices; put it on the host
-  unsigned int* global_array;
-  HIPCHECK(hipHostMalloc(reinterpret_cast<void**>(&global_array),
-                         num_devices * sizeof(unsigned int), 0));
-  HIPCHECK(hipMemset(global_array, 0, num_devices * sizeof(unsigned int)));
-
-  /*************************************************************************/
-  /* Launch the kernels ****************************************************/
-  std::cout << "Launching a kernel with " << warps << " warps ";
-  std::cout << "in " << requested_blocks << " thread blocks.";
-  std::cout << std::endl;
-
-  void *dev_params[num_devices][4];
-  hipLaunchParams md_params[num_devices];
-  for (int i = 0; i < num_devices; i++) {
-    dev_params[i][0] = reinterpret_cast<void*>(&kernel_atomic[i]);
-    dev_params[i][1] = reinterpret_cast<void*>(&global_array);
-    dev_params[i][2] = reinterpret_cast<void*>(&kernel_buffer[i]);
-    dev_params[i][3] = reinterpret_cast<void*>(&loops);
-    md_params[i].func = reinterpret_cast<void*>(test_kernel);
-    md_params[i].gridDim = requested_blocks;
-    md_params[i].blockDim = num_threads_in_block;
-    md_params[i].sharedMem = 0;
-    md_params[i].stream = streams[i];
-    md_params[i].args = dev_params[i];
-  }
-
-  HIPCHECK(hipLaunchCooperativeKernelMultiDevice(md_params, num_devices, 0));
-  HIPCHECK(hipDeviceSynchronize());
-
-  /*************************************************************************/
-  /* Read back the buffers and print out its data **************************/
-  for (int dev = 0; dev < num_devices; dev++) {
-    HIPCHECK(hipMemcpy(host_buffer[dev], kernel_buffer[dev],
-                       total_buffer_len * sizeof(unsigned int),
-                       hipMemcpyDeviceToHost));
-  }
-
-  for (unsigned int i = 0; i < loops; i++) {
-    for (int dev = 0; dev < num_devices; dev++) {
-      std::cout << "+++++++++++++++++ Device " << dev;
-      std::cout << "+++++++++++++++++" << std::endl;
-      for (unsigned int j = 0; j < requested_blocks; j++) {
-        std::cout << "Buffer entry " << (i*warps+j);
-        std::cout << " (written by warp " << j << ")";
-        std::cout << " is " << host_buffer[dev][i*requested_blocks+j];
-        std::cout << std::endl;
-      }
-    }
-    std::cout << "==========================\n";
-  }
-  for (unsigned int dev = 0; dev < num_devices; dev++) {
-    std::cout << "Testing output from device " << dev << std::endl;
-    int local_ret_val = verify_barrier_buffer(loops, requested_blocks,
-                                              host_buffer[dev], num_devices);
-    if (local_ret_val) {
-      failed("");
-    }
-  }
-
-  std::cout << std::endl << "The multi-GPU shared updates contain:\n";
-  for (int i = 0; i < num_devices; i++) {
-    std::cout << "Entry " << i << ": ";
-    std::cout << global_array[i] << std::endl;
-  }
-  int flag = 0;
-  for (int dev = 0; dev < num_devices; dev++) {
-    std::cout << "Testing multi-GPU output for entry " << dev << std::endl;
-    int local_ret_val = verify_multi_gpu_buffer(loops, global_array[dev]);
-    if (local_ret_val) {
-      flag = 1;
-    }
-  }
-  for (int k = 0; k < num_devices; ++k) {
-    HIPCHECK(hipFree(kernel_buffer[k]));
-    HIPCHECK(hipFree(kernel_atomic[k]));
-    HIPCHECK(hipStreamDestroy(streams[k]));
-    free(host_buffer[k]);
-  }
-  if (flag == 1) {
-    failed("");
-  } else {
-    passed();
-  }
-}
diff --git a/tests/src/runtimeApi/cooperativeGrps/simple_grid_group_barrier.cpp b/tests/src/runtimeApi/cooperativeGrps/simple_grid_group_barrier.cpp
deleted file mode 100644
index 77aa63d3c6..0000000000
--- a/tests/src/runtimeApi/cooperativeGrps/simple_grid_group_barrier.cpp
+++ /dev/null
@@ -1,233 +0,0 @@
-/*
-Copyright (c) 2020-present Advanced Micro Devices, Inc. All rights reserved.
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR
-IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
-*/
-
-// Test Description:
-/*The general idea of the application is to launch N warps. N is a command-line
-parameter, but the user should set N small enough that all warps can be on
-the GPU at the same time.
-
-All of the warps do a "work loop". Within the work loop, every warp
-atomically increments a global variable. The value returned from this atomic
-increment entriely depends on the order the threads arrive at the atomic
-instruction. Each warp then stores the result into a global array based on its
-warp ID.
-
-We also add a sleep/wait loop into the code so that the last warp runs much
-slower than everyone else. As such, it should store much larger values than
-all the other warps.
-
-If there are no barrier within the loop, then the last warp will likely get to
-the global variable the first time after all the other warps have each
-incremented it many times. If the barrier properly works, then each warp
-will increment the variable once per time through the loop, and all threads
-will sleep on the barrier waiting for the last warp to finally catch up.
-*/
-
-/* HIT_START
- * BUILD: %t %s ../../test_common.cpp
- * TEST: %t
- * HIT_END
- */
-
-#include <hip/hip_runtime.h>
-#include <hip/hip_cooperative_groups.h>
-#include "test_common.h"
-
-static int cooperative_groups_support(int device_id) {
-  hipError_t err;
-  int cooperative_attribute;
-  HIPCHECK(hipDeviceGetAttribute(&cooperative_attribute,
-           hipDeviceAttributeCooperativeLaunch, device_id));
-  if (!cooperative_attribute) {
-    std::cerr << "Cooperative launch support not available in ";
-    std::cerr << "the device attribute for device " << device_id;
-    std::cerr << std::endl;
-    return 0;
-  }
-
-  hipDeviceProp_t device_properties;
-  HIPCHECK(hipGetDeviceProperties(&device_properties, device_id));
-  if (device_properties.cooperativeLaunch == 0) {
-    std::cerr << "Cooperative group support not available in ";
-    std::cerr << "device properties." << std::endl;
-    return 0;
-  }
-  return 1;
-}
-
-static int verify_barrier_buffer(unsigned int loops, unsigned int warps,
-                                 unsigned int *host_buffer) {
-  unsigned int max_in_this_loop = 0;
-  for (unsigned int i = 0; i < loops; i++) {
-    max_in_this_loop += warps;
-    for (unsigned int j = 0; j < warps; j++) {
-      if (host_buffer[i*warps+j] > max_in_this_loop) {
-        std::cerr << "Barrier failure!" << std::endl;
-        std::cerr << "    Buffer entry " << i*warps+j;
-        std::cerr << " contains the value " << host_buffer[i*warps+j];
-        std::cerr << " but it should not be more than ";
-        std::cerr << max_in_this_loop << std::endl;
-        return -1;
-      }
-    }
-  }
-  std::cout << "Barriers work properly!" << std::endl;
-  return 0;
-}
-
-__global__ void
-test_kernel(unsigned int *atomic_val, unsigned int *array,
-            unsigned int loops) {
-  cooperative_groups::grid_group grid = cooperative_groups::this_grid();
-  unsigned rank = grid.thread_rank();
-
-  int offset = blockIdx.x;
-  for (int i = 0; i < loops; i++) {
-    // Make the last thread run way behind everyone else.
-    // If the barrier below fails, then the other threads may hit the
-    // atomicInc instruction many times before the last thread ever gets
-    // to it.
-    // As such, without the barrier, the last array entry will eventually
-    // contain a very large value, defined by however many times the other
-    // wavefronts make it through this loop.
-    // If the barrier works, then it will likely contain some number
-    // near "total number of blocks". It will be the last wavefront to
-    // reach the atomicInc, but everyone will have only hit the atomic once.
-    if (rank == (grid.size() - 1)) {
-      long long start_clock = clock64();
-      while (clock64() < (start_clock+1000000)) {}
-    }
-
-    if (threadIdx.x == 0) {
-      array[offset] = atomicInc(&atomic_val[0], UINT_MAX);
-    }
-    grid.sync();
-    offset += gridDim.x;
-  }
-}
-
-int main(int argc, char** argv) {
-  hipError_t err;
-  int device_num;
-  uint32_t loops = 2;
-  uint32_t warps = 10;
-  uint32_t block_size = 1;
-  HIPCHECK(hipGetDeviceCount(&device_num));
-  for (int dev = 0; dev < device_num; ++dev) {
-    std::cout << "Device number: " << dev << std::endl;
-    std::cout << "Loops: " << loops << std::endl;
-    std::cout << "Warps: " << warps << std::endl;
-    std::cout << "Block size: " << block_size << std::endl;
-
-    /*************************************************************************/
-    /* Test whether target device supports cooperative groups ****************/
-    HIPCHECK(hipSetDevice(dev));
-    if (!cooperative_groups_support(dev)) {
-      std::cout << "Skipping the test with Pass result.\n";
-      passed();
-    }
-
-    /*************************************************************************/
-    /* Test whether the requested size will fit on the GPU *******************/
-    int warp_size;
-    int num_sms;
-    int max_blocks_per_sm;
-    hipDeviceProp_t device_properties;
-    HIPCHECK(hipGetDeviceProperties(&device_properties, dev));
-    warp_size = device_properties.warpSize;
-    num_sms = device_properties.multiProcessorCount;
-
-    std::cout << "Device name: " << device_properties.name << std::endl;
-    std::cout << std::endl;
-
-    int num_threads_in_block = block_size * warp_size;
-
-    // Calculate the device occupancy to know how many blocks can be run.
-    HIPCHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor(&max_blocks_per_sm,
-             test_kernel, num_threads_in_block, 0));
-
-    int requested_blocks = warps / block_size;
-    if (requested_blocks > max_blocks_per_sm * num_sms) {
-      std::cerr << "Requesting to run " << requested_blocks << " blocks, ";
-      std::cerr << "but we can only guarantee to simultaneously run ";
-      std::cerr << (max_blocks_per_sm * num_sms) << std::endl;
-      failed("");
-    }
-
-    /*************************************************************************/
-    /* Set up data to pass into the kernel ***********************************/
-    // Each block will output a single value per loop.
-    uint32_t total_buffer_len = requested_blocks*loops;
-
-    // Alocate the buffer that will hold the kernel's output, and which will
-    // also be used to globally synchronize during GWS initialization
-    unsigned int *host_buffer = (unsigned int*)calloc(total_buffer_len,
-            sizeof(unsigned int));
-
-    unsigned int *kernel_buffer;
-    HIPCHECK(hipMalloc(reinterpret_cast<void**>(&kernel_buffer),
-                       total_buffer_len * sizeof(unsigned int)));
-    HIPCHECK(hipMemcpy(kernel_buffer, host_buffer,
-                       total_buffer_len * sizeof(unsigned int),
-                       hipMemcpyHostToDevice));
-
-    unsigned int *kernel_atomic;
-    HIPCHECK(hipMalloc(reinterpret_cast<void**>(&kernel_atomic),
-                       sizeof(unsigned int)));
-    HIPCHECK(hipMemset(kernel_atomic, 0, sizeof(unsigned int)));
-
-    /*************************************************************************/
-    /* Launch the kernel *****************************************************/
-    std::cout << "Launching a kernel with " << warps << " warps ";
-    std::cout << "in " << requested_blocks << " thread blocks.";
-    std::cout << std::endl;
-
-    void *params[3];
-    params[0] = reinterpret_cast<void*>(&kernel_atomic);
-    params[1] = reinterpret_cast<void*>(&kernel_buffer);
-    params[2] = reinterpret_cast<void*>(&loops);
-    HIPCHECK(hipLaunchCooperativeKernel(reinterpret_cast<void*>(test_kernel),
-                                        requested_blocks,
-                                        num_threads_in_block, params, 0, NULL));
-
-    /*************************************************************************/
-    /* Read back the buffer and print out its data****************************/
-    HIPCHECK(hipMemcpy(host_buffer, kernel_buffer,
-                       total_buffer_len * sizeof(unsigned int),
-                       hipMemcpyDeviceToHost));
-
-    for (unsigned int i = 0; i < loops; i++) {
-      for (unsigned int j = 0; j < requested_blocks; j++) {
-        std::cout << "Buffer entry " << (i*warps+j);
-        std::cout << " (written by warp " << j << ")";
-        std::cout << " is " << host_buffer[i * requested_blocks + j];
-        std::cout << std::endl;
-      }
-      std::cout << "==========================\n";
-    }
-    int ret_val = verify_barrier_buffer(loops, requested_blocks, host_buffer);
-    HIPCHECK(hipFree(kernel_buffer));
-    HIPCHECK(hipFree(kernel_atomic));
-    if (ret_val == -1) {
-      failed("");
-    } else {
-      passed();
-    }
-  }
-}
diff --git a/tests/src/runtimeApi/cooperativeGrps/simple_multi_grid_group_barrier.cpp b/tests/src/runtimeApi/cooperativeGrps/simple_multi_grid_group_barrier.cpp
deleted file mode 100644
index ae793cf6a1..0000000000
--- a/tests/src/runtimeApi/cooperativeGrps/simple_multi_grid_group_barrier.cpp
+++ /dev/null
@@ -1,374 +0,0 @@
-/*
-Copyright (c) 2020-present Advanced Micro Devices, Inc. All rights reserved.
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR
-IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
-*/
-
-// Test Description:
-/*The general idea of the application is to launch N warps to each of two GPUs.
-N is a command-line parameter, but the user should set N small enough that all
-warps can be on each of the GPUs at the same time.
-
-All of the warps do a "work loop". Within the work loop, every warp
-atomically increments a global variable that is shared between both fo the
-target GPUs. The value returned from this atomic increment entriely depends
-on the order the warps from the GPUs arrive at the atomic instruction. Each
-warp then stores the result into a global array based on its warp ID.
-
-We also add a sleep/wait loop into the code so that the last warp runs much
-slower than everyone else. As such, it should store much larger values than
-all the other warps.
-
-If there are no barrier within the loop, then warp 0 will likely ge to the
-global variable the first time while all the other warps have each
-incremented it many times. If the barrier properly works, then each warp
-will increment the variable once per time through the loop, and all threads
-will sleep on the barrier waiting for the last warp to finally catch up.
-*/
-
-/* HIT_START
- * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS -rdc=true -gencode arch=compute_60,code=sm_60
- * TEST: %t
- * HIT_END
- */
-
-#include <hip/hip_runtime.h>
-#include <hip/hip_cooperative_groups.h>
-#include "test_common.h"
-
-static int cooperative_groups_support(int device_id) {
-  hipError_t err;
-  int cooperative_attribute;
-  HIPCHECK(hipDeviceGetAttribute(&cooperative_attribute,
-           hipDeviceAttributeCooperativeLaunch, device_id));
-  if (!cooperative_attribute) {
-    std::cerr << "Cooperative launch support not available in ";
-    std::cerr << "the device attribute for device " << device_id;
-    std::cerr << std::endl;
-    return 0;
-  }
-
-  int multi_gpu_cooperative_attribute;
-  HIPCHECK(hipDeviceGetAttribute(&multi_gpu_cooperative_attribute,
-           hipDeviceAttributeCooperativeMultiDeviceLaunch, device_id));
-  if (!multi_gpu_cooperative_attribute) {
-    std::cerr << "Multi-GPU cooperative launch support not available in ";
-    std::cerr << "the device attribute for device " << device_id;
-    std::cerr << std::endl;
-    return 0;
-  }
-
-  hipDeviceProp_t device_properties;
-  HIPCHECK(hipGetDeviceProperties(&device_properties, device_id));
-  if (device_properties.cooperativeLaunch == 0) {
-    std::cerr << "Cooperative group support not available in ";
-    std::cerr << "device properties." << std::endl;
-    return 0;
-  }
-  if (device_properties.cooperativeMultiDeviceLaunch == 0) {
-    std::cerr << "Multi-GPU cooperative group support not available in ";
-    std::cerr << "device properties." << std::endl;
-    return 0;
-  }
-  return 1;
-}
-
-static int verify_barrier_buffer(unsigned int loops, unsigned int warps,
-                                 unsigned int *host_buffer,
-                                 unsigned int num_devs) {
-  unsigned int max_in_this_loop = 0;
-  for (unsigned int i = 0; i < loops; i++) {
-    max_in_this_loop += (warps * num_devs);
-    for (unsigned int j = 0; j < warps; j++) {
-      if (host_buffer[i*warps+j] > max_in_this_loop) {
-        std::cerr << "Barrier failure!" << std::endl;
-        std::cerr << "    Buffer entry " << i*warps+j;
-        std::cerr << " contains the value " << host_buffer[i*warps+j];
-        std::cerr << " but it should not be more than ";
-        std::cerr << max_in_this_loop << std::endl;
-        return -1;
-      }
-    }
-  }
-  std::cout << "\tBarriers work properly!" << std::endl;
-  return 0;
-}
-
-static int verify_multi_gpu_buffer(unsigned int loops, unsigned int array_val) {
-  unsigned int desired_val = 0;
-  for (int i = 0; i < loops; i++) {
-    if (i % 2 == 0) {
-      desired_val += 2;
-    } else {
-      desired_val *= 2;
-    }
-  }
-  std::cout << "Desired value is " << desired_val << std::endl;
-  if (array_val != desired_val) {
-    std::cerr << "ERROR! Multi-grid barrier does not appear to work.";
-    std::cerr << std::endl;
-    std::cerr << "Expected the multi-GPUs to work together to produce ";
-    std::cerr << "the value " << desired_val << std::endl;
-    std::cerr << "However, the entry returned from the multi-GPU ";
-    std::cerr << "kernel was " << array_val << std::endl;
-    return -1;
-  }
-    std::cout << "\tMulti-GPU barriers appear to work here." << std::endl;
-    return 0;
-}
-
-__global__ void
-test_kernel(unsigned int *atomic_val, unsigned int *global_array,
-            unsigned int *array, uint32_t loops) {
-  cooperative_groups::grid_group grid = cooperative_groups::this_grid();
-  cooperative_groups::multi_grid_group mgrid =
-                      cooperative_groups::this_multi_grid();
-  unsigned rank = grid.thread_rank();
-  unsigned global_rank = mgrid.thread_rank();
-
-  int offset = blockIdx.x;
-  for (int i = 0; i < loops; i++) {
-    // Make the last thread run way behind everyone else.
-    // If the grid barrier below fails, then the other threads may hit the
-    // atomicInc instruction many times before the last thread ever gets
-    // to it.
-    // As such, without the barrier, the last array entry will eventually
-    // contain a very large value, defined by however many times the other
-    // wavefronts make it through this loop.
-    // If the barrier works, then it will likely contain some number
-    // near "total number of blocks". It will be the last wavefront to
-    // reach the atomicInc, but everyone will have only hit the atomic once.
-    if (rank == (grid.size() - 1)) {
-      long long start_clock = clock64();
-      while (clock64() < (start_clock + 1000000)) {}
-    }
-    if (threadIdx.x == 0) {
-      array[offset] = atomicInc(atomic_val, UINT_MAX);
-    }
-    grid.sync();
-
-    // Make the last thread in the entire multi-grid run way behind
-    // everyone else.
-    // If the mgrid barrier below fails, then the two global_array entries
-    // will end up being out of sync, because the intermingling of adds
-    // and multiplies will not be aligned between to the two GPUs.
-    if (global_rank == (mgrid.size() - 1)) {
-      long long start_clock = clock64();
-      while (clock64() < (start_clock + 100000000)) {}
-    }
-    // During even iterations, add into your own array entry
-    // During odd iterations, add into your partner's array entry
-    unsigned grid_rank = mgrid.grid_rank();
-    unsigned inter_gpu_offset = (grid_rank + i) % mgrid.num_grids();
-    if (rank == (grid.size() - 1)) {
-      if (i % mgrid.num_grids() == 0) {
-        global_array[grid_rank] += 2;
-      } else {
-        global_array[inter_gpu_offset] *= 2;
-      }
-    }
-    mgrid.sync();
-    offset += gridDim.x;
-  }
-}
-
-int main(int argc, char** argv) {
-    hipError_t err;
-    int device_num = 0, flag = 0;
-    uint32_t loops = 2;
-    uint32_t warps = 10;
-    uint32_t block_size = 1;
-    HIPCHECK(hipGetDeviceCount(&device_num));
-    if (device_num < 2) {
-      std::cout << "This test needs atleast two gpus but found only";
-      std::cout << device_num << std::endl;
-      std::cout << "Hence skipping the test with pass result\n";
-      passed();
-    }
-
-    for (int d = 0; d < (device_num - 1); ++d) {
-    std::cout << "First device number: " << d << std::endl;
-    std::cout << "Second device number: " << (d + 1) << std::endl;
-    std::cout << "Loops: " << loops << std::endl;
-    std::cout << "Warps: " << warps << std::endl;
-    std::cout << "Block size: " << block_size << std::endl;
-
-    /*************************************************************************/
-    /* Test whether target device supports cooperative groups ****************/
-    for (int i = 0; i < 2; i++) {
-      if (!cooperative_groups_support((d + i))) {
-        std::cout << "Skipping the test with Pass result.\n";
-        passed();
-      }
-    }
-
-    /*************************************************************************/
-    /* Test whether the requested size will fit on the GPU *******************/
-    int warp_sizes[2];
-    int num_sms[2];
-    hipDeviceProp_t device_properties[2];
-    int warp_size = INT_MAX;
-    int num_sm = INT_MAX;
-    for (int i = 0; i < 2; i++) {
-      HIPCHECK(hipGetDeviceProperties(&device_properties[i], (d + i)));
-      warp_sizes[i] = device_properties[i].warpSize;
-      if (warp_sizes[i] < warp_size) {
-        warp_size = warp_sizes[i];
-      }
-      num_sms[i] = device_properties[i].multiProcessorCount;
-      if (num_sms[i] < num_sm) {
-        num_sm = num_sms[i];
-      }
-      std::cout << "Device " << (d + i);
-      std::cout << " name: " << device_properties[i].name << std::endl;
-    }
-    std::cout << std::endl;
-
-    int num_threads_in_block = block_size * warp_size;
-
-    // Calculate the device occupancy to know how many blocks can be run.
-    int max_blocks_per_sm_arr[2];
-    int max_blocks_per_sm = INT_MAX;
-    for (int i = 0; i < 2; i++) {
-      HIPCHECK(hipSetDevice((d + i)));
-      HIPCHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor(
-              &max_blocks_per_sm_arr[i], test_kernel, num_threads_in_block,
-              0));
-      if (max_blocks_per_sm_arr[i] < max_blocks_per_sm) {
-        max_blocks_per_sm = max_blocks_per_sm_arr[i];
-      }
-    }
-
-    int requested_blocks = warps / block_size;
-    if (requested_blocks > max_blocks_per_sm * num_sm) {
-      std::cerr << "Requesting to run " << requested_blocks << " blocks, ";
-      std::cerr << "but we can only guarantee to simultaneously run ";
-      std::cerr << (max_blocks_per_sm * num_sm) << std::endl;
-      failed("");
-    }
-
-    /*************************************************************************/
-    /* Set up data to pass into the kernel ***********************************/
-    // Each block will output a single value per loop.
-    uint32_t total_buffer_len = requested_blocks*loops;
-
-    // Alocate the buffer that will hold the kernel's output, and which will
-    // also be used to globally synchronize during GWS initialization
-    unsigned int *host_buffer[2];
-    unsigned int *kernel_buffer[2];
-    unsigned int *kernel_atomic[2];
-    hipStream_t streams[2];
-    for (int i = 0; i < 2; i++) {
-      host_buffer[i] = (unsigned int*)calloc(total_buffer_len,
-                                             sizeof(unsigned int));
-      HIPCHECK(hipSetDevice((d + i)));
-      HIPCHECK(hipMalloc(reinterpret_cast<void**>(&kernel_buffer[i]),
-               total_buffer_len * sizeof(unsigned int)));
-      HIPCHECK(hipMemcpy(kernel_buffer[i], host_buffer[i],
-               total_buffer_len * sizeof(unsigned int), hipMemcpyHostToDevice));
-      HIPCHECK(hipMalloc(reinterpret_cast<void**>(&kernel_atomic[i]),
-                         sizeof(unsigned int)));
-      HIPCHECK(hipMemset(kernel_atomic[i], 0, sizeof(unsigned int)));
-      HIPCHECK(hipStreamCreate(&streams[i]));
-    }
-
-    // Single kernel atomic shared between both devices; put it on the host
-    unsigned int* global_array;
-    HIPCHECK(hipHostMalloc(reinterpret_cast<void**>(&global_array),
-                           2 * sizeof(unsigned int), 0));
-    HIPCHECK(hipMemset(global_array, 0, 2 * sizeof(unsigned int)));
-
-    /*************************************************************************/
-    /* Launch the kernels ****************************************************/
-    std::cout << "Launching a kernel with " << warps << " warps ";
-    std::cout << "in " << requested_blocks << " thread blocks.";
-    std::cout << std::endl;
-
-    void *dev_params[2][4];
-    hipLaunchParams md_params[2];
-    for (int i = 0; i < 2; i++) {
-      dev_params[i][0] = reinterpret_cast<void*>(&kernel_atomic[i]);
-      dev_params[i][1] = reinterpret_cast<void*>(&global_array);
-      dev_params[i][2] = reinterpret_cast<void*>(&kernel_buffer[i]);
-      dev_params[i][3] = reinterpret_cast<void*>(&loops);
-      md_params[i].func = reinterpret_cast<void*>(test_kernel);
-      md_params[i].gridDim = requested_blocks;
-      md_params[i].blockDim = num_threads_in_block;
-      md_params[i].sharedMem = 0;
-      md_params[i].stream = streams[i];
-      md_params[i].args = dev_params[i];
-    }
-
-    HIPCHECK(hipLaunchCooperativeKernelMultiDevice(md_params, 2, 0));
-    HIPCHECK(hipDeviceSynchronize());
-
-    /*************************************************************************/
-    /* Read back the buffers and print out its data **************************/
-    for (int dev = 0; dev < 2; dev++) {
-      HIPCHECK(hipMemcpy(host_buffer[d + dev], kernel_buffer[d + dev],
-                         total_buffer_len * sizeof(unsigned int),
-                         hipMemcpyDeviceToHost));
-    }
-
-    for (unsigned int i = 0; i < loops; i++) {
-      for (int dev = 0; dev < 2; dev++) {
-        std::cout << "+++++++++++++++++ Device " << (d + dev);
-        std::cout << "+++++++++++++++++" << std::endl;
-        for (unsigned int j = 0; j < requested_blocks; j++) {
-          std::cout << "Buffer entry " << (i * warps + j);
-          std::cout << " (written by warp " << j << ")";
-          std::cout << " is " << host_buffer[dev][i * requested_blocks + j];
-          std::cout << std::endl;
-        }
-      }
-      std::cout << "==========================\n";
-    }
-    for (unsigned int dev = 0; dev < 2; dev++) {
-      std::cout << "Testing output from device " << (d + dev) << std::endl;
-      int local_ret_val = verify_barrier_buffer(loops, requested_blocks,
-                                                host_buffer[dev], 2);
-      if (local_ret_val == -1) {
-        flag = 1;
-      }
-    }
-
-    std::cout << std::endl << "The multi-GPU shared updates contain:";
-    std::cout << std::endl;
-    for (int i = 0; i < 2; i++) {
-      std::cout << "Entry " << i << ": ";
-      std::cout << global_array[i] << std::endl;
-    }
-    for (int dev = 0; dev < 2; dev++) {
-      std::cout << "Testing multi-GPU output for entry " << (d + dev);
-      std::cout << std::endl;
-      int local_ret_val = verify_multi_gpu_buffer(loops, global_array[dev]);
-      if (local_ret_val) {
-        flag = 1;
-      }
-    }
-    for (int k = 0; k < 2; ++k) {
-      HIPCHECK(hipFree(kernel_buffer[k]));
-      HIPCHECK(hipFree(kernel_atomic[k]));
-      HIPCHECK(hipStreamDestroy(streams[k]));
-      free(host_buffer[k]);
-    }
-  }
-  if (flag == 1) {
-    failed("");
-  } else {
-    passed();
-  }
-}
diff --git a/tests/src/runtimeApi/device/hipDeviceGetPCIBusId.cpp b/tests/src/runtimeApi/device/hipDeviceGetPCIBusId.cpp
index f7a9dac703..874f8bc44c 100644
--- a/tests/src/runtimeApi/device/hipDeviceGetPCIBusId.cpp
+++ b/tests/src/runtimeApi/device/hipDeviceGetPCIBusId.cpp
@@ -1,173 +1,173 @@
-/*
- * Copyright (c) 2015-present Advanced Micro Devices, Inc. All rights reserved.
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
- */
-
-/*
- * Test to compare
- * 1.pciBusID from hipDeviceGetPCIBusId and hipDeviceGetAttribute **
- * 2.{pciDomainID, pciBusID, pciDeviceID} values hipDeviceGetPCIBusId vs lspci **
- */
-
-/* HIT_START
- * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS -std=c++11 EXCLUDE_HIP_PLATFORM nvcc
- * TEST_NAMED: %t  hipDeviceGetPCIBusId-vs-hipDeviceGetAttribute --tests 0x1
- * TEST_NAMED: %t  hipDeviceGetPCIBusId-vs-lspci --tests 0x2 EXCLUDE_HIP_PLATFORM nvcc
- * HIT_END
- */
-
-#include "test_common.h"
-#define MAX_DEVICE_LENGTH 20
-
-static bool getPciBusId(int deviceCount, char hipDeviceList[][MAX_DEVICE_LENGTH]) {
-  for (int i = 0; i < deviceCount; i++) {
-    HIPCHECK(hipDeviceGetPCIBusId(hipDeviceList[i], MAX_DEVICE_LENGTH, i));
-  }
-  return true;
-}
-
-bool comparePciBusIDWithHipDeviceGetAttribute() {
-  bool testResult = true;
-  int deviceCount = 0;
-  HIPCHECK(hipGetDeviceCount(&deviceCount));
-  HIPASSERT(deviceCount != 0);
-  printf("No.of gpus in the system: %d\n", deviceCount);
-  char hipDeviceList[deviceCount][MAX_DEVICE_LENGTH];
-  char pciDeviceList[deviceCount][MAX_DEVICE_LENGTH];
-
-  getPciBusId(deviceCount, hipDeviceList);
-
-  for (int i = 0; i < deviceCount; i++) {
-    int pciBusID = -1;
-    int pciDeviceID = -1;
-    int pciDomainID = -1;
-    int tempPciBusId = -1;
-    sscanf(hipDeviceList[i], "%04x:%02x:%02x", &pciDomainID, &pciBusID,
-           &pciDeviceID);
-    HIPCHECK(hipDeviceGetAttribute(&tempPciBusId, hipDeviceAttributePciBusId, i));
-    if (pciBusID != tempPciBusId) {
-      testResult = false;
-      printf("pciBusID from hipDeviceGetPCIBusId mismatched to that from "
-             "hipDeviceGetAttribute for gpu %d\n", i);
-    }
-  }
-
-  printf("pciBusID output of both hipDeviceGetPCIBusId and"
-         " hipDeviceGetAttribute matched for all gpus\n");
-  return testResult;
-}
-
-bool compareHipDeviceGetPCIBusIdWithLspci() {
-  FILE *fpipe;
-  bool testResult = false;
-
-  {
-    // Check if lspci is installed, if not, don't proceed
-    char const *cmd = "lspci --version";
-    char *lspciCheck;
-    char temp[20];
-    fpipe = popen(cmd, "r");
-
-    if (fpipe == nullptr) {
-      printf("Unable to create command file\n");
-      return testResult;
-    }
-
-    lspciCheck = fgets(temp, 20, fpipe);
-    pclose(fpipe);
-
-    if (!lspciCheck) {
-      printf("lspci not found. Skipping the test\n");
-      return true;
-    }
-  }
-
-  int deviceCount = 0;
-  HIPCHECK(hipGetDeviceCount(&deviceCount));
-  HIPASSERT(deviceCount != 0);
-  printf("No.of gpus in the system: %d\n", deviceCount);
-  char hipDeviceList[deviceCount][MAX_DEVICE_LENGTH];
-  char pciDeviceList[deviceCount][MAX_DEVICE_LENGTH];
-
-  getPciBusId(deviceCount, hipDeviceList);
-
-  // Get lspci device list and compare with hip device list
-#if defined(__CUDA_ARCH__)
-  char const *command = "lspci -D | grep controller | grep NVIDIA | "
-                        "cut -d ' ' -f 1";
-#else
-  char const *command = "lspci -D | grep controller | grep AMD/ATI | "
-                        "cut -d ' ' -f 1";
-#endif
-  fpipe = popen(command, "r");
-
-  if (fpipe == nullptr) {
-    printf("Unable to create command file\n");
-    return testResult;
-  }
-
-  int index = 0;
-  int deviceMatchCount = 0;
-
-  while (fgets(pciDeviceList[index], sizeof(pciDeviceList[index]), fpipe)) {
-    bool bMatchFound = false;
-    for (int deviceNo = 0; deviceNo < deviceCount; deviceNo++) {
-      if (!strncmp(pciDeviceList[index], hipDeviceList[deviceNo], 10)) {
-        deviceMatchCount++;
-        bMatchFound = true;
-      }
-    }
-    if (bMatchFound == false) {
-      printf("PCI device: %s is not reported by HIP\n", pciDeviceList[index]);
-    }
-    index++;
-  }
-
-  pclose(fpipe);
-
-  if (deviceMatchCount == deviceCount) {
-    printf("hip and lspci output for {pciDomainID, pciBusID, pciDeviceID} "
-           "matched for all gpus\n");
-    testResult = true;
-  } else {
-    printf("Mismatch in number GPUs reported by HIP with lscpi\n");
-  }
-  return testResult;
-}
-
-int main(int argc, char* argv[]) {
-  bool testResult = true;
-  HipTest::parseStandardArguments(argc, argv, true);
-
-  if (p_tests & 0x1) {
-    testResult &= comparePciBusIDWithHipDeviceGetAttribute();
-  }
-
-  if (p_tests & 0x2) {
-#ifdef __unix__
-    testResult &= compareHipDeviceGetPCIBusIdWithLspci();
-#else
-    printf("Detected non-linux OS. Skipping the test\n");
-#endif
-  }
-
-  if (testResult) {
-    passed();
-  } else {
-    failed("one or more tests failed\n");
-  }
-}
+/*
+ * Copyright (c) 2015-present Advanced Micro Devices, Inc. All rights reserved.
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+/*
+ * Test to compare
+ * 1.pciBusID from hipDeviceGetPCIBusId and hipDeviceGetAttribute **
+ * 2.{pciDomainID, pciBusID, pciDeviceID} values hipDeviceGetPCIBusId vs lspci **
+ */
+
+/* HIT_START
+ * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS -std=c++11
+ * TEST_NAMED: %t  hipDeviceGetPCIBusId-vs-hipDeviceGetAttribute --tests 0x1
+ * TEST_NAMED: %t  hipDeviceGetPCIBusId-vs-lspci --tests 0x2 EXCLUDE_HIP_PLATFORM nvcc
+ * HIT_END
+ */
+
+#include "test_common.h"
+#define MAX_DEVICE_LENGTH 20
+
+static bool getPciBusId(int deviceCount, char hipDeviceList[][MAX_DEVICE_LENGTH]) {
+  for (int i = 0; i < deviceCount; i++) {
+    HIPCHECK(hipDeviceGetPCIBusId(hipDeviceList[i], MAX_DEVICE_LENGTH, i));
+  }
+  return true;
+}
+
+bool comparePciBusIDWithHipDeviceGetAttribute() {
+  bool testResult = true;
+  int deviceCount = 0;
+  HIPCHECK(hipGetDeviceCount(&deviceCount));
+  HIPASSERT(deviceCount != 0);
+  printf("No.of gpus in the system: %d\n", deviceCount);
+  char hipDeviceList[deviceCount][MAX_DEVICE_LENGTH];
+  char pciDeviceList[deviceCount][MAX_DEVICE_LENGTH];
+
+  getPciBusId(deviceCount, hipDeviceList);
+
+  for (int i = 0; i < deviceCount; i++) {
+    int pciBusID = -1;
+    int pciDeviceID = -1;
+    int pciDomainID = -1;
+    int tempPciBusId = -1;
+    sscanf(hipDeviceList[i], "%04x:%02x:%02x", &pciDomainID, &pciBusID,
+           &pciDeviceID);
+    HIPCHECK(hipDeviceGetAttribute(&tempPciBusId, hipDeviceAttributePciBusId, i));
+    if (pciBusID != tempPciBusId) {
+      testResult = false;
+      printf("pciBusID from hipDeviceGetPCIBusId mismatched to that from "
+             "hipDeviceGetAttribute for gpu %d\n", i);
+    }
+  }
+
+  printf("pciBusID output of both hipDeviceGetPCIBusId and"
+         " hipDeviceGetAttribute matched for all gpus\n");
+  return testResult;
+}
+
+bool compareHipDeviceGetPCIBusIdWithLspci() {
+  FILE *fpipe;
+  bool testResult = false;
+
+  {
+    // Check if lspci is installed, if not, don't proceed
+    char const *cmd = "lspci --version";
+    char *lspciCheck;
+    char temp[20];
+    fpipe = popen(cmd, "r");
+
+    if (fpipe == nullptr) {
+      printf("Unable to create command file\n");
+      return testResult;
+    }
+
+    lspciCheck = fgets(temp, 20, fpipe);
+    pclose(fpipe);
+
+    if (!lspciCheck) {
+      printf("lspci not found. Skipping the test\n");
+      return true;
+    }
+  }
+
+  int deviceCount = 0;
+  HIPCHECK(hipGetDeviceCount(&deviceCount));
+  HIPASSERT(deviceCount != 0);
+  printf("No.of gpus in the system: %d\n", deviceCount);
+  char hipDeviceList[deviceCount][MAX_DEVICE_LENGTH];
+  char pciDeviceList[deviceCount][MAX_DEVICE_LENGTH];
+
+  getPciBusId(deviceCount, hipDeviceList);
+
+  // Get lspci device list and compare with hip device list
+#if defined(__CUDA_ARCH__)
+  char const *command = "lspci -D | grep controller | grep NVIDIA | "
+                        "cut -d ' ' -f 1";
+#else
+  char const *command = "lspci -D | grep controller | grep AMD/ATI | "
+                        "cut -d ' ' -f 1";
+#endif
+  fpipe = popen(command, "r");
+
+  if (fpipe == nullptr) {
+    printf("Unable to create command file\n");
+    return testResult;
+  }
+
+  int index = 0;
+  int deviceMatchCount = 0;
+
+  while (fgets(pciDeviceList[index], sizeof(pciDeviceList[index]), fpipe)) {
+    bool bMatchFound = false;
+    for (int deviceNo = 0; deviceNo < deviceCount; deviceNo++) {
+      if (!strncmp(pciDeviceList[index], hipDeviceList[deviceNo], 10)) {
+        deviceMatchCount++;
+        bMatchFound = true;
+      }
+    }
+    if (bMatchFound == false) {
+      printf("PCI device: %s is not reported by HIP\n", pciDeviceList[index]);
+    }
+    index++;
+  }
+
+  pclose(fpipe);
+
+  if (deviceMatchCount == deviceCount) {
+    printf("hip and lspci output for {pciDomainID, pciBusID, pciDeviceID} "
+           "matched for all gpus\n");
+    testResult = true;
+  } else {
+    printf("Mismatch in number GPUs reported by HIP with lscpi\n");
+  }
+  return testResult;
+}
+
+int main(int argc, char* argv[]) {
+  bool testResult = true;
+  HipTest::parseStandardArguments(argc, argv, true);
+
+  if (p_tests & 0x1) {
+    testResult &= comparePciBusIDWithHipDeviceGetAttribute();
+  }
+
+  if (p_tests & 0x2) {
+#ifdef __unix__
+    testResult &= compareHipDeviceGetPCIBusIdWithLspci();
+#else
+    printf("Detected non-linux OS. Skipping the test\n");
+#endif
+  }
+
+  if (testResult) {
+    passed();
+  } else {
+    failed("one or more tests failed\n");
+  }
+}
diff --git a/tests/src/runtimeApi/device/hipSetGetDevice.cpp b/tests/src/runtimeApi/device/hipSetGetDevice.cpp
index 6c703de867..4224c974b3 100644
--- a/tests/src/runtimeApi/device/hipSetGetDevice.cpp
+++ b/tests/src/runtimeApi/device/hipSetGetDevice.cpp
@@ -25,7 +25,7 @@
  */
 
 /* HIT_START
- * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS -std=c++11 EXCLUDE_HIP_PLATFORM nvcc
+ * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS -std=c++11
  * TEST_NAMED: %t hipSetGetDevice-invalidDevice
  * TEST_NAMED: %t hipSetGetDevice-allValidDevice
  * TEST_NAMED: %t hipSetGetDevice-validDev1 --computeDevCnt 1
diff --git a/tests/src/runtimeApi/memory/hipIpcMemAccessTest.cpp b/tests/src/runtimeApi/memory/hipIpcMemAccessTest.cpp
deleted file mode 100644
index 00c01ab1cc..0000000000
--- a/tests/src/runtimeApi/memory/hipIpcMemAccessTest.cpp
+++ /dev/null
@@ -1,227 +0,0 @@
-/*
-Copyright (c) 2020 - present Advanced Micro Devices, Inc. All rights reserved.
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
-*/
-
-/* HIT_START
- * BUILD: %t %s ../../test_common.cpp
- * TEST: %t
- * HIT_END
- */
-
-#include <sys/types.h>
-#include <sys/mman.h>
-#include <sys/wait.h>
-#include <fcntl.h>
-#include <semaphore.h>
-#include <unistd.h>
-#include "test_common.h"
-
-#ifdef __linux__
-sem_t *sem_ob1 = NULL, *sem_ob2 = NULL;
-typedef struct mem_handle {
-  int device;
-  hipIpcMemHandle_t memHandle;
-  bool IfTestPassed;
-} hip_ipc_t;
-
-class IpcMemHandleTest {
- public:
-  bool InitFlag = true;
-  hip_ipc_t *shrd_mem = NULL;
-  pid_t pid;
-  size_t N = 1024;
-  size_t Nbytes = N * sizeof(int);
-  int *A_d = NULL, out = 0;
-  int *A_h, *C_h;
-  int Num_devices = 0, Data_mismatch, CanAccessPeer = 0;
-  int *Ad1 = NULL, *Ad2 = NULL;
-  IpcMemHandleTest();
-  bool Test();
-  ~IpcMemHandleTest();
-};
-
-
-bool IpcMemHandleTest::Test() {
-  if (InitFlag == false) {
-    // Abort the test if the initialization fails
-    printf("Resource initialization failed. Hence test skipped!");
-    return false;
-  }
-  pid = fork();
-  if (pid != 0) {
-    // Parent process
-    HIPCHECK(hipGetDeviceCount(&Num_devices));
-    for (int i = 0; i < Num_devices; ++i) {
-      if (shrd_mem->IfTestPassed == true) {
-        HIPCHECK(hipSetDevice(i));
-        HIPCHECK(hipMalloc(&A_d, Nbytes));
-        HIPCHECK(hipIpcGetMemHandle((hipIpcMemHandle_t *) &shrd_mem->memHandle,
-                                    A_d));
-        HIPCHECK(hipMemcpy(A_d, A_h, Nbytes, hipMemcpyHostToDevice));
-        shrd_mem->device = i;
-        if ((out=sem_post(sem_ob1)) == -1) {
-          // Need to use inline function to release resources.
-          shrd_mem->IfTestPassed = false;
-          failed("sem_post() call failed in parent process.");
-        }
-        if ((out=sem_wait(sem_ob2)) == -1) {
-          shrd_mem->IfTestPassed = false;
-          failed("sem_wait() call failed in parent process.");
-        }
-        HIPCHECK(hipFree(A_d));
-      }
-    }
-  } else {
-    // Child process
-    HIPCHECK(hipGetDeviceCount(&Num_devices));
-    for (int j = 0; j < Num_devices; ++j) {
-      if ((out=sem_wait(sem_ob1)) == -1) {
-        shrd_mem->IfTestPassed = false;
-        printf("sem_wait() call failed in child process.");
-        if ((out=sem_post(sem_ob2)) == -1) {
-          printf("sem_post() call on sem_ob2 failed");
-          exit(1);
-        }
-      }
-      for (int i = 0; i < Num_devices; ++i) {
-        Data_mismatch = 0;
-        HIPCHECK(hipSetDevice(i));
-        HIPCHECK(hipMalloc(&Ad2, Nbytes));
-        HIPCHECK(hipIpcOpenMemHandle((void **) &Ad1, shrd_mem->memHandle,
-                                     hipIpcMemLazyEnablePeerAccess));
-        HIPCHECK(hipDeviceCanAccessPeer(&CanAccessPeer, i, shrd_mem->device));
-        if (CanAccessPeer == 1) {
-          HIPCHECK(hipMemcpy(Ad2, Ad1, Nbytes, hipMemcpyDeviceToDevice));
-          HIPCHECK(hipMemcpy(C_h, Ad2, Nbytes, hipMemcpyDeviceToDevice));
-          for (int i = 0; i < N; ++i) {
-            if (C_h[i] != 123)
-              Data_mismatch++;
-          }
-          if (Data_mismatch != 0) {
-            printf("Data mismatch found when data copied from Ipc memhandle");
-            printf(" to Device: %d\n", i);
-            shrd_mem->IfTestPassed = false;
-          }
-          memset(reinterpret_cast<void*>(C_h), 0, Nbytes);
-          // Checking if the data obtained from Ipc shared memory is consistent
-          HIPCHECK(hipMemcpy(C_h, Ad1, Nbytes, hipMemcpyDeviceToHost));
-          for (int i = 0; i < N; ++i) {
-            if (C_h[i] != 123)
-              Data_mismatch++;
-          }
-          if (Data_mismatch != 0) {
-            printf("Data mismatch found when data copied from Ipc memhandle");
-            printf(" Host.\n");
-            shrd_mem->IfTestPassed = false;
-          }
-        }
-        HIPCHECK(hipIpcCloseMemHandle(reinterpret_cast<void*>(Ad1)));
-      }
-    HIPCHECK(hipFree(Ad2));
-    if ((out=sem_post(sem_ob2)) == -1) {
-      shrd_mem->IfTestPassed = false;
-      printf("sem_post() call on sem_ob2 failed");
-      exit(1);
-    }
-  }
-  exit(0);
-  }
-
-  if ((out = sem_unlink("/my-sem-object1")) == -1) {
-    printf("sem_unlink() call on /my-sem-object1 failed");
-  }
-  if ((out = sem_unlink("/my-sem-object2")) == -1) {
-    printf("sem_unlink() call on /my-sem-object2 failed");
-  }
-  int status;
-  waitpid(pid, &status, 0);
-  if (shrd_mem->IfTestPassed == false) {
-    return false;
-  } else {
-    return true;
-  }
-}
-
-IpcMemHandleTest::IpcMemHandleTest() {
-  std::string cmd_line = "rm -rf /dev/shm/sem.my-sem-object*";
-  int res = system(cmd_line.c_str());
-  if (res == -1) {
-    InitFlag = false;
-    printf("System call to remove existing shared objects failed!");
-  }
-  int out;
-  if ((sem_ob1 = sem_open ("/my-sem-object1", O_CREAT|O_EXCL, 0660, 0)) ==
-      SEM_FAILED) {
-    InitFlag = false;
-    printf("Initialization of 1st semaphore object failed");
-  }
-  if ((sem_ob2 = sem_open ("/my-sem-object2", O_CREAT|O_EXCL, 0660, 0)) ==
-      SEM_FAILED) {
-    InitFlag = false;
-    printf("Initialization of 2nd semaphore object failed");
-  }
-
-  shrd_mem = reinterpret_cast<hip_ipc_t *>(mmap(NULL, sizeof(hip_ipc_t),
-                                                PROT_READ | PROT_WRITE,
-                                                MAP_SHARED | MAP_ANONYMOUS,
-                                                0, 0));
-  if (shrd_mem == NULL) {
-    InitFlag = false;
-    printf("mmap() call failed!");
-  }
-  shrd_mem->IfTestPassed = true;
-  A_h = reinterpret_cast<int*>(malloc(Nbytes));
-  C_h = reinterpret_cast<int*>(malloc(Nbytes));
-  for (size_t i = 0; i < N; i++) {
-    A_h[i] = 123;
-  }
-}
-
-IpcMemHandleTest::~IpcMemHandleTest() {
-  munmap(shrd_mem, sizeof(hip_ipc_t));
-  HIPCHECK(hipFree((A_d)));
-  free(A_h);
-  free(C_h);
-  HIPCHECK(hipFree((Ad1)));
-  HIPCHECK(hipFree((Ad2)));
-}
-#endif
-
-int main() {
-  bool IfTestPassed = true;
-  // The following program spawns a child process and does the following
-  // Parent iterate through each device, create memory -- create hipIpcMemhandle
-  // stores the mem handle in mmaped memory, release the child using sem_post()
-  // and wait for child to release itself(parent process)
-  // child process:
-  // Child process get the ipc mem handle using hipIpcOpenMemHandle
-  // Iterate through all the available gpus and do Device to Device copies
-  // and check for data consistencies and close the hipIpcCloseMemHandle
-  // release the parent and wait for parent to release itself(child)
-#ifdef __linux__
-  IpcMemHandleTest obj;
-  IfTestPassed = obj.Test();
-#else
-  printf("This is not a Linux platform. Hence Skipping the test!\n");
-  IfTestPassed = true;
-#endif
-  if (IfTestPassed == false) {
-    failed("");
-  }
-  passed();
-}
diff --git a/tests/src/runtimeApi/memory/hipMallocConcurrency.cpp b/tests/src/runtimeApi/memory/hipMallocConcurrency.cpp
deleted file mode 100644
index 934c364b6b..0000000000
--- a/tests/src/runtimeApi/memory/hipMallocConcurrency.cpp
+++ /dev/null
@@ -1,487 +0,0 @@
-/*
-Copyright (c) 2020-present Advanced Micro Devices, Inc. All rights reserved.
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
-*/
-
-/**
-Testcase Scenarios :
-
- (TestCase 1)::
- 1) Test hipMalloc() api passing zero size and confirming *ptr returning
- nullptr. Also pass nullptr to hipFree() api.
- 2) Pass maximum value of size_t for hipMalloc() api and make sure appropriate
- error is returned.
- 3) Check for hipMalloc() error code, passing invalid/null pointer.
-
- (TestCase 2)::
- 4) Regress hipMalloc()/hipFree() in loop for bigger chunk of allocation
- with adequate number of iterations and later test for kernel execution on
- default gpu.
- 5) Regress hipMalloc()/hipFree() in loop while allocating smaller chunks
- keeping maximum number of iterations and then run kernel code on default
- gpu, perfom data validation.
-
- (TestCase 3)::
- 6) Check hipMalloc() api adaptability when app creates small chunks of memory
- continuously, stores it for later use and then frees it at later point
- of time.
-
- (TestCase 4)::
- 7) Run hipMalloc() api/kernel code on same gpu parallely from parent and child
- processes, validate the results.
-
- (TestCase 5)::
- 8) Execute hipMalloc() api simultaneously on all the gpus by spawning multiple
- child processes. Validate buffers allocated after running kernel code.
-
- (TestCase 6)::
- 9) Multithread Scenario : Exercise hipMalloc() api parellely on all gpus from
- multiple threads and regress the api.
-
- (TestCases 2, 3, 4, 5, 6)::
- 10) Validate memory usage with hipMemGetInfo() while regressing hipMalloc()
- api. Check for any possible memory leaks.
-*/
-
-/* HIT_START
- * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS --std=c++11
- * TEST_NAMED: %t hipMalloc_ArgValidation  --tests 1
- * TEST_NAMED: %t hipMalloc_LoopRegression_AllocFreeCycle --tests 2
- * TEST_NAMED: %t hipMalloc_LoopRegression_AllocPool --tests 3
- * TEST_NAMED: %t hipMallocChild_Concurrency_DefaultGpu --tests 4
- * TEST_NAMED: %t hipMallocChild_Concurrency_MultiGpu --tests 5
- * TEST_NAMED: %t hipMalloc_MultiThreaded_MultiGpu --tests 6
- * HIT_END
- */
-
-#include <sys/types.h>
-#include <sys/wait.h>
-#include <unistd.h>
-
-#include <iostream>
-#include <vector>
-#include <limits>
-#include <atomic>
-
-#include "test_common.h"
-
-/* Max alloc/free iterations for bigger chunks */
-#define MAX_ALLOCFREE_BC (10000)
-
-/* Buffer size for alloc/free cycles */
-#define BUFF_SIZE_AF (5*1024*1024)
-
-/* Max alloc/free iterations for smaller chunks */
-#define MAX_ALLOCFREE_SC (5000000)
-
-/* Max alloc and pool iterations (TBD) */
-#define MAX_ALLOCPOOL_ITER (2000000)
-
-/**
- * Validates data consitency on supplied gpu
- */
-bool validateMemoryOnGPU(int gpu) {
-  size_t Nbytes = N * sizeof(int);
-  int *A_d, *B_d, *C_d;
-  int *A_h, *B_h, *C_h;
-  size_t prevAvl, prevTot, curAvl, curTot;
-  bool TestPassed = true;
-
-  HIPCHECK(hipSetDevice(gpu));
-  HIPCHECK(hipMemGetInfo(&prevAvl, &prevTot));
-  HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, false);
-
-  unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N);
-
-  HIPCHECK(hipMemcpy(A_d, A_h, Nbytes, hipMemcpyHostToDevice));
-  HIPCHECK(hipMemcpy(B_d, B_h, Nbytes, hipMemcpyHostToDevice));
-
-  hipLaunchKernelGGL(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock),
-                  0, 0, static_cast<const int*>(A_d),
-                  static_cast<const int*>(B_d), C_d, N);
-
-  HIPCHECK(hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost));
-
-  if (!HipTest::checkVectorADD(A_h, B_h, C_h, N)) {
-    printf("Validation PASSED for gpu %d from pid %d\n", gpu, getpid());
-  } else {
-    printf("%s : Validation FAILED for gpu %d from pid %d\n",
-        __func__, gpu, getpid());
-    TestPassed &= false;
-  }
-
-  HipTest::freeArrays(A_d, B_d, C_d, A_h, B_h, C_h, false);
-  HIPCHECK(hipMemGetInfo(&curAvl, &curTot));
-
-  if ((prevAvl != curAvl) || (prevTot != curTot)) {
-    printf("%s : Memory allocation mismatch observed."
-        "Possible memory leak.", __func__);
-    TestPassed &= false;
-  }
-
-  return TestPassed;
-}
-
-/**
- * Fetches Gpu device count
- */
-void getDeviceCount(int *pdevCnt) {
-#ifdef __linux__
-  int fd[2], val = 0;
-  pid_t childpid;
-
-  // create pipe descriptors
-  pipe(fd);
-
-  // disable visible_devices env from shell
-  unsetenv("ROCR_VISIBLE_DEVICES");
-  unsetenv("HIP_VISIBLE_DEVICES");
-
-  childpid = fork();
-
-  if (childpid > 0) {  // Parent
-    close(fd[1]);
-    // parent will wait to read the device cnt
-    read(fd[0], &val, sizeof(val));
-
-    // close the read-descriptor
-    close(fd[0]);
-
-    // wait for child exit
-    wait(NULL);
-
-    *pdevCnt = val;
-  } else if (!childpid) {  // Child
-    int devCnt = 1;
-    // writing only, no need for read-descriptor
-    close(fd[0]);
-
-    HIPCHECK(hipGetDeviceCount(&devCnt));
-    // send the value on the write-descriptor:
-    write(fd[1], &devCnt, sizeof(devCnt));
-
-    // close the write descriptor:
-    close(fd[1]);
-    exit(0);
-  } else {  // failure
-    *pdevCnt = 1;
-    return;
-  }
-
-#else
-  HIPCHECK(hipGetDeviceCount(pdevCnt));
-#endif
-}
-
-/**
- * Regress memory allocation and free in loop
- */
-bool regressAllocInLoop(int gpu) {
-  bool TestPassed = true;
-  size_t tot, avail, ptot, pavail;
-  int i = 0;
-  int *ptr;
-
-  HIPCHECK(hipSetDevice(gpu));
-
-  // Exercise allocation in loop with bigger chunks
-  for (i = 0; i < MAX_ALLOCFREE_BC; i++) {
-    size_t numBytes = BUFF_SIZE_AF;
-
-    HIPCHECK(hipMemGetInfo(&pavail, &ptot));
-    HIPCHECK(hipMalloc(&ptr, numBytes));
-    HIPCHECK(hipMemGetInfo(&avail, &tot));
-
-    if (pavail-avail != numBytes) {
-      printf("LoopAllocation : Memory allocation of %6.2fMB"
-             "not matching with hipMemGetInfo - FAIL\n",
-              numBytes/(1024.0*1024.0));
-      TestPassed &= false;
-      HIPCHECK(hipFree(ptr));
-      break;
-    }
-
-    HIPCHECK(hipFree(ptr));
-  }
-
-  // Exercise allocation in loop with smaller chunks and max iters
-  HIPCHECK(hipMemGetInfo(&pavail, &ptot));
-
-  for (i = 0; i < MAX_ALLOCFREE_SC; i++) {
-    size_t numBytes = 16;
-
-    HIPCHECK(hipMalloc(&ptr, numBytes));
-
-    HIPCHECK(hipFree(ptr));
-  }
-
-  HIPCHECK(hipMemGetInfo(&avail, &tot));
-
-  if ((pavail != avail) || (ptot != tot)) {
-    printf("LoopAllocation : Memory allocation mismatch observed."
-        "Possible memory leak.");
-    TestPassed &= false;
-  }
-
-  return TestPassed;
-}
-
-/*
- * Thread func to regress alloc and check data consistency
- */
-
-std::atomic<bool> g_thTestPassed(true);
-
-void threadFunc(int gpu) {
-  g_thTestPassed = g_thTestPassed & regressAllocInLoop(gpu);
-  g_thTestPassed = g_thTestPassed & validateMemoryOnGPU(gpu);
-
-  printf("thread execution status on gpu(%d) : %d\n", gpu, g_thTestPassed.load());
-}
-
-int main(int argc, char* argv[]) {
-  HipTest::parseStandardArguments(argc, argv, true);
-
-  if (p_tests == 1) {  // Arg validation
-    // Test hipMalloc for zero size
-    bool TestPassed = true;
-    int *ptr;
-
-    HIPCHECK(hipMalloc(&ptr, 0));
-
-    // ptr expected to be reset to null ptr
-    if (ptr) {
-      printf("ArgValidation : Failed in zero size test\n");
-      TestPassed &= false;
-    }
-
-    // Free null ptr
-    HIPCHECK(hipFree(ptr));
-
-    // Test hipMalloc for invalid arguments
-    hipError_t ret;
-
-    if ((ret = hipMalloc(NULL, 100)) != hipErrorInvalidValue) {
-      printf("ArgValidation : Inappropritate error value returned"
-          " for invalid argument. Error: '%s'(%d)\n",
-          hipGetErrorString(ret), ret);
-      TestPassed &= false;
-    }
-
-    // Test hipMalloc for Maximum value of size_t
-    if ((ret = hipMalloc(&ptr, std::numeric_limits<std::size_t>::max()))
-        != hipErrorMemoryAllocation) {
-      printf("ArgValidation : Invalid error returned for max size_t."
-          " Error: '%s'(%d)\n", hipGetErrorString(ret), ret);
-      TestPassed &= false;
-    }
-
-    if (TestPassed) {
-      passed();
-    } else {
-      failed("hipMalloc ArgumentValidation Failure!");
-    }
-
-  } else if (p_tests == 2) {  // Loop Regression Alloc/Free Cycle
-    bool TestPassed = true;
-
-    TestPassed &= regressAllocInLoop(0);
-    TestPassed &= validateMemoryOnGPU(0);
-
-    if (TestPassed) {
-      passed();
-    } else {
-      failed("hipMalloc_LoopRegression_AllocFreeCycle Failure!");
-    }
-
-  } else if (p_tests == 3) {  // Loop Regression Alloc and Pool
-    size_t avail, tot, pavail, ptot;
-    bool TestPassed = true;
-    hipError_t err;
-    int *ptr;
-
-    std::vector<int *> ptrlist;
-
-    HIPCHECK(hipMemGetInfo(&pavail, &ptot));
-
-    // Allocate small chunks of memory million times
-    for (int i = 0; i < MAX_ALLOCPOOL_ITER; i++) {  // Iterations TBD
-      if ((err = hipMalloc(&ptr, 10)) != hipSuccess) {
-        HIPCHECK(hipMemGetInfo(&avail, &tot));
-
-        printf("Loop regression pool allocation failure. "
-        "Total gpu memory : %6.2fMB, Free memory %6.2fMB iter %d error '%s'\n",
-        tot/(1024.0*1024.0), avail/(1024.0*1024.0), i, hipGetErrorString(err));
-
-        TestPassed &= false;
-        break;
-      }
-
-      // Store pointers allocated to emulate memory pool of app
-      ptrlist.push_back(ptr);
-    }
-
-    // Free ptrs at later point of time
-    for ( auto &t : ptrlist ) {
-      HIPCHECK(hipFree(t));
-    }
-
-    HIPCHECK(hipMemGetInfo(&avail, &tot));
-
-    TestPassed &= validateMemoryOnGPU(0);
-
-    if ((pavail != avail) || (ptot != tot)) {
-      printf("%s : Memory allocation mismatch observed. Possible memory leak.",
-          __func__);
-      TestPassed &= false;
-    }
-
-    if (TestPassed) {
-      passed();
-    } else {
-      failed("hipMalloc_LoopRegression_AllocPool failure!");
-    }
-
-  } else if (p_tests == 4) {
-    bool TestPassed = true;
-
-#ifdef __linux__
-    // Parallel execution of parent and child on gpu0
-    int pid;
-
-    if ((pid = fork()) < 0) {
-      printf("Child_Concurrency_Gpu0 : fork() returned error %d.", pid);
-      TestPassed &= false;
-
-    } else if (!pid) {   // Child process
-      bool TestPassedChild = true;
-
-      TestPassedChild = validateMemoryOnGPU(0);
-
-      if (TestPassedChild) {
-        exit(0);  // child exit with success status
-      } else {
-        printf("Child_Concurrency_Gpu0 : childpid %d failed\n", getpid());
-        exit(1);  // child exit with failure status
-      }
-
-    } else {  // Parent process
-      int exitStatus;
-      TestPassed = validateMemoryOnGPU(0);
-
-      pid = wait(&exitStatus);
-      if ( WEXITSTATUS(exitStatus) || ( pid < 0 ) )
-        TestPassed &= false;
-    }
-#else
-    printf("Test hipMallocChild_Concurrency_DefaultGpu skipped on non-linux\n");
-#endif
-
-    // TC scenarios specific to linux
-    // are treated as pass in windows.
-    if (TestPassed) {
-      passed();
-    } else {
-      failed("hipMallocChild_Concurrency_DefaultGpu Failed!");
-    }
-
-  } else if (p_tests == 5) {
-    bool TestPassed = true;
-#ifdef __linux__
-    // Parallel execution on multiple gpus from different child processes
-    int devCnt = 1, pid = 0, cumStatus = 0;
-
-    // Get GPU count
-    getDeviceCount(&devCnt);
-
-    // Spawn child for each GPU
-    for (int gpu = 0; gpu < devCnt; gpu++) {
-      if ((pid = fork()) < 0) {
-         printf("Child_Concurrency_MultiGpu : fork() returned error %d\n", pid);
-         failed("Test Failed!");
-
-      } else if (!pid) {  // Child process
-         bool TestPassedChild = true;
-         TestPassedChild = validateMemoryOnGPU(gpu);
-
-         if (TestPassedChild) {
-            exit(0);  // child exit with success status
-         } else {
-            printf("Child_Concurrency_MultiGpu : childpid %d failed\n",
-                getpid());
-            exit(1);  // child exit with failure status
-         }
-      }
-    }
-
-    // Parent shall wait for child to complete
-    for (int i = 0; i < devCnt; i++) {
-      int pidwait = 0, exitStatus;
-      pidwait = wait(&exitStatus);
-
-      if (pidwait < 0) {
-        TestPassed &= false;
-        break;
-      }
-
-      cumStatus |= WEXITSTATUS(exitStatus);
-    }
-
-    // Cummulative status of all child
-    if (cumStatus) {
-       TestPassed &= false;
-    }
-
-#else
-    printf("Test hipMallocChild_Concurrency_MultiGpu skipped on non-linux\n");
-#endif
-
-
-    // TC scenarios specific to linux
-    // are treated as pass in windows.
-    if (TestPassed) {
-      passed();
-    } else {
-      failed("hipMallocChild_Concurrency_MultiGpu Failed!");
-    }
-
-  } else if (p_tests == 6) {  // Multithreaded multiple gpu execution
-    std::vector<std::thread> threadlist;
-    int devCnt = 1;
-
-    // Get GPU count
-    getDeviceCount(&devCnt);
-
-
-    for (int i = 0; i < devCnt; i++) {
-      threadlist.push_back(std::thread(threadFunc, i));
-    }
-
-    for (auto &t : threadlist) {
-      t.join();
-    }
-
-    if (g_thTestPassed) {
-      passed();
-    } else {
-      failed("hipMalloc_MultiThreaded_MultiGpu Failed!");
-    }
-  } else {
-    failed("Didnt receive any valid option. Try options 1 to 6\n");
-  }
-}
-
diff --git a/tests/src/runtimeApi/memory/hipMallocManaged_MultiScenario.cpp b/tests/src/runtimeApi/memory/hipMallocManaged_MultiScenario.cpp
deleted file mode 100644
index 25820e2305..0000000000
--- a/tests/src/runtimeApi/memory/hipMallocManaged_MultiScenario.cpp
+++ /dev/null
@@ -1,423 +0,0 @@
-/*
-Copyright (c) 2020 - present Advanced Micro Devices, Inc. All rights reserved.
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
-*/
-
-/* Test 6 is disabled */
-/* HIT_START
- * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS -std=c++11
- * TEST_NAMED: %t hipMallocManaged1 --tests 1
- * TEST_NAMED: %t hipMallocManaged2 --tests 2
- * TEST_NAMED: %t hipMallocManagedNegativeTests --tests 3
- * TEST_NAMED: %t hipMallocManagedMultiChunkSingleDevice --tests 4
- * TEST_NAMED: %t hipMallocManagedMultiChunkMultiDevice --tests 5 EXCLUDE_HIP_PLATFORM nvcc
- * TEST_NAMED: %t hipMallocManagedOversubscription --tests 6 EXCLUDE_HIP_PLATFORM rocclr nvcc
- * HIT_END
- */
-
-#include <atomic>
-#include "test_common.h"
-#define N 1048576  // equals to (1024*1024)
-#define INIT_VAL 123
-
-/*
- * Kernel function to perform addition operation.
- */
-template <typename T>
-__global__ void
-vector_sum(T *Ad1, T *Ad2, size_t NUM_ELMTS) {
-    size_t offset = (blockIdx.x * blockDim.x + threadIdx.x);
-    size_t stride = blockDim.x * gridDim.x;
-
-    for (size_t i = offset; i < NUM_ELMTS; i += stride) {
-        Ad2[i] = Ad1[i] + Ad1[i];
-    }
-}
-
-// The following Test case tests the following scenario:
-// A large chunk of hipMallocManaged() memory(Hmm) is created
-// Equal parts of Hmm is accessed on available gpus and
-// kernel is launched on acessed chunk of hmm memory
-// and checks if there are any inconsistencies or access issues
-bool MultiChunkMultiDevice(int NumDevices) {
-  std::atomic<int> DataMismatch{0};
-  bool IfTestPassed = true;
-  int Counter = 0;
-  unsigned int NUM_ELMS = (1024 * 1024);
-  float *Ad[NumDevices], *Hmm = NULL, *Ah = new float[NUM_ELMS];
-  hipStream_t stream[NumDevices];
-  for (int Oloop = 0; Oloop < NumDevices; ++Oloop) {
-    HIPCHECK(hipSetDevice(Oloop));
-    HIPCHECK(hipMalloc(&Ad[Oloop], NUM_ELMS * sizeof(float)));
-    HIPCHECK(hipMemset(Ad[Oloop], 0, NUM_ELMS * sizeof(float)));
-    HIPCHECK(hipStreamCreate(&stream[Oloop]));
-  }
-  HIPCHECK(hipMallocManaged(&Hmm, (NumDevices * NUM_ELMS * sizeof(float))));
-  for (int i = 0; i < NumDevices; ++i) {
-    for (; Counter < ((i + 1) * NUM_ELMS); ++Counter) {
-      Hmm[Counter] = INIT_VAL + i;
-    }
-  }
-  const unsigned threadsPerBlock = 256;
-  const unsigned blocks = (NUM_ELMS + 255)/256;
-  for (int Klaunch = 0; Klaunch < NumDevices; ++Klaunch) {
-    vector_sum<float> <<<blocks, threadsPerBlock, 0, stream[Klaunch]>>>
-                      (&Hmm[Klaunch * NUM_ELMS], Ad[Klaunch], NUM_ELMS);
-  }
-  HIPCHECK(hipDeviceSynchronize());
-  for (int m = 0; m < NumDevices; ++m) {
-    HIPCHECK(hipMemcpy(Ah, Ad[m], NUM_ELMS * sizeof(float),
-                       hipMemcpyDeviceToHost));
-    for (int n = 0; n < NUM_ELMS; ++n) {
-      if (Ah[n] != ((INIT_VAL + m) * 2)) {
-        DataMismatch++;
-      }
-    }
-    memset(reinterpret_cast<void*>(Ah), 0, NUM_ELMS * sizeof(float));
-  }
-  if (DataMismatch.load() != 0) {
-    printf("MultiChunkMultiDevice: Mismatch observed!\n");
-    IfTestPassed = false;
-  }
-  for (int i = 0; i < NumDevices; ++i) {
-    HIPCHECK(hipFree(Ad[i]));
-    HIPCHECK(hipStreamDestroy(stream[i]));
-  }
-  HIPCHECK(hipFree(Hmm));
-  free(Ah);
-  return IfTestPassed;
-}
-
-// The following Test case tests the following scenario:
-// A large chunk of hipMallocManaged() memory(Hmm) is created
-// Equal parts of Hmm is accessed and
-// kernel is launched on acessed chunk of hmm memory
-// and checks if there are any inconsistencies or access issues
-
-bool MultiChunkSingleDevice(int NumDevices) {
-  std::atomic<int> DataMismatch{0};
-  int Chunks = 4, Counter = 0;
-  bool IfTestPassed = true;
-  unsigned int NUM_ELMS = (1024 * 1024);
-  float *Ad[Chunks], *Hmm = NULL, *Ah = new float[NUM_ELMS];
-  hipStream_t stream[Chunks];
-  for (int i = 0; i < Chunks; ++i) {
-    HIPCHECK(hipMalloc(&Ad[i], NUM_ELMS * sizeof(float)));
-    HIPCHECK(hipMemset(Ad[i], 0, NUM_ELMS * sizeof(float)));
-    HIPCHECK(hipStreamCreate(&stream[i]));
-  }
-  HIPCHECK(hipMallocManaged(&Hmm, (Chunks * NUM_ELMS * sizeof(float))));
-  for (int i = 0; i < Chunks; ++i) {
-    for (; Counter < ((i + 1) * NUM_ELMS); ++Counter) {
-      Hmm[Counter] = (INIT_VAL + i);
-    }
-  }
-  const unsigned threadsPerBlock = 256;
-  const unsigned blocks = (NUM_ELMS + 255)/256;
-  for (int k = 0; k < Chunks; ++k) {
-    vector_sum<float> <<<blocks, threadsPerBlock, 0, stream[k]>>>
-                      (&Hmm[k * NUM_ELMS], Ad[k], NUM_ELMS);
-  }
-  HIPCHECK(hipDeviceSynchronize());
-  for (int m = 0; m < Chunks; ++m) {
-    HIPCHECK(hipMemcpy(Ah, Ad[m], NUM_ELMS * sizeof(float),
-                       hipMemcpyDeviceToHost));
-    for (int n = 0; n < NUM_ELMS; ++n) {
-      if (Ah[n] != ((INIT_VAL + m) * 2)) {
-        DataMismatch++;
-      }
-    }
-  }
-  if (DataMismatch.load() != 0) {
-    printf("MultiChunkSingleDevice: Mismatch observed!\n");
-    IfTestPassed = false;
-  }
-  for (int i = 0; i < Chunks; ++i) {
-    HIPCHECK(hipFree(Ad[i]));
-    HIPCHECK(hipStreamDestroy(stream[i]));
-  }
-  HIPCHECK(hipFree(Hmm));
-  free(Ah);
-  return IfTestPassed;
-}
-
-// The following tests oversubscription hipMallocManaged() api
-// Currently disabled.
-bool TestOversubscriptionMallocManaged(int NumDevices) {
-  bool IfTestPassed = true;
-  hipError_t err;
-  void *A = NULL;
-  size_t total = 0, free = 0;
-  HIPCHECK(hipMemGetInfo(&free, &total));
-  // ToDo: In case of HMM, memory over-subscription is allowed.  Hence, relook
-  // into how out of memory can be tested.
-  // Demanding more mem size than available
-  err = hipMallocManaged(&A, (free +1), hipMemAttachGlobal);
-  if (hipErrorOutOfMemory != err) {
-    printf("hipMallocManaged: Returned %s for size value > device memory\n",
-           hipGetErrorString(err));
-    IfTestPassed = false;
-  }
-
-  return IfTestPassed;
-}
-
-// The following test does negative testing of hipMallocManaged() api
-// by passing invalid values and check if the behavior is as expected
-bool NegativeTestsMallocManaged(int NumDevices) {
-  bool IfTestPassed = true;
-  hipError_t err;
-  void *A = NULL;
-  size_t total = 0, free = 0;
-  HIPCHECK(hipMemGetInfo(&free, &total));
-
-  err = hipMallocManaged(NULL, 1024, hipMemAttachGlobal);
-  if (hipErrorInvalidValue != err) {
-    printf("hipMallocManaged: Returned %s when devPtr is null\n",
-           hipGetErrorString(err));
-    IfTestPassed = false;
-  }
-
-  err = hipMallocManaged(&A, 0, hipMemAttachGlobal);
-  if (hipErrorInvalidValue != err) {
-    printf("hipMallocManaged: Returned %s when size is 0\n",
-           hipGetErrorString(err));
-    IfTestPassed = false;
-  }
-
-  err = hipMallocManaged(NULL, 0, hipMemAttachGlobal);
-  if (hipErrorInvalidValue != err) {
-    printf("hipMallocManaged: Returned %s when devPtr & size is null & 0\n",
-           hipGetErrorString(err));
-    IfTestPassed = false;
-  }
-
-#ifdef __HIP_PLATFORM_HCC__
-  // The flag hipMemAttachHost is currently not supported therefore
-  // api should return "hipErrorInvalidValue" for now
-  err = hipMallocManaged(&A, 1024, hipMemAttachHost);
-  if (hipErrorInvalidValue != err) {
-    printf("hipMallocManaged: Returned %s for 'hipMemAttachHost' flag\n",
-           hipGetErrorString(err));
-    IfTestPassed = false;
-  }
-#endif  // __HIP_PLATFORM_HCC__
-
-  err = hipMallocManaged(NULL, 0, 0);
-  if (hipErrorInvalidValue != err) {
-    printf("hipMallocManaged: Returned %s when params are null, 0, 0\n",
-           hipGetErrorString(err));
-    IfTestPassed = false;
-  }
-
-  err = hipMallocManaged(&A, 1024, 145);
-  if (hipErrorInvalidValue != err) {
-    printf("hipMallocManaged: Returned %s when flag param is numerical 145\n",
-           hipGetErrorString(err));
-    IfTestPassed = false;
-  }
-
-  err = hipMallocManaged(&A, -10, hipMemAttachGlobal);
-  if (hipErrorOutOfMemory != err) {
-    printf("hipMallocManaged: Returned %s for negative size value.\n",
-           hipGetErrorString(err));
-    IfTestPassed = false;
-  }
-
-  return IfTestPassed;
-}
-
-
-// Allocate two pointers using hipMallocManaged(), initialize,
-// then launch kernel using these pointers directly and
-// later validate the content without using any Memcpy.
-template <typename T>
-bool TestMallocManaged2(int NumDevices) {
-  bool IfTestPassed = true;
-  T *Hmm1 = NULL, *Hmm2 = NULL;
-
-  for (int i = 0; i < NumDevices; ++i) {
-    HIPCHECK(hipSetDevice(i));
-    std::atomic<int> DataMismatch{0};
-    HIPCHECK(hipMallocManaged(&Hmm1, N * sizeof(T)));
-    HIPCHECK(hipMallocManaged(&Hmm2, N * sizeof(T)));
-    for (int m = 0; m < N; ++m) {
-      Hmm1[m] = m;
-      Hmm2[m] = 0;
-    }
-    const unsigned threadsPerBlock = 256;
-    const unsigned blocks = (N + 255)/256;
-    // Kernel launch
-    vector_sum <<<blocks, threadsPerBlock>>> (Hmm1, Hmm2, N);
-    HIPCHECK(hipDeviceSynchronize());
-    for (int v = 0; v < N; ++v) {
-      if (Hmm2[v] != (v + v)) {
-        DataMismatch++;
-      }
-    }
-    if (DataMismatch.load() != 0) {
-      IfTestPassed = false;
-    }
-    HIPCHECK(hipFree(Hmm1));
-    HIPCHECK(hipFree(Hmm2));
-  }
-  return IfTestPassed;
-}
-
-// In the following test, a memory is created using hipMallocManaged() by
-// setting a device and verified if it is accessible when the context is set
-// to all other devices. This include verification and Device two Device
-// transfers and kernel launch o discover if there any access issues.
-
-template <typename T>
-bool TestMallocManaged1(int NumDevices) {
-  std::atomic<unsigned int> DataMismatch;
-  bool TestPassed = true;
-  T *Ah1 = new T[N], *Ah2 = new T[N], *Ad = NULL, *Hmm = NULL;
-
-  for (int i =0; i < N; ++i) {
-    Ah1[i] = INIT_VAL;
-    Ah2[i] = 0;
-  }
-  for (int Oloop = 0; Oloop < NumDevices; ++Oloop) {
-    DataMismatch = 0;
-    HIPCHECK(hipSetDevice(Oloop));
-    HIPCHECK(hipMallocManaged(&Hmm, N * sizeof(T)));
-    for (int Iloop = 0; Iloop < NumDevices; ++Iloop) {
-      HIPCHECK(hipSetDevice(Iloop));
-      HIPCHECK(hipMalloc(&Ad, N * sizeof(T)));
-      // Copy data from host to hipMallocMananged memory and verify
-      HIPCHECK(hipMemcpy(Hmm, Ah1, N * sizeof(T), hipMemcpyHostToDevice));
-      for (int v = 0; v < N; ++v) {
-        if (Hmm[v] != INIT_VAL) {
-          DataMismatch++;
-        }
-      }
-      if (DataMismatch.load() != 0) {
-        printf("Mismatch is observed with host data at device %d", Iloop);
-        printf(" while hipMallocManaged memory set to the device %d\n", Oloop);
-        TestPassed = false;
-        DataMismatch = 0;
-      }
-      // Executing D2D transfer with hipMallocManaged memory and verify
-      HIPCHECK(hipMemcpy(Ad, Hmm, N * sizeof(T), hipMemcpyDeviceToDevice));
-      HIPCHECK(hipMemcpy(Ah2, Ad, N * sizeof(T), hipMemcpyDeviceToHost));
-      for (int k = 0; k < N; ++k) {
-        if (Ah2[k] != INIT_VAL) {
-          DataMismatch++;
-        }
-      }
-      if (DataMismatch.load() != 0) {
-        printf("Mismatch is observed with D2D transfer at device %d\n", Iloop);
-        printf(" while hipMallocManaged memory set to the device %d\n", Oloop);
-        TestPassed = false;
-        DataMismatch = 0;
-      }
-      HIPCHECK(hipMemset(Ad, 0, N * sizeof(T)));
-      const unsigned threadsPerBlock = 256;
-      const unsigned blocks = (N + 255)/256;
-      // Launching the kernel to check if there is any access issue with
-      // hipMallocManaged memory and local device's memory
-      vector_sum <<<blocks, threadsPerBlock>>> (Hmm, Ad, N);
-      hipDeviceSynchronize();
-      HIPCHECK(hipMemcpy(Ah2, Ad, N * sizeof(T), hipMemcpyDeviceToHost));
-      for (int m = 0; m < N; ++m) {
-        if (Ah2[m] != 246) {
-          DataMismatch++;
-        }
-      }
-      if (DataMismatch.load() != 0) {
-        printf("Data Mismatch observed after kernel lch device %d\n", Iloop);
-        TestPassed = false;
-        DataMismatch = 0;
-      }
-      HIPCHECK(hipFree(Ad));
-    }
-    HIPCHECK(hipFree(Hmm));
-  }
-  free(Ah1);
-  free(Ah2);
-  return TestPassed;
-}
-
-int main(int argc, char* argv[]) {
-  HipTest::parseStandardArguments(argc, argv, true);
-
-  if ((p_tests <= 0) || (p_tests > 5)) {
-    failed("Valid arguments are from 1 to 5");
-  }
-
-  int NumDevices = 0;
-  HIPCHECK(hipGetDeviceCount(&NumDevices));
-  bool TestStatus = true, OverAllStatus = true;
-  if (p_tests == 1) {
-    TestStatus = TestMallocManaged1<float>(NumDevices);
-    if (!TestStatus) {
-      printf("Test Failed with float datatype.\n");
-      OverAllStatus = false;
-    }
-    TestStatus = TestMallocManaged1<int>(NumDevices);
-    if (!TestStatus) {
-      printf("Test Failed with int datatype.\n");
-      OverAllStatus = false;
-    }
-    TestStatus = TestMallocManaged1<unsigned char>(NumDevices);
-    if (!TestStatus) {
-      printf("Test Failed with unsigned char datatype.\n");
-      OverAllStatus = false;
-    }
-    TestStatus = TestMallocManaged1<double>(NumDevices);
-    if (!TestStatus) {
-      printf("Test Failed with double datatype.\n");
-      OverAllStatus = false;
-    }
-    if (!OverAllStatus) {
-      failed("");
-    }
-  }
-  if (p_tests == 2) {
-    TestStatus = TestMallocManaged2<float>(NumDevices);
-    if (!TestStatus) {
-      failed("Test Failed with float datatype.");
-    }
-  }
-  if (p_tests == 3) {
-    TestStatus = NegativeTestsMallocManaged(NumDevices);
-    if (!TestStatus) {
-      failed("Negative Tests with hipMallocManaged() failed!.");
-    }
-  }
-  if (p_tests == 4) {
-    TestStatus = MultiChunkSingleDevice(NumDevices);
-    if (!TestStatus) {
-      failed("hipMallocManaged: MultiChunkSingleDevice test failed!");
-    }
-  }
-  if (p_tests == 5) {
-    TestStatus = MultiChunkMultiDevice(NumDevices);
-    if (!TestStatus) {
-      failed("hipMallocManaged: MultiChunkMultiDevice test failed!");
-    }
-  }
-  if (p_tests == 6) {
-    TestStatus = TestOversubscriptionMallocManaged(NumDevices);
-    if (!TestStatus) {
-      failed("hipMallocManaged: TestOversubscriptionMallocManaged failed!");
-    }
-  }
-  passed();
-}
diff --git a/tests/src/runtimeApi/memory/hipMemcpyDtoD.cpp b/tests/src/runtimeApi/memory/hipMemcpyDtoD.cpp
index b3dc32810b..7e65c47244 100755
--- a/tests/src/runtimeApi/memory/hipMemcpyDtoD.cpp
+++ b/tests/src/runtimeApi/memory/hipMemcpyDtoD.cpp
@@ -75,9 +75,6 @@ int main() {
         HIPCHECK(hipFree(Z_d));
       } else {
         std::cout<<"Machine does not seem to have P2P Capabilities, Empty Pass"<<std::endl;
-        if (hip_skip_tests_enabled()) {
-          return hip_skip_retcode();
-        }
       }
     }
 
diff --git a/tests/src/runtimeApi/memory/hipMemcpyDtoDAsync.cpp b/tests/src/runtimeApi/memory/hipMemcpyDtoDAsync.cpp
index 0b3724b0cc..16a8f6df41 100755
--- a/tests/src/runtimeApi/memory/hipMemcpyDtoDAsync.cpp
+++ b/tests/src/runtimeApi/memory/hipMemcpyDtoDAsync.cpp
@@ -81,9 +81,6 @@ int main() {
         HIPCHECK(hipFree(Z_d));
       } else {
         std::cout<<"Machine does not seem to have P2P Capabilities, Empty Pass"<<std::endl;
-        if (hip_skip_tests_enabled()) {
-          return hip_skip_retcode();
-        }
       }
     }
 
diff --git a/tests/src/runtimeApi/memory/hipMemcpyPeer.cpp b/tests/src/runtimeApi/memory/hipMemcpyPeer.cpp
index 146da0b0e8..e1299e37d0 100755
--- a/tests/src/runtimeApi/memory/hipMemcpyPeer.cpp
+++ b/tests/src/runtimeApi/memory/hipMemcpyPeer.cpp
@@ -77,9 +77,6 @@ int main() {
         HIPCHECK(hipFree(Z_d));
       } else {
         std::cout<<"Machine does not seem to have P2P Capabilities, Empty Pass"<<std::endl;
-        if (hip_skip_tests_enabled()) {
-          return hip_skip_retcode();
-        }
       }
     }
     passed();
diff --git a/tests/src/runtimeApi/memory/hipMemcpyPeerAsync.cpp b/tests/src/runtimeApi/memory/hipMemcpyPeerAsync.cpp
index 3aa6451975..8b3254e6fd 100755
--- a/tests/src/runtimeApi/memory/hipMemcpyPeerAsync.cpp
+++ b/tests/src/runtimeApi/memory/hipMemcpyPeerAsync.cpp
@@ -83,9 +83,6 @@ int main() {
         HIPCHECK(hipFree(Z_d));
       } else {
         std::cout<<"Machine does not seem to have P2P Capabilities, Empty Pass"<<std::endl;
-        if (hip_skip_tests_enabled()) {
-          return hip_skip_retcode();
-        }
       }
     }
 
diff --git a/tests/src/runtimeApi/memory/hipMemcpyWithStream.cpp b/tests/src/runtimeApi/memory/hipMemcpyWithStream.cpp
index 30d4191b41..4466ef92bd 100755
--- a/tests/src/runtimeApi/memory/hipMemcpyWithStream.cpp
+++ b/tests/src/runtimeApi/memory/hipMemcpyWithStream.cpp
@@ -24,7 +24,7 @@ THE SOFTWARE.
  */
 
 /* HIT_START
- * BUILD: %t %s ../../test_common.cpp EXCLUDE_HIP_PLATFORM nvcc
+ * BUILD: %t %s ../../test_common.cpp
  * TEST: %t
  * HIT_END
  */
diff --git a/tests/src/runtimeApi/memory/hipMemcpyWithStreamMultiThread.cpp b/tests/src/runtimeApi/memory/hipMemcpyWithStreamMultiThread.cpp
index 87e52b93ff..b3d613c45f 100644
--- a/tests/src/runtimeApi/memory/hipMemcpyWithStreamMultiThread.cpp
+++ b/tests/src/runtimeApi/memory/hipMemcpyWithStreamMultiThread.cpp
@@ -24,7 +24,7 @@ THE SOFTWARE.
  */
 
 /* HIT_START
- * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS -std=c++11 EXCLUDE_HIP_PLATFORM nvcc
+ * BUILD: %t %s ../../test_common.cpp
  * TEST: %t
  * HIT_END
  */
diff --git a/tests/src/runtimeApi/memory/hipMemset2D.cpp b/tests/src/runtimeApi/memory/hipMemset2D.cpp
index 6515fa36e1..0e54c4ec67 100644
--- a/tests/src/runtimeApi/memory/hipMemset2D.cpp
+++ b/tests/src/runtimeApi/memory/hipMemset2D.cpp
@@ -20,7 +20,7 @@
 // Test for hipMemset2D functionality for different width and height values
 
 /* HIT_START
- * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS -std=c++11 EXCLUDE_HIP_PLATFORM nvcc
+ * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS -std=c++11
  * TEST_NAMED: %t hipMemset2D-basic
  * TEST_NAMED: %t hipMemset2D-dim1 --width2D 10 --height2D 10 --memsetWidth 4 --memsetHeight 4
  * TEST_NAMED: %t hipMemset2D-dim2 --width2D 100 --height2D 100 --memsetWidth 20 --memsetHeight 40
diff --git a/tests/src/runtimeApi/memory/hipMultiMemcpyMultiThrdMultiStrm.cpp b/tests/src/runtimeApi/memory/hipMultiMemcpyMultiThrdMultiStrm.cpp
index 01d96e9cd8..997cf3b651 100644
--- a/tests/src/runtimeApi/memory/hipMultiMemcpyMultiThrdMultiStrm.cpp
+++ b/tests/src/runtimeApi/memory/hipMultiMemcpyMultiThrdMultiStrm.cpp
@@ -21,7 +21,7 @@
 // and also launch hipMemcpyAsync() api on the same stream. This test case is simulate the scenario
 // reported in SWDEV-181598.
 /* HIT_START
- * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS --std=c++11 EXCLUDE_HIP_PLATFORM nvcc
+ * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS --std=c++11
  * TEST: %t
  * HIT_END
  */
diff --git a/tests/src/runtimeApi/memory/hipMultiMemcpyMultiThread.cpp b/tests/src/runtimeApi/memory/hipMultiMemcpyMultiThread.cpp
index 881afef9fb..d09cd92274 100644
--- a/tests/src/runtimeApi/memory/hipMultiMemcpyMultiThread.cpp
+++ b/tests/src/runtimeApi/memory/hipMultiMemcpyMultiThread.cpp
@@ -21,7 +21,7 @@
 // and also launch hipMemcpyAsync() api. This test case is simulate the scenario
 // reported in SWDEV-181598.
 /* HIT_START
- * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS --std=c++11 EXCLUDE_HIP_PLATFORM nvcc
+ * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS --std=c++11
  * TEST: %t
  * HIT_END
  */
diff --git a/tests/src/runtimeApi/module/hipFuncSetSharedMemConfig.cpp b/tests/src/runtimeApi/module/hipFuncSetSharedMemConfig.cpp
deleted file mode 100644
index 3f04563c71..0000000000
--- a/tests/src/runtimeApi/module/hipFuncSetSharedMemConfig.cpp
+++ /dev/null
@@ -1,27 +0,0 @@
-/*
-Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
-*/
-
-#include <hip/hip_runtime_api.h>
-#include "test_common.h"
-
-int main() {
-    hipSharedMemConfig_t config;
-    HIP_PRINT_STATUS(hipFuncSetSharedMemConfig(NULL));
-    HIP_PRINT_STATUS(hipFuncSetSharedMemConfig(&config));
-}
diff --git a/tests/src/runtimeApi/cooperativeGrps/hipLaunchCoopMultiKernel.cpp b/tests/src/runtimeApi/module/hipLaunchCoopMultiKernel.cpp
similarity index 98%
rename from tests/src/runtimeApi/cooperativeGrps/hipLaunchCoopMultiKernel.cpp
rename to tests/src/runtimeApi/module/hipLaunchCoopMultiKernel.cpp
index 0e523f9d2e..8e67044eb0 100644
--- a/tests/src/runtimeApi/cooperativeGrps/hipLaunchCoopMultiKernel.cpp
+++ b/tests/src/runtimeApi/module/hipLaunchCoopMultiKernel.cpp
@@ -20,7 +20,7 @@ THE SOFTWARE.
 // Simple test for hipLaunchCooperativeKernelMultiDevice API.
 
 /* HIT_START
- * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS --std=c++11 -rdc=true -gencode arch=compute_60,code=sm_60
+ * BUILD: %t %s ../../test_common.cpp EXCLUDE_HIP_PLATFORM nvcc
  * TEST: %t
  * HIT_END
  */
diff --git a/tests/src/runtimeApi/cooperativeGrps/hipLaunchCooperativeKernel.cpp b/tests/src/runtimeApi/module/hipLaunchCooperativeKernel.cpp
similarity index 94%
rename from tests/src/runtimeApi/cooperativeGrps/hipLaunchCooperativeKernel.cpp
rename to tests/src/runtimeApi/module/hipLaunchCooperativeKernel.cpp
index 6b1ba1c27a..e0fcd4108b 100644
--- a/tests/src/runtimeApi/cooperativeGrps/hipLaunchCooperativeKernel.cpp
+++ b/tests/src/runtimeApi/module/hipLaunchCooperativeKernel.cpp
@@ -22,14 +22,15 @@ THE SOFTWARE.
 // Simple test for hipLaunchCooperativeKernel API.
 
 /* HIT_START
- * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS --std=c++11
+ * BUILD: %t %s ../../test_common.cpp EXCLUDE_HIP_PLATFORM nvcc
  * TEST: %t
  * HIT_END
  */
 
 #include "hip/hip_runtime.h"
 #include "hip/hip_runtime_api.h"
-#include "hip/hip_cooperative_groups.h"
+#include "hip/hcc_detail/device_library_decls.h"
+#include "hip/hcc_detail/hip_cooperative_groups.h"
 #include <iostream>
 #include <chrono>
 #include "test_common.h"
@@ -128,7 +129,7 @@ int main() {
     params[3] = (void*)&dC;
 
     std::cout << "Testing with grid size = " << dimGrid.x << " and block size = " << dimBlock.x << "\n";
-    HIPCHECK(hipLaunchCooperativeKernel(reinterpret_cast<void*>(test_gws), dimGrid, dimBlock, params, dimBlock.x * sizeof(long), stream));
+    HIPCHECK(hipLaunchCooperativeKernel(test_gws, dimGrid, dimBlock, params, dimBlock.x * sizeof(long), stream));
 
     HIPCHECK(hipMemcpy(init, dC, sizeof(long), hipMemcpyDeviceToHost));
 
diff --git a/tests/src/runtimeApi/module/hipModuleLoadDataMultThreadOnMultGPU.cpp b/tests/src/runtimeApi/module/hipModuleLoadDataMultThreadOnMultGPU.cpp
index 6f649708b7..cc976ced42 100644
--- a/tests/src/runtimeApi/module/hipModuleLoadDataMultThreadOnMultGPU.cpp
+++ b/tests/src/runtimeApi/module/hipModuleLoadDataMultThreadOnMultGPU.cpp
@@ -18,7 +18,7 @@ THE SOFTWARE.
 */
 
 /* HIT_START
- * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS -std=c++11 EXCLUDE_HIP_PLATFORM nvcc
+ * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS -std=c++11
  * TEST: %t
  * HIT_END
  */
diff --git a/tests/src/runtimeApi/module/hipModuleLoadDataMultThreaded.cpp b/tests/src/runtimeApi/module/hipModuleLoadDataMultThreaded.cpp
index dbf58209fa..840e9b6975 100644
--- a/tests/src/runtimeApi/module/hipModuleLoadDataMultThreaded.cpp
+++ b/tests/src/runtimeApi/module/hipModuleLoadDataMultThreaded.cpp
@@ -18,7 +18,7 @@ THE SOFTWARE.
 */
 
 /* HIT_START
- * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS -std=c++11 EXCLUDE_HIP_PLATFORM rocclr nvcc
+ * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS -std=c++11 EXCLUDE_HIP_PLATFORM rocclr
  * TEST: %t
  * HIT_END
  */
diff --git a/tests/src/runtimeApi/stream/hipStreamACb_AltEnqueue.cpp b/tests/src/runtimeApi/stream/hipStreamACb_AltEnqueue.cpp
index 3ee2f4a050..07acc4a591 100644
--- a/tests/src/runtimeApi/stream/hipStreamACb_AltEnqueue.cpp
+++ b/tests/src/runtimeApi/stream/hipStreamACb_AltEnqueue.cpp
@@ -18,7 +18,7 @@ THE SOFTWARE.
  */
 
 /* HIT_START
- * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS --std=c++11 EXCLUDE_HIP_PLATFORM nvcc
+ * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS --std=c++11
  * TEST: %t
  * HIT_END
  */
diff --git a/tests/src/runtimeApi/stream/hipStreamACb_MStrm_Mgpu.cpp b/tests/src/runtimeApi/stream/hipStreamACb_MStrm_Mgpu.cpp
index 0d65a0f50b..3a25d3331c 100644
--- a/tests/src/runtimeApi/stream/hipStreamACb_MStrm_Mgpu.cpp
+++ b/tests/src/runtimeApi/stream/hipStreamACb_MStrm_Mgpu.cpp
@@ -21,7 +21,7 @@
 // kernel. Verify that all the kernels queued are executed before the callback.
 
 /* HIT_START
- * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS -std=c++11 EXCLUDE_HIP_PLATFORM nvcc
+ * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS -std=c++11
  * TEST: %t
  * HIT_END
  */
diff --git a/tests/src/runtimeApi/stream/hipStreamACb_MultiCalls.cpp b/tests/src/runtimeApi/stream/hipStreamACb_MultiCalls.cpp
index a98fbb87c7..a182c85010 100644
--- a/tests/src/runtimeApi/stream/hipStreamACb_MultiCalls.cpp
+++ b/tests/src/runtimeApi/stream/hipStreamACb_MultiCalls.cpp
@@ -21,7 +21,7 @@
 // when hipStreamAddCallback() is called back to back multiple calls
 
 /* HIT_START
- * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS --std=c++11 EXCLUDE_HIP_PLATFORM nvcc
+ * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS --std=c++11
  * TEST: %t
  * HIT_END
  */
diff --git a/tests/src/runtimeApi/stream/hipStreamACb_StrmSyncTiming.cpp b/tests/src/runtimeApi/stream/hipStreamACb_StrmSyncTiming.cpp
index fb93268176..d21ea5da54 100644
--- a/tests/src/runtimeApi/stream/hipStreamACb_StrmSyncTiming.cpp
+++ b/tests/src/runtimeApi/stream/hipStreamACb_StrmSyncTiming.cpp
@@ -22,7 +22,7 @@
 // by hipStreamAddCallback() api.
 
 /* HIT_START
- * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS --std=c++11 EXCLUDE_HIP_PLATFORM nvcc
+ * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS --std=c++11
  * TEST: %t
  * HIT_END
  */
diff --git a/tests/src/runtimeApi/stream/hipStreamACb_ThrdBehaviour.cpp b/tests/src/runtimeApi/stream/hipStreamACb_ThrdBehaviour.cpp
index 5e9b75adee..2eef534ea4 100644
--- a/tests/src/runtimeApi/stream/hipStreamACb_ThrdBehaviour.cpp
+++ b/tests/src/runtimeApi/stream/hipStreamACb_ThrdBehaviour.cpp
@@ -22,7 +22,7 @@
 // finish. Ideally Host thread should not wait for callback to finish.
 
 /* HIT_START
- * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS --std=c++11 EXCLUDE_HIP_PLATFORM nvcc
+ * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS --std=c++11
  * TEST: %t
  * HIT_END
  */
diff --git a/tests/src/runtimeApi/stream/hipStreamACb_order.cpp b/tests/src/runtimeApi/stream/hipStreamACb_order.cpp
index f7d8a866f2..7b66441fa6 100644
--- a/tests/src/runtimeApi/stream/hipStreamACb_order.cpp
+++ b/tests/src/runtimeApi/stream/hipStreamACb_order.cpp
@@ -18,7 +18,7 @@
  * */
 
 /* HIT_START
- * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS --std=c++11  EXCLUDE_HIP_PLATFORM nvcc
+ * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS --std=c++11
  * TEST: %t
  * HIT_END
  */
diff --git a/tests/src/runtimeApi/stream/hipStreamGetPriority.cpp b/tests/src/runtimeApi/stream/hipStreamGetPriority.cpp
index 49991eec20..8da2c2f8a5 100644
--- a/tests/src/runtimeApi/stream/hipStreamGetPriority.cpp
+++ b/tests/src/runtimeApi/stream/hipStreamGetPriority.cpp
@@ -19,7 +19,7 @@ THE SOFTWARE.
 
 
 /* HIT_START
- * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS -std=c++11 EXCLUDE_HIP_PLATFORM nvcc
+ * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS -std=c++11
  * TEST: %t
  * HIT_END
  */
diff --git a/tests/src/test_common.h b/tests/src/test_common.h
old mode 100755
new mode 100644
index 21a4c45ac8..8897dc938e
--- a/tests/src/test_common.h
+++ b/tests/src/test_common.h
@@ -41,6 +41,7 @@ THE SOFTWARE.
 
 #define HC __attribute__((hc))
 
+
 #define KNRM "\x1B[0m"
 #define KRED "\x1B[31m"
 #define KGRN "\x1B[32m"
@@ -50,19 +51,6 @@ THE SOFTWARE.
 #define KCYN "\x1B[36m"
 #define KWHT "\x1B[37m"
 
-  // HIP Skip Return code set at cmake
-#define HIP_SKIP_RETURN_CODE 127
-#define HIP_ENABLE_SKIP_TESTS 0
-
-inline bool hip_skip_tests_enabled() {
-  return HIP_ENABLE_SKIP_TESTS;
-}
-
-inline int hip_skip_retcode() {
-  // HIP Skip Return code set at cmake
-  return HIP_SKIP_RETURN_CODE;
-}
-
 #define passed()                                                                                   \
     printf("%sPASSED!%s\n", KGRN, KNRM);                                                           \
     exit(0);
diff --git a/tests/unit/test_common.h b/tests/unit/test_common.h
old mode 100755
new mode 100644
index ae6f1cba04..4b55c70164
--- a/tests/unit/test_common.h
+++ b/tests/unit/test_common.h
@@ -41,6 +41,7 @@ THE SOFTWARE.
 
 #define HC __attribute__((hc))
 
+
 #define KNRM "\x1B[0m"
 #define KRED "\x1B[31m"
 #define KGRN "\x1B[32m"