Merge 'amd-master-next' into 'amd-npi-next'

Change-Id: I935fc8f681fad2df4e932407287a29a6a797351a
2020-08-14 09:09:52 +00:00
@@ -1,20 +1,10 @@
---
 Language: Cpp
 BasedOnStyle: Google
 AlignEscapedNewlinesLeft: false
+AlignOperands: false
 ColumnLimit: 100
+AlwaysBreakTemplateDeclarations: false
 DerivePointerAlignment: false
-IndentWrappedFunctionNames: false
+IndentFunctionDeclarationAfterType: false
 MaxEmptyLinesToKeep: 2
 SortIncludes: false
-IndentWidth: 4
---
-Language: ObjC
-BasedOnStyle: Google
-AlignEscapedNewlinesLeft: false
-ColumnLimit: 100
-DerivePointerAlignment: false
-IndentWrappedFunctionNames: false
-MaxEmptyLinesToKeep: 2
-SortIncludes: false
-IndentWidth: 4
@@ -477,9 +477,6 @@ if(HIP_PLATFORM STREQUAL "hcc" OR HIP_PLATFORM STREQUAL "rocclr")
    install(FILES ${PROJECT_BINARY_DIR}/.hipInfo DESTINATION lib)
 endif()

-# Install .hipInfo
-install(FILES ${PROJECT_BINARY_DIR}/.hipInfo DESTINATION lib)
-
 # Install .hipVersion
 install(FILES ${PROJECT_BINARY_DIR}/.hipVersion DESTINATION bin)

@@ -426,6 +426,7 @@ foreach $arg (@ARGV)
    # TODO: why are we removing it here?
    $trimarg =~ s/^\s+|\s+$//g;  # Remive whitespace
    my $swallowArg = 0;
+    my $escapeArg = 1;
    if ($arg eq '-c' or $arg eq '--genco' or $arg eq '-E') {
        $compileOnly = 1;
        $needLDFLAGS  = 0;
@@ -578,6 +579,7 @@ foreach $arg (@ARGV)
        close $in;
        close $out;
        $arg = "$new_arg -Wl,\@$new_file";
+        $escapeArg = 0;
    } elsif (($arg =~ m/\.a$/ || $arg =~ m/\.lo$/) &&
             $HIP_PLATFORM eq 'hcc' and $HIP_COMPILER eq 'clang') {
        ## process static library for hip-clang
@@ -624,6 +626,7 @@ foreach $arg (@ARGV)
            $new_arg .= " $tmpdir/$libBaseName";
        }
        $arg = "$new_arg";
+        $escapeArg = 0;
        if ($toolArgs =~ m/-Xlinker$/) {
            $toolArgs = substr $toolArgs, 0, -8;
            chomp $toolArgs;
@@ -703,7 +706,7 @@ foreach $arg (@ARGV)
    # common characters such as alphanumerics.
    # Do the quoting here because sometimes the $arg is changed in the loop
    # Important to have all of '-Xlinker' in the set of unquoted characters.
-    if (not $isWindows) {       # Windows needs different quoting, ignore for now
+    if (not $isWindows and $escapeArg) {       # Windows needs different quoting, ignore for now
        $arg =~ s/[^-a-zA-Z0-9_=+,.\/]/\\$&/g;
    }
    $toolArgs .= " $arg" unless $swallowArg;
@@ -798,9 +801,9 @@ if ($setStdLib eq 0 and $HIP_PLATFORM eq 'hcc' and $HIP_COMPILER eq 'hcc')

 if ($needHipHcc) {
    if ($linkType eq 0) {
-        substr($HIPLDFLAGS,0,0) = "  $HIP_LIB_PATH/libhip_hcc_static.a " ;
+        substr($HIPLDFLAGS,0,0) = "  $HIP_LIB_PATH/libamdhip64.a " ;
    } else {
-        substr($HIPLDFLAGS,0,0) = "  -Wl,--enable-new-dtags -Wl,--rpath=$HIP_LIB_PATH:$ROCM_PATH/lib $HIP_LIB_PATH/libhip_hcc.so ";
+        substr($HIPLDFLAGS,0,0) = "  -Wl,--enable-new-dtags -Wl,--rpath=$HIP_LIB_PATH:$ROCM_PATH/lib $HIP_LIB_PATH/libamdhip64.so ";
    }
 }

@@ -839,7 +842,7 @@ if ($HIP_PLATFORM eq "hcc" and $HIP_COMPILER eq "clang") {
      if ($linkType eq 0) {
        $toolArgs .= " -L$HIP_LIB_PATH -lamdhip64 -L$ROCM_PATH/lib -lhsa-runtime64 -ldl -lnuma ";
      } else {
-        $toolArgs .= " -Wl,--enable-new-dtags -Wl,--rpath=$HIP_LIB_PATH:$ROCM_PATH/lib -lhip_hcc -lnuma ";
+        $toolArgs .= " -Wl,--enable-new-dtags -Wl,--rpath=$HIP_LIB_PATH:$ROCM_PATH/lib -lamdhip64 ";
      }
      # To support __fp16 and _Float16, explicitly link with compiler-rt
      $toolArgs .= " -L$HIP_CLANG_PATH/../lib/clang/$HIP_CLANG_VERSION/lib/linux -lclang_rt.builtins-x86_64 "
@@ -1,7 +1,7 @@
 #!/usr/bin/perl -w

 $HIP_BASE_VERSION_MAJOR = "3";
-$HIP_BASE_VERSION_MINOR = "7";
+$HIP_BASE_VERSION_MINOR = "8";

 # Need perl > 5.10 to use logic-defined or
 use 5.006; use v5.10.1;
@@ -638,7 +638,11 @@ macro(HIP_ADD_EXECUTABLE hip_target)
        endif()
        set(CMAKE_HIP_LINK_EXECUTABLE "${HIP_HIPCC_CMAKE_LINKER_HELPER} ${HIP_CLANG_PATH} ${HIP_CLANG_PARALLEL_BUILD_LINK_OPTIONS} <FLAGS> <CMAKE_CXX_LINK_FLAGS> <LINK_FLAGS> <OBJECTS> -o <TARGET> <LINK_LIBRARIES>")
    endif()
-    add_executable(${hip_target} ${_cmake_options} ${_generated_files} ${_sources})
+    if ("${_sources}" STREQUAL "")
+        add_executable(${hip_target} ${_cmake_options} ${_generated_files} "")
+    else()
+        add_executable(${hip_target} ${_cmake_options} ${_generated_files} ${_sources})
+    endif()
    set_target_properties(${hip_target} PROPERTIES LINKER_LANGUAGE HIP)
 endmacro()

@@ -652,7 +656,11 @@ macro(HIP_ADD_LIBRARY hip_target)
    if(_source_files)
        list(REMOVE_ITEM _sources ${_source_files})
    endif()
-    add_library(${hip_target} ${_cmake_options} ${_generated_files} ${_sources})
+    if ("${_sources}" STREQUAL "")
+        add_library(${hip_target} ${_cmake_options} ${_generated_files} "")
+    else()
+        add_library(${hip_target} ${_cmake_options} ${_generated_files} ${_sources})
+    endif()
    set_target_properties(${hip_target} PROPERTIES LINKER_LANGUAGE ${HIP_C_OR_CXX})
 endmacro()

@@ -4,8 +4,6 @@

 CUDA supports cuCtx API, the Driver API that defines "Context" and "Devices" as separate entities. Contexts contain a single device, and a device can theoretically have multiple contexts. HIP initially added limited support for these API to facilitate easy porting from existing driver codes. These API are marked as deprecated now since there are better alternate interface (such as hipSetDevice or the stream API) to achieve the required functions.

-### hipCtxCreate
-### hipCtxDestroy
 ### hipCtxPopCurrent
 ### hipCtxPushCurrent
 ### hipCtxSetCurrent
@@ -21,7 +19,7 @@ CUDA supports cuCtx API, the Driver API that defines "Context" and "Devices" as
 ### hipCtxEnablePeerAccess
 ### hipCtxDisablePeerAccess

-## HIP Management APIs
+## HIP Memory Management APIs

 ### hipMallocHost
 Should use "hipHostMalloc" instead.
@@ -14,7 +14,7 @@ and provides practical suggestions on how to port CUDA code and work through com
  * [CUDA to HIP Math Library Equivalents](#library-equivalents)
 - [Distinguishing Compiler Modes](#distinguishing-compiler-modes)
  * [Identifying HIP Target Platform](#identifying-hip-target-platform)
-  * [Identifying the Compiler: hcc, hip-clang, or nvcc](#identifying-the-compiler-hcc-hip-clang-or-nvcc)
+  * [Identifying the Compiler: hip-clang, or nvcc](#identifying-the-compiler-hip-clang-or-nvcc)
  * [Identifying Current Compilation Pass: Host or Device](#identifying-current-compilation-pass-host-or-device)
  * [Compiler Defines: Summary](#compiler-defines-summary)
 - [Identifying Architecture Features](#identifying-architecture-features)
@@ -41,12 +41,10 @@ and provides practical suggestions on how to port CUDA code and work through com
 - [threadfence_system](#threadfence_system)
  * [Textures and Cache Control](#textures-and-cache-control)
 - [More Tips](#more-tips)
-  * [HIPTRACE Mode](#hiptrace-mode)
-  * [Environment Variables](#environment-variables)
+  * [HIP Logging](#hip-logging)
  * [Debugging hipcc](#debugging-hipcc)
  * [What Does This Error Mean?](#what-does-this-error-mean)
    + [/usr/include/c++/v1/memory:5172:15: error: call to implicitly deleted default constructor of 'std::__1::bad_weak_ptr' throw bad_weak_ptr();](#usrincludecv1memory517215-error-call-to-implicitly-deleted-default-constructor-of-std__1bad_weak_ptr-throw-bad_weak_ptr)
-  * [HIP Environment Variables](#hip-environment-variables)
  * [Editor Highlighting](#editor-highlighting)


@@ -163,17 +161,19 @@ Many projects use a mixture of an accelerator compiler (AMD or NVIDIA) and a sta



-### Identifying the Compiler: hcc, hip-clang or nvcc
-Often, it's useful to know whether the underlying compiler is hcc, HIP-Clang or nvcc. This knowledge can guard platform-specific code or aid in platform-specific performance tuning.
-
+### Identifying the Compiler: hip-clang or nvcc
+Often, it's useful to know whether the underlying compiler is HIP-Clang or nvcc. This knowledge can guard platform-specific code or aid in platform-specific performance tuning.

 ```
-#ifdef __HCC__
-// Compiled with hcc
+#ifdef __HIP_PLATFORM_HCC__
+// Compiled with HIP-Clang

 ```
+
 ```
-#ifdef __HIP__
+#if defined(__HCC__) || (defined(__clang__) && defined(__HIP__))
+#define __HIP_PLATFORM_HCC__
+#endif
 // Compiled with HIP-Clang

 ```
@@ -198,7 +198,7 @@ Compiler directly generates the host code (using the Clang x86 target) and passe

 nvcc makes two passes over the code: one for host code and one for device code.
 HIP-Clang will have multiple passes over the code: one for the host code, and one for each architecture on the device code.
-`__HIP_DEVICE_COMPILE__` is set to a nonzero value when the compiler (hcc, HIP-Clang or nvcc) is compiling code for a device inside a `__global__` kernel or for a device function. `__HIP_DEVICE_COMPILE__` can replace #ifdef checks on the `__CUDA_ARCH__` define.
+`__HIP_DEVICE_COMPILE__` is set to a nonzero value when the compiler (HIP-Clang or nvcc) is compiling code for a device inside a `__global__` kernel or for a device function. `__HIP_DEVICE_COMPILE__` can replace #ifdef checks on the `__CUDA_ARCH__` define.

 ```
 // #ifdef __CUDA_ARCH__
@@ -209,24 +209,21 @@ Unlike `__CUDA_ARCH__`, the `__HIP_DEVICE_COMPILE__` value is 1 or undefined, an


 ### Compiler Defines: Summary
-|Define  		|  hcc      |  HIP-Clang  | nvcc 		|  Other (GCC, ICC, Clang, etc.)
-|--- | --- | --- | --- |---|
+|Define  		|   HIP-Clang  | nvcc 		|  Other (GCC, ICC, Clang, etc.)
+|--- | --- | --- |---|
 |HIP-related defines:|
-|`__HIP_PLATFORM_HCC__`| Defined | Defined | Undefined |  Defined if targeting hcc platform; undefined otherwise |
-|`__HIP_PLATFORM_NVCC__`| Undefined | Undefined | Defined |  Defined if targeting nvcc platform; undefined otherwise |
-|`__HIP_DEVICE_COMPILE__`     | 1 if compiling for device; undefined if compiling for host  | 1 if compiling for device; undefined if compiling for host  |1 if compiling for device; undefined if compiling for host  | Undefined
-|`__HIPCC__`		| Defined   | Defined | Defined 		|  Undefined
-|`__HIP_ARCH_*` | 0 or 1 depending on feature support (see below) |0 or 1 depending on feature support (see below) | 0 or 1 depending on feature support (see below) | 0
+|`__HIP_PLATFORM_HCC__`| Defined | Undefined |  Defined if targeting AMD platform; undefined otherwise |
+|`__HIP_PLATFORM_NVCC__`| Undefined  | Defined |  Defined if targeting nvcc platform; undefined otherwise |
+|`__HIP_DEVICE_COMPILE__`     | 1 if compiling for device; undefined if compiling for host  |1 if compiling for device; undefined if compiling for host  | Undefined
+|`__HIPCC__`		|  Defined | Defined 		|  Undefined
+|`__HIP_ARCH_*` |0 or 1 depending on feature support (see below) | 0 or 1 depending on feature support (see below) | 0
 |nvcc-related defines:|
-|`__CUDACC__` 		| Undefined | Undefined | Defined if source code is compiled by nvcc; undefined otherwise 		|  Undefined
-|`__NVCC__` 		| Undefined | Undefined | Defined 		|  Undefined
-|`__CUDA_ARCH__`		| Undefined | Undefined | Unsigned representing compute capability (e.g., "130") if in device code; 0 if in host code  | Undefined
-|hcc-related defines:|
-|`__HCC__`  		| Defined   | Undefined | Undefined   	|  Undefined
-|`__HCC_ACCELERATOR__`  	| Nonzero if in device code; otherwise undefined | Undefined | Undefined | Undefined
+|`__CUDACC__` 		 | Defined if source code is compiled by nvcc; undefined otherwise 		|  Undefined
+|`__NVCC__` 		 | Undefined | Defined 		|  Undefined
+|`__CUDA_ARCH__`		 | Undefined | Unsigned representing compute capability (e.g., "130") if in device code; 0 if in host code  | Undefined
 |hip-clang-related defines:|
-|`__HIP__`  		| Undefined | Defined   | Undefined   	|  Undefined
-|hcc/HIP-Clang common defines:|
+|`__HIP__`  		 | Defined   | Undefined   	|  Undefined
+|HIP-Clang common defines:|
 |`__clang__`		| Defined   | Defined | Undefined 	|  Defined if using Clang; otherwise undefined

 ## Identifying Architecture Features
@@ -274,23 +271,23 @@ The table below shows the full set of architectural properties that HIP supports
 |`__HIP_ARCH_HAS_SHARED_INT32_ATOMICS__`        |    hasSharedInt32Atomics      |32-bit integer atomics for shared memory
 |`__HIP_ARCH_HAS_SHARED_FLOAT_ATOMIC_EXCH__`    |    hasSharedFloatAtomicExch   |32-bit float atomic exchange for shared memory
 |`__HIP_ARCH_HAS_FLOAT_ATOMIC_ADD__`            |     hasFloatAtomicAdd         |32-bit float atomic add in global and shared memory
-|64-bit atomics:                           |                                     |                                                     
-|`__HIP_ARCH_HAS_GLOBAL_INT64_ATOMICS__`        |    hasGlobalInt64Atomics      |64-bit integer atomics for global memory                                                 
+|64-bit atomics:                           |                                     |
+|`__HIP_ARCH_HAS_GLOBAL_INT64_ATOMICS__`        |    hasGlobalInt64Atomics      |64-bit integer atomics for global memory
 |`__HIP_ARCH_HAS_SHARED_INT64_ATOMICS__`        |    hasSharedInt64Atomics      |64-bit integer atomics for shared memory
 |Doubles:                                  |                                     |
-|`__HIP_ARCH_HAS_DOUBLES__`                    |     hasDoubles                 |Double-precision floating point                                                     
-|Warp cross-lane operations:              |                                      |                                                     
-|`__HIP_ARCH_HAS_WARP_VOTE__`                  |     hasWarpVote                |Warp vote instructions (any, all)                                                      
-|`__HIP_ARCH_HAS_WARP_BALLOT__`                |     hasWarpBallot              |Warp ballot instructions 
-|`__HIP_ARCH_HAS_WARP_SHUFFLE__`               |     hasWarpShuffle             |Warp shuffle operations (shfl\_\*)                                                     
+|`__HIP_ARCH_HAS_DOUBLES__`                    |     hasDoubles                 |Double-precision floating point
+|Warp cross-lane operations:              |                                      |
+|`__HIP_ARCH_HAS_WARP_VOTE__`                  |     hasWarpVote                |Warp vote instructions (any, all)
+|`__HIP_ARCH_HAS_WARP_BALLOT__`                |     hasWarpBallot              |Warp ballot instructions
+|`__HIP_ARCH_HAS_WARP_SHUFFLE__`               |     hasWarpShuffle             |Warp shuffle operations (shfl\_\*)
 |`__HIP_ARCH_HAS_WARP_FUNNEL_SHIFT__`          |     hasFunnelShift             |Funnel shift two input words into one
 |Sync:                                    |                                      |
 |`__HIP_ARCH_HAS_THREAD_FENCE_SYSTEM__`        |     hasThreadFenceSystem       |threadfence\_system
-|`__HIP_ARCH_HAS_SYNC_THREAD_EXT__`            |     hasSyncThreadsExt         |syncthreads\_count, syncthreads\_and, syncthreads\_or                                                      
-|Miscellaneous:                                |                                |                                                     
-|`__HIP_ARCH_HAS_SURFACE_FUNCS__`              |   hasSurfaceFuncs              |                                                     
-|`__HIP_ARCH_HAS_3DGRID__`                     |   has3dGrid                    | Grids and groups are 3D                                                     
-|`__HIP_ARCH_HAS_DYNAMIC_PARALLEL__`           |   hasDynamicParallelism        | 
+|`__HIP_ARCH_HAS_SYNC_THREAD_EXT__`            |     hasSyncThreadsExt         |syncthreads\_count, syncthreads\_and, syncthreads\_or
+|Miscellaneous:                                |                                |
+|`__HIP_ARCH_HAS_SURFACE_FUNCS__`              |   hasSurfaceFuncs              |
+|`__HIP_ARCH_HAS_3DGRID__`                     |   has3dGrid                    | Grids and groups are 3D
+|`__HIP_ARCH_HAS_DYNAMIC_PARALLEL__`           |   hasDynamicParallelism        |


 ## Finding HIP
@@ -498,19 +495,15 @@ int main()
    std::cout<<"Passed"<<std::endl;
 }
 ```
- 
+
 ## threadfence_system
 Threadfence_system makes all device memory writes, all writes to mapped host memory, and all writes to peer memory visible to CPU and other GPU devices.
 Some implementations can provide this behavior by flushing the GPU L2 cache.
-HIP/HCC does not provide this functionality.  As a workaround, users can set the environment variable `HSA_DISABLE_CACHE=1` to 
-disable the GPU L2 cache. This will affect all accesses and for all kernels and so may have 
-a performance impact.
+HIP/HIP-Clang does not provide this functionality.  As a workaround, users can set the environment variable `HSA_DISABLE_CACHE=1` to disable the GPU L2 cache. This will affect all accesses and for all kernels and so may have a performance impact.

 ### Textures and Cache Control

-Compute programs sometimes use textures either to access dedicated texture caches or to use the texture-sampling hardware for interpolation and clamping. The former approach uses simple point samplers with linear interpolation, essentially only reading a single point. The latter approach uses the sampler hardware to interpolate and combine multiple
-point samples. AMD hardware, as well as recent competing hardware,
-has a unified texture/L1 cache, so it no longer has a dedicated texture cache. But the nvcc path often caches global loads in the L2 cache, and some programs may benefit from explicit control of the L1 cache contents.  We recommend the __ldg instruction for this purpose.
+Compute programs sometimes use textures either to access dedicated texture caches or to use the texture-sampling hardware for interpolation and clamping. The former approach uses simple point samplers with linear interpolation, essentially only reading a single point. The latter approach uses the sampler hardware to interpolate and combine multiple samples. AMD hardware, as well as recent competing hardware, has a unified texture/L1 cache, so it no longer has a dedicated texture cache. But the nvcc path often caches global loads in the L2 cache, and some programs may benefit from explicit control of the L1 cache contents.  We recommend the __ldg instruction for this purpose.

 AMD compilers currently load all data into both the L1 and L2 caches, so __ldg is treated as a no-op.

@@ -521,27 +514,51 @@ We recommend the following for functional portability:


 ## More Tips
-### HIPTRACE Mode

-On an hcc/AMD platform, set the HIP_TRACE_API environment variable to see a textural API trace. Use the following bit mask:
+### HIP Logging

- 0x1 = trace APIs
- 0x2 = trace synchronization operations
- 0x4 = trace memory allocation / deallocation
+On an AMD platform, set the AMD_LOG_LEVEL environment variable to log HIP application execution information.

-### Environment Variables
+The value of the setting controls different logging level,

-On hcc/AMD platforms, set the HIP_PRINT_ENV environment variable to 1 and run an application that calls a HIP API to see all HIP-supported environment variables and their current values:
+```
+enum LogLevel {
+LOG_NONE = 0,
+LOG_ERROR = 1,
+LOG_WARNING = 2,
+LOG_INFO = 3,
+LOG_DEBUG = 4
+};
+```

- HIP_PRINT_ENV = 1: print HIP environment variables
- HIP_TRACE_API = 1: trace each HIP API call. Print the function name and return code to stderr as the program executes.
- HIP_LAUNCH_BLOCKING = 0: make HIP APIs host-synchronous so they are blocked until any kernel launches or data-copy commands are complete (an alias is CUDA_LAUNCH_BLOCKING)
-
- KMDUMPISA = 1 : Will dump the GCN ISA for all kernels into the local directory. (This flag is provided by HCC).
+Logging mask is used to print types of functionalities during the execution of HIP application.
+It can be set as one of the following values,

+```
+enum LogMask {
+  LOG_API       = 0x00000001, //!< API call
+  LOG_CMD       = 0x00000002, //!< Kernel and Copy Commands and Barriers
+  LOG_WAIT      = 0x00000004, //!< Synchronization and waiting for commands to finish
+  LOG_AQL       = 0x00000008, //!< Decode and display AQL packets
+  LOG_QUEUE     = 0x00000010, //!< Queue commands and queue contents
+  LOG_SIG       = 0x00000020, //!< Signal creation, allocation, pool
+  LOG_LOCK      = 0x00000040, //!< Locks and thread-safety code.
+  LOG_KERN      = 0x00000080, //!< kernel creations and arguments, etc.
+  LOG_COPY      = 0x00000100, //!< Copy debug
+  LOG_COPY2     = 0x00000200, //!< Detailed copy debug
+  LOG_RESOURCE  = 0x00000400, //!< Resource allocation, performance-impacting events.
+  LOG_INIT      = 0x00000800, //!< Initialization and shutdown
+  LOG_MISC      = 0x00001000, //!< misc debug, not yet classified
+  LOG_AQL2      = 0x00002000, //!< Show raw bytes of AQL packet
+  LOG_CODE      = 0x00004000, //!< Show code creation debug
+  LOG_CMD2      = 0x00008000, //!< More detailed command info, including barrier commands
+  LOG_LOCATION  = 0x00010000, //!< Log message location
+  LOG_ALWAYS    = 0xFFFFFFFF, //!< Log always even mask flag is zero
+};
+```

 ### Debugging hipcc
-To see the detailed commands that hipcc issues, set the environment variable HIPCC_VERBOSE to 1. Doing so will print to stderr the hcc (or nvcc) commands that hipcc generates. 
+To see the detailed commands that hipcc issues, set the environment variable HIPCC_VERBOSE to 1. Doing so will print to stderr the HIP-clang (or nvcc) commands that hipcc generates.

 ```
 export HIPCC_VERBOSE=1
@@ -557,25 +574,6 @@ hipcc-cmd: /opt/hcc/bin/hcc  -hc -I/opt/hcc/include -stdlib=libc++ -I../../../..
 If you pass a ".cu" file, hcc will attempt to compile it as a CUDA language file. You must tell hcc that it's in fact a C++ file: use the "-x c++" option.


-### HIP Environment Variables
-
-On the HCC path, HIP provides a number of environment variables that control the behavior of HIP.  Some of these are useful for application development (for example HIP_VISIBLE_DEVICES, HIP_LAUNCH_BLOCKING),
-some are useful for performance tuning or experimentation (for example HIP_STAGING*), and some are useful for debugging (HIP_DB).  You can see the environment variables supported by HIP as well as
-their current values and usage with the environment var "HIP_PRINT_ENV" - set this and then run any HIP application.  For example:
-
-```
-$ HIP_PRINT_ENV=1 ./myhipapp
-HIP_PRINT_ENV                  =  1 : Print HIP environment variables.
-HIP_LAUNCH_BLOCKING            =  0 : Make HIP APIs 'host-synchronous', so they block until any kernel launches or data copy commands complete. Alias: CUDA_LAUNCH_BLOCKING.
-HIP_DB                         =  0 : Print various debug info.  Bitmask, see hip_hcc.cpp for more information.
-HIP_TRACE_API                  =  0 : Trace each HIP API call.  Print function name and return code to stderr as program executes.
-HIP_TRACE_API_COLOR            = green : Color to use for HIP_API.  None/Red/Green/Yellow/Blue/Magenta/Cyan/White
-HIP_PROFILE_API                 =  0 : Add HIP function begin/end to ATP file generated with CodeXL
-HIP_VISIBLE_DEVICES            =  0 : Only devices whose index is present in the secquence are visible to HIP applications and they are enumerated in the order of secquence
-
-```
-
-
 ### Editor Highlighting
 See the utils/vim or utils/gedit directories to add handy highlighting to hip files.

@@ -156,13 +156,13 @@ if(HIP_COMPILER STREQUAL "clang")

  if (EXISTS ${AMD_DEVICE_LIBS_PREFIX}/amdgcn/bitcode)
    set_property(TARGET hip::device APPEND PROPERTY
-      INTERFACE_COMPILE_OPTIONS -x hip
+      INTERFACE_COMPILE_OPTIONS -xhip
    )
  else()
    # This path is to support an older build of the device library
    # TODO: To be removed in the future.
    set_property(TARGET hip::device APPEND PROPERTY
-      INTERFACE_COMPILE_OPTIONS -x hip --hip-device-lib-path=${AMD_DEVICE_LIBS_PREFIX}/lib
+      INTERFACE_COMPILE_OPTIONS -xhip --hip-device-lib-path=${AMD_DEVICE_LIBS_PREFIX}/lib
    )
  endif()

@@ -95,6 +95,13 @@ typedef struct ihipCtx_t* hipCtx_t;
 // Note many APIs also use integer deviceIds as an alternative to the device pointer:
 typedef int hipDevice_t;

+typedef enum hipDeviceP2PAttr {
+  hipDevP2PAttrPerformanceRank = 0,
+  hipDevP2PAttrAccessSupported,
+  hipDevP2PAttrNativeAtomicSupported,
+  hipDevP2PAttrHipArrayAccessSupported
+} hipDeviceP2PAttr;
+
 typedef struct ihipStream_t* hipStream_t;

 #define hipIpcMemLazyEnablePeerAccess 0
@@ -2166,6 +2173,7 @@ hipError_t hipMemcpy2DToArray(hipArray* dst, size_t wOffset, size_t hOffset, con
 *  @see hipMemcpy, hipMemcpy2DToArray, hipMemcpy2D, hipMemcpyFromArray, hipMemcpyToSymbol,
 * hipMemcpyAsync
 */
+DEPRECATED(DEPRECATED_MSG)
 hipError_t hipMemcpyToArray(hipArray* dst, size_t wOffset, size_t hOffset, const void* src,
                            size_t count, hipMemcpyKind kind);

@@ -2184,6 +2192,7 @@ hipError_t hipMemcpyToArray(hipArray* dst, size_t wOffset, size_t hOffset, const
 *  @see hipMemcpy, hipMemcpy2DToArray, hipMemcpy2D, hipMemcpyFromArray, hipMemcpyToSymbol,
 * hipMemcpyAsync
 */
+DEPRECATED(DEPRECATED_MSG)
 hipError_t hipMemcpyFromArray(void* dst, hipArray_const_t srcArray, size_t wOffset, size_t hOffset,
                              size_t count, hipMemcpyKind kind);

@@ -2799,6 +2808,19 @@ hipError_t hipDeviceComputeCapability(int* major, int* minor, hipDevice_t device
 */
 hipError_t hipDeviceGetName(char* name, int len, hipDevice_t device);

+
+/**
+ * @brief Returns a value for attr of link between two devices
+ * @param [out] value
+ * @param [in] attr
+ * @param [in] srcDevice
+ * @param [in] dstDevice
+ *
+ * @returns #hipSuccess, #hipErrorInavlidDevice
+ */
+hipError_t hipDeviceGetP2PAttribute(int* value, hipDeviceP2PAttr attr,
+                                    int srcDevice, int dstDevice);
+
 /**
 * @brief Returns a PCI Bus Id string for the device, overloaded to take int device ID.
 * @param [out] pciBusId
@@ -3511,6 +3533,7 @@ hipError_t hipExtLaunchKernel(const void* function_address, dim3 numBlocks, dim3
                              void** args, size_t sharedMemBytes, hipStream_t stream,
                              hipEvent_t startEvent, hipEvent_t stopEvent, int flags);

+DEPRECATED(DEPRECATED_MSG)
 hipError_t hipBindTexture(
    size_t* offset,
    const textureReference* tex,
@@ -3518,6 +3541,7 @@ hipError_t hipBindTexture(
    const hipChannelFormatDesc* desc,
    size_t size __dparm(UINT_MAX));

+DEPRECATED(DEPRECATED_MSG)
 hipError_t hipBindTexture2D(
    size_t* offset,
    const textureReference* tex,
@@ -3527,6 +3551,7 @@ hipError_t hipBindTexture2D(
    size_t height,
    size_t pitch);

+DEPRECATED(DEPRECATED_MSG)
 hipError_t hipBindTextureToArray(
    const textureReference* tex,
    hipArray_const_t array,
@@ -3537,6 +3562,7 @@ hipError_t hipBindTextureToMipmappedArray(
    hipMipmappedArray_const_t mipmappedArray,
    const hipChannelFormatDesc* desc);

+DEPRECATED(DEPRECATED_MSG)
 hipError_t hipGetTextureAlignmentOffset(
    size_t* offset,
    const textureReference* texref);
@@ -3545,6 +3571,7 @@ hipError_t hipGetTextureReference(
    const textureReference** texref,
    const void* symbol);

+DEPRECATED(DEPRECATED_MSG)
 hipError_t hipUnbindTexture(const textureReference* tex);

 hipError_t hipCreateTextureObject(
@@ -3814,6 +3841,7 @@ inline hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
 class TlsData;

 #if !__HIP_ROCclr__
+DEPRECATED(DEPRECATED_MSG)
 hipError_t hipBindTexture(size_t* offset, textureReference* tex, const void* devPtr,
                          const hipChannelFormatDesc* desc, size_t size = UINT_MAX);
 #endif
@@ -3841,6 +3869,7 @@ hipError_t ihipBindTextureImpl(TlsData *tls, int dim, enum hipTextureReadMode re
 **/
 #if !__HIP_ROCclr__
 template <class T, int dim, enum hipTextureReadMode readMode>
+DEPRECATED(DEPRECATED_MSG)
 hipError_t hipBindTexture(size_t* offset, struct texture<T, dim, readMode>& tex, const void* devPtr,
                          const struct hipChannelFormatDesc& desc, size_t size = UINT_MAX) {
    return ihipBindTextureImpl(nullptr, dim, readMode, offset, devPtr, &desc, size, &tex);
@@ -3863,6 +3892,7 @@ hipError_t hipBindTexture(size_t* offset, struct texture<T, dim, readMode>& tex,
 **/
 #if !__HIP_ROCclr__
 template <class T, int dim, enum hipTextureReadMode readMode>
+DEPRECATED(DEPRECATED_MSG)
 hipError_t hipBindTexture(size_t* offset, struct texture<T, dim, readMode>& tex, const void* devPtr,
                          size_t size = UINT_MAX) {
    return ihipBindTextureImpl(nullptr, dim, readMode, offset, devPtr, &(tex.channelDesc), size, &tex);
@@ -3871,6 +3901,7 @@ hipError_t hipBindTexture(size_t* offset, struct texture<T, dim, readMode>& tex,

 // C API
 #if !__HIP_ROCclr__
+DEPRECATED(DEPRECATED_MSG)
 hipError_t hipBindTexture2D(size_t* offset, textureReference* tex, const void* devPtr,
                            const hipChannelFormatDesc* desc, size_t width, size_t height,
                            size_t pitch);
@@ -3884,6 +3915,7 @@ hipError_t ihipBindTexture2DImpl(int dim, enum hipTextureReadMode readMode, size

 #if !__HIP_ROCclr__
 template <class T, int dim, enum hipTextureReadMode readMode>
+DEPRECATED(DEPRECATED_MSG)
 hipError_t hipBindTexture2D(size_t* offset, struct texture<T, dim, readMode>& tex,
                            const void* devPtr, size_t width, size_t height, size_t pitch) {
    return ihipBindTexture2DImpl(dim, readMode, offset, devPtr, &(tex.channelDesc), width, height,
@@ -3893,6 +3925,7 @@ hipError_t hipBindTexture2D(size_t* offset, struct texture<T, dim, readMode>& te

 #if !__HIP_ROCclr__
 template <class T, int dim, enum hipTextureReadMode readMode>
+DEPRECATED(DEPRECATED_MSG)
 hipError_t hipBindTexture2D(size_t* offset, struct texture<T, dim, readMode>& tex,
                            const void* devPtr, const struct hipChannelFormatDesc& desc,
                            size_t width, size_t height, size_t pitch) {
@@ -3902,6 +3935,7 @@ hipError_t hipBindTexture2D(size_t* offset, struct texture<T, dim, readMode>& te

 // C API
 #if !__HIP_ROCclr__
+DEPRECATED(DEPRECATED_MSG)
 hipError_t hipBindTextureToArray(textureReference* tex, hipArray_const_t array,
                                 const hipChannelFormatDesc* desc);
 #endif
@@ -3915,6 +3949,7 @@ hipError_t ihipBindTextureToArrayImpl(TlsData *tls, int dim, enum hipTextureRead

 #if !__HIP_ROCclr__
 template <class T, int dim, enum hipTextureReadMode readMode>
+DEPRECATED(DEPRECATED_MSG)
 hipError_t hipBindTextureToArray(struct texture<T, dim, readMode>& tex, hipArray_const_t array) {
    return ihipBindTextureToArrayImpl(nullptr, dim, readMode, array, tex.channelDesc, &tex);
 }
@@ -3922,6 +3957,7 @@ hipError_t hipBindTextureToArray(struct texture<T, dim, readMode>& tex, hipArray

 #if !__HIP_ROCclr__
 template <class T, int dim, enum hipTextureReadMode readMode>
+DEPRECATED(DEPRECATED_MSG)
 hipError_t hipBindTextureToArray(struct texture<T, dim, readMode>& tex, hipArray_const_t array,
                                 const struct hipChannelFormatDesc& desc) {
    return ihipBindTextureToArrayImpl(nullptr, dim, readMode, array, desc, &tex);
@@ -3930,6 +3966,7 @@ hipError_t hipBindTextureToArray(struct texture<T, dim, readMode>& tex, hipArray

 #if !__HIP_ROCclr__
 template <class T, int dim, enum hipTextureReadMode readMode>
+DEPRECATED(DEPRECATED_MSG)
 inline static hipError_t hipBindTextureToArray(struct texture<T, dim, readMode> *tex,
                                               hipArray_const_t array,
                                               const struct hipChannelFormatDesc* desc) {
@@ -3999,6 +4036,7 @@ inline hipError_t hipExtLaunchMultiKernelMultiDevice(hipLaunchParams* launchPara
 *  @return #hipSuccess
 **/
 #if !__HIP_ROCclr__
+DEPRECATED(DEPRECATED_MSG)
 hipError_t hipUnbindTexture(const textureReference* tex);
 #endif

@@ -4008,6 +4046,7 @@ extern hipError_t ihipUnbindTextureImpl(const hipTextureObject_t& textureObject)

 #if !__HIP_ROCclr__
 template <class T, int dim, enum hipTextureReadMode readMode>
+DEPRECATED(DEPRECATED_MSG)
 hipError_t hipUnbindTexture(struct texture<T, dim, readMode>& tex) {
    return ihipUnbindTextureImpl(tex.textureObject);
 }
@@ -4015,7 +4054,10 @@ hipError_t hipUnbindTexture(struct texture<T, dim, readMode>& tex) {

 #if !__HIP_ROCclr__
 hipError_t hipGetChannelDesc(hipChannelFormatDesc* desc, hipArray_const_t array);
+
+DEPRECATED(DEPRECATED_MSG)
 hipError_t hipGetTextureAlignmentOffset(size_t* offset, const textureReference* texref);
+
 hipError_t hipGetTextureReference(const textureReference** texref, const void* symbol);

 hipError_t hipCreateTextureObject(hipTextureObject_t* pTexObject, const hipResourceDesc* pResDesc,
@@ -4058,28 +4100,23 @@ hipError_t hipCreateSurfaceObject(hipSurfaceObject_t* pSurfObject, const hipReso
 hipError_t hipDestroySurfaceObject(hipSurfaceObject_t surfaceObject);

 #if __HIP_ROCclr__
-template<class T, int dim, enum hipTextureReadMode readMode>
-static inline hipError_t hipBindTexture(
-    size_t *offset,
-    const struct texture<T, dim, readMode> &tex,
-    const void *devPtr,
-    size_t size = UINT_MAX)
-{
+template <class T, int dim, enum hipTextureReadMode readMode>
+DEPRECATED(DEPRECATED_MSG)
+static inline hipError_t hipBindTexture(size_t* offset, const struct texture<T, dim, readMode>& tex,
+                                        const void* devPtr, size_t size = UINT_MAX) {
    return hipBindTexture(offset, &tex, devPtr, &tex.channelDesc, size);
 }

-template<class T, int dim, enum hipTextureReadMode readMode>
-static inline hipError_t hipBindTexture(
-    size_t *offset,
-    const struct texture<T, dim, readMode> &tex,
-    const void *devPtr,
-    const struct hipChannelFormatDesc &desc,
-    size_t size = UINT_MAX)
-{
+template <class T, int dim, enum hipTextureReadMode readMode>
+DEPRECATED(DEPRECATED_MSG)
+static inline hipError_t
+    hipBindTexture(size_t* offset, const struct texture<T, dim, readMode>& tex, const void* devPtr,
+                   const struct hipChannelFormatDesc& desc, size_t size = UINT_MAX) {
    return hipBindTexture(offset, &tex, devPtr, &desc, size);
 }

 template<class T, int dim, enum hipTextureReadMode readMode>
+DEPRECATED(DEPRECATED_MSG)
 static inline hipError_t hipBindTexture2D(
    size_t *offset,
    const struct texture<T, dim, readMode> &tex,
@@ -4092,6 +4129,7 @@ static inline hipError_t hipBindTexture2D(
 }

 template<class T, int dim, enum hipTextureReadMode readMode>
+DEPRECATED(DEPRECATED_MSG)
 static inline hipError_t hipBindTexture2D(
  size_t *offset,
  const struct texture<T, dim, readMode> &tex,
@@ -4105,6 +4143,7 @@ static inline hipError_t hipBindTexture2D(
 }

 template<class T, int dim, enum hipTextureReadMode readMode>
+DEPRECATED(DEPRECATED_MSG)
 static inline hipError_t hipBindTextureToArray(
    const struct texture<T, dim, readMode> &tex,
    hipArray_const_t array)
@@ -4115,6 +4154,7 @@ static inline hipError_t hipBindTextureToArray(
 }

 template<class T, int dim, enum hipTextureReadMode readMode>
+DEPRECATED(DEPRECATED_MSG)
 static inline hipError_t hipBindTextureToArray(
    const struct texture<T, dim, readMode> &tex,
    hipArray_const_t array,
@@ -4148,6 +4188,7 @@ static inline hipError_t hipBindTextureToMipmappedArray(
 }

 template<class T, int dim, enum hipTextureReadMode readMode>
+DEPRECATED(DEPRECATED_MSG)
 static inline hipError_t hipUnbindTexture(
    const struct texture<T, dim, readMode> &tex)
 {
@@ -37,6 +37,18 @@ extern "C" {
 #define __dparm(x)
 #endif

+// Add Deprecated Support for CUDA Mapped HIP APIs
+#if defined(__DOXYGEN_ONLY__) || defined(HIP_ENABLE_DEPRECATED)
+#define __HIP_DEPRECATED
+#elif defined(_MSC_VER)
+#define __HIP_DEPRECATED __declspec(deprecated)
+#elif defined(__GNUC__)
+#define __HIP_DEPRECATED __attribute__((deprecated))
+#else
+#define __HIP_DEPRECATED
+#endif
+
+
 // TODO -move to include/hip_runtime_api.h as a common implementation.
 /**
 * Memory copy types
@@ -179,6 +191,7 @@ typedef enum cudaSharedMemConfig hipSharedMemConfig;
 typedef CUfunc_cache hipFuncCache;
 typedef CUjit_option hipJitOption;
 typedef CUdevice hipDevice_t;
+typedef enum cudaDeviceP2PAttr hipDeviceP2PAttr;
 typedef CUmodule hipModule_t;
 typedef CUfunction hipFunction_t;
 typedef CUdeviceptr hipDeviceptr_t;
@@ -962,14 +975,16 @@ inline static hipError_t hipMemcpy2DToArray(hipArray* dst, size_t wOffset, size_
                                                      height, hipMemcpyKindToCudaMemcpyKind(kind)));
 }

-inline static hipError_t hipMemcpyToArray(hipArray* dst, size_t wOffset, size_t hOffset,
-                                          const void* src, size_t count, hipMemcpyKind kind) {
+__HIP_DEPRECATED inline static hipError_t hipMemcpyToArray(hipArray* dst, size_t wOffset,
+                                                           size_t hOffset, const void* src,
+                                                           size_t count, hipMemcpyKind kind) {
    return hipCUDAErrorTohipError(
        cudaMemcpyToArray(dst, wOffset, hOffset, src, count, hipMemcpyKindToCudaMemcpyKind(kind)));
 }

-inline static hipError_t hipMemcpyFromArray(void* dst, hipArray_const_t srcArray, size_t wOffset,
-                                            size_t hOffset, size_t count, hipMemcpyKind kind) {
+__HIP_DEPRECATED inline static hipError_t hipMemcpyFromArray(void* dst, hipArray_const_t srcArray,
+                                                             size_t wOffset, size_t hOffset,
+                                                             size_t count, hipMemcpyKind kind) {
    return hipCUDAErrorTohipError(cudaMemcpyFromArray(dst, srcArray, wOffset, hOffset, count,
                                                      hipMemcpyKindToCudaMemcpyKind(kind)));
 }
@@ -1352,7 +1367,12 @@ inline static hipError_t hipPointerGetAttributes(hipPointerAttribute_t* attribut
    struct cudaPointerAttributes cPA;
    hipError_t err = hipCUDAErrorTohipError(cudaPointerGetAttributes(&cPA, ptr));
    if (err == hipSuccess) {
-        switch (cPA.memoryType) {
+#if (CUDART_VERSION >= 11000)
+        auto memType = cPA.type;
+#else
+        unsigned memType = cPA.memoryType; // No auto because cuda 10.2 doesnt force c++11
+#endif
+        switch (memType) {
            case cudaMemoryTypeDevice:
                attributes->memoryType = hipMemoryTypeDevice;
                break;
@@ -1606,6 +1626,11 @@ inline static hipError_t hipDeviceGetName(char* name, int len, hipDevice_t devic
    return hipCUResultTohipError(cuDeviceGetName(name, len, device));
 }

+inline static hipError_t hipDeviceGetP2PAttribute(int* value, hipDeviceP2PAttr attr,
+                                                  int srcDevice, int dstDevice) {
+    return hipCUDAErrorTohipError(cudaDeviceGetP2PAttribute(value, attr, srcDevice, dstDevice));
+}
+
 inline static hipError_t hipDeviceGetPCIBusId(char* pciBusId, int len, hipDevice_t device) {
    return hipCUDAErrorTohipError(cudaDeviceGetPCIBusId(pciBusId, len, device));
 }
@@ -1689,14 +1714,17 @@ inline static hipError_t hipFuncSetCacheConfig(const void* func, hipFuncCache_t
    return hipCUDAErrorTohipError(cudaFuncSetCacheConfig(func, cacheConfig));
 }

-inline static hipError_t hipBindTexture(size_t* offset, struct textureReference* tex, const void* devPtr,
-                                        const hipChannelFormatDesc* desc, size_t size __dparm(UINT_MAX)){
+__HIP_DEPRECATED inline static hipError_t hipBindTexture(size_t* offset,
+                                                         struct textureReference* tex,
+                                                         const void* devPtr,
+                                                         const hipChannelFormatDesc* desc,
+                                                         size_t size __dparm(UINT_MAX)) {
    return hipCUDAErrorTohipError(cudaBindTexture(offset, tex, devPtr, desc, size));
 }

-inline static hipError_t hipBindTexture2D(size_t* offset, struct textureReference* tex, const void* devPtr,
-                            const hipChannelFormatDesc* desc, size_t width, size_t height,
-                            size_t pitch) {
+__HIP_DEPRECATED inline static hipError_t hipBindTexture2D(
+    size_t* offset, struct textureReference* tex, const void* devPtr,
+    const hipChannelFormatDesc* desc, size_t width, size_t height, size_t pitch) {
    return hipCUDAErrorTohipError(cudaBindTexture2D(offset, tex, devPtr, desc, width, height, pitch));
 }

@@ -1731,8 +1759,8 @@ inline static hipError_t hipGetTextureObjectResourceDesc(hipResourceDesc* pResDe
    return hipCUDAErrorTohipError(cudaGetTextureObjectResourceDesc( pResDesc, textureObject));
 }

-inline static hipError_t hipGetTextureAlignmentOffset(size_t* offset, const struct textureReference* texref)
-{
+__HIP_DEPRECATED inline static hipError_t hipGetTextureAlignmentOffset(
+    size_t* offset, const struct textureReference* texref) {
    return hipCUDAErrorTohipError(cudaGetTextureAlignmentOffset(offset,texref));
 }

@@ -1805,32 +1833,32 @@ inline static hipError_t hipBindTexture(size_t* offset, struct texture<T, dim, r
 }

 template <class T, int dim, enum cudaTextureReadMode readMode>
-inline static hipError_t hipUnbindTexture(struct texture<T, dim, readMode>* tex) {
+__HIP_DEPRECATED inline static hipError_t hipUnbindTexture(struct texture<T, dim, readMode>* tex) {
    return hipCUDAErrorTohipError(cudaUnbindTexture(tex));
 }

 template <class T, int dim, enum cudaTextureReadMode readMode>
-inline static hipError_t hipUnbindTexture(struct texture<T, dim, readMode> &tex) {
+__HIP_DEPRECATED inline static hipError_t hipUnbindTexture(struct texture<T, dim, readMode>& tex) {
    return hipCUDAErrorTohipError(cudaUnbindTexture(tex));
 }

 template <class T, int dim, enum cudaTextureReadMode readMode>
-inline static hipError_t hipBindTextureToArray(struct texture<T, dim, readMode>& tex,
-                                               hipArray_const_t array,
-                                               const hipChannelFormatDesc& desc) {
+__HIP_DEPRECATED inline static hipError_t hipBindTextureToArray(
+    struct texture<T, dim, readMode>& tex, hipArray_const_t array,
+    const hipChannelFormatDesc& desc) {
    return hipCUDAErrorTohipError(cudaBindTextureToArray(tex, array, desc));
 }

 template <class T, int dim, enum cudaTextureReadMode readMode>
-inline static hipError_t hipBindTextureToArray(struct texture<T, dim, readMode> *tex,
-                                               hipArray_const_t array,
-                                               const hipChannelFormatDesc* desc) {
+__HIP_DEPRECATED inline static hipError_t hipBindTextureToArray(
+    struct texture<T, dim, readMode>* tex, hipArray_const_t array,
+    const hipChannelFormatDesc* desc) {
    return hipCUDAErrorTohipError(cudaBindTextureToArray(tex, array, desc));
 }

 template <class T, int dim, enum cudaTextureReadMode readMode>
-inline static hipError_t hipBindTextureToArray(struct texture<T, dim, readMode>& tex,
-                                               hipArray_const_t array) {
+__HIP_DEPRECATED inline static hipError_t hipBindTextureToArray(
+    struct texture<T, dim, readMode>& tex, hipArray_const_t array) {
    return hipCUDAErrorTohipError(cudaBindTextureToArray(tex, array));
 }

@@ -70,12 +70,12 @@ struct pstreams {
    /// Type used for file descriptors.
    typedef int fd_type;

-    static const pmode pstdin = std::ios_base::out;   ///< Write to stdin
-    static const pmode pstdout = std::ios_base::in;   ///< Read from stdout
-    static const pmode pstderr = std::ios_base::app;  ///< Read from stderr
+    static constexpr pmode pstdin = std::ios_base::out;   ///< Write to stdin
+    static constexpr pmode pstdout = std::ios_base::in;   ///< Read from stdout
+    static constexpr pmode pstderr = std::ios_base::app;  ///< Read from stderr

    /// Create a new process group for the child process.
-    static const pmode newpg = std::ios_base::trunc;
+    static constexpr pmode newpg = std::ios_base::trunc;

   protected:
    enum { bufsz = 32 };  ///< Size of pstreambuf buffers.
@@ -5,8 +5,6 @@ if(@BUILD_SHARED_LIBS@)
    install(FILES @PROJECT_BINARY_DIR@/lib/libamdhip64.so DESTINATION lib)
    install(FILES @PROJECT_BINARY_DIR@/lib/libamdhip64.so.@HIP_LIB_VERSION_MAJOR@ DESTINATION lib)
    install(FILES @PROJECT_BINARY_DIR@/lib/libamdhip64.so.@HIP_LIB_VERSION_STRING@ DESTINATION lib)
-    install(FILES @PROJECT_BINARY_DIR@/lib/libhip_hcc.so DESTINATION lib)
-    install(FILES @PROJECT_BINARY_DIR@/lib/libhiprtc.so DESTINATION lib)
 else()
    install(FILES @PROJECT_BINARY_DIR@/lib/libamdhip64.a DESTINATION lib)
 endif()
@@ -45,7 +43,7 @@ set(CPACK_GENERATOR "TGZ;DEB;RPM")
 set(CPACK_BINARY_DEB "ON")
 set(CPACK_DEBIAN_FILE_NAME ${CPACK_PACKAGE_FILE_NAME}_amd64.deb)
 set(CPACK_DEBIAN_PACKAGE_CONTROL_EXTRA "${PROJECT_BINARY_DIR}/postinst;${PROJECT_BINARY_DIR}/prerm")
-set(CPACK_DEBIAN_PACKAGE_DEPENDS "hsa-rocr-dev, rocm-utils, hip-base (= ${CPACK_PACKAGE_VERSION}),  comgr (>= 1.1), llvm-amdgpu")
+set(CPACK_DEBIAN_PACKAGE_DEPENDS "hsa-rocr-dev, rocminfo, hip-base (= ${CPACK_PACKAGE_VERSION}),  comgr (>= 1.1), llvm-amdgpu")
 set(CPACK_DEBIAN_PACKAGE_PROVIDES "hip-hcc (= ${CPACK_PACKAGE_VERSION})")

 set(CPACK_BINARY_RPM "ON")
@@ -55,7 +53,7 @@ set(CPACK_RPM_POST_INSTALL_SCRIPT_FILE "${PROJECT_BINARY_DIR}/postinst")
 set(CPACK_RPM_PRE_UNINSTALL_SCRIPT_FILE "${PROJECT_BINARY_DIR}/prerm")
 set(CPACK_RPM_PACKAGE_AUTOREQPROV " no")
 string(REPLACE "-" "_" HIP_BASE_VERSION ${CPACK_PACKAGE_VERSION})
-set(CPACK_RPM_PACKAGE_REQUIRES "hsa-rocr-dev, rocm-utils, hip-base = ${HIP_BASE_VERSION},  comgr >= 1.1, llvm-amdgpu")
+set(CPACK_RPM_PACKAGE_REQUIRES "hsa-rocr-dev, rocminfo, hip-base = ${HIP_BASE_VERSION},  comgr >= 1.1, llvm-amdgpu")
 set(CPACK_RPM_PACKAGE_PROVIDES "hip-hcc = ${HIP_BASE_VERSION}")
 set(CPACK_RPM_EXCLUDE_FROM_AUTO_FILELIST_ADDITION "/opt")
 set(CPACK_SOURCE_GENERATOR "TGZ")
@@ -66,6 +66,14 @@ if(NOT DEFINED ROCclr_DIR OR NOT DEFINED LIBOCL_STATIC_DIR OR NOT DEFINED LIBROC
 	#	message(FATAL_ERROR "define ROCclr_DIR, LIBOCL_STATIC_DIR\n")

 endif()
+
+#APPEND default path for CMAKE_PREFIX_PATH
+#User provided will be searched first since defualt path is at end.
+#Custom install path can be provided at compile time as cmake parameter(-DCMAKE_PREFIX_PATH="")
+#/opt/rocm:default:For amd_comgr,hsa-runtime64
+#/opt/rocm/llvm/:default:For llvm/clang pulled in as dependency from hsa/comgr
+list( APPEND CMAKE_PREFIX_PATH ${CMAKE_PREFIX_PATH} "/opt/rocm" "/opt/rocm/llvm")
+
 list ( APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/modules" )
 set(CMAKE_MODULE_PATH${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/cmake" "${CMAKE_CURRENT_SOURCE_DIR}/cmake/modules")

@@ -194,6 +202,7 @@ else()

 endif()

+set_target_properties(amdhip64 PROPERTIES LINK_FLAGS_RELEASE -s)
 set_target_properties(amdhip64 PROPERTIES PUBLIC_HEADER ${PROF_API_STR})
 add_library(host INTERFACE)
 target_link_libraries(host INTERFACE hip::amdhip64)
@@ -206,20 +215,12 @@ target_link_libraries(device INTERFACE host)
 # FIXME: Linux convention is to create static library with same base
 # filename.

-
 if(${BUILD_SHARED_LIBS})
-    target_link_libraries(amdhip64 PRIVATE amdrocclr_static Threads::Threads dl numa hsa-runtime64::hsa-runtime64)
+    target_link_libraries(amdhip64 PRIVATE amdrocclr_static Threads::Threads dl hsa-runtime64::hsa-runtime64)
    INSTALL(PROGRAMS $<TARGET_FILE:amdhip64> DESTINATION lib COMPONENT MAIN)
-    INSTALL(CODE "execute_process( COMMAND ${CMAKE_COMMAND} -E create_symlink libamdhip64.so lib/libhip_hcc.so  )" DESTINATION lib COMPONENT MAIN)
-
-    INSTALL(CODE "execute_process( COMMAND ${CMAKE_COMMAND} -E create_symlink libamdhip64.so lib/libhiprtc.so  )" DESTINATION lib COMPONENT MAIN)
-    INSTALL(FILES ${CMAKE_BINARY_DIR}/lib/libhip_hcc.so DESTINATION lib COMPONENT MAIN)
-
-    INSTALL(FILES ${CMAKE_BINARY_DIR}/lib/libhiprtc.so DESTINATION lib COMPONENT MAIN)
-
 else()

-    target_link_libraries(amdhip64 PRIVATE Threads::Threads dl numa hsa-runtime64::hsa-runtime64 amd_comgr)
+    target_link_libraries(amdhip64 PRIVATE Threads::Threads dl hsa-runtime64::hsa-runtime64 amd_comgr)
    # combine objects of vid and hip into amdhip64_static
    add_custom_target(
       amdhip64_static_combiner
@@ -92,8 +92,8 @@ hipError_t hipDeviceComputeCapability(int *major, int *minor, hipDevice_t device

  auto* deviceHandle = g_devices[device]->devices()[0];
  const auto& info = deviceHandle->info();
-  *major = info.gfxipVersion_ / 100;
-  *minor = info.gfxipVersion_ % 100;
+  *major = info.gfxipMajor_;
+  *minor = info.gfxipMinor_;

  HIP_RETURN(hipSuccess);
 }
@@ -175,10 +175,10 @@ hipError_t hipGetDeviceProperties ( hipDeviceProp_t* props, hipDevice_t device )
  deviceProps.maxGridSize[2] = INT32_MAX;
  deviceProps.clockRate = info.maxEngineClockFrequency_ * 1000;
  deviceProps.memoryClockRate = info.maxMemoryClockFrequency_ * 1000;
-  deviceProps.memoryBusWidth = info.globalMemChannels_ * 32;
+  deviceProps.memoryBusWidth = info.globalMemChannels_;
  deviceProps.totalConstMem = info.maxConstantBufferSize_;
-  deviceProps.major = info.gfxipVersion_ / 100;
-  deviceProps.minor = info.gfxipVersion_ % 100;
+  deviceProps.major = info.gfxipMajor_;
+  deviceProps.minor = info.gfxipMinor_;
  deviceProps.multiProcessorCount = info.maxComputeUnits_;
  deviceProps.l2CacheSize = info.l2CacheSize_;
  deviceProps.maxThreadsPerMultiProcessor = info.maxThreadsPerCU_;
@@ -208,7 +208,7 @@ hipError_t hipGetDeviceProperties ( hipDeviceProp_t* props, hipDevice_t device )
  deviceProps.maxSharedMemoryPerMultiProcessor = info.localMemSizePerCU_;
  //deviceProps.isMultiGpuBoard = info.;
  deviceProps.canMapHostMemory = 1;
-  deviceProps.gcnArch = info.gfxipVersion_;
+  deviceProps.gcnArch = info.gfxipMajor_ * 100 + info.gfxipMinor_ * 10 + info.gfxipStepping_;
  sprintf(deviceProps.gcnArchName, "gfx%d%d%x", info.gfxipMajor_, info.gfxipMinor_, info.gfxipStepping_);
  deviceProps.cooperativeLaunch = info.cooperativeGroups_;
  deviceProps.cooperativeMultiDeviceLaunch = info.cooperativeMultiDeviceGroups_;
@@ -367,13 +367,6 @@ hipError_t hipDeviceGetLimit ( size_t* pValue, hipLimit_t limit ) {
  }
 }

-/**
-hipError_t hipDeviceGetP2PAttribute ( int* value, hipDeviceP2PAttr attr, int  srcDevice, int  dstDevice ) {
-  assert(0);
-  HIP_RETURN(hipSuccess);
-}
-**/
-
 hipError_t hipDeviceGetPCIBusId ( char* pciBusId, int  len, int  device ) {

  HIP_INIT_API(hipDeviceGetPCIBusId, (void*)pciBusId, len, device);
@@ -140,6 +140,19 @@ hipError_t Event::streamWait(amd::HostQueue* hostQueue, uint flags) {
 void Event::addMarker(amd::HostQueue* queue, amd::Command* command, bool record) {
  amd::ScopedLock lock(lock_);

+  if (queue->properties().test(CL_QUEUE_PROFILING_ENABLE)) {
+    if (command == nullptr) {
+      command = queue->getLastQueuedCommand(true);
+      if (command == nullptr) {
+        command = new amd::Marker(*queue, kMarkerDisableFlush);
+        command->enqueue();
+      }
+    }
+  } else if (command == nullptr) {
+    command = new hip::ProfileMarker(*queue, false);
+    command->enqueue();
+  }
+
  if (event_ == &command->event()) return;

  if (event_ != nullptr) {
@@ -239,16 +252,9 @@ hipError_t hipEventRecord(hipEvent_t event, hipStream_t stream) {
  }

  hip::Event* e = reinterpret_cast<hip::Event*>(event);
-  amd::ScopedLock lock(e->lock());
-
  amd::HostQueue* queue = hip::getQueue(stream);
-  amd::Command* command = queue->getLastQueuedCommand(true);
-  if (command == nullptr) {
-    command = new amd::Marker(*queue, kMarkerDisableFlush);
-    command->enqueue();
-  }

-  e->addMarker(queue, command, true);
+  e->addMarker(queue, nullptr, true);
  HIP_RETURN(hipSuccess);
 }

@@ -26,12 +26,13 @@

 namespace hip {

-class TimerMarker: public amd::Marker {
+class ProfileMarker: public amd::Marker {
 public:
-  TimerMarker(amd::HostQueue& queue) : amd::Marker(queue, false) {
+  ProfileMarker(amd::HostQueue& queue, bool disableFlush)
+  : amd::Marker(queue, disableFlush) {
    profilingInfo_.enabled_ = true;
    profilingInfo_.callback_ = nullptr;
-    profilingInfo_.start_ = profilingInfo_.end_ = 0;
+    profilingInfo_.clear();
  }
 };

@@ -23,16 +23,18 @@ DeviceVar::DeviceVar(std::string name, hipModule_t hmod) : shadowVptr(nullptr),
    guarantee(false);
  }

-  if (amd_mem_obj_ == nullptr || device_ptr_ == nullptr) {
-    DevLogPrintfError("Cannot get memory for creating device Var: %s", name.c_str());
-    guarantee(false);
+  // Handle size 0 symbols
+  if (size_ != 0) {
+    if (amd_mem_obj_ == nullptr || device_ptr_ == nullptr) {
+      DevLogPrintfError("Cannot get memory for creating device Var: %s", name.c_str());
+      guarantee(false);
+    }
+    amd::MemObjMap::AddMemObj(device_ptr_, amd_mem_obj_);
  }
-
-  amd::MemObjMap::AddMemObj(device_ptr_, amd_mem_obj_);
 }

 DeviceVar::~DeviceVar() {
-  if (device_ptr_ != nullptr) {
+  if (amd_mem_obj_ != nullptr) {
    amd::MemObjMap::RemoveMemObj(device_ptr_);
    amd_mem_obj_->release();
  }
@@ -29,6 +29,7 @@ hipDeviceGetLimit
 hipDeviceGetName
 hipDeviceGetPCIBusId
 hipDeviceGetSharedMemConfig
+hipDeviceGetP2PAttribute
 hipDevicePrimaryCtxGetState
 hipDevicePrimaryCtxRelease
 hipDevicePrimaryCtxReset
@@ -175,8 +176,6 @@ __hipRegisterVar
 __hipRegisterSurface
 __hipRegisterTexture
 __hipUnregisterFatBinary
-__gnu_h2f_ieee
-__gnu_f2h_ieee
 hipConfigureCall
 hipSetupArgument
 hipLaunchByPtr
@@ -30,6 +30,7 @@ global:
    hipDeviceGetName;
    hipDeviceGetPCIBusId;
    hipDeviceGetSharedMemConfig;
+    hipDeviceGetP2PAttribute;
    hipDevicePrimaryCtxGetState;
    hipDevicePrimaryCtxRelease;
    hipDevicePrimaryCtxReset;
@@ -175,8 +176,6 @@ global:
    __hipRegisterSurface;
    __hipRegisterTexture;
    __hipUnregisterFatBinary;
-    __gnu_h2f_ieee;
-    __gnu_f2h_ieee;
    hipConfigureCall;
    hipSetupArgument;
    hipLaunchByPtr;
@@ -59,7 +59,7 @@ static_assert(static_cast<uint32_t>(hipMemRangeAttributeLastPrefetchLocation) ==
 hipError_t hipMallocManaged(void** dev_ptr, size_t size, unsigned int flags) {
  HIP_INIT_API(hipMallocManaged, dev_ptr, size, flags);

-  if ((dev_ptr == nullptr) || (flags != hipMemAttachGlobal)) {
+  if ((dev_ptr == nullptr) || (size == 0) || (flags != hipMemAttachGlobal)) {
    HIP_RETURN(hipErrorInvalidValue);
  }

@@ -71,7 +71,7 @@ hipError_t hipMemPrefetchAsync(const void* dev_ptr, size_t count, int device,
                               hipStream_t stream) {
  HIP_INIT_API(hipMemPrefetchAsync, dev_ptr, count, device, stream);

-  if ((dev_ptr == nullptr) || (count == 0) || (stream == nullptr)) {
+  if ((dev_ptr == nullptr) || (count == 0)) {
    HIP_RETURN(hipErrorInvalidValue);
  }
  amd::HostQueue* queue = nullptr;
@@ -213,4 +213,4 @@ static hipError_t ihipMallocManaged(void** ptr, size_t size) {
  ClPrint(amd::LOG_INFO, amd::LOG_API, "%-5d: [%zx] ihipMallocManaged ptr=0x%zx",  getpid(),
    std::this_thread::get_id(), *ptr);
  return hipSuccess;
-}
+}
@@ -37,6 +37,15 @@
 #include <unistd.h>
 #endif

+#define KNRM "\x1B[0m"
+#define KRED "\x1B[31m"
+#define KGRN "\x1B[32m"
+#define KYEL "\x1B[33m"
+#define KBLU "\x1B[34m"
+#define KMAG "\x1B[35m"
+#define KCYN "\x1B[36m"
+#define KWHT "\x1B[37m"
+
 /*! IHIP IPC MEMORY Structure */
 #define IHIP_IPC_MEM_HANDLE_SIZE   32
 #define IHIP_IPC_MEM_RESERVED_SIZE LP64_SWITCH(28,24)
@@ -58,8 +67,8 @@ typedef struct ihipIpcMemHandle_st {
  }

 #define HIP_API_PRINT(...)                                 \
-  ClPrint(amd::LOG_INFO, amd::LOG_API, "%-5d: [%zx] %s ( %s )", getpid(), std::this_thread::get_id(),             \
-          __func__, ToString( __VA_ARGS__ ).c_str());
+  uint64_t startTimeUs=0 ; HIPPrintDuration(amd::LOG_INFO, amd::LOG_API, &startTimeUs, "%-5d: [%zx] %s%s ( %s )%s",  getpid(), std::this_thread::get_id(), KGRN,    \
+          __func__, ToString( __VA_ARGS__ ).c_str(),KNRM);

 #define HIP_ERROR_PRINT(err, ...)                             \
  ClPrint(amd::LOG_INFO, amd::LOG_API, "%-5d: [%zx] %s: Returned %s : %s", getpid(), std::this_thread::get_id(),  \
@@ -75,6 +84,12 @@ typedef struct ihipIpcMemHandle_st {
  HIP_INIT()                                                 \
  HIP_CB_SPAWNER_OBJECT(cid);

+#define HIP_RETURN_DURATION(ret, ...)                      \
+  hip::g_lastError = ret;                         \
+  HIPPrintDuration(amd::LOG_INFO, amd::LOG_API, &startTimeUs, "%-5d: [%zx] %s: Returned %s : %s",  getpid(), std::this_thread::get_id(),  \
+          __func__, hipGetErrorName(hip::g_lastError), ToString( __VA_ARGS__ ).c_str()); \
+  return hip::g_lastError;
+
 #define HIP_RETURN(ret, ...)                      \
  hip::g_lastError = ret;                         \
  HIP_ERROR_PRINT(hip::g_lastError, __VA_ARGS__)  \
@@ -122,6 +122,9 @@ hipError_t ihipMalloc(void** ptr, size_t sizeBytes, unsigned int flags)
  *ptr = amd::SvmBuffer::malloc(*amdContext, flags, sizeBytes, amdContext->devices()[0]->info().memBaseAddrAlign_,
              useHostDevice ? curDevContext->svmDevices()[0] : nullptr);
  if (*ptr == nullptr) {
+    size_t free = 0, total =0;
+    hipMemGetInfo(&free, &total);
+    LogPrintfError("Allocation failed : Device memory : required :%u | free :%u | total :%u \n", sizeBytes, free, total);
    return hipErrorOutOfMemory;
  }

@@ -178,9 +181,12 @@ hipError_t ihipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKin
              *srcMemory->asBuffer(), sOffset, sizeBytes, dst);
    isAsync = false;
  } else if ((srcMemory != nullptr) && (dstMemory != nullptr)) {
-    // Check if the queue device doesn't match the device on any memory object. Hence
-    // it's a P2P transfer, because the app has requested access to another GPU
-    if (srcMemory->getContext().devices()[0] != dstMemory->getContext().devices()[0]) {
+    // Check if the queue device doesn't match the device on any memory object.
+    // And any of them are not host allocation.
+    // Hence it's a P2P transfer, because the app has requested access to another GPU
+    if ((srcMemory->getContext().devices()[0] != dstMemory->getContext().devices()[0]) &&
+        ((srcMemory->getContext().devices().size() == 1) &&
+         (dstMemory->getContext().devices().size() == 1))) {
      command = new amd::CopyMemoryP2PCommand(queue, CL_COMMAND_COPY_BUFFER, waitList,
          *srcMemory->asBuffer(), *dstMemory->asBuffer(), sOffset, dOffset, sizeBytes);
      if (command == nullptr) {
@@ -193,7 +199,16 @@ hipError_t ihipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKin
        return hipErrorInvalidValue;
      }
    } else {
-      command = new amd::CopyMemoryCommand(queue, CL_COMMAND_COPY_BUFFER, waitList,
+      amd::HostQueue* pQueue = &queue;
+      if (queueDevice != srcMemory->getContext().devices()[0]) {
+        pQueue = hip::getNullStream(srcMemory->getContext());
+        amd::Command* cmd = queue.getLastQueuedCommand(true);
+        if (cmd != nullptr) {
+          waitList.push_back(cmd);
+        }
+      }
+
+      command = new amd::CopyMemoryCommand(*pQueue, CL_COMMAND_COPY_BUFFER, waitList,
          *srcMemory->asBuffer(), *dstMemory->asBuffer(), sOffset, dOffset, sizeBytes);
    }
  }
@@ -228,7 +243,7 @@ hipError_t hipExtMallocWithFlags(void** ptr, size_t sizeBytes, unsigned int flag
 hipError_t hipMalloc(void** ptr, size_t sizeBytes) {
  HIP_INIT_API(hipMalloc, ptr, sizeBytes);

-  HIP_RETURN(ihipMalloc(ptr, sizeBytes, 0), *ptr);
+  HIP_RETURN_DURATION(ihipMalloc(ptr, sizeBytes, 0), *ptr);
 }

 hipError_t hipHostMalloc(void** ptr, size_t sizeBytes, unsigned int flags) {
@@ -260,7 +275,7 @@ hipError_t hipHostMalloc(void** ptr, size_t sizeBytes, unsigned int flags) {
    ihipFlags |= CL_MEM_FOLLOW_USER_NUMA_POLICY;
  }

-  HIP_RETURN(ihipMalloc(ptr, sizeBytes, ihipFlags), *ptr);
+  HIP_RETURN_DURATION(ihipMalloc(ptr, sizeBytes, ihipFlags), *ptr);
 }

 hipError_t hipFree(void* ptr) {
@@ -273,7 +288,7 @@ hipError_t hipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind
  HIP_INIT_API(hipMemcpy, dst, src, sizeBytes, kind);

  amd::HostQueue* queue = hip::getNullStream();
-  HIP_RETURN(ihipMemcpy(dst, src, sizeBytes, kind, *queue));
+  HIP_RETURN_DURATION(ihipMemcpy(dst, src, sizeBytes, kind, *queue));
 }

 hipError_t hipMemcpyWithStream(void* dst, const void* src, size_t sizeBytes,
@@ -282,7 +297,7 @@ hipError_t hipMemcpyWithStream(void* dst, const void* src, size_t sizeBytes,

  amd::HostQueue* queue = hip::getQueue(stream);

-  HIP_RETURN(ihipMemcpy(dst, src, sizeBytes, kind, *queue, false));
+  HIP_RETURN_DURATION(ihipMemcpy(dst, src, sizeBytes, kind, *queue, false));
 }

 hipError_t hipMemPtrGetInfo(void *ptr, size_t *size) {
@@ -706,7 +721,7 @@ hipError_t hipHostRegister(void* hostPtr, size_t sizeBytes, unsigned int flags)
    amd::MemObjMap::AddMemObj(hostPtr, mem);
    HIP_RETURN(hipSuccess);
  } else {
-    HIP_RETURN(ihipMalloc(&hostPtr, sizeBytes, flags), hostPtr);
+    HIP_RETURN_DURATION(ihipMalloc(&hostPtr, sizeBytes, flags), hostPtr);
  }
 }

@@ -725,9 +740,14 @@ hipError_t hipHostUnregister(void* hostPtr) {
    amd::Memory* mem = getMemoryObject(hostPtr, offset);

    if(mem) {
-      for (const auto& device: hip::getCurrentDevice()->devices()) {
-        const device::Memory* devMem = mem->getDeviceMemory(*device);
-        amd::MemObjMap::RemoveMemObj(reinterpret_cast<void*>(devMem->virtualAddress()));
+      for (const auto& device: g_devices) {
+        const device::Memory* devMem = mem->getDeviceMemory(*device->devices()[0]);
+        if (devMem != nullptr) {
+          void* vAddr = reinterpret_cast<void*>(devMem->virtualAddress());
+          if (amd::MemObjMap::FindMemObj(vAddr)) {
+            amd::MemObjMap::RemoveMemObj(vAddr);
+          }
+        }
      }
      amd::MemObjMap::RemoveMemObj(hostPtr);
      mem->release();
@@ -764,7 +784,7 @@ hipError_t hipMemcpyToSymbol(const void* symbol, const void* src, size_t sizeByt
  device_ptr = reinterpret_cast<address>(device_ptr) + offset;

  /* Copy memory from source to destination address */
-  HIP_RETURN(hipMemcpy(device_ptr, src, sizeBytes, kind));
+  HIP_RETURN_DURATION(hipMemcpy(device_ptr, src, sizeBytes, kind));
 }

 hipError_t hipMemcpyFromSymbol(void* dst, const void* symbol, size_t sizeBytes,
@@ -786,7 +806,7 @@ hipError_t hipMemcpyFromSymbol(void* dst, const void* symbol, size_t sizeBytes,
  device_ptr = reinterpret_cast<address>(device_ptr) + offset;

  /* Copy memory from source to destination address */
-  HIP_RETURN(hipMemcpy(dst, device_ptr, sizeBytes, kind));
+  HIP_RETURN_DURATION(hipMemcpy(dst, device_ptr, sizeBytes, kind));
 }

 hipError_t hipMemcpyToSymbolAsync(const void* symbol, const void* src, size_t sizeBytes,
@@ -808,7 +828,7 @@ hipError_t hipMemcpyToSymbolAsync(const void* symbol, const void* src, size_t si
  device_ptr = reinterpret_cast<address>(device_ptr) + offset;

  /* Copy memory from source to destination address */
-  HIP_RETURN(hipMemcpyAsync(device_ptr, src, sizeBytes, kind, stream));
+  HIP_RETURN_DURATION(hipMemcpyAsync(device_ptr, src, sizeBytes, kind, stream));
 }

 hipError_t hipMemcpyFromSymbolAsync(void* dst, const void* symbol, size_t sizeBytes,
@@ -830,7 +850,7 @@ hipError_t hipMemcpyFromSymbolAsync(void* dst, const void* symbol, size_t sizeBy
  device_ptr = reinterpret_cast<address>(device_ptr) + offset;

  /* Copy memory from source to destination address */
-  HIP_RETURN(hipMemcpyAsync(dst, device_ptr, sizeBytes, kind, stream));
+  HIP_RETURN_DURATION(hipMemcpyAsync(dst, device_ptr, sizeBytes, kind, stream));
 }

 hipError_t hipMemcpyHtoD(hipDeviceptr_t dstDevice,
@@ -838,7 +858,7 @@ hipError_t hipMemcpyHtoD(hipDeviceptr_t dstDevice,
                         size_t ByteCount) {
  HIP_INIT_API(hipMemcpyHtoD, dstDevice, srcHost, ByteCount);

-  HIP_RETURN(ihipMemcpy(dstDevice, srcHost, ByteCount, hipMemcpyHostToDevice, *hip::getQueue(nullptr)));
+  HIP_RETURN_DURATION(ihipMemcpy(dstDevice, srcHost, ByteCount, hipMemcpyHostToDevice, *hip::getQueue(nullptr)));
 }

 hipError_t hipMemcpyDtoH(void* dstHost,
@@ -846,7 +866,7 @@ hipError_t hipMemcpyDtoH(void* dstHost,
                         size_t ByteCount) {
  HIP_INIT_API(hipMemcpyDtoH, dstHost, srcDevice, ByteCount);

-  HIP_RETURN(ihipMemcpy(dstHost, srcDevice, ByteCount, hipMemcpyDeviceToHost, *hip::getQueue(nullptr)));
+  HIP_RETURN_DURATION(ihipMemcpy(dstHost, srcDevice, ByteCount, hipMemcpyDeviceToHost, *hip::getQueue(nullptr)));
 }

 hipError_t hipMemcpyDtoD(hipDeviceptr_t dstDevice,
@@ -854,7 +874,7 @@ hipError_t hipMemcpyDtoD(hipDeviceptr_t dstDevice,
                         size_t ByteCount) {
  HIP_INIT_API(hipMemcpyDtoD, dstDevice, srcDevice, ByteCount);

-  HIP_RETURN(ihipMemcpy(dstDevice, srcDevice, ByteCount, hipMemcpyDeviceToDevice, *hip::getQueue(nullptr)));
+  HIP_RETURN_DURATION(ihipMemcpy(dstDevice, srcDevice, ByteCount, hipMemcpyDeviceToDevice, *hip::getQueue(nullptr)));
 }

 hipError_t hipMemcpyAsync(void* dst, const void* src, size_t sizeBytes,
@@ -863,7 +883,7 @@ hipError_t hipMemcpyAsync(void* dst, const void* src, size_t sizeBytes,

  amd::HostQueue* queue = hip::getQueue(stream);

-  HIP_RETURN(ihipMemcpy(dst, src, sizeBytes, kind, *queue, true));
+  HIP_RETURN_DURATION(ihipMemcpy(dst, src, sizeBytes, kind, *queue, true));
 }

 hipError_t hipMemcpyHtoDAsync(hipDeviceptr_t dstDevice,
@@ -872,7 +892,7 @@ hipError_t hipMemcpyHtoDAsync(hipDeviceptr_t dstDevice,
                              hipStream_t stream) {
  HIP_INIT_API(hipMemcpyHtoDAsync, dstDevice, srcHost, ByteCount, stream);

-  HIP_RETURN(ihipMemcpy(dstDevice, srcHost, ByteCount, hipMemcpyHostToDevice, *hip::getQueue(stream), true));
+  HIP_RETURN_DURATION(ihipMemcpy(dstDevice, srcHost, ByteCount, hipMemcpyHostToDevice, *hip::getQueue(stream), true));
 }

 hipError_t hipMemcpyDtoDAsync(hipDeviceptr_t dstDevice,
@@ -881,7 +901,7 @@ hipError_t hipMemcpyDtoDAsync(hipDeviceptr_t dstDevice,
                              hipStream_t stream) {
  HIP_INIT_API(hipMemcpyDtoDAsync, dstDevice, srcDevice, ByteCount, stream);

-  HIP_RETURN(ihipMemcpy(dstDevice, srcDevice, ByteCount, hipMemcpyDeviceToDevice, *hip::getQueue(stream), true));
+  HIP_RETURN_DURATION(ihipMemcpy(dstDevice, srcDevice, ByteCount, hipMemcpyDeviceToDevice, *hip::getQueue(stream), true));
 }

 hipError_t hipMemcpyDtoHAsync(void* dstHost,
@@ -890,7 +910,7 @@ hipError_t hipMemcpyDtoHAsync(void* dstHost,
                              hipStream_t stream) {
  HIP_INIT_API(hipMemcpyDtoHAsync, dstHost, srcDevice, ByteCount, stream);

-  HIP_RETURN(ihipMemcpy(dstHost, srcDevice, ByteCount, hipMemcpyDeviceToHost, *hip::getQueue(stream), true));
+  HIP_RETURN_DURATION(ihipMemcpy(dstHost, srcDevice, ByteCount, hipMemcpyDeviceToHost, *hip::getQueue(stream), true));
 }

 hipError_t ihipMemcpyAtoD(hipArray* srcArray,
@@ -1399,10 +1419,34 @@ hipError_t ihipMemcpyParam3D(const HIP_MEMCPY3D* pCopy,
  hipMemoryType srcMemoryType = pCopy->srcMemoryType;
  if (srcMemoryType == hipMemoryTypeUnified) {
    srcMemoryType = amd::MemObjMap::FindMemObj(pCopy->srcDevice) ? hipMemoryTypeDevice : hipMemoryTypeHost;
+    if (srcMemoryType == hipMemoryTypeHost) {
+      // {src/dst}Host may be unitialized. Copy over {src/dst}Device into it if we detect system memory.
+      const_cast<HIP_MEMCPY3D*>(pCopy)->srcHost = pCopy->srcDevice;
+    }
  }
  hipMemoryType dstMemoryType = pCopy->dstMemoryType;
  if (dstMemoryType == hipMemoryTypeUnified) {
    dstMemoryType = amd::MemObjMap::FindMemObj(pCopy->dstDevice) ? hipMemoryTypeDevice : hipMemoryTypeHost;
+    if (srcMemoryType == hipMemoryTypeHost) {
+      const_cast<HIP_MEMCPY3D*>(pCopy)->dstHost = pCopy->dstDevice;
+    }
+  }
+
+  // If {src/dst}MemoryType is hipMemoryTypeHost, check if the memory was prepinned.
+  // In that case upgrade the copy type to hipMemoryTypeDevice to avoid extra pinning.
+  if (srcMemoryType == hipMemoryTypeHost) {
+    amd::Memory* mem = amd::MemObjMap::FindMemObj(pCopy->srcHost);
+    srcMemoryType = mem ? hipMemoryTypeDevice : hipMemoryTypeHost;
+    if (srcMemoryType == hipMemoryTypeDevice) {
+      const_cast<HIP_MEMCPY3D*>(pCopy)->srcDevice = const_cast<void*>(pCopy->srcHost);
+    }
+  }
+  if (dstMemoryType == hipMemoryTypeHost) {
+    amd::Memory* mem = amd::MemObjMap::FindMemObj(pCopy->dstHost);
+    dstMemoryType = mem ? hipMemoryTypeDevice : hipMemoryTypeHost;
+    if (dstMemoryType == hipMemoryTypeDevice) {
+      const_cast<HIP_MEMCPY3D*>(pCopy)->dstDevice = const_cast<void*>(pCopy->dstDevice);
+    }
  }

  amd::Coord3D srcOrigin = {pCopy->srcXInBytes, pCopy->srcY, pCopy->srcZ};
@@ -1480,21 +1524,21 @@ hipError_t ihipMemcpy2D(void* dst, size_t dpitch, const void* src, size_t spitch
 hipError_t hipMemcpyParam2D(const hip_Memcpy2D* pCopy) {
  HIP_INIT_API(hipMemcpyParam2D, pCopy);

-  HIP_RETURN(ihipMemcpyParam2D(pCopy, nullptr));
+  HIP_RETURN_DURATION(ihipMemcpyParam2D(pCopy, nullptr));
 }

 hipError_t hipMemcpy2D(void* dst, size_t dpitch, const void* src, size_t spitch, size_t width,
                       size_t height, hipMemcpyKind kind) {
  HIP_INIT_API(hipMemcpy2D, dst, dpitch, src, spitch, width, height, kind);

-  HIP_RETURN(ihipMemcpy2D(dst, dpitch, src, spitch, width, height, kind, nullptr));
+  HIP_RETURN_DURATION(ihipMemcpy2D(dst, dpitch, src, spitch, width, height, kind, nullptr));
 }

 hipError_t hipMemcpy2DAsync(void* dst, size_t dpitch, const void* src, size_t spitch, size_t width,
                            size_t height, hipMemcpyKind kind, hipStream_t stream) {
  HIP_INIT_API(hipMemcpy2DAsync, dst, dpitch, src, spitch, width, height, kind, stream);

-  HIP_RETURN(ihipMemcpy2D(dst, dpitch, src, spitch, width, height, kind, stream, true));
+  HIP_RETURN_DURATION(ihipMemcpy2D(dst, dpitch, src, spitch, width, height, kind, stream, true));
 }

 hipError_t ihipMemcpy2DToArray(hipArray_t dst, size_t wOffset, size_t hOffset, const void* src, size_t spitch, size_t width, size_t height, hipMemcpyKind kind, hipStream_t stream, bool isAsync = false) {
@@ -1525,7 +1569,7 @@ hipError_t ihipMemcpy2DToArray(hipArray_t dst, size_t wOffset, size_t hOffset, c
 hipError_t hipMemcpy2DToArray(hipArray* dst, size_t wOffset, size_t hOffset, const void* src, size_t spitch, size_t width, size_t height, hipMemcpyKind kind) {
  HIP_INIT_API(hipMemcpy2DToArray, dst, wOffset, hOffset, src, spitch, width, height, kind);

-  HIP_RETURN(ihipMemcpy2DToArray(dst, wOffset, hOffset, src, spitch, width, height, kind, nullptr));
+  HIP_RETURN_DURATION(ihipMemcpy2DToArray(dst, wOffset, hOffset, src, spitch, width, height, kind, nullptr));
 }

 hipError_t hipMemcpyToArray(hipArray* dst, size_t wOffset, size_t hOffset, const void* src, size_t count, hipMemcpyKind kind) {
@@ -1540,7 +1584,7 @@ hipError_t hipMemcpyToArray(hipArray* dst, size_t wOffset, size_t hOffset, const

  const size_t height = (count / dst->width) / hip::getElementSize(dst);

-  HIP_RETURN(ihipMemcpy2DToArray(dst, wOffset, hOffset, src, 0 /* spitch */, witdthInBytes, height, kind, nullptr));
+  HIP_RETURN_DURATION(ihipMemcpy2DToArray(dst, wOffset, hOffset, src, 0 /* spitch */, witdthInBytes, height, kind, nullptr));
 }

 hipError_t ihipMemcpy2DFromArray(void* dst, size_t dpitch, hipArray_const_t src, size_t wOffsetSrc, size_t hOffsetSrc, size_t width, size_t height, hipMemcpyKind kind, hipStream_t stream, bool isAsync = false) {
@@ -1580,7 +1624,7 @@ hipError_t hipMemcpyFromArray(void* dst, hipArray_const_t src, size_t wOffsetSrc

  const size_t height = (count / src->width) / hip::getElementSize(src);

-  HIP_RETURN(ihipMemcpy2DFromArray(dst, 0 /* dpitch */, src, wOffsetSrc, hOffset, witdthInBytes, height, kind, nullptr));
+  HIP_RETURN_DURATION(ihipMemcpy2DFromArray(dst, 0 /* dpitch */, src, wOffsetSrc, hOffset, witdthInBytes, height, kind, nullptr));
 }

 hipError_t hipMemcpyHtoA(hipArray* dstArray,
@@ -1589,7 +1633,7 @@ hipError_t hipMemcpyHtoA(hipArray* dstArray,
                         size_t ByteCount) {
  HIP_INIT_API(hipMemcpyHtoA, dstArray, dstOffset, srcHost, ByteCount);

-  HIP_RETURN(ihipMemcpyHtoA(srcHost, dstArray, {0, 0, 0}, {dstOffset, 0, 0}, {ByteCount, 1, 1}, 0, 0, nullptr));
+  HIP_RETURN_DURATION(ihipMemcpyHtoA(srcHost, dstArray, {0, 0, 0}, {dstOffset, 0, 0}, {ByteCount, 1, 1}, 0, 0, nullptr));
 }

 hipError_t hipMemcpyAtoH(void* dstHost,
@@ -1598,7 +1642,7 @@ hipError_t hipMemcpyAtoH(void* dstHost,
                         size_t ByteCount) {
  HIP_INIT_API(hipMemcpyAtoH, dstHost, srcArray, srcOffset, ByteCount);

-  HIP_RETURN(ihipMemcpyAtoH(srcArray, dstHost, {srcOffset, 0, 0}, {0, 0, 0}, {ByteCount, 1, 1}, 0, 0, nullptr));
+  HIP_RETURN_DURATION(ihipMemcpyAtoH(srcArray, dstHost, {srcOffset, 0, 0}, {0, 0, 0}, {ByteCount, 1, 1}, 0, 0, nullptr));
 }

 hipError_t ihipMemcpy3D(const hipMemcpy3DParms* p,
@@ -1625,25 +1669,25 @@ hipError_t ihipMemcpy3D(const hipMemcpy3DParms* p,
 hipError_t hipMemcpy3D(const hipMemcpy3DParms* p) {
  HIP_INIT_API(hipMemcpy3D, p);

-  HIP_RETURN(ihipMemcpy3D(p, nullptr));
+  HIP_RETURN_DURATION(ihipMemcpy3D(p, nullptr));
 }

 hipError_t hipMemcpy3DAsync(const hipMemcpy3DParms* p, hipStream_t stream) {
  HIP_INIT_API(hipMemcpy3DAsync, p, stream);

-  HIP_RETURN(ihipMemcpy3D(p, stream, true));
+  HIP_RETURN_DURATION(ihipMemcpy3D(p, stream, true));
 }

 hipError_t hipDrvMemcpy3D(const HIP_MEMCPY3D* pCopy) {
  HIP_INIT_API(hipDrvMemcpy3D, pCopy);

-  HIP_RETURN(ihipMemcpyParam3D(pCopy, nullptr));
+  HIP_RETURN_DURATION(ihipMemcpyParam3D(pCopy, nullptr));
 }

 hipError_t hipDrvMemcpy3DAsync(const HIP_MEMCPY3D* pCopy, hipStream_t stream) {
  HIP_INIT_API(hipDrvMemcpy3DAsync, pCopy, stream);

-  HIP_RETURN(ihipMemcpyParam3D(pCopy, stream, true));
+  HIP_RETURN_DURATION(ihipMemcpyParam3D(pCopy, stream, true));
 }

 hipError_t packFillMemoryCommand(amd::Memory* memory, size_t offset, int64_t value, size_t valueSize,
@@ -1892,7 +1936,7 @@ hipError_t hipMemAllocPitch(hipDeviceptr_t* dptr, size_t* pitch, size_t widthInB
 hipError_t hipMemAllocHost(void** ptr, size_t size) {
  HIP_INIT_API(hipMemAllocHost, ptr, size);

-  HIP_RETURN(hipHostMalloc(ptr, size, 0));
+  HIP_RETURN_DURATION(hipHostMalloc(ptr, size, 0));
 }

 hipError_t hipIpcGetMemHandle(hipIpcMemHandle_t* handle, void* dev_ptr) {
@@ -2074,25 +2118,25 @@ hipError_t ihipMemcpy2DArrayToArray(hipArray_t dst, size_t wOffsetDst, size_t hO
 hipError_t hipMemcpy2DArrayToArray(hipArray_t dst, size_t wOffsetDst, size_t hOffsetDst, hipArray_const_t src, size_t wOffsetSrc, size_t hOffsetSrc, size_t width, size_t height, hipMemcpyKind kind) {
  HIP_INIT_API(hipMemcpy2DArrayToArray, dst, wOffsetDst, hOffsetDst, src, wOffsetSrc, hOffsetSrc, width, height, kind);

-  HIP_RETURN(ihipMemcpy2DArrayToArray(dst, wOffsetDst, hOffsetDst, src, wOffsetSrc, hOffsetSrc, width, height, kind, nullptr));
+  HIP_RETURN_DURATION(ihipMemcpy2DArrayToArray(dst, wOffsetDst, hOffsetDst, src, wOffsetSrc, hOffsetSrc, width, height, kind, nullptr));
 }

 hipError_t hipMemcpyArrayToArray(hipArray_t dst, size_t wOffsetDst, size_t hOffsetDst, hipArray_const_t src, size_t wOffsetSrc, size_t hOffsetSrc, size_t width, size_t height, hipMemcpyKind kind) {
  HIP_INIT_API(hipMemcpyArrayToArray, dst, wOffsetDst, hOffsetDst, src, wOffsetSrc, hOffsetSrc, width, height, kind);

-  HIP_RETURN(ihipMemcpy2DArrayToArray(dst, wOffsetDst, hOffsetDst, src, wOffsetSrc, hOffsetSrc, width, height, kind, nullptr));
+  HIP_RETURN_DURATION(ihipMemcpy2DArrayToArray(dst, wOffsetDst, hOffsetDst, src, wOffsetSrc, hOffsetSrc, width, height, kind, nullptr));
 }

 hipError_t hipMemcpy2DFromArray(void* dst, size_t dpitch, hipArray_const_t src, size_t wOffsetSrc, size_t hOffset, size_t width, size_t height, hipMemcpyKind kind) {
  HIP_INIT_API(hipMemcpy2DFromArray, dst, dpitch, src, wOffsetSrc, hOffset, width, height, kind);

-  HIP_RETURN(ihipMemcpy2DFromArray(dst, dpitch, src, wOffsetSrc, hOffset, width, height, kind, nullptr));
+  HIP_RETURN_DURATION(ihipMemcpy2DFromArray(dst, dpitch, src, wOffsetSrc, hOffset, width, height, kind, nullptr));
 }

 hipError_t hipMemcpy2DFromArrayAsync(void* dst, size_t dpitch, hipArray_const_t src, size_t wOffsetSrc, size_t hOffsetSrc, size_t width, size_t height, hipMemcpyKind kind, hipStream_t stream) {
  HIP_INIT_API(hipMemcpy2DFromArrayAsync, dst, dpitch, src, wOffsetSrc, hOffsetSrc, width, height, kind, stream);

-  HIP_RETURN(ihipMemcpy2DFromArray(dst, dpitch, src, wOffsetSrc, hOffsetSrc, width, height, kind, stream, true));
+  HIP_RETURN_DURATION(ihipMemcpy2DFromArray(dst, dpitch, src, wOffsetSrc, hOffsetSrc, width, height, kind, stream, true));
 }

 hipError_t hipMemcpyFromArrayAsync(void* dst, hipArray_const_t src, size_t wOffsetSrc, size_t hOffsetSrc, size_t count, hipMemcpyKind kind, hipStream_t stream) {
@@ -2107,13 +2151,13 @@ hipError_t hipMemcpyFromArrayAsync(void* dst, hipArray_const_t src, size_t wOffs

  const size_t height = (count / src->width) / hip::getElementSize(src);

-  HIP_RETURN(ihipMemcpy2DFromArray(dst, 0 /* dpitch */, src, wOffsetSrc, hOffsetSrc, widthInBytes, height, kind, stream, true));
+  HIP_RETURN_DURATION(ihipMemcpy2DFromArray(dst, 0 /* dpitch */, src, wOffsetSrc, hOffsetSrc, widthInBytes, height, kind, stream, true));
 }

 hipError_t hipMemcpy2DToArrayAsync(hipArray* dst, size_t wOffset, size_t hOffset, const void* src, size_t spitch, size_t width, size_t height, hipMemcpyKind kind, hipStream_t stream) {
  HIP_INIT_API(hipMemcpy2DToArrayAsync, dst, wOffset, hOffset, src, spitch, width, height, kind);

-  HIP_RETURN(ihipMemcpy2DToArray(dst, wOffset, hOffset, src, spitch, width, height, kind, stream, true));
+  HIP_RETURN_DURATION(ihipMemcpy2DToArray(dst, wOffset, hOffset, src, spitch, width, height, kind, stream, true));
 }

 hipError_t hipMemcpyToArrayAsync(hipArray_t dst, size_t wOffset, size_t hOffset, const void* src, size_t count, hipMemcpyKind kind, hipStream_t stream) {
@@ -2128,7 +2172,7 @@ hipError_t hipMemcpyToArrayAsync(hipArray_t dst, size_t wOffset, size_t hOffset,

  const size_t height = (count / dst->width) / hip::getElementSize(dst);

-  HIP_RETURN(ihipMemcpy2DToArray(dst, wOffset, hOffset, src, 0 /* spitch */, widthInBytes, height, kind, stream, true));
+  HIP_RETURN_DURATION(ihipMemcpy2DToArray(dst, wOffset, hOffset, src, 0 /* spitch */, widthInBytes, height, kind, stream, true));
 }

 hipError_t hipMemcpyAtoA(hipArray* dstArray,
@@ -2138,7 +2182,7 @@ hipError_t hipMemcpyAtoA(hipArray* dstArray,
                         size_t ByteCount) {
  HIP_INIT_API(hipMemcpyAtoA, dstArray, dstOffset, srcArray, srcOffset, ByteCount);

-  HIP_RETURN(ihipMemcpyAtoA(srcArray, dstArray, {srcOffset, 0, 0}, {dstOffset, 0, 0}, {ByteCount, 1, 1}, nullptr));
+  HIP_RETURN_DURATION(ihipMemcpyAtoA(srcArray, dstArray, {srcOffset, 0, 0}, {dstOffset, 0, 0}, {ByteCount, 1, 1}, nullptr));
 }

 hipError_t hipMemcpyAtoD(hipDeviceptr_t dstDevice,
@@ -2147,7 +2191,7 @@ hipError_t hipMemcpyAtoD(hipDeviceptr_t dstDevice,
                         size_t ByteCount) {
  HIP_INIT_API(hipMemcpyAtoD, dstDevice, srcArray, srcOffset, ByteCount);

-  HIP_RETURN(ihipMemcpyAtoD(srcArray, dstDevice, {srcOffset, 0, 0}, {0, 0, 0}, {ByteCount, 1, 1}, 0, 0, nullptr));
+  HIP_RETURN_DURATION(ihipMemcpyAtoD(srcArray, dstDevice, {srcOffset, 0, 0}, {0, 0, 0}, {ByteCount, 1, 1}, 0, 0, nullptr));
 }

 hipError_t hipMemcpyAtoHAsync(void* dstHost,
@@ -2157,7 +2201,7 @@ hipError_t hipMemcpyAtoHAsync(void* dstHost,
                              hipStream_t stream) {
  HIP_INIT_API(hipMemcpyAtoHAsync, dstHost, srcArray, srcOffset, ByteCount, stream);

-  HIP_RETURN(ihipMemcpyAtoH(srcArray, dstHost, {srcOffset, 0, 0}, {0, 0, 0}, {ByteCount, 1, 1}, 0, 0, stream, true));
+  HIP_RETURN_DURATION(ihipMemcpyAtoH(srcArray, dstHost, {srcOffset, 0, 0}, {0, 0, 0}, {ByteCount, 1, 1}, 0, 0, stream, true));
 }

 hipError_t hipMemcpyDtoA(hipArray* dstArray,
@@ -2166,7 +2210,7 @@ hipError_t hipMemcpyDtoA(hipArray* dstArray,
                        size_t ByteCount) {
  HIP_INIT_API(hipMemcpyDtoA, dstArray, dstOffset, srcDevice, ByteCount);

-  HIP_RETURN(ihipMemcpyDtoA(srcDevice, dstArray, {0, 0, 0}, {dstOffset, 0, 0}, {ByteCount, 1, 1}, 0, 0, nullptr));
+  HIP_RETURN_DURATION(ihipMemcpyDtoA(srcDevice, dstArray, {0, 0, 0}, {dstOffset, 0, 0}, {ByteCount, 1, 1}, 0, 0, nullptr));
 }

 hipError_t hipMemcpyHtoAAsync(hipArray* dstArray,
@@ -2176,7 +2220,7 @@ hipError_t hipMemcpyHtoAAsync(hipArray* dstArray,
                              hipStream_t stream) {
  HIP_INIT_API(hipMemcpyHtoAAsync, dstArray, dstOffset, srcHost, ByteCount, stream);

-  HIP_RETURN(ihipMemcpyHtoA(srcHost, dstArray, {0, 0, 0}, {dstOffset, 0, 0}, {ByteCount, 1, 1}, 0, 0, stream, true));
+  HIP_RETURN_DURATION(ihipMemcpyHtoA(srcHost, dstArray, {0, 0, 0}, {dstOffset, 0, 0}, {ByteCount, 1, 1}, 0, 0, stream, true));
 }

 hipError_t hipMipmappedArrayCreate(hipMipmappedArray_t* pHandle,
@@ -2233,7 +2277,7 @@ hipError_t hipMallocHost(void** ptr,
    HIP_RETURN(hipErrorInvalidValue);
  }

-  HIP_RETURN(ihipMalloc(ptr, size, CL_MEM_SVM_FINE_GRAIN_BUFFER), *ptr);
+  HIP_RETURN_DURATION(ihipMalloc(ptr, size, CL_MEM_SVM_FINE_GRAIN_BUFFER), *ptr);
 }

 hipError_t hipFreeHost(void *ptr) {
@@ -193,16 +193,17 @@ hipError_t hipFuncGetAttributes(hipFuncAttributes* attr, const void* func)
  HIP_RETURN(hipSuccess);
 }

-hipError_t ihipModuleLaunchKernel(hipFunction_t f,
-                                 uint32_t gridDimX, uint32_t gridDimY, uint32_t gridDimZ,
+hipError_t ihipModuleLaunchKernel(hipFunction_t f, uint32_t globalWorkSizeX,
+                                 uint32_t globalWorkSizeY, uint32_t globalWorkSizeZ,
                                 uint32_t blockDimX, uint32_t blockDimY, uint32_t blockDimZ,
                                 uint32_t sharedMemBytes, hipStream_t hStream,
                                 void **kernelParams, void **extra,
                                 hipEvent_t startEvent, hipEvent_t stopEvent, uint32_t flags = 0,
                                 uint32_t params = 0, uint32_t gridId = 0, uint32_t numGrids = 0,
                                 uint64_t prevGridSum = 0, uint64_t allGridSum = 0, uint32_t firstDevice = 0) {
-  HIP_INIT_API(ihipModuleLaunchKernel, f, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ,
-    sharedMemBytes, hStream, kernelParams, extra, startEvent, stopEvent, flags, params);
+  HIP_INIT_API(ihipModuleLaunchKernel, f, globalWorkSizeX, globalWorkSizeY, globalWorkSizeZ,
+    blockDimX, blockDimY, blockDimZ, sharedMemBytes, hStream, kernelParams, extra, startEvent,
+    stopEvent, flags, params);

  hip::DeviceFunc* function = hip::DeviceFunc::asFunction(f);
  amd::Kernel* kernel = function->kernel();
@@ -229,7 +230,8 @@ hipError_t ihipModuleLaunchKernel(hipFunction_t f,
    int block_size = blockDimX * blockDimY * blockDimZ;
    hip_impl::ihipOccupancyMaxActiveBlocksPerMultiprocessor(
      &num_blocks, &max_blocks_per_grid, &best_block_size, device, f, block_size, sharedMemBytes, true);
-    if (((gridDimX * gridDimY * gridDimZ) / block_size) > unsigned(max_blocks_per_grid)) {
+    if (((globalWorkSizeX * globalWorkSizeY * globalWorkSizeZ) / block_size) >
+       unsigned(max_blocks_per_grid)) {
      return hipErrorCooperativeLaunchTooLarge;
    }
  }
@@ -243,11 +245,11 @@ hipError_t ihipModuleLaunchKernel(hipFunction_t f,
  }

  size_t globalWorkOffset[3] = {0};
-  size_t globalWorkSize[3] = { gridDimX, gridDimY, gridDimZ };
+  size_t globalWorkSize[3] = { globalWorkSizeX, globalWorkSizeY, globalWorkSizeZ };
  size_t localWorkSize[3] = { blockDimX, blockDimY, blockDimZ };
  amd::NDRangeContainer ndrange(3, globalWorkOffset, globalWorkSize, localWorkSize);
  amd::Command::EventWaitList waitList;
-
+  bool profileNDRange = false;
  address kernargs = nullptr;

  // 'extra' is a struct that contains the following info: {
@@ -271,13 +273,16 @@ hipError_t ihipModuleLaunchKernel(hipFunction_t f,
                               desc.type_ == T_POINTER/*svmBound*/);
    } else {
      assert(extra == nullptr);
-      kernel->parameters().set(i, desc.size_, kernelParams[i], desc.type_ == T_POINTER/*svmBound*/);
+      kernel->parameters().set(i, desc.size_, kernelParams[i],
+                               desc.type_ == T_POINTER/*svmBound*/);
    }
  }

+  profileNDRange = (startEvent != nullptr && stopEvent != nullptr);
+
  amd::NDRangeKernelCommand* command = new amd::NDRangeKernelCommand(
    *queue, waitList, *kernel, ndrange, sharedMemBytes,
-    params, gridId, numGrids, prevGridSum, allGridSum, firstDevice);
+    params, gridId, numGrids, prevGridSum, allGridSum, firstDevice, profileNDRange);
  if (!command) {
    return hipErrorOutOfMemory;
  }
@@ -290,11 +295,11 @@ hipError_t ihipModuleLaunchKernel(hipFunction_t f,

  command->enqueue();

-  if(startEvent != nullptr) {
+  if (startEvent != nullptr) {
    eStart->addMarker(queue, command, false);
    command->retain();
  }
-  if(stopEvent != nullptr) {
+  if (stopEvent != nullptr) {
    eStop->addMarker(queue, command, false);
    command->retain();
  }
@@ -313,8 +318,17 @@ hipError_t hipModuleLaunchKernel(hipFunction_t f,
               blockDimX, blockDimY, blockDimZ,
               sharedMemBytes, hStream,
               kernelParams, extra);
-
-  HIP_RETURN(ihipModuleLaunchKernel(f, gridDimX * blockDimX, gridDimY * blockDimY, gridDimZ * blockDimZ,
+  size_t globalWorkSizeX = gridDimX * blockDimX;
+  size_t globalWorkSizeY = gridDimY * blockDimY;
+  size_t globalWorkSizeZ = gridDimZ * blockDimZ;
+  if (globalWorkSizeX > std::numeric_limits<uint32_t>::max() ||
+      globalWorkSizeY > std::numeric_limits<uint32_t>::max() ||
+      globalWorkSizeZ > std::numeric_limits<uint32_t>::max()) {
+    HIP_RETURN(hipErrorInvalidConfiguration);
+  }
+  HIP_RETURN(ihipModuleLaunchKernel(f, static_cast<uint32_t>(globalWorkSizeX),
+                                static_cast<uint32_t>(globalWorkSizeY),
+                                static_cast<uint32_t>(globalWorkSizeZ),
                                blockDimX, blockDimY, blockDimZ,
                                sharedMemBytes, hStream, kernelParams, extra, nullptr, nullptr));
 }
@@ -337,37 +351,37 @@ hipError_t hipExtModuleLaunchKernel(hipFunction_t f, uint32_t globalWorkSizeX,



-hipError_t hipHccModuleLaunchKernel(hipFunction_t f, uint32_t gridDimX,
-                                    uint32_t gridDimY, uint32_t gridDimZ,
+hipError_t hipHccModuleLaunchKernel(hipFunction_t f, uint32_t globalWorkSizeX,
+                                    uint32_t globalWorkSizeY, uint32_t globalWorkSizeZ,
                                    uint32_t blockDimX, uint32_t blockDimY,
                                    uint32_t blockDimZ, size_t sharedMemBytes,
                                    hipStream_t hStream, void** kernelParams, void** extra,
                                    hipEvent_t startEvent,
                                    hipEvent_t stopEvent)
 {
-  HIP_INIT_API(hipHccModuleLaunchKernel, f, gridDimX, gridDimY, gridDimZ,
+  HIP_INIT_API(hipHccModuleLaunchKernel, f, globalWorkSizeX, globalWorkSizeY, globalWorkSizeZ,
               blockDimX, blockDimY, blockDimZ,
               sharedMemBytes, hStream,
               kernelParams, extra, startEvent, stopEvent);

-  HIP_RETURN(ihipModuleLaunchKernel(f, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ,
+  HIP_RETURN(ihipModuleLaunchKernel(f, globalWorkSizeX, globalWorkSizeY, globalWorkSizeZ, blockDimX, blockDimY, blockDimZ,
                                sharedMemBytes, hStream, kernelParams, extra, startEvent, stopEvent));
 }

-hipError_t hipModuleLaunchKernelExt(hipFunction_t f, uint32_t gridDimX,
-                                    uint32_t gridDimY, uint32_t gridDimZ,
+hipError_t hipModuleLaunchKernelExt(hipFunction_t f, uint32_t globalWorkSizeX,
+                                    uint32_t globalWorkSizeY, uint32_t globalWorkSizeZ,
                                    uint32_t blockDimX, uint32_t blockDimY,
                                    uint32_t blockDimZ, size_t sharedMemBytes,
                                    hipStream_t hStream, void** kernelParams, void** extra,
                                    hipEvent_t startEvent,
                                    hipEvent_t stopEvent)
 {
-  HIP_INIT_API(hipModuleLaunchKernelExt, f, gridDimX, gridDimY, gridDimZ,
+  HIP_INIT_API(hipModuleLaunchKernelExt, f, globalWorkSizeX, globalWorkSizeY, globalWorkSizeZ,
               blockDimX, blockDimY, blockDimZ,
               sharedMemBytes, hStream,
               kernelParams, extra, startEvent, stopEvent);

-  HIP_RETURN(ihipModuleLaunchKernel(f, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ,
+  HIP_RETURN(ihipModuleLaunchKernel(f, globalWorkSizeX, globalWorkSizeY, globalWorkSizeZ, blockDimX, blockDimY, blockDimZ,
                                sharedMemBytes, hStream, kernelParams, extra, startEvent, stopEvent));
 }

@@ -406,8 +420,17 @@ hipError_t hipLaunchCooperativeKernel(const void* f,
  int deviceId = ihipGetDevice();
  hipFunction_t func = nullptr;
  HIP_RETURN_ONFAIL(PlatformState::instance().getStatFunc(&func, f, deviceId));
-
-  HIP_RETURN(ihipModuleLaunchKernel(func, gridDim.x * blockDim.x, gridDim.y * blockDim.y, gridDim.z * blockDim.z,
+  size_t globalWorkSizeX = gridDim.x * blockDim.x;
+  size_t globalWorkSizeY = gridDim.y * blockDim.y;
+  size_t globalWorkSizeZ = gridDim.z * blockDim.z;
+  if (globalWorkSizeX > std::numeric_limits<uint32_t>::max() ||
+      globalWorkSizeY > std::numeric_limits<uint32_t>::max() ||
+      globalWorkSizeZ > std::numeric_limits<uint32_t>::max()) {
+    HIP_RETURN(hipErrorInvalidConfiguration);
+  }
+  HIP_RETURN(ihipModuleLaunchKernel(func, static_cast<uint32_t>(globalWorkSizeX),
+                                static_cast<uint32_t>(globalWorkSizeY),
+                                static_cast<uint32_t>(globalWorkSizeZ),
                                blockDim.x, blockDim.y, blockDim.z,
                                sharedMemBytes, hStream, kernelParams, nullptr, nullptr, nullptr, 0,
                                amd::NDRangeKernelCommand::CooperativeGroups));
@@ -452,7 +475,7 @@ hipError_t ihipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsL
  uint64_t prevGridSize = 0;
  uint32_t firstDevice = 0;

-  // Sync the execution streams on all devices 
+  // Sync the execution streams on all devices
  if ((flags & hipCooperativeLaunchMultiDeviceNoPreSync) == 0) {
    for (int i = 0; i < numDevices; ++i) {
      amd::HostQueue* queue =
@@ -481,11 +504,16 @@ hipError_t ihipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsL
      result = hipErrorInvalidDeviceFunction;
      HIP_RETURN(result);
    }
-
-    result = ihipModuleLaunchKernel(func,
-      launch.gridDim.x * launch.blockDim.x,
-      launch.gridDim.y * launch.blockDim.y,
-      launch.gridDim.z * launch.blockDim.z,
+    size_t globalWorkSizeX = launch.gridDim.x * launch.blockDim.x;
+    size_t globalWorkSizeY = launch.gridDim.y * launch.blockDim.y;
+    size_t globalWorkSizeZ = launch.gridDim.z * launch.blockDim.z;
+    if (globalWorkSizeX > std::numeric_limits<uint32_t>::max() ||
+        globalWorkSizeY > std::numeric_limits<uint32_t>::max() ||
+        globalWorkSizeZ > std::numeric_limits<uint32_t>::max()) {
+      HIP_RETURN(hipErrorInvalidConfiguration);
+    }
+    result = ihipModuleLaunchKernel(func, static_cast<uint32_t>(globalWorkSizeX),
+      static_cast<uint32_t>(globalWorkSizeY), static_cast<uint32_t>(globalWorkSizeZ),
      launch.blockDim.x, launch.blockDim.y, launch.blockDim.z,
      launch.sharedMem, launch.stream, launch.args, nullptr, nullptr, nullptr,
      flags, extFlags, i, numDevices, prevGridSize, allGridSize, firstDevice);
@@ -495,7 +523,7 @@ hipError_t ihipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsL
    prevGridSize += launch.gridDim.x * launch.gridDim.y * launch.gridDim.z;
  }

-  // Sync the execution streams on all devices 
+  // Sync the execution streams on all devices
  if ((flags & hipCooperativeLaunchMultiDeviceNoPostSync) == 0) {
    for (int i = 0; i < numDevices; ++i) {
      amd::HostQueue* queue =
@@ -52,24 +52,61 @@ hipError_t canAccessPeer(int* canAccessPeer, int deviceId, int peerDeviceId){
  amd::Device* device = nullptr;
  amd::Device* peer_device = nullptr;
  if (canAccessPeer == nullptr) {
-    HIP_RETURN(hipErrorInvalidValue);
+    return hipErrorInvalidValue;
  }
  /* Peer cannot be self */
  if (deviceId == peerDeviceId) {
    *canAccessPeer = 0;
-    HIP_RETURN(hipSuccess);
+    return hipSuccess;
  }
  /* Cannot exceed the max number of devices */
  if (static_cast<size_t>(deviceId) >= g_devices.size()
       || static_cast<size_t>(peerDeviceId) >= g_devices.size()) {
-    HIP_RETURN(hipErrorInvalidDevice);
+    return hipErrorInvalidDevice;
  }
  device = g_devices[deviceId]->devices()[0];
  peer_device = g_devices[peerDeviceId]->devices()[0];
  *canAccessPeer = static_cast<int>(std::find(device->p2pDevices_.begin(),
                                              device->p2pDevices_.end(), as_cl(peer_device))
                                              != device->p2pDevices_.end());
-  HIP_RETURN(hipSuccess);
+  return hipSuccess;
+}
+
+hipError_t hipDeviceGetP2PAttribute(int* value, hipDeviceP2PAttr attr,
+                                    int srcDevice, int dstDevice) {
+  HIP_INIT_API(hipDeviceGetP2PAttribute, value, attr, srcDevice, dstDevice);
+
+  hipError_t hip_error = hipSuccess;
+
+  if (value == nullptr) {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+
+  if (srcDevice >= static_cast<int>(g_devices.size())
+      || dstDevice >= static_cast<int>(g_devices.size())) {
+    HIP_RETURN(hipErrorInvalidDevice);
+  }
+
+  switch (attr) {
+    case hipDevP2PAttrPerformanceRank :
+      assert(0 && "Unimplemented");
+      break;
+    case hipDevP2PAttrAccessSupported :
+      hip_error = canAccessPeer(value, srcDevice, dstDevice);
+      break;
+    case hipDevP2PAttrNativeAtomicSupported :
+      assert(0 && "Unimplemented");
+      break;
+    case hipDevP2PAttrHipArrayAccessSupported :
+      assert(0 && "Unimplemented");
+      break;
+    default :
+      DevLogPrintfError("Invalid attribute attr: %d ", attr);
+      hip_error = hipErrorInvalidValue;
+      break;
+  }
+
+  HIP_RETURN(hip_error);
 }

 hipError_t hipDeviceCanAccessPeer(int* canAccess, int deviceId, int peerDeviceId) {
@@ -271,7 +271,7 @@ hipError_t hipGetSymbolAddress(void** devPtr, const void* symbol) {

  HIP_RETURN_ONFAIL(PlatformState::instance().getStatGlobalVar(symbol, ihipGetDevice(), devPtr, &sym_size));

-  HIP_RETURN(hipSuccess);
+  HIP_RETURN(hipSuccess, *devPtr);
 }

 hipError_t hipGetSymbolSize(size_t* sizePtr, const void* symbol) {
@@ -280,7 +280,7 @@ hipError_t hipGetSymbolSize(size_t* sizePtr, const void* symbol) {
  hipDeviceptr_t device_ptr = nullptr;
  HIP_RETURN_ONFAIL(PlatformState::instance().getStatGlobalVar(symbol, ihipGetDevice(), &device_ptr, sizePtr));

-  HIP_RETURN(hipSuccess);
+  HIP_RETURN(hipSuccess, *sizePtr);
 }

 hipError_t ihipCreateGlobalVarObj(const char* name, hipModule_t hmod, amd::Memory** amd_mem_obj,
@@ -351,10 +351,10 @@ hipError_t ihipOccupancyMaxActiveBlocksPerMultiprocessor(
  size_t GprWaves = VgprWaves;
  if (wrkGrpInfo->usedSGPRs_ > 0) {
    size_t maxSGPRs;
-    if (device.info().gfxipVersion_ < 800) {
+    if (device.info().gfxipMajor_ < 8) {
      maxSGPRs = 512;
    }
-    else if (device.info().gfxipVersion_ < 1000) {
+    else if (device.info().gfxipMajor_ < 10) {
      maxSGPRs = 800;
    }
    else {
@@ -467,7 +467,7 @@ hipError_t hipModuleOccupancyMaxPotentialBlockSizeWithFlags(int* gridSize, int*
  HIP_RETURN(ret);
 }

-hipError_t hipModuleOccupancyMaxActiveBlocksPerMultiprocessor(int* numBlocks, 
+hipError_t hipModuleOccupancyMaxActiveBlocksPerMultiprocessor(int* numBlocks,
                                             hipFunction_t f, int blockSize, size_t dynSharedMemPerBlk)
 {
  HIP_INIT_API(hipModuleOccupancyMaxActiveBlocksPerMultiprocessor, f, blockSize, dynSharedMemPerBlk);
@@ -486,7 +486,7 @@ hipError_t hipModuleOccupancyMaxActiveBlocksPerMultiprocessor(int* numBlocks,
 }

 hipError_t hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int* numBlocks,
-                                                              hipFunction_t f, int blockSize, 
+                                                              hipFunction_t f, int blockSize,
                                                              size_t dynSharedMemPerBlk, unsigned int flags)
 {
  HIP_INIT_API(hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags, f, blockSize, dynSharedMemPerBlk, flags);
@@ -561,202 +561,6 @@ hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int* numBlocks,

 namespace hip_impl {

-struct dl_phdr_info {
-  ELFIO::Elf64_Addr        dlpi_addr;
-  const char       *dlpi_name;
-  const ELFIO::Elf64_Phdr *dlpi_phdr;
-  ELFIO::Elf64_Half        dlpi_phnum;
-};
-
-extern "C" int dl_iterate_phdr(
-  int (*callback) (struct dl_phdr_info *info, size_t size, void *data), void *data
-);
-
-struct Symbol {
-  std::string name;
-  ELFIO::Elf64_Addr value = 0;
-  ELFIO::Elf_Xword size = 0;
-  ELFIO::Elf_Half sect_idx = 0;
-  uint8_t bind = 0;
-  uint8_t type = 0;
-  uint8_t other = 0;
-};
-
-inline Symbol read_symbol(const ELFIO::symbol_section_accessor& section, unsigned int idx) {
-  assert(idx < section.get_symbols_num());
-
-  Symbol r;
-  section.get_symbol(idx, r.name, r.value, r.size, r.bind, r.type, r.sect_idx, r.other);
-
-  return r;
-}
-
-template <typename P>
-inline ELFIO::section* find_section_if(ELFIO::elfio& reader, P p) {
-    const auto it = find_if(reader.sections.begin(), reader.sections.end(), std::move(p));
-
-    return it != reader.sections.end() ? *it : nullptr;
-}
-
-std::vector<std::pair<uintptr_t, std::string>> function_names_for(const ELFIO::elfio& reader,
-                                                                  ELFIO::section* symtab) {
-  std::vector<std::pair<uintptr_t, std::string>> r;
-  ELFIO::symbol_section_accessor symbols{reader, symtab};
-
-  for (auto i = 0u; i != symbols.get_symbols_num(); ++i) {
-    auto tmp = read_symbol(symbols, i);
-
-    if (tmp.type == STT_FUNC && tmp.sect_idx != SHN_UNDEF && !tmp.name.empty()) {
-      r.emplace_back(tmp.value, tmp.name);
-    }
-  }
-
-  return r;
-}
-
-const std::vector<std::pair<uintptr_t, std::string>>& function_names_for_process() {
-  static constexpr const char self[] = "/proc/self/exe";
-
-  static std::vector<std::pair<uintptr_t, std::string>> r;
-  static std::once_flag f;
-
-  std::call_once(f, []() {
-    ELFIO::elfio reader;
-
-    if (reader.load(self)) {
-      const auto it = find_section_if(
-          reader, [](const ELFIO::section* x) { return x->get_type() == SHT_SYMTAB; });
-
-      if (it) r = function_names_for(reader, it);
-    }
-  });
-
-  return r;
-}
-
-
-const std::unordered_map<uintptr_t, std::string>& function_names()
-{
-  static std::unordered_map<uintptr_t, std::string> r{
-    function_names_for_process().cbegin(),
-    function_names_for_process().cend()};
-  static std::once_flag f;
-
-  std::call_once(f, []() {
-    dl_iterate_phdr([](dl_phdr_info* info, size_t, void*) {
-      ELFIO::elfio reader;
-
-      if (reader.load(info->dlpi_name)) {
-        const auto it = find_section_if(
-            reader, [](const ELFIO::section* x) { return x->get_type() == SHT_SYMTAB; });
-
-        if (it) {
-          auto n = function_names_for(reader, it);
-
-          for (auto&& f : n) f.first += info->dlpi_addr;
-
-          r.insert(make_move_iterator(n.begin()), make_move_iterator(n.end()));
-        }
-      }
-      return 0;
-    },
-    nullptr);
-  });
-
-  return r;
-}
-
-std::vector<char> bundles_for_process() {
-  static constexpr const char self[] = "/proc/self/exe";
-  static constexpr const char kernel_section[] = ".kernel";
-  std::vector<char> r;
-
-  ELFIO::elfio reader;
-
-  if (reader.load(self)) {
-    auto it = find_section_if(
-        reader, [](const ELFIO::section* x) { return x->get_name() == kernel_section; });
-
-    if (it) r.insert(r.end(), it->get_data(), it->get_data() + it->get_size());
-  }
-
-  return r;
-}
-
-const std::vector<hipModule_t>& modules() {
-    static std::vector<hipModule_t> r;
-    static std::once_flag f;
-
-    std::call_once(f, []() {
-      static std::vector<std::vector<char>> bundles{bundles_for_process()};
-
-      dl_iterate_phdr(
-          [](dl_phdr_info* info, std::size_t, void*) {
-        ELFIO::elfio tmp;
-        if (tmp.load(info->dlpi_name)) {
-          const auto it = find_section_if(
-              tmp, [](const ELFIO::section* x) { return x->get_name() == ".kernel"; });
-
-          if (it) bundles.emplace_back(it->get_data(), it->get_data() + it->get_size());
-        }
-        return 0;
-      },
-      nullptr);
-
-      for (auto&& bundle : bundles) {
-        if (bundle.empty()) {
-          continue;
-        }
-        std::string magic(&bundle[0], sizeof(CLANG_OFFLOAD_BUNDLER_MAGIC_STR) - 1);
-        if (magic.compare(CLANG_OFFLOAD_BUNDLER_MAGIC_STR))
-          continue;
-
-        const auto obheader = reinterpret_cast<const hip::CodeObject::__ClangOffloadBundleHeader*>(&bundle[0]);
-        const auto* desc = &obheader->desc[0];
-        for (uint64_t i = 0; i < obheader->numBundles; ++i,
-             desc = reinterpret_cast<const hip::CodeObject::__ClangOffloadBundleDesc*>(
-                 reinterpret_cast<uintptr_t>(&desc->triple[0]) + desc->tripleSize)) {
-
-          std::string triple(desc->triple, sizeof(HCC_AMDGCN_AMDHSA_TRIPLE) - 1);
-          if (triple.compare(HCC_AMDGCN_AMDHSA_TRIPLE))
-            continue;
-
-          std::string target(desc->triple + sizeof(HCC_AMDGCN_AMDHSA_TRIPLE),
-                             desc->tripleSize - sizeof(HCC_AMDGCN_AMDHSA_TRIPLE));
-
-          if (isCompatibleCodeObject(target, hip::getCurrentDevice()->devices()[0]->info().name_)) {
-            hipModule_t module;
-            if (hipSuccess == hipModuleLoadData(&module, reinterpret_cast<const void*>(
-                reinterpret_cast<uintptr_t>(obheader) + desc->offset)))
-              r.push_back(module);
-              break;
-          }
-        }
-      }
-    });
-
-    return r;
-}
-
-const std::unordered_map<uintptr_t, hipFunction_t>& functions()
-{
-  static std::unordered_map<uintptr_t, hipFunction_t> r;
-  static std::once_flag f;
-
-  std::call_once(f, []() {
-    for (auto&& function : function_names()) {
-      for (auto&& module : modules()) {
-        hipFunction_t f;
-        if (hipSuccess == hipModuleGetFunction(&f, module, function.second.c_str())) {
-          r[function.first] = f;
-        }
-      }
-    }
-  });
-
-  return r;
-}
-
 void hipLaunchKernelGGLImpl(
  uintptr_t function_address,
  const dim3& numBlocks,
@@ -767,11 +571,19 @@ void hipLaunchKernelGGLImpl(
 {
  HIP_INIT();

-  const auto it = functions().find(function_address);
-  if (it == functions().cend())
-    assert(0);
+  hip::Stream* s = reinterpret_cast<hip::Stream*>(stream);
+  int deviceId = (s != nullptr)? s->DeviceId() : ihipGetDevice();
+  if (deviceId == -1) {
+    DevLogPrintfError("Wrong Device Id: %d \n", deviceId);
+  }

-  hipModuleLaunchKernel(it->second,
+  hipFunction_t func = nullptr;
+  hipError_t hip_error = PlatformState::instance().getStatFunc(&func, reinterpret_cast<void*>(function_address), deviceId);
+  if ((hip_error != hipSuccess) || (func == nullptr)) {
+    DevLogPrintfError("Cannot find the static function: 0x%x", function_address);
+  }
+
+  hipModuleLaunchKernel(func,
    numBlocks.x, numBlocks.y, numBlocks.z,
    dimBlocks.x, dimBlocks.y, dimBlocks.z,
    sharedMemBytes, stream, nullptr, kernarg);
@@ -815,63 +627,24 @@ hipError_t ihipLaunchKernel(const void* hostFunction,
  hipFunction_t func =  nullptr;
  hipError_t hip_error = PlatformState::instance().getStatFunc(&func, hostFunction, deviceId);
  if ((hip_error != hipSuccess) || (func == nullptr)) {
-#ifdef ATI_OS_LINUX
-    const auto it = hip_impl::functions().find(reinterpret_cast<uintptr_t>(hostFunction));
-    if (it == hip_impl::functions().cend()) {
-      DevLogPrintfError("Cannot find function: 0x%x \n", hostFunction);
-      HIP_RETURN(hipErrorInvalidDeviceFunction);
-    }
-    func = it->second;
-#else
    HIP_RETURN(hipErrorInvalidDeviceFunction);
-#endif
  }
-  HIP_RETURN(ihipModuleLaunchKernel(func, (gridDim.x * blockDim.x), (gridDim.y * blockDim.y),
-                                    (gridDim.z * blockDim.z), blockDim.x, blockDim.y, blockDim.z,
+  size_t globalWorkSizeX = gridDim.x * blockDim.x;
+  size_t globalWorkSizeY = gridDim.y * blockDim.y;
+  size_t globalWorkSizeZ = gridDim.z * blockDim.z;
+  if (globalWorkSizeX > std::numeric_limits<uint32_t>::max() ||
+      globalWorkSizeY > std::numeric_limits<uint32_t>::max() ||
+      globalWorkSizeZ > std::numeric_limits<uint32_t>::max()) {
+    HIP_RETURN(hipErrorInvalidConfiguration);
+  }
+  HIP_RETURN(ihipModuleLaunchKernel(func, static_cast<uint32_t>(globalWorkSizeX),
+                                    static_cast<uint32_t>(globalWorkSizeY),
+                                    static_cast<uint32_t>(globalWorkSizeZ),
+                                    blockDim.x, blockDim.y, blockDim.z,
                                    sharedMemBytes, stream, args, nullptr, startEvent, stopEvent,
                                    flags));
 }

-// conversion routines between float and half precision
-static inline std::uint32_t f32_as_u32(float f) { union { float f; std::uint32_t u; } v; v.f = f; return v.u; }
-static inline float u32_as_f32(std::uint32_t u) { union { float f; std::uint32_t u; } v; v.u = u; return v.f; }
-static inline int clamp_int(int i, int l, int h) { return std::min(std::max(i, l), h); }
-
-// half float, the f16 is in the low 16 bits of the input argument
-static inline float __convert_half_to_float(std::uint32_t a) noexcept {
-  std::uint32_t u = ((a << 13) + 0x70000000U) & 0x8fffe000U;
-  std::uint32_t v = f32_as_u32(u32_as_f32(u) * u32_as_f32(0x77800000U)/*0x1.0p+112f*/) + 0x38000000U;
-  u = (a & 0x7fff) != 0 ? v : u;
-  return u32_as_f32(u) * u32_as_f32(0x07800000U)/*0x1.0p-112f*/;
-}
-
-// float half with nearest even rounding
-// The lower 16 bits of the result is the bit pattern for the f16
-static inline std::uint32_t __convert_float_to_half(float a) noexcept {
-  std::uint32_t u = f32_as_u32(a);
-  int e = static_cast<int>((u >> 23) & 0xff) - 127 + 15;
-  std::uint32_t m = ((u >> 11) & 0xffe) | ((u & 0xfff) != 0);
-  std::uint32_t i = 0x7c00 | (m != 0 ? 0x0200 : 0);
-  std::uint32_t n = ((std::uint32_t)e << 12) | m;
-  std::uint32_t s = (u >> 16) & 0x8000;
-  int b = clamp_int(1-e, 0, 13);
-  std::uint32_t d = (0x1000 | m) >> b;
-  d |= (d << b) != (0x1000 | m);
-  std::uint32_t v = e < 1 ? d : n;
-  v = (v >> 2) + (((v & 0x7) == 3) | ((v & 0x7) > 5));
-  v = e > 30 ? 0x7c00 : v;
-  v = e == 143 ? i : v;
-  return s | v;
-}
-
-extern "C" float __gnu_h2f_ieee(unsigned short h){
-  return __convert_half_to_float((std::uint32_t) h);
-}
-
-extern "C" unsigned short __gnu_f2h_ieee(float f){
-  return (unsigned short)__convert_float_to_half(f);
-}
-
 void PlatformState::init()
 {
  amd::ScopedLock lock(lock_);
@@ -1074,4 +847,3 @@ void PlatformState::popExec(ihipExec_t& exec) {
  exec = std::move(execStack_.top());
  execStack_.pop();
 }
-
@@ -22,6 +22,9 @@
 #include "hip_internal.hpp"
 #include "hip_event.hpp"
 #include "thread/monitor.hpp"
+#include "hip_prof_api.h"
+
+extern api_callbacks_table_t callbacks_table;

 static amd::Monitor streamSetLock{"Guards global stream set"};
 static std::unordered_set<hip::Stream*> streamSet;
@@ -50,7 +53,12 @@ Stream::Stream(hip::Device* dev, Priority p,

 // ================================================================================================
 bool Stream::Create() {
-  cl_command_queue_properties properties = CL_QUEUE_PROFILING_ENABLE;
+  // Enable queue profiling if a profiler is attached which sets the callback_table flag
+  // or if we force it with env var. This would enable time stamp collection for every
+  // command submitted to the stream(queue).
+  cl_command_queue_properties properties = (callbacks_table.is_enabled() ||
+                                            HIP_FORCE_QUEUE_PROFILING) ?
+                                             CL_QUEUE_PROFILING_ENABLE : 0;
  amd::CommandQueue::Priority p;
  switch (priority_) {
    case Priority::High:
@@ -64,8 +72,9 @@ bool Stream::Create() {
      p = amd::CommandQueue::Priority::Normal;
      break;
  }
-  amd::HostQueue* queue = new amd::HostQueue(*device_->asContext(), *device_->devices()[0], properties,
-                             amd::CommandQueue::RealTimeDisabled, p, cuMask_);
+  amd::HostQueue* queue = new amd::HostQueue(*device_->asContext(), *device_->devices()[0],
+                                             properties, amd::CommandQueue::RealTimeDisabled,
+                                             p, cuMask_);

  // Create a host queue
  bool result = (queue != nullptr) ? queue->create() : false;
@@ -202,6 +211,10 @@ static hipError_t ihipStreamCreate(hipStream_t* stream,
 hipError_t hipStreamCreateWithFlags(hipStream_t *stream, unsigned int flags) {
  HIP_INIT_API(hipStreamCreateWithFlags, stream, flags);

+  if (stream == nullptr) {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+
  HIP_RETURN(ihipStreamCreate(stream, flags, hip::Stream::Priority::Normal), *stream);
 }

@@ -209,6 +222,10 @@ hipError_t hipStreamCreateWithFlags(hipStream_t *stream, unsigned int flags) {
 hipError_t hipStreamCreate(hipStream_t *stream) {
  HIP_INIT_API(hipStreamCreate, stream);

+  if (stream == nullptr) {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+
  HIP_RETURN(ihipStreamCreate(stream, hipStreamDefault, hip::Stream::Priority::Normal), *stream);
 }

@@ -216,6 +233,10 @@ hipError_t hipStreamCreate(hipStream_t *stream) {
 hipError_t hipStreamCreateWithPriority(hipStream_t* stream, unsigned int flags, int priority) {
  HIP_INIT_API(hipStreamCreateWithPriority, stream, flags, priority);

+  if (stream == nullptr) {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+
  hip::Stream::Priority streamPriority;
  if (priority <= hip::Stream::Priority::High) {
    streamPriority = hip::Stream::Priority::High;
@@ -269,8 +269,10 @@ hipError_t ihipCreateTextureObject(hipTextureObject_t* pTexObject,
  case hipResourceTypePitch2D: {
    const cl_channel_order channelOrder = hip::getCLChannelOrder(hip::getNumChannels(pResDesc->res.pitch2D.desc), pTexDesc->sRGB);
    const cl_channel_type channelType = hip::getCLChannelType(hip::getArrayFormat(pResDesc->res.pitch2D.desc), pTexDesc->readMode);
+    const amd::Image::Format imageFormat({channelOrder, channelType});
    const cl_mem_object_type imageType = hip::getCLMemObjectType(pResDesc->resType);
-    const size_t imageSizeInBytes = pResDesc->res.pitch2D.pitchInBytes * pResDesc->res.pitch2D.height;
+    const size_t imageSizeInBytes = pResDesc->res.pitch2D.width * imageFormat.getElementSize() +
+                                    pResDesc->res.pitch2D.pitchInBytes * (pResDesc->res.pitch2D.height - 1);
    amd::Memory* buffer = getMemoryObjectWithOffset(pResDesc->res.pitch2D.devPtr, imageSizeInBytes);
    image = ihipImageCreate(channelOrder,
                            channelType,
@@ -7,7 +7,7 @@
 #include "test_common.h"

 /* HIT_START
- * BUILD: %t %s ../../src/test_common.cpp timer.cpp EXCLUDE_HIP_PLATFORM nvcc
+ * BUILD: %t %s ../../src/test_common.cpp ../../src/timer.cpp EXCLUDE_HIP_PLATFORM nvcc
 * TEST: %t
 * HIT_END
 */
@@ -7,7 +7,7 @@
 #include "test_common.h"

 /* HIT_START
- * BUILD: %t %s ../../src/test_common.cpp timer.cpp EXCLUDE_HIP_PLATFORM nvcc
+ * BUILD: %t %s ../../src/test_common.cpp ../../src/timer.cpp EXCLUDE_HIP_PLATFORM nvcc
 * TEST: %t
 * HIT_END
 */
@@ -7,7 +7,7 @@
 #include "test_common.h"

 /* HIT_START
- * BUILD: %t %s ../../src/test_common.cpp timer.cpp EXCLUDE_HIP_PLATFORM nvcc
+ * BUILD: %t %s ../../src/test_common.cpp ../../src/timer.cpp EXCLUDE_HIP_PLATFORM nvcc
 * TEST: %t
 * HIT_END
 */
@@ -0,0 +1,136 @@
+/*
+Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+/* HIT_START
+ * BUILD: %t %s ../../src/test_common.cpp EXCLUDE_HIP_PLATFORM nvcc
+ * TEST: %t
+ * HIT_END
+ */
+
+#include <iostream>
+#include <chrono>
+#include "test_common.h"
+
+using namespace std;
+
+#define arraySize 16
+
+typedef struct d_uint16 {
+  uint data[arraySize];
+} d_uint16;
+
+__global__ void read_kernel(d_uint16 *src, ulong N, uint *dst) {
+
+  size_t idx = (blockIdx.x * blockDim.x + threadIdx.x);
+  size_t stride = blockDim.x * gridDim.x ;
+
+  uint tmp = 0;
+  for (size_t i = idx; i < N; i += stride) {
+    for (size_t j = 0; j < arraySize; j++) {
+      tmp += src[i].data[j];
+    }
+  }
+
+  atomicAdd(dst, tmp);
+}
+
+int main(int argc, char* argv[]) {
+  d_uint16 *dSrc;
+  d_uint16 *hSrc;
+  uint *dDst;
+  uint *hDst;
+  hipStream_t stream;
+  ulong N = 4 * 1024 * 1024;
+  uint nBytes = N * sizeof(d_uint16);
+
+  int nGpu = 0;
+  HIPCHECK(hipGetDeviceCount(&nGpu));
+  if (nGpu < 1) {
+      cout << "info: didn't find any GPU! skipping the test!\n";
+      passed();
+      return 0;
+  }
+
+  static int device = 0;
+  HIPCHECK(hipSetDevice(device));
+  hipDeviceProp_t props;
+  HIPCHECK(hipGetDeviceProperties(&props, device));
+  cout << "info: running on bus " << "0x" << props.pciBusID << " " << props.name <<
+      " with " << props.multiProcessorCount << " CUs" << endl;
+
+  const unsigned threadsPerBlock = 64;
+  const unsigned blocks = props.multiProcessorCount * 4;
+
+  uint inputData = 0x1;
+  int nIter = 1000;
+
+  hSrc =  new d_uint16[nBytes];
+  HIPCHECK(hSrc == 0 ? hipErrorOutOfMemory : hipSuccess);
+  hDst =  new uint;
+  hDst[0] = 0;
+  HIPCHECK(hDst == 0 ? hipErrorOutOfMemory : hipSuccess);
+  for (size_t i = 0; i < N; i++) {
+    for (int j = 0; j < arraySize; j++) {
+      hSrc[i].data[j] = inputData;
+    }
+  }
+
+  HIPCHECK(hipMalloc(&dSrc, nBytes));
+  HIPCHECK(hipMalloc(&dDst, sizeof(uint)));
+
+  HIPCHECK(hipStreamCreate(&stream));
+
+  HIPCHECK(hipMemcpy(dSrc, hSrc, nBytes, hipMemcpyHostToDevice));
+  HIPCHECK(hipMemcpy(dDst, hDst, sizeof(uint), hipMemcpyHostToDevice));
+
+  hipLaunchKernelGGL(read_kernel, dim3(blocks), dim3(threadsPerBlock), 0, stream, dSrc, N, dDst);
+  HIPCHECK(hipMemcpy(hDst, dDst, sizeof(uint), hipMemcpyDeviceToHost));
+  hipDeviceSynchronize();
+
+  if (hDst[0] != (nBytes / sizeof(uint))) {
+    cout << "info: Data validation failed for warm up run!" << endl;
+    cout << "info: expected " << nBytes / sizeof(uint) << " got " << hDst[0] << endl;
+    HIPCHECK(hipErrorUnknown);
+  }
+
+  // measure performance based on host time
+  auto all_start = chrono::steady_clock::now();
+
+  for(int i = 0; i < nIter; i++) {
+    hipLaunchKernelGGL(read_kernel, dim3(blocks), dim3(threadsPerBlock), 0, stream, dSrc, N, dDst);
+  }
+  hipDeviceSynchronize();
+
+  auto all_end = chrono::steady_clock::now();
+  chrono::duration<double> all_kernel_time = all_end - all_start;
+
+  // read speed in GB/s
+  double perf = ((double)nBytes * nIter * (double)(1e-09)) / all_kernel_time.count();
+
+  cout << "info: average read speed of " << perf << " GB/s " << "achieved for memory size of " <<
+      nBytes / (1024 * 1024) << " MB" << endl;
+
+  delete [] hSrc;
+  delete hDst;
+  hipFree(dSrc);
+  hipFree(dDst);
+  HIPCHECK(hipStreamDestroy(stream));
+
+  passed();
+}
@@ -0,0 +1,126 @@
+/*
+Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+/* HIT_START
+ * BUILD: %t %s ../../src/test_common.cpp EXCLUDE_HIP_PLATFORM nvcc
+ * TEST: %t
+ * HIT_END
+ */
+
+#include <iostream>
+#include <chrono>
+#include "test_common.h"
+
+using namespace std;
+
+#define arraySize 16
+
+typedef struct d_uint16 {
+  uint data[arraySize];
+} d_uint16;
+
+__global__ void write_kernel(d_uint16 *dst, ulong N, d_uint16 pval) {
+    size_t idx = (blockIdx.x * blockDim.x + threadIdx.x);
+    size_t stride = blockDim.x * gridDim.x;
+    for (size_t i = idx; i < N; i += stride) {
+      dst[i] = pval;
+    }
+};
+
+int main(int argc, char* argv[]) {
+  d_uint16 *dDst;
+  d_uint16 *hDst;
+  hipStream_t stream;
+  ulong N = 4 * 1024 * 1024;
+  uint nBytes = N * sizeof(d_uint16);
+  d_uint16 pval;
+
+  for (int i = 0; i < arraySize; i++) {
+    pval.data[i] = 0xabababab;
+  }
+
+  int nGpu = 0;
+  HIPCHECK(hipGetDeviceCount(&nGpu));
+  if (nGpu < 1) {
+      cout << "info: didn't find any GPU! skipping the test!\n";
+      passed();
+      return 0;
+  }
+
+  static int device = 0;
+  HIPCHECK(hipSetDevice(device));
+  hipDeviceProp_t props;
+  HIPCHECK(hipGetDeviceProperties(&props, device));
+  cout << "info: running on bus " << "0x" << props.pciBusID << " " << props.name <<
+      " with " << props.multiProcessorCount << " CUs" << endl;
+
+  size_t threadsPerBlock = 64;
+  size_t blocks = props.multiProcessorCount * 4;
+
+  uint inputData = 0xabababab;
+  int nIter = 1000;
+
+  hDst =  new d_uint16[nBytes];
+  HIPCHECK(hDst == 0 ? hipErrorOutOfMemory : hipSuccess);
+  for (size_t i = 0; i < N; i++) {
+    for (size_t j = 0; j < arraySize; j++) {
+      hDst[i].data[j] = 0;
+    }
+  }
+
+  HIPCHECK(hipMalloc(&dDst, nBytes));
+
+  HIPCHECK(hipStreamCreate(&stream));
+
+  hipLaunchKernelGGL(write_kernel, dim3(blocks), dim3(threadsPerBlock), 0, stream, dDst, N, pval);
+  HIPCHECK(hipMemcpy(hDst, dDst, nBytes , hipMemcpyDeviceToHost));
+  hipDeviceSynchronize();
+
+  for (uint i = 0; i < N; i++) {
+    for (uint j = 0; j < arraySize; j++) {
+      if (hDst[i].data[j] != inputData) {
+        cout << "info: Data validation failed for warm up run! " << endl;
+        cout << "at index i: " << i << " element j: " << j << endl;
+        cout << hex << "expected 0x" << inputData << " but got 0x" << hDst[i].data[j] << endl;
+        HIPCHECK(hipErrorUnknown);
+      }
+    }
+  }
+
+  auto all_start = chrono::steady_clock::now();
+  for(int i = 0; i < nIter; i++) {
+    hipLaunchKernelGGL(write_kernel, dim3(blocks), dim3(threadsPerBlock), 0, stream, dDst, N, pval);
+  }
+  hipDeviceSynchronize();
+  auto all_end = chrono::steady_clock::now();
+  chrono::duration<double> all_kernel_time = all_end - all_start;
+
+  // read speed in GB/s
+  double perf = ((double)nBytes * nIter * (double)(1e-09)) / all_kernel_time.count();
+
+  cout << "info: average write speed of " << perf << " GB/s " << "achieved for memory size of " <<
+      nBytes / (1024 * 1024) << " MB" << endl;
+
+
+  delete [] hDst;
+  hipFree(dDst);
+  HIPCHECK(hipStreamDestroy(stream));
+
+  passed();
+}
@@ -34,12 +34,13 @@ THE SOFTWARE.
 #include <array>
 #include "hip/hip_runtime.h"
 /* HIT_START
- * BUILD: %t %s ../../src/test_common.cpp EXCLUDE_HIP_PLATFORM nvcc
+ * BUILD_CMD: hipPerfHostNumaAlloc %hc -I%S/../../src %S/%s %S/../../src/test_common.cpp -lnuma -o %T/%t EXCLUDE_HIP_PLATFORM nvcc
 * TEST: %t
 * HIT_END
 */

-// To run it correctly, we must not export HIP_VISIBLE_DEVICES
+// To run it correctly, we must not export HIP_VISIBLE_DEVICES.
+// And we must explicitly link libnuma because of numa api move_pages().
 #define NUM_PAGES 4
 char *h = nullptr;
 char *d_h = nullptr;
@@ -127,6 +128,7 @@ bool test(int cpuId, int gpuId, int numaMode, unsigned int hostMallocflags) {
  printf("\n");

  HIPCHECK(hipHostFree((void* )h));
+  hipHostUnregister(m);
  free(m);

  if (cpuId >= 0 && (numaMode == MPOL_BIND || numaMode == MPOL_PREFERRED)) {
@@ -149,8 +151,7 @@ bool runTest(const int &cpuCount, const int &gpuCount,

    for (int i = 0; i < cpuCount; i++) {
      for (int j = 0; j < gpuCount; j++) {
-        if (!test(i, j, mode[m],
-                  hipHostMallocDefault | hipHostMallocNumaUser)) {
+        if (!test(i, j, mode[m], hostMallocflags)) {
          return false;
        }
      }
@@ -0,0 +1,250 @@
+/*
+ Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE.
+ */
+
+/* HIT_START
+ * BUILD: %t %s ../../src/test_common.cpp EXCLUDE_HIP_PLATFORM nvcc
+ * TEST: %t
+ * HIT_END
+ */
+
+#include <iostream>
+#include <chrono>
+#include "test_common.h"
+
+using namespace std;
+
+#define sharedMemSize1 2048
+#define sharedMemSize2 256
+
+__global__ void sharedMemReadSpeed1(float *outBuf, ulong N) {
+
+  size_t gid = (blockIdx.x * blockDim.x + threadIdx.x);
+  size_t lid = threadIdx.x;
+  __shared__ float local[sharedMemSize1];
+
+  float val1 = 0;
+  float val2 = 0;
+  float val3 = 0;
+  float val4 = 0;
+
+  for (int i = 0; i < (sharedMemSize1 / 64); i++) {
+    local[lid + i * 64] = lid;
+  }
+
+  __syncthreads();
+
+  val1 += local[lid];
+  val2 += local[lid + 64];
+  val3 += local[lid + 128];
+  val4 += local[lid + 192];
+  val1 += local[lid + 256];
+  val2 += local[lid + 320];
+  val3 += local[lid + 384];
+  val4 += local[lid + 448];
+  val1 += local[lid + 512];
+  val2 += local[lid + 576];
+  val3 += local[lid + 640];
+  val4 += local[lid + 704];
+  val1 += local[lid + 768];
+  val2 += local[lid + 832];
+  val3 += local[lid + 896];
+  val4 += local[lid + 960];
+  val1 += local[lid + 1024];
+  val2 += local[lid + 1088];
+  val3 += local[lid + 1152];
+  val4 += local[lid + 1216];
+  val1 += local[lid + 1280];
+  val2 += local[lid + 1344];
+  val3 += local[lid + 1408];
+  val4 += local[lid + 1472];
+  val1 += local[lid + 1536];
+  val2 += local[lid + 1600];
+  val3 += local[lid + 1664];
+  val4 += local[lid + 1728];
+  val1 += local[lid + 1792];
+  val2 += local[lid + 1856];
+  val3 += local[lid + 1920];
+  val4 += local[lid + 1984];
+
+  if (gid < N) {
+    outBuf[gid] = val1 + val2 + val3 + val4;
+  }
+};
+
+__global__ void sharedMemReadSpeed2(float *outBuf, ulong N) {
+  size_t gid = (blockIdx.x * blockDim.x + threadIdx.x);
+  size_t lid = threadIdx.x;
+  __shared__ float local[sharedMemSize2];
+
+  float val0 = 0.0f;
+  float val1 = 0.0f;
+
+  for (int i = 0; i < (sharedMemSize2 / 64); i++) {
+    local[lid + i * 64] = lid;
+  }
+
+  __syncthreads();
+
+#pragma nounroll
+  for (uint i = 0; i < 32; i++) {
+    val0 += local[8 * i + 0];
+    val1 += local[8 * i + 1];
+    val0 += local[8 * i + 2];
+    val1 += local[8 * i + 3];
+    val0 += local[8 * i + 4];
+    val1 += local[8 * i + 5];
+    val0 += local[8 * i + 6];
+    val1 += local[8 * i + 7];
+  }
+
+  if (gid < N) {
+    outBuf[gid] = val0 + val1;
+  }
+};
+
+int main(int argc, char *argv[]) {
+  float *dDst;
+  float *hDst;
+  hipStream_t stream;
+  constexpr uint numSizes = 4;
+  constexpr uint Sizes[numSizes] = {262144, 1048576, 4194304, 16777216};
+  uint numReads1 = 32;
+  uint numReads2 = 256;
+  uint sharedMemSizeBytes1 = sharedMemSize1 * sizeof(float);
+  uint sharedMemSizeBytes2 = sharedMemSize2 * sizeof(float);
+  int nIter = 1000;
+  const unsigned threadsPerBlock = 64;
+
+  int nGpu = 0;
+  HIPCHECK(hipGetDeviceCount(&nGpu));
+  if (nGpu < 1) {
+    cout << "info: didn't find any GPU! skipping the test!\n";
+    passed();
+    return 0;
+  }
+
+  static int device = 0;
+  HIPCHECK(hipSetDevice(device));
+  hipDeviceProp_t props;
+  HIPCHECK(hipGetDeviceProperties(&props, device));
+  cout << "info: running on bus " << "0x" << props.pciBusID << " " << props.name
+      << " with " << props.multiProcessorCount << " CUs" << endl;
+
+  HIPCHECK(hipStreamCreate(&stream));
+
+  for (int nTest = 0; nTest < numSizes; nTest++) {
+    uint nBytes = Sizes[nTest % numSizes];
+    ulong N = nBytes / sizeof(float);
+    const unsigned blocks = N / threadsPerBlock;
+
+    hDst = new float[nBytes];
+    HIPCHECK(hDst == 0 ? hipErrorOutOfMemory : hipSuccess);
+    memset(hDst, 0, nBytes);
+
+    HIPCHECK(hipMalloc(&dDst, nBytes));
+    HIPCHECK(hipMemcpy(dDst, hDst, nBytes, hipMemcpyHostToDevice));
+
+    hipLaunchKernelGGL(sharedMemReadSpeed1, dim3(blocks), dim3(threadsPerBlock),
+        0, stream, dDst, N);
+    HIPCHECK(hipMemcpy(hDst, dDst, nBytes, hipMemcpyDeviceToHost));
+    hipDeviceSynchronize();
+
+    int tmp = 0;
+    for (int i = 0; i < N; i++) {
+      if (i % threadsPerBlock == 0) {
+        tmp = 0;
+      }
+      if (hDst[i] != tmp) {
+        cout << "info: Data validation failed for warm up run!" << endl;
+        cout << "info: expected " << tmp << " got " << hDst[i] << endl;
+        HIPCHECK (hipErrorUnknown);
+      }
+      tmp += threadsPerBlock / 2;
+    }
+
+    auto all_start = chrono::steady_clock::now();
+    for (int i = 0; i < nIter; i++) {
+      hipLaunchKernelGGL(sharedMemReadSpeed1, dim3(blocks),
+          dim3(threadsPerBlock), 0, stream, dDst, N);
+    }
+    hipDeviceSynchronize();
+
+    auto all_end = chrono::steady_clock::now();
+    chrono::duration<double> all_kernel_time = all_end - all_start;
+
+    // read speed in GB/s
+    double perf = ((double) blocks * threadsPerBlock
+        * (numReads1 * sizeof(float) + sharedMemSizeBytes1 / 64) * nIter
+        * (double) (1e-09)) / all_kernel_time.count();
+
+    cout << "info: read speed = " << setw(8) << perf << " GB/s for "
+        << sharedMemSizeBytes1 / 1024 << " KB shared memory"
+            " with " << setw(8) << blocks * threadsPerBlock << " threads, "
+        << setw(4) << numReads1 << " reads in sharedMemReadSpeed1 kernel" << endl;
+
+    delete[] hDst;
+    hipFree(dDst);
+  }
+
+
+  for (int nTest = 0; nTest < numSizes; nTest++) {
+    uint nBytes = Sizes[nTest % numSizes];
+    ulong N = nBytes / sizeof(float);
+    const unsigned blocks = N / threadsPerBlock;
+
+    hDst = new float[nBytes];
+    HIPCHECK(hDst == 0 ? hipErrorOutOfMemory : hipSuccess);
+    memset(hDst, 0, nBytes);
+
+    HIPCHECK(hipMalloc(&dDst, nBytes));
+    HIPCHECK(hipMemcpy(dDst, hDst, nBytes, hipMemcpyHostToDevice));
+
+    hipLaunchKernelGGL(sharedMemReadSpeed2, dim3(blocks), dim3(threadsPerBlock),
+        0, stream, dDst, N);
+    HIPCHECK(hipMemcpy(hDst, dDst, nBytes, hipMemcpyDeviceToHost));
+    hipDeviceSynchronize();
+
+    auto all_start = chrono::steady_clock::now();
+    for (int i = 0; i < nIter; i++) {
+      hipLaunchKernelGGL(sharedMemReadSpeed2, dim3(blocks),
+          dim3(threadsPerBlock), 0, stream, dDst, N);
+    }
+    hipDeviceSynchronize();
+
+    auto all_end = chrono::steady_clock::now();
+    chrono::duration<double> all_kernel_time = all_end - all_start;
+
+    // read speed in GB/s
+    double perf = ((double) blocks * threadsPerBlock
+        * (numReads2 * sizeof(float) + sharedMemSizeBytes2 / 64) * nIter
+        * (double) (1e-09)) / all_kernel_time.count();
+
+    cout << "info: read speed = " << setw(8) << perf << " GB/s for "
+        << sharedMemSizeBytes2 / 1024 << " KB shared memory"
+            " with " << setw(8) << blocks * threadsPerBlock << " threads, "
+        << setw(4) << numReads2 << " reads in sharedMemReadSpeed2 kernel" << endl;
+
+    delete[] hDst;
+    hipFree(dDst);
+  }
+
+  HIPCHECK(hipStreamDestroy(stream));
+
+  passed();
+}
@@ -22,7 +22,7 @@ THE SOFTWARE.


 /* HIT_START
- * BUILD: %t %s ../test_common.cpp
+ * BUILD: %t %s ../test_common.cpp EXCLUDE_HIP_PLATFORM nvcc
 * TEST: %t
 * HIT_END
 */
@@ -19,12 +19,13 @@ THE SOFTWARE.


 /* HIT_START
- * BUILD: %t %s ../test_common.cpp NVCC_OPTIONS -std=c++11
+ * BUILD: %t %s ../test_common.cpp NVCC_OPTIONS -std=c++11 EXCLUDE_HIP_PLATFORM nvcc
 * TEST: %t
 * HIT_END
 */

 #include "test_common.h"
+#include <hip/hip_runtime.h>
 #include "hip/hip_fp16.h"

 #define test_passed(test_name) \
@@ -18,7 +18,7 @@
 * */

 /* HIT_START
- * BUILD_CMD: hipMalloc %cxx -D__HIP_PLATFORM_HCC__ -I%hip-path/include -I/opt/rocm/include %S/%s -Wl,--rpath=%hip-path/lib %hip-path/lib/libhip_hcc.so -o %T/%t -std=c++11 EXCLUDE_HIP_PLATFORM nvcc
+ * BUILD_CMD: hipMalloc %cxx -D__HIP_PLATFORM_HCC__ -I%hip-path/include -I/opt/rocm/include %S/%s -Wl,--rpath=%hip-path/lib %hip-path/lib/libamdhip64.so -o %T/%t -std=c++11 EXCLUDE_HIP_PLATFORM nvcc
 * TEST: %t EXCLUDE_HIP_PLATFORM nvcc
 * HIT_END
 */
@@ -21,7 +21,7 @@
 /* HIT_START
 * BUILD_CMD: gpu.o %hc -I%hip-path/include -g -c %S/gpu.cpp -o %T/gpu.o EXCLUDE_HIP_PLATFORM nvcc rocclr
 * BUILD_CMD: launchkernel.o %hc -D__HIP_PLATFORM_HCC__ -g -I%hip-path/include -c %S/LaunchKernel.c -o %T/launchkernel.o EXCLUDE_HIP_PLATFORM nvcc rocclr
- * BUILD_CMD: LaunchKernel %hc %T/launchkernel.o %T/gpu.o -g -Wl,--rpath=%hip-path/lib %hip-path/lib/libhip_hcc.so -o %T/%t DEPENDS gpu.o launchkernel.o EXCLUDE_HIP_PLATFORM nvcc rocclr
+ * BUILD_CMD: LaunchKernel %hc %T/launchkernel.o %T/gpu.o -g -Wl,--rpath=%hip-path/lib %hip-path/lib/libamdhip64.so -o %T/%t DEPENDS gpu.o launchkernel.o EXCLUDE_HIP_PLATFORM nvcc rocclr
 * TEST: %t EXCLUDE_HIP_PLATFORM nvcc rocclr
 * HIT_END
 */
@@ -19,7 +19,7 @@

 /* HIT_START
 * BUILD_CMD: hipMalloc %cc -D__HIP_PLATFORM_NVCC__ -I%hip-path/include -I/usr/local/cuda/include %S/%s -o %T/hipMalloc_nv -L/usr/local/cuda/lib64 -lcudart EXCLUDE_HIP_PLATFORM hcc rocclr
- * BUILD_CMD: hipMalloc %cc -D__HIP_PLATFORM_HCC__ -I%hip-path/include %S/%s -Wl,--rpath=%hip-path/lib %hip-path/lib/libhip_hcc.so -o %T/hipMalloc_hcc EXCLUDE_HIP_PLATFORM nvcc rocclr
+ * BUILD_CMD: hipMalloc %cc -D__HIP_PLATFORM_HCC__ -I%hip-path/include %S/%s -Wl,--rpath=%hip-path/lib %hip-path/lib/libamdhip64.so -o %T/hipMalloc_hcc EXCLUDE_HIP_PLATFORM nvcc rocclr
 * TEST: hipMalloc_nv EXCLUDE_HIP_PLATFORM hcc rocclr
 * TEST: hipMalloc_hcc EXCLUDE_HIP_PLATFORM nvcc rocclr
 * HIT_END
@@ -18,7 +18,7 @@ THE SOFTWARE.
 */

 /* HIT_START
- * BUILD: %t %s ../test_common.cpp
+ * BUILD: %t %s ../test_common.cpp EXCLUDE_HIP_PLATFORM nvcc
 * TEST: %t
 * HIT_END
 */
@@ -43,6 +43,7 @@ void single_process() {

  // Negative, Make sure we return error when an offset of original ptr is passed
  ipc_offset_dptr = ipc_dptr + (OFFSET * sizeof(int));
+  // HIP API return value differs from CUDA's return type
  assert(hipErrorInvalidDevicePointer == hipIpcGetMemHandle(&ipc_offset_handle, ipc_offset_dptr));

  // Get handle for the device_ptr
@@ -26,7 +26,7 @@
 /* HIT_START
 * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS -std=c++11
 * TEST_NAMED: %t  hipDeviceGetPCIBusId-vs-hipDeviceGetAttribute --tests 0x1
- * TEST_NAMED: %t  hipDeviceGetPCIBusId-vs-lspci --tests 0x2
+ * TEST_NAMED: %t  hipDeviceGetPCIBusId-vs-lspci --tests 0x2 EXCLUDE_HIP_PLATFORM nvcc
 * HIT_END
 */

@@ -106,8 +106,13 @@ bool compareHipDeviceGetPCIBusIdWithLspci() {
  getPciBusId(deviceCount, hipDeviceList);

  // Get lspci device list and compare with hip device list
+#if defined(__CUDA_ARCH__)
+  char const *command = "lspci -D | grep controller | grep NVIDIA | "
+                        "cut -d ' ' -f 1";
+#else
  char const *command = "lspci -D | grep controller | grep AMD/ATI | "
                        "cut -d ' ' -f 1";
+#endif
  fpipe = popen(command, "r");

  if (fpipe == nullptr) {
@@ -467,6 +467,7 @@ void HipMemcpyWithStreamMultiThreadtests::TestkindDefaultForDtoD(void) {
                      &A_h[0], &B_h[0], &C_h[0], N, false);

  for (int i=1; i < numDevices; ++i) {
+    HIPCHECK(hipSetDevice(i));
    HIPCHECK(hipMalloc(&A_d[i], Nbytes));
    HIPCHECK(hipMalloc(&B_d[i], Nbytes));
    HIPCHECK(hipMalloc(&C_d[i], Nbytes));
@@ -476,6 +477,7 @@ void HipMemcpyWithStreamMultiThreadtests::TestkindDefaultForDtoD(void) {

  hipStream_t stream[numDevices];
  for (int i=0; i < numDevices; ++i) {
+    HIPCHECK(hipSetDevice(i));
    HIPCHECK(hipStreamCreate(&stream[i]));
  }

@@ -166,11 +166,11 @@ bool testhipMemset2AsyncOps() {
  hipStream_t s;
  hipStreamCreate(&s);
  hipMemsetAsync(p2, 0, 32*32*4, s);
-  hipMemsetD32Async(p3, 0x3fe00000, 32*32, s );
+  hipMemsetD32Async((hipDeviceptr_t)p3, 0x3fe00000, 32*32, s );
  hipStreamSynchronize(s);
  for (int i = 0; i < 256; ++i) {
    hipMemsetAsync(p2, 0, 32*32*4, s);
-    hipMemsetD32Async(p3, 0x3fe00000, 32*32, s );
+    hipMemsetD32Async((hipDeviceptr_t)p3, 0x3fe00000, 32*32, s );
  }
  hipStreamSynchronize(s);
  hipDeviceSynchronize();
@@ -60,8 +60,6 @@ void run(const std::vector<char>& buffer, int deviceNo) {
  hipSetDevice(deviceNo);
  hipModule_t Module;
  hipFunction_t Function;
-  HIPCHECK(hipModuleLoadData(&Module, &buffer[0]));
-  HIPCHECK(hipModuleGetFunction(&Function, Module, kernel_name));

  float *A, *B, *Ad, *Bd;
  A = new float[LEN];
@@ -78,6 +76,9 @@ void run(const std::vector<char>& buffer, int deviceNo) {
  HIPCHECK(hipMemcpy(Ad, A, SIZE, hipMemcpyHostToDevice));
  HIPCHECK(hipMemcpy(Bd, B, SIZE, hipMemcpyHostToDevice));

+  HIPCHECK(hipModuleLoadData(&Module, &buffer[0]));
+  HIPCHECK(hipModuleGetFunction(&Function, Module, kernel_name));
+
  hipStream_t stream;
  HIPCHECK(hipStreamCreate(&stream));

@@ -56,8 +56,6 @@ std::vector<char> load_file() {
 void run(const std::vector<char>& buffer) {
  hipModule_t Module;
  hipFunction_t Function;
-  HIPCHECK(hipModuleLoadData(&Module, &buffer[0]));
-  HIPCHECK(hipModuleGetFunction(&Function, Module, kernel_name));

  float *A, *B, *Ad, *Bd;
  A = new float[LEN];
@@ -74,6 +72,9 @@ void run(const std::vector<char>& buffer) {
  HIPCHECK(hipMemcpy(Ad, A, SIZE, hipMemcpyHostToDevice));
  HIPCHECK(hipMemcpy(Bd, B, SIZE, hipMemcpyHostToDevice));

+  HIPCHECK(hipModuleLoadData(&Module, &buffer[0]));
+  HIPCHECK(hipModuleGetFunction(&Function, Module, kernel_name));
+
  hipStream_t stream;
  HIPCHECK(hipStreamCreate(&stream));

@@ -21,7 +21,7 @@ THE SOFTWARE.
 */

 /* HIT_START
- * BUILD_CMD: tex2d_kernel.code %hc --genco %S/tex2d_kernel.cpp -o tex2d_kernel.code EXCLUDE_HIP_PLATFORM rocclr
+ * BUILD_CMD: tex2d_kernel.code %hc --genco %S/tex2d_kernel.cpp -o tex2d_kernel.code EXCLUDE_HIP_PLATFORM rocclr nvcc
 * HIT_END
 */

@@ -81,6 +81,13 @@ static void HIPRT_CB Callback1(hipStream_t stream, hipError_t status,
  sleep(SECONDS_TO_WAIT);
 }

+bool rangedCompare(long a, long b) {
+  auto diff = b - a;
+  if (diff < 0) diff *= -1;
+  if (diff < 500) return true;
+  return false;
+}
+

 int main(int argc, char* argv[]) {
  float *A_d, *C_d;
@@ -139,7 +146,8 @@ int main(int argc, char* argv[]) {
  // completes the execution. Therefore the hipStreamSynchronize() in the
  // main thread should hardly take any time to complete.

-  if (duration.count() < SECONDS_TO_WAIT * TO_MICROSECONDS) {
+  if ((duration.count() < (SECONDS_TO_WAIT * TO_MICROSECONDS)) ||
+      (rangedCompare(duration.count(), SECONDS_TO_WAIT * TO_MICROSECONDS))) {
    passed();
  } else {
    failed("hipStreamSynchronize is waiting untill Callback() completes.");
@@ -18,7 +18,7 @@ THE SOFTWARE.
 */

 /* HIT_START
- * BUILD: %t %s ../../test_common.cpp
+ * BUILD: %t %s ../../test_common.cpp EXCLUDE_HIP_PLATFORM all
 * TEST: %t
 * HIT_END
 */
@@ -45,7 +45,7 @@ int main(int argc, char *argv[]) {

    // Check if priorities are indeed supported
    if ((priority_low + priority_high) != 0) {
-      failed("Priorities are not supported");
+      passed(); // exit the test since priorities are not supported
    }

    // Checking Priority of default stream