diff --git a/projects/hip/docs/.sphinx/_toc.yml.in b/projects/hip/docs/.sphinx/_toc.yml.in
index d44d7e721e..e0cb4610fc 100644
--- a/projects/hip/docs/.sphinx/_toc.yml.in
+++ b/projects/hip/docs/.sphinx/_toc.yml.in
@@ -5,6 +5,8 @@ subtrees:
     - file: user_guide/programming_manual
     - file: user_guide/hip_rtc
     - file: user_guide/faq
+    - file: user_guide/hip_porting_guide    
+    - file: user_guide/hip_porting_driver_api
 - caption: How to Guides
   entries:
   - file: how_to_guides/install.md
diff --git a/projects/hip/docs/user_guide/hip_porting_driver_api.md b/projects/hip/docs/user_guide/hip_porting_driver_api.md
new file mode 100644
index 0000000000..08df9aaa8a
--- /dev/null
+++ b/projects/hip/docs/user_guide/hip_porting_driver_api.md
@@ -0,0 +1,287 @@
+# Porting CUDA Driver API
+
+## Introduction to the CUDA Driver and Runtime APIs
+CUDA provides a separate CUDA Driver and Runtime APIs. The two APIs have significant overlap in functionality:
+- Both APIs support events, streams, memory management, memory copy, and error handling.
+- Both APIs deliver similar performance.
+- Driver APIs calls begin with the prefix `cu` while Runtime APIs begin with the prefix `cuda`. For example, the Driver API API contains `cuEventCreate` while the Runtime API contains `cudaEventCreate`, with similar functionality.
+- The Driver API defines a different but largely overlapping error code space than the Runtime API, and uses a different coding convention. For example, Driver API defines `CUDA_ERROR_INVALID_VALUE` while the Runtime API defines `cudaErrorInvalidValue`
+
+
+The Driver API offers two additional pieces of functionality not provided by the Runtime API: cuModule and cuCtx APIs.
+
+### cuModule API
+The Module section of the Driver API provides additional control over how and when accelerator code objects are loaded.
+For example, the driver API allows code objects to be loaded from files or memory pointers.
+Symbols for kernels or global data can be extracted from the loaded code objects.
+In contrast, the Runtime API automatically loads and (if necessary) compiles all of the kernels from an executable binary when run.
+In this mode, NVCC must be used to compile kernel code so the automatic loading can function correctly.
+
+Both Driver and Runtime APIs define a function for launching kernels (called `cuLaunchKernel` or `cudaLaunchKernel`.
+The kernel arguments and the execution configuration (grid dimensions, group dimensions, dynamic shared memory, and stream) are passed as arguments to the launch function.
+The Runtime additionally provides the `<<< >>>` syntax for launching kernels, which resembles a special function call and is easier to use than explicit launch API (in particular with respect to handling of kernel arguments).
+However, this syntax is not standard C++ and is available only when NVCC is used to compile the host code.
+
+The Module features are useful in an environment which generates the code objects directly, such as a new accelerator language front-end.
+Here, NVCC is not used. Instead, the environment may have a different kernel language or different compilation flow.
+Other environments have many kernels and do not want them to be all loaded automatically.
+The Module functions can be used to load the generated code objects and launch kernels.
+As we will see below, HIP defines a Module API which provides similar explicit control over code object management.
+
+### cuCtx API
+The Driver API defines "Context" and "Devices" as separate entities.
+Contexts contain a single device, and a device can theoretically have multiple contexts.
+Each context contains a set of streams and events specific to the context.
+Historically contexts also defined a unique address space for the GPU, though this may no longer be the case in Unified Memory platforms (since the CPU and all the devices in the same process share a single unified address space).
+The Context APIs also provide a mechanism to switch between devices, which allowed a single CPU thread to send commands to different GPUs.
+HIP as well as a recent versions of CUDA Runtime provide other mechanisms to accomplish this feat - for example using streams or `cudaSetDevice`.
+
+The CUDA Runtime API unifies the Context API with the Device API. This simplifies the APIs and has little loss of functionality since each Context can contain a single device, and the benefits of multiple contexts has been replaced with other interfaces.
+HIP provides a context API to facilitate easy porting from existing Driver codes.
+In HIP, the Ctx functions largely provide an alternate syntax for changing the active device.
+
+Most new applications will prefer to use `hipSetDevice` or the stream APIs , therefore HIP has marked hipCtx APIs as **deprecated**. Support for these APIs may not be available in future releases. For more details on deprecated APIs please refer [HIP deprecated APIs](https://github.com/ROCm-Developer-Tools/HIP/tree/master/docs/markdown/hip_deprecated_api_list.md).
+
+## HIP Module and Ctx APIs
+
+Rather than present two separate APIs, HIP extends the HIP API with new APIs for Modules and Ctx control.
+
+### hipModule API
+
+Like the CUDA Driver API, the Module API provides additional control over how code is loaded, including options to load code from files or from in-memory pointers.
+NVCC and HIP-Clang target different architectures and use different code object formats: NVCC is `cubin` or `ptx` files, while the HIP-Clang path is the `hsaco` format.
+The external compilers which generate these code objects are responsible for generating and loading the correct code object for each platform.
+Notably, there is not a fat binary format that can contain code for both NVCC and HIP-Clang platforms. The following table summarizes the formats used on each platform:
+
+| Format        | APIs                             | NVCC               | HIP-CLANG    |
+| ---           | ---                              | ---                | ---          |
+| Code Object   | hipModuleLoad, hipModuleLoadData | .cubin or PTX text | .hsaco       |
+| Fat Binary    | hipModuleLoadFatBin              | .fatbin            | .hip_fatbin  |
+
+`hipcc` uses HIP-Clang or NVCC to compile host codes. Both of these may embed code objects into the final executable, and these code objects will be automatically loaded when the application starts.
+The hipModule API can be used to load additional code objects, and in this way provides an extended capability to the automatically loaded code objects.
+HIP-Clang allows both of these capabilities to be used together, if desired. Of course it is possible to create a program with no kernels and thus no automatic loading.
+
+
+### hipCtx API
+HIP provides a `Ctx` API as a thin layer over the existing Device functions. This Ctx API can be used to set the current context, or to query properties of the device associated with the context.
+The current context is implicitly used by other APIs such as `hipStreamCreate`.
+
+### hipify translation of CUDA Driver API
+The HIPIFY tools convert CUDA Driver APIs for streams, events, modules, devices, memory management, context, profiler to the equivalent HIP driver calls. For example, `cuEventCreate` will be translated to `hipEventCreate`.
+HIPIFY tools also convert error codes from the Driver namespace and coding convention to the equivalent HIP error code. Thus, HIP unifies the APIs for these common functions.
+
+The memory copy API requires additional explanation. The CUDA driver includes the memory direction in the name of the API (ie `cuMemcpyH2D`) while the CUDA driver API provides a single memory copy API with a parameter that specifies the direction and additionally supports a "default" direction where the runtime determines the direction automatically.
+HIP provides APIs with both styles: for example, `hipMemcpyH2D` as well as `hipMemcpy`.
+The first flavor may be faster in some cases since they avoid host overhead to detect the different memory directions.
+
+HIP defines a single error space, and uses camel-case for all errors (i.e. `hipErrorInvalidValue`).
+
+#### Address Spaces
+HIP-Clang defines a process-wide address space where the CPU and all devices allocate addresses from a single unified pool.
+Thus addresses may be shared between contexts, and unlike the original CUDA definition a new context does not create a new address space for the device.
+
+#### Using hipModuleLaunchKernel
+`hipModuleLaunchKernel` is `cuLaunchKernel` in HIP world. It takes the same arguments as `cuLaunchKernel`.
+
+#### Additional Information
+- HIP-Clang creates a primary context when the HIP API is called. So in a pure driver API code, HIP-Clang will create a primary context while HIP/NVCC will have empty context stack.
+HIP-Clang will push primary context to context stack when it is empty. This can have subtle differences on applications which mix the runtime and driver APIs.
+
+### hip-clang Implementation Notes
+#### .hip_fatbin
+hip-clang links device code from different translation units together. For each device target, a code object is generated. Code objects for different device targets are bundled by clang-offload-bundler as one fatbinary, which is embeded as a global symbol `__hip_fatbin` in the .hip_fatbin section of the ELF file of the executable or shared object.
+
+#### Initialization and Termination Functions
+hip-clang generates initializatiion and termination functions for each translation unit for host code compilation. The initialization functions call `__hipRegisterFatBinary` to register the fatbinary embeded in the ELF file. They also call `__hipRegisterFunction` and `__hipRegisterVar` to register kernel functions and device side global variables. The termination functions call `__hipUnregisterFatBinary`.
+hip-clang emits a global variable `__hip_gpubin_handle` of void** type with linkonce linkage and inital value 0 for each host translation unit. Each initialization function checks `__hip_gpubin_handle` and register the fatbinary only if `__hip_gpubin_handle` is 0 and saves the return value of `__hip_gpubin_handle` to `__hip_gpubin_handle`. This is to guarantee that the fatbinary is only registered once. Similar check is done in the termination functions.
+
+#### Kernel Launching
+hip-clang supports kernel launching by CUDA `<<<>>>` syntax, hipLaunchKernelGGL. The latter one is macro which expand to CUDA `<<<>>>` syntax.
+
+When the executable or shared library is loaded by the dynamic linker, the initilization functions are called. In the initialization functions, when `__hipRegisterFatBinary` is called, the code objects containing all kernels are loaded; when `__hipRegisterFunction` is called, the stub functions are associated with the corresponding kernels in code objects.
+
+hip-clang implements two sets of kernel launching APIs.
+
+By default, in the host code, for the `<<<>>>` statement, hip-clang first emits call of hipConfigureCall to set up the threads and grids, then emits call of the stub function with the given arguments. In the stub function, hipSetupArgument is called for each kernel argument, then hipLaunchByPtr is called with a function pointer to the stub function. In hipLaunchByPtr, the real kernel associated with the stub function is launched.
+
+### NVCC Implementation Notes
+
+#### Interoperation between HIP and CUDA Driver
+CUDA applications may want to mix CUDA driver code with HIP code (see example below). This table shows the type equivalence to enable this interaction.
+
+|**HIP Type**   |**CU Driver Type**|**CUDA Runtime Type**|
+| ----          | ----             | ----                |
+| hipModule_t   |  CUmodule        |                     |
+| hipFunction_t |  CUfunction      |                     |
+| hipCtx_t      |  CUcontext       |                     |
+| hipDevice_t   |  CUdevice        |                     |
+| hipStream_t   |  CUstream        | cudaStream_t        |
+| hipEvent_t    |  CUevent         | cudaEvent_t         |
+| hipArray      |  CUarray         | cudaArray           |
+
+#### Compilation Options
+The `hipModule_t` interface does not support `cuModuleLoadDataEx` function, which is used to control PTX compilation options.
+HIP-Clang does not use PTX and does not support these compilation options.
+In fact, HIP-Clang code objects always contain fully compiled ISA and do not require additional compilation as a part of the load step.
+The corresponding HIP function `hipModuleLoadDataEx` behaves as `hipModuleLoadData` on HIP-Clang path (compilation options are not used) and as `cuModuleLoadDataEx` on NVCC path.
+For example (CUDA):
+```
+CUmodule module;
+void *imagePtr = ...;  // Somehow populate data pointer with code object
+
+const int numOptions = 1;
+CUJit_option options[numOptions];
+void * optionValues[numOptions];
+
+options[0] = CU_JIT_MAX_REGISTERS;
+unsigned maxRegs = 15;
+optionValues[0] = (void*)(&maxRegs);
+
+cuModuleLoadDataEx(module, imagePtr, numOptions, options, optionValues);
+
+CUfunction k;
+cuModuleGetFunction(&k, module, "myKernel");
+```
+HIP:
+```
+hipModule_t module;
+void *imagePtr = ...;  // Somehow populate data pointer with code object
+
+const int numOptions = 1;
+hipJitOption options[numOptions];
+void * optionValues[numOptions];
+
+options[0] = hipJitOptionMaxRegisters;
+unsigned maxRegs = 15;
+optionValues[0] = (void*)(&maxRegs);
+
+// hipModuleLoadData(module, imagePtr) will be called on HIP-Clang path, JIT options will not be used, and
+// cupModuleLoadDataEx(module, imagePtr, numOptions, options, optionValues) will be called on NVCC path
+hipModuleLoadDataEx(module, imagePtr, numOptions, options, optionValues);
+
+hipFunction_t k;
+hipModuleGetFunction(&k, module, "myKernel");
+```
+
+The below sample shows how to use `hipModuleGetFunction`.
+
+```
+#include<hip_runtime.h>
+#include<hip_runtime_api.h>
+#include<iostream>
+#include<fstream>
+#include<vector>
+
+#define LEN 64
+#define SIZE LEN<<2
+
+#ifdef __HIP_PLATFORM_AMD__
+#define fileName "vcpy_isa.co"
+#endif
+
+#ifdef __HIP_PLATFORM_NVIDIA__
+#define fileName "vcpy_isa.ptx"
+#endif
+
+#define kernel_name "hello_world"
+
+int main(){
+    float *A, *B;
+    hipDeviceptr_t Ad, Bd;
+    A = new float[LEN];
+    B = new float[LEN];
+
+    for(uint32_t i=0;i<LEN;i++){
+        A[i] = i*1.0f;
+        B[i] = 0.0f;
+        std::cout<<A[i] << " "<<B[i]<<std::endl;
+    }
+
+
+#ifdef __HIP_PLATFORM_NVIDIA__
+          hipInit(0);
+          hipDevice_t device;
+          hipCtx_t context;
+          hipDeviceGet(&device, 0);
+          hipCtxCreate(&context, 0, device);
+#endif
+
+    hipMalloc((void**)&Ad, SIZE);
+    hipMalloc((void**)&Bd, SIZE);
+
+    hipMemcpyHtoD(Ad, A, SIZE);
+    hipMemcpyHtoD(Bd, B, SIZE);
+    hipModule_t Module;
+    hipFunction_t Function;
+    hipModuleLoad(&Module, fileName);
+    hipModuleGetFunction(&Function, Module, kernel_name);
+
+    std::vector<void*>argBuffer(2);
+    memcpy(&argBuffer[0], &Ad, sizeof(void*));
+    memcpy(&argBuffer[1], &Bd, sizeof(void*));
+
+    size_t size = argBuffer.size()*sizeof(void*);
+
+    void *config[] = {
+      HIP_LAUNCH_PARAM_BUFFER_POINTER, &argBuffer[0],
+      HIP_LAUNCH_PARAM_BUFFER_SIZE, &size,
+      HIP_LAUNCH_PARAM_END
+    };
+
+    hipModuleLaunchKernel(Function, 1, 1, 1, LEN, 1, 1, 0, 0, NULL, (void**)&config);
+
+    hipMemcpyDtoH(B, Bd, SIZE);
+    for(uint32_t i=0;i<LEN;i++){
+        std::cout<<A[i]<<" - "<<B[i]<<std::endl;
+    }
+
+#ifdef __HIP_PLATFORM_NVIDIA__
+          hipCtxDetach(context);
+#endif
+
+    return 0;
+}
+```
+
+## HIP Module and Texture Driver API
+
+HIP supports texture driver APIs however texture reference should be declared in host scope. Following code explains the use of texture reference for __HIP_PLATFORM_AMD__ platform.
+
+```
+// Code to generate code object
+
+#include "hip/hip_runtime.h"
+extern texture<float, 2, hipReadModeElementType> tex;
+
+__global__ void tex2dKernel(hipLaunchParm lp, float* outputData,
+                             int width,
+                             int height)
+{
+    int x = blockIdx.x*blockDim.x + threadIdx.x;
+    int y = blockIdx.y*blockDim.y + threadIdx.y;
+    outputData[y*width + x] = tex2D(tex, x, y);
+}
+
+```
+```
+// Host code:
+
+texture<float, 2, hipReadModeElementType> tex;
+
+void myFunc ()
+{
+    // ...
+
+    textureReference* texref;
+    hipModuleGetTexRef(&texref, Module1, "tex");
+    hipTexRefSetAddressMode(texref, 0, hipAddressModeWrap);
+    hipTexRefSetAddressMode(texref, 1, hipAddressModeWrap);
+    hipTexRefSetFilterMode(texref, hipFilterModePoint);
+    hipTexRefSetFlags(texref, 0);
+    hipTexRefSetFormat(texref, HIP_AD_FORMAT_FLOAT, 1);
+    hipTexRefSetArray(texref, array, HIP_TRSA_OVERRIDE_FORMAT);
+
+   // ...
+}
+```
diff --git a/projects/hip/docs/user_guide/hip_porting_guide.md b/projects/hip/docs/user_guide/hip_porting_guide.md
new file mode 100644
index 0000000000..be0f793d35
--- /dev/null
+++ b/projects/hip/docs/user_guide/hip_porting_guide.md
@@ -0,0 +1,550 @@
+# HIP Porting Guide
+In addition to providing a portable C++ programming environment for GPUs, HIP is designed to ease
+the porting of existing CUDA code into the HIP environment.  This section describes the available tools
+and provides practical suggestions on how to port CUDA code and work through common issues.
+
+## Porting a New CUDA Project
+
+### General Tips
+- Starting the port on a CUDA machine is often the easiest approach, since you can incrementally port pieces of the code to HIP while leaving the rest in CUDA. (Recall that on CUDA machines HIP is just a thin layer over CUDA, so the two code types can interoperate on nvcc platforms.) Also, the HIP port can be compared with the original CUDA code for function and performance.
+- Once the CUDA code is ported to HIP and is running on the CUDA machine, compile the HIP code using the HIP compiler on an AMD machine.
+- HIP ports can replace CUDA versions: HIP can deliver the same performance as a native CUDA implementation, with the benefit of portability to both Nvidia and AMD architectures as well as a path to future C++ standard support. You can handle platform-specific features through conditional compilation or by adding them to the open-source HIP infrastructure.
+- Use **[hipconvertinplace-perl.sh](https://github.com/ROCm-Developer-Tools/HIPIFY/blob/master/bin/hipconvertinplace-perl.sh)** to hipify all code files in the CUDA source directory.
+
+### Scanning existing CUDA code to scope the porting effort
+The **[hipexamine-perl.sh](https://github.com/ROCm-Developer-Tools/HIPIFY/blob/master/bin/hipexamine-perl.sh)** tool will scan a source directory to determine which files contain CUDA code and how much of that code can be automatically hipified.
+```
+> cd examples/rodinia_3.0/cuda/kmeans
+> $HIP_DIR/bin/hipexamine-perl.sh.
+info: hipify ./kmeans.h =====>
+info: hipify ./unistd.h =====>
+info: hipify ./kmeans.c =====>
+info: hipify ./kmeans_cuda_kernel.cu =====>
+  info: converted 40 CUDA->HIP refs( dev:0 mem:0 kern:0 builtin:37 math:0 stream:0 event:0 err:0 def:0 tex:3 other:0 ) warn:0 LOC:185
+info: hipify ./getopt.h =====>
+info: hipify ./kmeans_cuda.cu =====>
+  info: converted 49 CUDA->HIP refs( dev:3 mem:32 kern:2 builtin:0 math:0 stream:0 event:0 err:0 def:0 tex:12 other:0 ) warn:0 LOC:311
+info: hipify ./rmse.c =====>
+info: hipify ./cluster.c =====>
+info: hipify ./getopt.c =====>
+info: hipify ./kmeans_clustering.c =====>
+info: TOTAL-converted 89 CUDA->HIP refs( dev:3 mem:32 kern:2 builtin:37 math:0 stream:0 event:0 err:0 def:0 tex:15 other:0 ) warn:0 LOC:3607
+  kernels (1 total) :   kmeansPoint(1)
+```
+
+hipexamine-perl scans each code file (cpp, c, h, hpp, etc.) found in the specified directory:
+
+   * Files with no CUDA code (ie kmeans.h) print one line summary just listing the source file name.
+   * Files with CUDA code print a summary of what was found - for example the kmeans_cuda_kernel.cu file:
+```
+info: hipify ./kmeans_cuda_kernel.cu =====>
+  info: converted 40 CUDA->HIP refs( dev:0 mem:0 kern:0 builtin:37 math:0 stream:0 event:0
+```
+* Interesting information in kmeans_cuda_kernel.cu :
+  * How many CUDA calls were converted to HIP (40)
+  * Breakdown of the CUDA functionality used (dev:0 mem:0 etc). This file uses many CUDA builtins (37) and texture functions (3).
+   * Warning for code that looks like CUDA API but was not converted (0 in this file).
+   * Count Lines-of-Code (LOC) - 185 for this file.
+
+* hipexamine-perl also presents a summary at the end of the process for the statistics collected across all files. This has similar format to the per-file reporting, and also includes a list of all kernels which have been called.  An example from above:
+
+```shell
+info: TOTAL-converted 89 CUDA->HIP refs( dev:3 mem:32 kern:2 builtin:37 math:0 stream:0 event:0 err:0 def:0 tex:15 other:0 ) warn:0 LOC:3607
+  kernels (1 total) :   kmeansPoint(1)
+```
+
+### Converting a project "in-place"
+
+```shell
+> hipify-perl --inplace
+```
+
+For each input file FILE, this script will:
+  - If "FILE.prehip file does not exist, copy the original code to a new file with extension ".prehip". Then hipify the code file.
+  - If "FILE.prehip" file exists, hipify FILE.prehip and save to FILE.
+
+This is useful for testing improvements to the hipify toolset.
+
+
+The [hipconvertinplace-perl.sh](https://github.com/ROCm-Developer-Tools/HIPIFY/blob/master/bin/hipconvertinplace-perl.sh) script will perform inplace conversion for all code files in the specified directory.
+This can be quite handy when dealing with an existing CUDA code base since the script preserves the existing directory structure
+and filenames - and includes work.  After converting in-place, you can review the code to add additional parameters to
+directory names.
+
+
+```shell
+> hipconvertinplace-perl.sh MY_SRC_DIR
+```
+
+### Library Equivalents
+
+| CUDA Library | ROCm Library | Comment |
+|------- | ---------   | -----   |
+| cuBLAS        |    rocBLAS     | Basic Linear Algebra Subroutines
+| cuFFT        |    rocFFT     | Fast Fourier Transfer Library
+| cuSPARSE     |    rocSPARSE   | Sparse BLAS  + SPMV
+| cuSolver     |    rocSOLVER   | Lapack library
+| AMG-X    |    rocALUTION   | Sparse iterative solvers and preconditioners with Geometric and Algebraic MultiGrid
+| Thrust    |    rocThrust | C++ parallel algorithms library
+| CUB     |    rocPRIM | Low Level Optimized Parallel Primitives
+| cuDNN    |    MIOpen | Deep learning Solver Library
+| cuRAND    |    rocRAND | Random Number Generator Library
+| EIGEN    |    EIGEN – HIP port | C++ template library for linear algebra: matrices, vectors, numerical solvers,
+| NCCL    |    RCCL  | Communications Primitives Library based on the MPI equivalents
+
+
+
+## Distinguishing Compiler Modes
+
+
+### Identifying HIP Target Platform
+All HIP projects target either AMD or NVIDIA platform. The platform affects which headers are included and which libraries are used for linking.
+
+- `HIP_PLATFORM_AMD` is defined if the HIP platform targets AMD.
+Note, `HIP_PLATFORM_HCC` was previously defined if the HIP platform targeted AMD, it is deprecated.
+
+- `HIP_PLATFORM_NVDIA` is defined if the HIP platform targets NVIDIA.
+Note, `HIP_PLATFORM_NVCC` was previously defined if the HIP platform targeted NVIDIA, it is deprecated.
+
+### Identifying the Compiler: hip-clang or nvcc
+Often, it's useful to know whether the underlying compiler is HIP-Clang or nvcc. This knowledge can guard platform-specific code or aid in platform-specific performance tuning.
+
+```
+#ifdef __HIP_PLATFORM_AMD__
+// Compiled with HIP-Clang
+#endif
+```
+
+```
+#ifdef __HIP_PLATFORM_NVIDIA__
+// Compiled with nvcc
+//  Could be compiling with CUDA language extensions enabled (for example, a ".cu file)
+//  Could be in pass-through mode to an underlying host compile OR (for example, a .cpp file)
+
+```
+
+```
+#ifdef __CUDACC__
+// Compiled with nvcc (CUDA language extensions enabled)
+```
+
+Compiler directly generates the host code (using the Clang x86 target) and passes the code to another host compiler. Thus, they have no equivalent of the \__CUDA_ACC define.
+
+
+### Identifying Current Compilation Pass: Host or Device
+
+nvcc makes two passes over the code: one for host code and one for device code.
+HIP-Clang will have multiple passes over the code: one for the host code, and one for each architecture on the device code.
+`__HIP_DEVICE_COMPILE__` is set to a nonzero value when the compiler (HIP-Clang or nvcc) is compiling code for a device inside a `__global__` kernel or for a device function. `__HIP_DEVICE_COMPILE__` can replace #ifdef checks on the `__CUDA_ARCH__` define.
+
+```
+// #ifdef __CUDA_ARCH__
+#if __HIP_DEVICE_COMPILE__
+```
+
+Unlike `__CUDA_ARCH__`, the `__HIP_DEVICE_COMPILE__` value is 1 or undefined, and it doesn't represent the feature capability of the target device.
+
+### Compiler Defines: Summary
+|Define  		|   HIP-Clang  | nvcc 		|  Other (GCC, ICC, Clang, etc.)
+|--- | --- | --- |---|
+|HIP-related defines:|
+|`__HIP_PLATFORM_AMD__`| Defined | Undefined |  Defined if targeting AMD platform; undefined otherwise |
+|`__HIP_PLATFORM_NVIDIA__`| Undefined  | Defined |  Defined if targeting NVIDIA platform; undefined otherwise |
+|`__HIP_DEVICE_COMPILE__`     | 1 if compiling for device; undefined if compiling for host  |1 if compiling for device; undefined if compiling for host  | Undefined
+|`__HIPCC__`		|  Defined | Defined 		|  Undefined
+|`__HIP_ARCH_*` |0 or 1 depending on feature support (see below) | 0 or 1 depending on feature support (see below) | 0
+|nvcc-related defines:|
+|`__CUDACC__` 		 | Defined if source code is compiled by nvcc; undefined otherwise 		|  Undefined
+|`__NVCC__` 		 | Undefined | Defined 		|  Undefined
+|`__CUDA_ARCH__`		 | Undefined | Unsigned representing compute capability (e.g., "130") if in device code; 0 if in host code  | Undefined
+|hip-clang-related defines:|
+|`__HIP__`  		 | Defined   | Undefined   	|  Undefined
+|HIP-Clang common defines:|
+|`__clang__`		| Defined   | Defined | Undefined 	|  Defined if using Clang; otherwise undefined
+
+## Identifying Architecture Features
+
+### HIP_ARCH Defines
+
+Some CUDA code tests `__CUDA_ARCH__` for a specific value to determine whether the machine supports a certain architectural feature. For instance,
+
+```
+#if (__CUDA_ARCH__ >= 130)
+// doubles are supported
+```
+This type of code requires special attention, since AMD and CUDA devices have different architectural capabilities. Moreover, you can't determine the presence of a feature using a simple comparison against an architecture's version number. HIP provides a set of defines and device properties to query whether a specific architectural feature is supported.
+
+The `__HIP_ARCH_*` defines can replace comparisons of `__CUDA_ARCH__` values:
+```
+//#if (__CUDA_ARCH__ >= 130)   // non-portable
+if __HIP_ARCH_HAS_DOUBLES__ {  // portable HIP feature query
+   // doubles are supported
+}
+```
+
+For host code, the `__HIP_ARCH__*` defines are set to 0. You should only use the __HIP_ARCH__ fields in device code.
+
+### Device-Architecture Properties
+
+Host code should query the architecture feature flags in the device properties that hipGetDeviceProperties returns, rather than testing the "major" and "minor" fields directly:
+
+```
+hipGetDeviceProperties(&deviceProp, device);
+//if ((deviceProp.major == 1 && deviceProp.minor < 2))  // non-portable
+if (deviceProp.arch.hasSharedInt32Atomics) {            // portable HIP feature query
+    // has shared int32 atomic operations ...
+}
+```
+
+### Table of Architecture Properties
+The table below shows the full set of architectural properties that HIP supports.
+
+|Define (use only in device code) | Device Property (run-time query) | Comment |
+|------- | ---------   | -----   |
+|32-bit atomics:||
+|`__HIP_ARCH_HAS_GLOBAL_INT32_ATOMICS__`        |    hasGlobalInt32Atomics      |32-bit integer atomics for global memory
+|`__HIP_ARCH_HAS_GLOBAL_FLOAT_ATOMIC_EXCH__`    |    hasGlobalFloatAtomicExch   |32-bit float atomic exchange for global memory
+|`__HIP_ARCH_HAS_SHARED_INT32_ATOMICS__`        |    hasSharedInt32Atomics      |32-bit integer atomics for shared memory
+|`__HIP_ARCH_HAS_SHARED_FLOAT_ATOMIC_EXCH__`    |    hasSharedFloatAtomicExch   |32-bit float atomic exchange for shared memory
+|`__HIP_ARCH_HAS_FLOAT_ATOMIC_ADD__`            |     hasFloatAtomicAdd         |32-bit float atomic add in global and shared memory
+|64-bit atomics:                           |                                     |
+|`__HIP_ARCH_HAS_GLOBAL_INT64_ATOMICS__`        |    hasGlobalInt64Atomics      |64-bit integer atomics for global memory
+|`__HIP_ARCH_HAS_SHARED_INT64_ATOMICS__`        |    hasSharedInt64Atomics      |64-bit integer atomics for shared memory
+|Doubles:                                  |                                     |
+|`__HIP_ARCH_HAS_DOUBLES__`                    |     hasDoubles                 |Double-precision floating point
+|Warp cross-lane operations:              |                                      |
+|`__HIP_ARCH_HAS_WARP_VOTE__`                  |     hasWarpVote                |Warp vote instructions (any, all)
+|`__HIP_ARCH_HAS_WARP_BALLOT__`                |     hasWarpBallot              |Warp ballot instructions
+|`__HIP_ARCH_HAS_WARP_SHUFFLE__`               |     hasWarpShuffle             |Warp shuffle operations (shfl\_\*)
+|`__HIP_ARCH_HAS_WARP_FUNNEL_SHIFT__`          |     hasFunnelShift             |Funnel shift two input words into one
+|Sync:                                    |                                      |
+|`__HIP_ARCH_HAS_THREAD_FENCE_SYSTEM__`        |     hasThreadFenceSystem       |threadfence\_system
+|`__HIP_ARCH_HAS_SYNC_THREAD_EXT__`            |     hasSyncThreadsExt         |syncthreads\_count, syncthreads\_and, syncthreads\_or
+|Miscellaneous:                                |                                |
+|`__HIP_ARCH_HAS_SURFACE_FUNCS__`              |   hasSurfaceFuncs              |
+|`__HIP_ARCH_HAS_3DGRID__`                     |   has3dGrid                    | Grids and groups are 3D
+|`__HIP_ARCH_HAS_DYNAMIC_PARALLEL__`           |   hasDynamicParallelism        |
+
+
+## Finding HIP
+
+Makefiles can use the following syntax to conditionally provide a default HIP_PATH if one does not exist:
+
+```
+HIP_PATH ?= $(shell hipconfig --path)
+```
+
+## Identifying HIP Runtime
+
+HIP can depend on rocclr, or cuda as runtime
+
+- AMD platform
+On AMD platform, HIP uses Radeon Open Compute Common Language Runtime, called ROCclr.
+ROCclr is a virtual device interface that HIP runtimes interact with different backends which allows runtimes to work on Linux , as well as Windows without much efforts.
+
+- NVIDIA platform
+On Nvidia platform, HIP is just a thin layer on top of CUDA.
+On non-AMD platform, HIP runtime determines if cuda is available and can be used. If available, HIP_PLATFORM is set to nvidia and underneath CUDA path is used.
+
+
+## hipLaunchKernelGGL
+
+hipLaunchKernelGGL is a macro that can serve as an alternative way to launch kernel, which accepts parameters of launch configurations (grid dims, group dims, stream, dynamic shared size) followed by a variable number of kernel arguments.
+It can replace <<< >>>, if the user so desires.
+
+## Compiler Options
+
+hipcc is a portable compiler driver that will call nvcc or HIP-Clang (depending on the target system) and attach all required include and library options. It passes options through to the target compiler. Tools that call hipcc must ensure the compiler options are appropriate for the target compiler.
+The `hipconfig` script may helpful in identifying the target platform, compiler and runtime. It can also help set options appropriately.
+
+### Compiler options supported on AMD platforms
+
+Here are the main compiler options supported on AMD platforms by HIP-Clang.
+
+| Option                            | Description |
+| ------                            | ----------- |
+| --amdgpu-target=<gpu_arch>        | [DEPRECATED] This option is being replaced by `--offload-arch=<target>`. Generate code for the given GPU target.  Supported targets are gfx701, gfx801, gfx802, gfx803, gfx900, gfx906, gfx908, gfx1010, gfx1011, gfx1012, gfx1030, gfx1031.  This option could appear multiple times on the same command line to generate a fat binary for multiple targets. |
+| --fgpu-rdc                        | Generate relocatable device code, which allows kernels or device functions calling device functions in different translation units. |
+| -ggdb                             | Equivalent to `-g` plus tuning for GDB.  This is recommended when using ROCm's GDB to debug GPU code. |
+| --gpu-max-threads-per-block=<num> | Generate code to support up to the specified number of threads per block.  |
+| -O<n>                             | Specify the optimization level. |
+| -offload-arch=<target>            | Specify the AMD GPU [target ID](https://clang.llvm.org/docs/ClangOffloadBundlerFileFormat.html#target-id). |
+| -save-temps                       | Save the compiler generated intermediate files. |
+| -v                                | Show the compilation steps. |
+
+## Linking Issues
+
+### Linking With hipcc
+
+hipcc adds the necessary libraries for HIP as well as for the accelerator compiler (nvcc or AMD compiler). We recommend linking with hipcc since it automatically links the binary to the necessary HIP runtime libraries.  It also has knowledge on how to link and to manage the GPU objects.
+
+### -lm Option
+
+hipcc adds -lm by default to the link command.
+
+
+## Linking Code With Other Compilers
+
+CUDA code often uses nvcc for accelerator code (defining and launching kernels, typically defined in .cu or .cuh files).
+It also uses a standard compiler (g++) for the rest of the application. nvcc is a preprocessor that employs a standard host compiler (gcc) to generate the host code.
+Code compiled using this tool can employ only the intersection of language features supported by both nvcc and the host compiler.
+In some cases, you must take care to ensure the data types and alignment of the host compiler are identical to those of the device compiler. Only some host compilers are supported---for example, recent nvcc versions lack Clang host-compiler capability.
+
+HIP-Clang generates both device and host code using the same Clang-based compiler. The code uses the same API as gcc, which allows code generated by different gcc-compatible compilers to be linked together. For example, code compiled using HIP-Clang can link with code compiled using "standard" compilers (such as gcc, ICC and Clang). Take care to ensure all compilers use the same standard C++ header and library formats.
+
+
+### libc++ and libstdc++
+
+hipcc links to libstdc++ by default. This provides better compatibility between g++ and HIP.
+
+If you pass "--stdlib=libc++" to hipcc, hipcc will use the libc++ library.  Generally, libc++ provides a broader set of C++ features while libstdc++ is the standard for more compilers (notably including g++).
+
+When cross-linking C++ code, any C++ functions that use types from the C++ standard library (including std::string, std::vector and other containers) must use the same standard-library implementation. They include the following:
+
+- Functions or kernels defined in HIP-Clang that are called from a standard compiler
+- Functions defined in a standard compiler that are called from HIP-Clanng.
+
+Applications with these interfaces should use the default libstdc++ linking.
+
+Applications which are compiled entirely with hipcc, and which benefit from advanced C++ features not supported in libstdc++, and which do not require portability to nvcc, may choose to use libc++.
+
+
+### HIP Headers (hip_runtime.h, hip_runtime_api.h)
+
+The hip_runtime.h and hip_runtime_api.h files define the types, functions and enumerations needed to compile a HIP program:
+
+- hip_runtime_api.h: defines all the HIP runtime APIs (e.g., hipMalloc) and the types required to call them. A source file that is only calling HIP APIs but neither defines nor launches any kernels can include hip_runtime_api.h. hip_runtime_api.h uses no custom hc language features and can be compiled using a standard C++ compiler.
+- hip_runtime.h: included in hip_runtime_api.h. It additionally provides the types and defines required to create and launch kernels. hip_runtime.h can be compiled using a standard C++ compiler but will expose a subset of the available functions.
+
+CUDA has slightly different contents for these two files. In some cases you may need to convert hipified code to include the richer hip_runtime.h instead of hip_runtime_api.h.
+
+### Using a Standard C++ Compiler
+You can compile hip\_runtime\_api.h using a standard C or C++ compiler (e.g., gcc or ICC). The HIP include paths and defines (`__HIP_PLATFORM_AMD__` or `__HIP_PLATFORM_NVIDIA__`) must pass to the standard compiler; hipconfig then returns the necessary options:
+```
+> hipconfig --cxx_config
+ -D__HIP_PLATFORM_AMD__ -I/home/user1/hip/include
+```
+
+You can capture the hipconfig output and passed it to the standard compiler; below is a sample makefile syntax:
+
+```
+CPPFLAGS += $(shell $(HIP_PATH)/bin/hipconfig --cpp_config)
+```
+
+nvcc includes some headers by default.  However, HIP does not include default headers, and instead all required files must be explicitly included.
+Specifically, files that call HIP run-time APIs or define HIP kernels must explicitly include the appropriate HIP headers.
+If the compilation process reports that it cannot find necessary APIs (for example, "error: identifier hipSetDevice is undefined"),
+ensure that the file includes hip_runtime.h (or hip_runtime_api.h, if appropriate).
+The hipify-perl script automatically converts "cuda_runtime.h" to "hip_runtime.h," and it converts "cuda_runtime_api.h" to "hip_runtime_api.h", but it may miss nested headers or macros.
+
+#### cuda.h
+
+The HIP-Clang path provides an empty cuda.h file. Some existing CUDA programs include this file but don't require any of the functions.
+
+### Choosing HIP File Extensions
+
+Many existing CUDA projects use the ".cu" and ".cuh" file extensions to indicate code that should be run through the nvcc compiler.
+For quick HIP ports, leaving these file extensions unchanged is often easier, as it minimizes the work required to change file names in the directory and #include statements in the files.
+
+For new projects or ports which can be re-factored, we recommend the use of the extension ".hip.cpp" for source files, and
+".hip.h" or ".hip.hpp" for header files.
+This indicates that the code is standard C++ code, but also provides a unique indication for make tools to
+run hipcc when appropriate.
+
+## Workarounds
+
+### warpSize
+Code should not assume a warp size of 32 or 64.  See [Warp Cross-Lane Functions](hip_kernel_language.md#warp-cross-lane-functions) for information on how to write portable wave-aware code.
+
+### Kernel launch with group size > 256
+Kernel code should use ``` __attribute__((amdgpu_flat_work_group_size(<min>,<max>)))```.
+
+For example:
+```
+__global__ void dot(double *a,double *b,const int n) __attribute__((amdgpu_flat_work_group_size(1, 512)))
+```
+
+## memcpyToSymbol
+
+HIP support for hipMemcpyToSymbol is complete.  This feature allows a kernel
+to define a device-side data symbol which can be accessed on the host side.  The symbol
+can be in __constant or device space.
+
+Note that the symbol name needs to be encased in the HIP_SYMBOL macro, as shown in the code example below. This also applies to hipMemcpyFromSymbol, hipGetSymbolAddress, and hipGetSymbolSize.
+
+For example:
+
+Device Code:
+```
+#include<hip/hip_runtime.h>
+#include<hip/hip_runtime_api.h>
+#include<iostream>
+
+#define HIP_ASSERT(status) \
+    assert(status == hipSuccess)
+
+#define LEN 512
+#define SIZE 2048
+
+__constant__ int Value[LEN];
+
+__global__ void Get(hipLaunchParm lp, int *Ad)
+{
+    int tid = threadIdx.x + blockIdx.x * blockDim.x;
+    Ad[tid] = Value[tid];
+}
+
+int main()
+{
+    int *A, *B, *Ad;
+    A = new int[LEN];
+    B = new int[LEN];
+    for(unsigned i=0;i<LEN;i++)
+    {
+        A[i] = -1*i;
+        B[i] = 0;
+    }
+
+    HIP_ASSERT(hipMalloc((void**)&Ad, SIZE));
+
+    HIP_ASSERT(hipMemcpyToSymbol(HIP_SYMBOL(Value), A, SIZE, 0, hipMemcpyHostToDevice));
+    hipLaunchKernelGGL(Get, dim3(1,1,1), dim3(LEN,1,1), 0, 0, Ad);
+    HIP_ASSERT(hipMemcpy(B, Ad, SIZE, hipMemcpyDeviceToHost));
+
+    for(unsigned i=0;i<LEN;i++)
+    {
+        assert(A[i] == B[i]);
+    }
+    std::cout<<"Passed"<<std::endl;
+}
+```
+
+## CU_POINTER_ATTRIBUTE_MEMORY_TYPE
+
+To get pointer's memory type in HIP/HIP-Clang, developers should use hipPointerGetAttributes API. First parameter of the API is hipPointerAttribute_t which has 'type' as member variable. 'type' indicates input pointer is allocated on device or host.
+
+For example:
+```
+double * ptr;
+hipMalloc(reinterpret_cast<void**>(&ptr), sizeof(double));
+hipPointerAttribute_t attr;
+hipPointerGetAttributes(&attr, ptr); /*attr.type will have value as hipMemoryTypeDevice*/
+
+double* ptrHost;
+hipHostMalloc(&ptrHost, sizeof(double));
+hipPointerAttribute_t attr;
+hipPointerGetAttributes(&attr, ptrHost); /*attr.type will have value as hipMemoryTypeHost*/
+```
+Please note, hipMemoryType enum values are different from cudaMemoryType enum values.
+
+For example, on AMD platform, hipMemoryType is defined in hip_runtime_api.h,
+```
+typedef enum hipMemoryType {
+    hipMemoryTypeHost = 0,    ///< Memory is physically located on host
+    hipMemoryTypeDevice = 1,  ///< Memory is physically located on device. (see deviceId for specific device)
+    hipMemoryTypeArray = 2,   ///< Array memory, physically located on device. (see deviceId for specific device)
+    hipMemoryTypeUnified = 3, ///< Not used currently
+    hipMemoryTypeManaged = 4  ///< Managed memory, automaticallly managed by the unified memory system
+} hipMemoryType;
+```
+Looking into CUDA toolkit, it defines cudaMemoryType as following,
+```
+enum cudaMemoryType
+{
+  cudaMemoryTypeUnregistered = 0, // Unregistered memory.
+  cudaMemoryTypeHost = 1, // Host memory.
+  cudaMemoryTypeDevice = 2, // Device memory.
+  cudaMemoryTypeManaged = 3, // Managed memory
+}
+```
+In this case, memory type translation for hipPointerGetAttributes needs to be handled properly on nvidia platform to get the correct memory type in CUDA, which is done in the file nvidia_hip_runtime_api.h.
+
+So in any HIP applications which use HIP APIs involving memory types, developers should use #ifdef in order to assign the correct enum values depending on Nvidia or AMD platform.
+
+As an example, please see the code from the link,
+github.com/ROCm-Developer-Tools/HIP/blob/develop/tests/catch/unit/memory/hipMemcpyParam2D.cc#L77-L96.
+
+With the #ifdef condition, HIP APIs work as expected on both AMD and NVIDIA platforms.
+
+## threadfence_system
+Threadfence_system makes all device memory writes, all writes to mapped host memory, and all writes to peer memory visible to CPU and other GPU devices.
+Some implementations can provide this behavior by flushing the GPU L2 cache.
+HIP/HIP-Clang does not provide this functionality.  As a workaround, users can set the environment variable `HSA_DISABLE_CACHE=1` to disable the GPU L2 cache. This will affect all accesses and for all kernels and so may have a performance impact.
+
+### Textures and Cache Control
+
+Compute programs sometimes use textures either to access dedicated texture caches or to use the texture-sampling hardware for interpolation and clamping. The former approach uses simple point samplers with linear interpolation, essentially only reading a single point. The latter approach uses the sampler hardware to interpolate and combine multiple samples. AMD hardware, as well as recent competing hardware, has a unified texture/L1 cache, so it no longer has a dedicated texture cache. But the nvcc path often caches global loads in the L2 cache, and some programs may benefit from explicit control of the L1 cache contents.  We recommend the __ldg instruction for this purpose.
+
+AMD compilers currently load all data into both the L1 and L2 caches, so __ldg is treated as a no-op.
+
+We recommend the following for functional portability:
+
+- For programs that use textures only to benefit from improved caching, use the __ldg instruction
+- Programs that use texture object and reference APIs, work well on HIP
+
+
+## More Tips
+
+### HIP Logging
+
+On an AMD platform, set the AMD_LOG_LEVEL environment variable to log HIP application execution information.
+
+The value of the setting controls different logging level,
+
+```
+enum LogLevel {
+LOG_NONE = 0,
+LOG_ERROR = 1,
+LOG_WARNING = 2,
+LOG_INFO = 3,
+LOG_DEBUG = 4
+};
+```
+
+Logging mask is used to print types of functionalities during the execution of HIP application.
+It can be set as one of the following values,
+
+```
+enum LogMask {
+  LOG_API       = 0x00000001, //!< API call
+  LOG_CMD       = 0x00000002, //!< Kernel and Copy Commands and Barriers
+  LOG_WAIT      = 0x00000004, //!< Synchronization and waiting for commands to finish
+  LOG_AQL       = 0x00000008, //!< Decode and display AQL packets
+  LOG_QUEUE     = 0x00000010, //!< Queue commands and queue contents
+  LOG_SIG       = 0x00000020, //!< Signal creation, allocation, pool
+  LOG_LOCK      = 0x00000040, //!< Locks and thread-safety code.
+  LOG_KERN      = 0x00000080, //!< kernel creations and arguments, etc.
+  LOG_COPY      = 0x00000100, //!< Copy debug
+  LOG_COPY2     = 0x00000200, //!< Detailed copy debug
+  LOG_RESOURCE  = 0x00000400, //!< Resource allocation, performance-impacting events.
+  LOG_INIT      = 0x00000800, //!< Initialization and shutdown
+  LOG_MISC      = 0x00001000, //!< misc debug, not yet classified
+  LOG_AQL2      = 0x00002000, //!< Show raw bytes of AQL packet
+  LOG_CODE      = 0x00004000, //!< Show code creation debug
+  LOG_CMD2      = 0x00008000, //!< More detailed command info, including barrier commands
+  LOG_LOCATION  = 0x00010000, //!< Log message location
+  LOG_ALWAYS    = 0xFFFFFFFF, //!< Log always even mask flag is zero
+};
+```
+
+### Debugging hipcc
+To see the detailed commands that hipcc issues, set the environment variable HIPCC_VERBOSE to 1. Doing so will print to stderr the HIP-clang (or nvcc) commands that hipcc generates.
+
+```
+export HIPCC_VERBOSE=1
+make
+...
+hipcc-cmd: /opt/hcc/bin/hcc  -hc -I/opt/hcc/include -stdlib=libc++ -I../../../../hc/include -I../../../../include/amd_detail/cuda -I../../../../include -x c++ -I../../common -O3 -c backprop_cuda.cu
+```
+
+### What Does This Error Mean?
+
+#### /usr/include/c++/v1/memory:5172:15: error: call to implicitly deleted default constructor of 'std::__1::bad_weak_ptr' throw bad_weak_ptr();
+
+If you pass a ".cu" file, hcc will attempt to compile it as a CUDA language file. You must tell hcc that it's in fact a C++ file: use the "-x c++" option.
+
+
+### Editor Highlighting
+See the utils/vim or utils/gedit directories to add handy highlighting to hip files.
+
+
diff --git a/projects/hip/tests/src/kernel/hipShflTests.cpp b/projects/hip/tests/src/kernel/hipShflTests.cpp
index e1467aaa17..4b37489fad 100644
--- a/projects/hip/tests/src/kernel/hipShflTests.cpp
+++ b/projects/hip/tests/src/kernel/hipShflTests.cpp
@@ -28,6 +28,7 @@ THE SOFTWARE.
 #include <iostream>
 #include <hip/hip_runtime.h>
 #include "test_common.h"
+#include <hip/hip_fp16.h>
 
 #define WIDTH 4
 
@@ -60,12 +61,53 @@ void matrixTransposeCPUReference(T* output, T* input, const unsigned int width)
 void getFactor(int& fact) { fact = 101; }
 void getFactor(unsigned int& fact) { fact = static_cast<unsigned int>(INT32_MAX)+1; }
 void getFactor(float& fact) { fact = 2.5; }
+void getFactor(__half& fact) { fact = 2.5; }
 void getFactor(double& fact) { fact = 2.5; }
 void getFactor(long& fact) { fact = 202; }
 void getFactor(unsigned long& fact) { fact = static_cast<unsigned long>(__LONG_MAX__)+1; }
 void getFactor(long long& fact) { fact = 303; }
 void getFactor(unsigned long long& fact) { fact = static_cast<unsigned long long>(__LONG_LONG_MAX__)+1; }
 
+template <typename T> int compare(T* TransposeMatrix, T* cpuTransposeMatrix) {
+    int errors = 0;
+    for (int i = 0; i < NUM; i++) {
+        if (TransposeMatrix[i] != cpuTransposeMatrix[i]) {
+            errors++;
+        }
+    }
+    return errors;
+}
+
+template <> int compare<__half>(__half* TransposeMatrix, __half* cpuTransposeMatrix) {
+    int errors = 0;
+    for (int i = 0; i < NUM; i++) {
+        if (__half2float(TransposeMatrix[i]) != __half2float(cpuTransposeMatrix[i])) {
+            errors++;
+        }
+    }
+    return errors;
+}
+
+template <typename T>
+void init(T* Matrix) {
+    // initialize the input data
+    T factor;
+    getFactor(factor);
+    for (int i = 0; i < NUM; i++) {
+        Matrix[i] = (T)i + factor;
+    }
+}
+
+template <>
+void init(__half* Matrix) {
+    // initialize the input data
+    __half factor;
+    getFactor(factor);
+    for (int i = 0; i < NUM; i++) {
+        Matrix[i] = i + __half2float(factor);
+    }
+}
+
 template<typename T>
 void runTest() {
     T* Matrix;
@@ -78,19 +120,13 @@ void runTest() {
     hipDeviceProp_t devProp;
     hipGetDeviceProperties(&devProp, 0);
 
-    int i;
     int errors;
 
     Matrix = (T*)malloc(NUM * sizeof(T));
     TransposeMatrix = (T*)malloc(NUM * sizeof(T));
     cpuTransposeMatrix = (T*)malloc(NUM * sizeof(T));
 
-    // initialize the input data
-    T factor;
-    getFactor(factor);
-    for (i = 0; i < NUM; i++) {
-        Matrix[i] = (T)i + factor;
-    }
+    init(Matrix);
 
     // allocate the memory on the device side
     hipMalloc((void**)&gpuMatrix, NUM * sizeof(T));
@@ -110,14 +146,8 @@ void runTest() {
     matrixTransposeCPUReference(cpuTransposeMatrix, Matrix, WIDTH);
 
     // verify the results
-    errors = 0;
+    errors = compare(TransposeMatrix, cpuTransposeMatrix);
     double eps = 1.0E-6;
-    for (i = 0; i < NUM; i++) {
-        if (TransposeMatrix[i] != cpuTransposeMatrix[i]) {
-            errors++;
-        }
-    }
-
     // free the resources on device side
     hipFree(gpuMatrix);
     hipFree(gpuTransposeMatrix);
@@ -137,6 +167,7 @@ int main() {
     runTest<float>();
     runTest<double>();
     runTest<long>();
+    runTest<__half>();
     runTest<long long>();
     runTest<unsigned int>();
     runTest<unsigned long>();
diff --git a/projects/hip/tests/src/kernel/hipShflUpDownTest.cpp b/projects/hip/tests/src/kernel/hipShflUpDownTest.cpp
index be1253d03c..86aa5d6565 100644
--- a/projects/hip/tests/src/kernel/hipShflUpDownTest.cpp
+++ b/projects/hip/tests/src/kernel/hipShflUpDownTest.cpp
@@ -20,7 +20,7 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 */
 /* HIT_START
- * BUILD: %t %s ../test_common.cpp
+ * BUILD: %t %s ../test_common.cpp EXCLUDE_HIP_PLATFORM nvidia
  * TEST: %t
  * HIT_END
  */
@@ -28,6 +28,8 @@ THE SOFTWARE.
 #include <iostream>
 #include <hip/hip_runtime.h>
 #include "test_common.h"
+#include <hip/hip_fp16.h>
+const int size = 32;
 
 template <typename T>
 __global__ void shflDownSum(T* a, int size) {
@@ -59,27 +61,59 @@ void getFactor(int& fact) { fact = 101; }
 void getFactor(unsigned int& fact) { fact = static_cast<unsigned int>(INT32_MAX)+1; }
 void getFactor(float& fact) { fact = 2.5; }
 void getFactor(double& fact) { fact = 2.5; }
+void getFactor(__half& fact) { fact = 2.5; }
 void getFactor(long& fact) { fact = 202; }
 void getFactor(unsigned long& fact) { fact = static_cast<unsigned long>(__LONG_MAX__)+1; }
 void getFactor(long long& fact) { fact = 303; }
 void getFactor(unsigned long long& fact) { fact = static_cast<unsigned long long>(__LONG_LONG_MAX__)+1; }
 
-template <typename T>
-void runTestShflUp() {
-    const int size = 32;
-    T a[size];
+template <typename T> T sum(T* a) {
     T cpuSum = 0;
-    T factor; getFactor(factor);
+    T factor;
+    getFactor(factor);
     for (int i = 0; i < size; i++) {
         a[i] = i + factor;
         cpuSum += a[i];
     }
+    return cpuSum;
+}
+
+template <> __half sum(__half* a) {
+    __half cpuSum = 0;
+    __half factor;
+    getFactor(factor);
+    for (int i = 0; i < size; i++) {
+        a[i] = i + __half2float(factor);
+        cpuSum = __half2float(cpuSum) + __half2float(a[i]);
+    }
+    return cpuSum;
+}
+
+template <typename T> bool compare(T gpuSum, T cpuSum) {
+    if (gpuSum != cpuSum) {
+        return true;
+    }
+    return false;
+}
+
+template <> bool compare(__half gpuSum, __half cpuSum) {
+    if (__half2float(gpuSum) != __half2float(cpuSum)) {
+        return true;
+    }
+    return false;
+}
+
+template <typename T>
+void runTestShflUp() {
+    const int size = 32;
+    T a[size];
+    T cpuSum = sum(a);
     T* d_a;
     hipMalloc(&d_a, sizeof(T) * size);
     hipMemcpy(d_a, &a, sizeof(T) * size, hipMemcpyDefault);
     hipLaunchKernelGGL(shflUpSum<T>, 1, size, 0, 0, d_a, size);
     hipMemcpy(&a, d_a, sizeof(T) * size, hipMemcpyDefault);
-    if (a[size - 1] != cpuSum) {
+    if (compare(a[size - 1], cpuSum)) {
         hipFree(d_a);
         failed("Shfl Up Sum did not match.");
     }
@@ -88,20 +122,14 @@ void runTestShflUp() {
 
 template <typename T>
 void runTestShflDown() {
-    const int size = 32;
     T a[size];
-    T cpuSum = 0;
-    T factor; getFactor(factor);
-    for (int i = 0; i < size; i++) {
-        a[i] = i + factor;
-        cpuSum += a[i];
-    }
+    T cpuSum = sum(a);
     T* d_a;
     hipMalloc(&d_a, sizeof(T) * size);
     hipMemcpy(d_a, &a, sizeof(T) * size, hipMemcpyDefault);
     hipLaunchKernelGGL(shflDownSum<T>, 1, size, 0, 0, d_a, size);
     hipMemcpy(&a, d_a, sizeof(T) * size, hipMemcpyDefault);
-    if (a[0] != cpuSum) {
+    if (compare(a[0], cpuSum)) {
         hipFree(d_a);
         failed("Shfl Down Sum did not match.");
     }
@@ -110,20 +138,14 @@ void runTestShflDown() {
 
 template <typename T>
 void runTestShflXor() {
-    const int size = 32;
     T a[size];
-    T cpuSum = 0;
-    T factor; getFactor(factor);
-    for (int i = 0; i < size; i++) {
-        a[i] = i + factor;
-        cpuSum += a[i];
-    }
+    T cpuSum = sum(a);
     T* d_a;
     hipMalloc(&d_a, sizeof(T) * size);
     hipMemcpy(d_a, &a, sizeof(T) * size, hipMemcpyDefault);
     hipLaunchKernelGGL(shflXorSum<T>, 1, size, 0, 0, d_a, size);
     hipMemcpy(&a, d_a, sizeof(T) * size, hipMemcpyDefault);
-    if (a[0] != cpuSum) {
+    if (compare(a[0], cpuSum)) {
         hipFree(d_a);
         failed("Shfl Xor Sum did not match.");
     }
@@ -134,6 +156,7 @@ int main() {
     runTestShflUp<float>();
     runTestShflUp<double>();
     runTestShflUp<long>();
+    runTestShflUp<__half>();
     runTestShflUp<long long>();
     runTestShflUp<unsigned int>();
     runTestShflUp<unsigned long>();
@@ -143,6 +166,7 @@ int main() {
     runTestShflDown<float>();
     runTestShflDown<double>();
     runTestShflDown<long>();
+    runTestShflDown<__half>();
     runTestShflDown<long long>();
     runTestShflDown<unsigned int>();
     runTestShflDown<unsigned long>();
@@ -152,6 +176,7 @@ int main() {
     runTestShflXor<float>();
     runTestShflXor<double>();
     runTestShflXor<long>();
+    runTestShflXor<__half>();
     runTestShflXor<long long>();
     runTestShflXor<unsigned int>();
     runTestShflXor<unsigned long>();