Merge branch 'amd-develop' into amd-master

Change-Id: I8921e67e352e35e4c496e78a797fb309279ab7d0 [ROCm/clr commit: 62870fdc39]
2017-03-14 15:57:38 +05:30
Commit b054a30dec
@@ -189,7 +189,7 @@ if(HIP_PLATFORM STREQUAL "hcc")

    execute_process(COMMAND ${HCC_HOME}/bin/hcc-config --ldflags OUTPUT_VARIABLE HCC_LD_FLAGS)
    set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} ${HCC_LD_FLAGS} -Wl,-Bsymbolic")
-    set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} --amdgpu-target=gfx701 --amdgpu-target=gfx801 --amdgpu-target=gfx802 --amdgpu-target=gfx803")
+    set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} --amdgpu-target=gfx701 --amdgpu-target=gfx801 --amdgpu-target=gfx802 --amdgpu-target=gfx803 --amdgpu-target=gfx900")
    add_library(hip_hcc SHARED ${SOURCE_FILES_RUNTIME})
    target_link_libraries(hip_hcc PRIVATE hc_am)
    add_library(hip_hcc_static STATIC ${SOURCE_FILES_RUNTIME})
@@ -23,8 +23,8 @@ use File::Basename;
 # HSA_PATH  : Path to HSA dir (default /opt/rocm/hsa).  Used on AMD platforms only.

 if(scalar @ARGV == 0){
-print "No Arguments passed, exiting ...\n";
-exit(-1);
+    print "No Arguments passed, exiting ...\n";
+    exit(-1);
 }

 #---
@@ -74,6 +74,7 @@ $target_gfx701 = 0;
 $target_gfx801 = 0;
 $target_gfx802 = 0;
 $target_gfx803 = 0;
+$target_gfx900 = 0;

 if ($HIP_PLATFORM eq "hcc") {
    $HSA_PATH=$ENV{'HSA_PATH'} // "/opt/rocm/hsa";
@@ -189,18 +190,18 @@ if ($verbose & 0x4) {
 # Handle code object generation
 my $ISACMD="";
 if($HIP_PLATFORM eq "hcc"){
-  $ISACMD .= "set ROCM_PATH=$ROCM_PATH && set ROCM_TARGET=$ROCM_TARGET && $HIP_PATH/bin/hccgenco.sh ";
-  if($ARGV[0] eq "--genco"){
-    foreach $isaarg (@ARGV[1..$#ARGV]){
-      $ISACMD .= " ";
-      $ISACMD .= $isaarg;
+    $ISACMD .= "set ROCM_PATH=$ROCM_PATH && set ROCM_TARGET=$ROCM_TARGET && $HIP_PATH/bin/hccgenco.sh ";
+    if($ARGV[0] eq "--genco"){
+        foreach $isaarg (@ARGV[1..$#ARGV]){
+            $ISACMD .= " ";
+            $ISACMD .= $isaarg;
+        }
+        if ($verbose & 0x1) {
+            print "hipcc-cmd: ", $ISACMD, "\n";
+        }
+        system($ISACMD) and die();
+        exit(0);
    }
-    if ($verbose & 0x1) {
-      print "hipcc-cmd: ", $ISACMD, "\n";
-    }
-    system($ISACMD) and die();
-    exit(0);
-  }
 }

 if(($HIP_PLATFORM eq "hcc")){
@@ -210,18 +211,18 @@ if(($HIP_PLATFORM eq "hcc")){
 }

 if($HIP_PLATFORM eq "nvcc"){
-  $ISACMD .= "$HIP_PATH/bin/hipcc -ptx ";
-  if($ARGV[0] eq "--genco"){
-    foreach $isaarg (@ARGV[1..$#ARGV]){
-      $ISACMD .= " ";
-      $ISACMD .= $isaarg;
+    $ISACMD .= "$HIP_PATH/bin/hipcc -ptx ";
+    if($ARGV[0] eq "--genco"){
+        foreach $isaarg (@ARGV[1..$#ARGV]){
+            $ISACMD .= " ";
+            $ISACMD .= $isaarg;
+        }
+        if ($verbose & 0x1) {
+            print "hipcc-cmd: ", $ISACMD, "\n";
+        }
+        system($ISACMD) and die();
+        exit(0);
    }
-    if ($verbose & 0x1) {
-      print "hipcc-cmd: ", $ISACMD, "\n";
-    }
-    system($ISACMD) and die();
-    exit(0);
-  }
 }

 my $toolArgs = "";  # arguments to pass to the hcc or nvcc tool
@@ -247,20 +248,25 @@ foreach $arg (@ARGV)
    }
    if($arg eq '--amdgpu-target=gfx701')
    {
-       $target_gfx701 = 1;
+        $target_gfx701 = 1;
    }
    if($arg eq '--amdgpu-target=gfx801')
    {
-       $target_gfx801 = 1;
+        $target_gfx801 = 1;
    }
    if($arg eq '--amdgpu-target=gfx802')
    {
-       $target_gfx802 = 1;
+        $target_gfx802 = 1;
    }
    if($arg eq '--amdgpu-target=gfx803')
    {
-       $target_gfx803 = 1;
+        $target_gfx803 = 1;
    }
+    if($arg eq '--amdgpu-target=gfx900')
+    {
+        $target_gfx900 = 1;
+    }
+
    if(($trimarg eq '-stdlib=libstdc++') and ($setStdLib eq 0))
    {
        $HIPCXXFLAGS .= $HCC_WA_FLAGS;
@@ -320,6 +326,33 @@ foreach $arg (@ARGV)
    }
    $toolArgs .= " $arg" unless $swallowArg;
 }
+foreach my $target (split(/,/, $ENV{HCC_AMDGPU_TARGET}))
+{
+    if($target eq 'gfx701')
+    {
+        $target_gfx701 = 1;
+    }
+    if($target eq 'gfx801')
+    {
+        $target_gfx801 = 1;
+    }
+    if($target eq 'gfx802')
+    {
+        $target_gfx802 = 1;
+    }
+    if($target eq 'gfx803')
+    {
+        $target_gfx803 = 1;
+    }
+    if($target eq 'gfx900')
+    {
+        $target_gfx900 = 1;
+    }
+}
+if ($target_gfx701 eq 0 and $target_gfx801 eq 0 and $target_gfx802 eq 0 and $target_gfx803 eq 0 and $target_gfx900 eq 0)
+{
+    $target_gfx803 = 1;
+}

 if($HIP_PLATFORM eq "hcc"){

@@ -343,12 +376,10 @@ if($HIP_PLATFORM eq "hcc"){
        $HIPCXXFLAGS .= " -D__HIP_ARCH_GFX803__=1 ";
        $ENV{HCC_EXTRA_LIBRARIES_GFX803}="$HIP_PATH/lib/hip_hc_gfx803.ll\n";
    }
-    if ($target_gfx701 eq 0 and $target_gfx801 eq 0 and $target_gfx802 eq 0 and $target_gfx803 eq 0)
-    {
-        $HIPLDFLAGS .= " --amdgpu-target=gfx701 --amdgpu-target=gfx801 --amdgpu-target=gfx802 --amdgpu-target=gfx803";
-        $ENV{HCC_EXTRA_LIBRARIES_GFX803}="$HIP_PATH/lib/hip_hc_gfx803.ll\n";
+    if ($target_gfx900 eq 1) {
+        $HIPLDFLAGS .= " --amdgpu-target=gfx900";
+        $HIPCXXFLAGS .= " -D__HIP_ARCH_GFX900__=1 ";
    }
-
 }

 if ($hasC and $HIP_PLATFORM eq 'nvcc') {
@@ -407,3 +438,5 @@ if ($runCmd) {
    }
    system ("$CMD") and die ();
 }
+
+# vim: ts=4:sw=4:expandtab:smartindent
@@ -166,10 +166,10 @@ Both nvcc and hcc make two passes over the code: one for host code and one for d
 
 ```
 // #ifdef __CUDA_ARCH__  
-#ifdef __HIP_DEVICE_COMPILE__ && (__HIP_DEVICE_COMPILE__ == 1) 
+#if __HIP_DEVICE_COMPILE__
 ```
 
-Unlike `__CUDA_ARCH__`, the `__HIP_DEVICE_COMPILE__` value is 0 or 1, and it doesnt represent the feature capability of the target device.  
+Unlike `__CUDA_ARCH__`, the `__HIP_DEVICE_COMPILE__` value is 1 or undefined, and it doesnt represent the feature capability of the target device.  


 ### Compiler Defines: Summary
@@ -178,7 +178,7 @@ Unlike `__CUDA_ARCH__`, the `__HIP_DEVICE_COMPILE__` value is 0 or 1, and it doe
 |HIP-related defines:|
 |`__HIP_PLATFORM_HCC___`| Defined | Undefined |  Defined if targeting hcc platform; undefined otherwise |
 |`__HIP_PLATFORM_NVCC___`| Undefined | Defined |  Defined if targeting nvcc platform; undefined otherwise |
-|`__HIP_DEVICE_COMPILE__`     | 1 if compiling for device; 0 if compiling for host  |1 if compiling for device; 0 if compiling for host  | Undefined 
+|`__HIP_DEVICE_COMPILE__`     | 1 if compiling for device; undefined if compiling for host  |1 if compiling for device; undefined if compiling for host  | Undefined 
 |`__HIPCC__`		| Defined   | Defined 		|  Undefined
 |`__HIP_ARCH_*` | 0 or 1 depending on feature support (see below) | 0 or 1 depending on feature support (see below) | 0 
 |nvcc-related defines:|
@@ -28,7 +28,7 @@ THE SOFTWARE.

 #if __cplusplus
 #define COMPLEX_ADD_OP_OVERLOAD(type) \
-__device__ __host__ static type operator + (const type& lhs, const type& rhs) { \
+__device__ __host__ static inline type operator + (const type& lhs, const type& rhs) { \
  type ret; \
  ret.x = lhs.x + rhs.x ; \
  ret.y = lhs.y + rhs.y ; \
@@ -36,7 +36,7 @@ __device__ __host__ static type operator + (const type& lhs, const type& rhs) {
 }

 #define COMPLEX_SUB_OP_OVERLOAD(type) \
-__device__ __host__ static type operator - (const type& lhs, const type& rhs) { \
+__device__ __host__ static inline type operator - (const type& lhs, const type& rhs) { \
  type ret; \
  ret.x = lhs.x - rhs.x; \
  ret.y = lhs.y - rhs.y; \
@@ -44,7 +44,7 @@ __device__ __host__ static type operator - (const type& lhs, const type& rhs) {
 }

 #define COMPLEX_MUL_OP_OVERLOAD(type) \
-__device__ __host__ static type operator * (const type& lhs, const type& rhs) { \
+__device__ __host__ static inline type operator * (const type& lhs, const type& rhs) { \
  type ret; \
  ret.x = lhs.x * rhs.x - lhs.y * rhs.y; \
  ret.y = lhs.x * rhs.y + lhs.y * rhs.x; \
@@ -52,7 +52,7 @@ __device__ __host__ static type operator * (const type& lhs, const type& rhs) {
 }

 #define COMPLEX_DIV_OP_OVERLOAD(type) \
-__device__ __host__ static type operator / (const type& lhs, const type& rhs) { \
+__device__ __host__ static inline type operator / (const type& lhs, const type& rhs) { \
  type ret; \
  ret.x = (lhs.x * rhs.x + lhs.y * rhs.y); \
  ret.y = (rhs.x * lhs.y - lhs.x * rhs.y); \
@@ -88,7 +88,7 @@ __device__ __host__ static inline type& operator /= (type& lhs, const type& rhs)
 }

 #define COMPLEX_SCALAR_PRODUCT(type, type1) \
-__device__ __host__ static type operator * (const type& lhs, type1 rhs) { \
+__device__ __host__ static inline type operator * (const type& lhs, type1 rhs) { \
  type ret; \
  ret.x = lhs.x * rhs; \
  ret.y = lhs.y * rhs; \
@@ -226,6 +226,8 @@ __device__ int __all(  int input);
 __device__ int __any( int input);
 __device__  unsigned long long int __ballot( int input);

+#if __HIP_ARCH_GFX701__ == 0
+
 // warp shuffle functions
 #ifdef __cplusplus
 __device__ int __shfl(int input, int lane, int width=warpSize);
@@ -247,6 +249,18 @@ __device__ float __shfl_down(float input, unsigned int lane_delta, int width);
 __device__ float __shfl_xor(float input, int lane_mask, int width);
 #endif

+__device__ unsigned __hip_ds_bpermute(int index, unsigned src);
+__device__ float __hip_ds_bpermutef(int index, float src);
+__device__ unsigned __hip_ds_permute(int index, unsigned src);
+__device__ float __hip_ds_permutef(int index, float src);
+
+__device__ unsigned __hip_ds_swizzle(unsigned int src, int pattern);
+__device__ float __hip_ds_swizzlef(float src, int pattern);
+
+__device__ int __hip_move_dpp(int src, int dpp_ctrl, int row_mask, int bank_mask, bool bound_ctrl);
+
+#endif
+
 __host__ __device__ int min(int arg1, int arg2);
 __host__ __device__ int max(int arg1, int arg2);

@@ -321,16 +335,6 @@ __device__ static inline void __threadfence(void) {
 //__device__ void  __threadfence_system(void) __attribute__((deprecated("Provided with workaround configuration, see hip_kernel_language.md for details")));
 __device__ void  __threadfence_system(void) ;

-__device__ unsigned __hip_ds_bpermute(int index, unsigned src);
-__device__ float __hip_ds_bpermutef(int index, float src);
-__device__ unsigned __hip_ds_permute(int index, unsigned src);
-__device__ float __hip_ds_permutef(int index, float src);
-
-__device__ unsigned __hip_ds_swizzle(unsigned int src, int pattern);
-__device__ float __hip_ds_swizzlef(float src, int pattern);
-
-__device__ int __hip_move_dpp(int src, int dpp_ctrl, int row_mask, int bank_mask, bool bound_ctrl);
-
 // doxygen end Fence Fence
 /**
 * @}
@@ -62,7 +62,12 @@ typedef struct ihipStream_t *hipStream_t;

 #define hipIpcMemLazyEnablePeerAccess 0

-typedef struct ihipIpcMemHandle_t *hipIpcMemHandle_t;
+#define HIP_IPC_HANDLE_SIZE 64
+
+typedef struct hipIpcMemHandle_st
+{
+    char reserved[HIP_IPC_HANDLE_SIZE];
+}hipIpcMemHandle_t;

 //TODO: IPC event handle currently unsupported
 struct ihipIpcEventHandle_t;
@@ -853,6 +858,8 @@ hipError_t hipPointerGetAttributes(hipPointerAttribute_t *attributes, void* ptr)
 *  @param[out] ptr Pointer to the allocated memory
 *  @param[in]  size Requested memory size
 *
+ *  If size is 0, no memory is allocated, *ptr returns nullptr, and hipSuccess is returned.
+ *
 *  @return #hipSuccess
 *
 *  @see hipMallocPitch, hipFree, hipMallocArray, hipFreeArray, hipMalloc3D, hipMalloc3DArray, hipHostFree, hipHostMalloc
@@ -865,6 +872,8 @@ hipError_t hipMalloc(void** ptr, size_t size) ;
 *  @param[out] ptr Pointer to the allocated host pinned memory
 *  @param[in]  size Requested memory size
 *
+ *  If size is 0, no memory is allocated, *ptr returns nullptr, and hipSuccess is returned.
+ *
 *  @return #hipSuccess, #hipErrorMemoryAllocation
 *
 *  @deprecated use hipHostMalloc() instead
@@ -878,6 +887,8 @@ hipError_t hipMallocHost(void** ptr, size_t size) __attribute__((deprecated("use
 *  @param[in]  size Requested memory size
 *  @param[in]  flags Type of host memory allocation
 *
+ *  If size is 0, no memory is allocated, *ptr returns nullptr, and hipSuccess is returned.
+ *
 *  @return #hipSuccess, #hipErrorMemoryAllocation
 *
 *  @see hipSetDeviceFlags, hipHostFree
@@ -891,6 +902,8 @@ hipError_t hipHostMalloc(void** ptr, size_t size, unsigned int flags) ;
 *  @param[in]  size Requested memory size
 *  @param[in]  flags Type of host memory allocation
 *
+ *  If size is 0, no memory is allocated, *ptr returns nullptr, and hipSuccess is returned.
+ *
 *  @return #hipSuccess, #hipErrorMemoryAllocation
 *
 *  @deprecated use hipHostMalloc() instead
@@ -975,6 +988,9 @@ hipError_t hipHostUnregister(void* hostPtr) ;
 *  @param[out] pitch Pitch for allocation (in bytes)
 *  @param[in]  width Requested pitched allocation width (in bytes)
 *  @param[in]  height Requested pitched allocation height
+ *
+ *  If size is 0, no memory is allocated, *ptr returns nullptr, and hipSuccess is returned.
+ *
 *  @return Error code
 *
 *  @see hipMalloc, hipFree, hipMallocArray, hipFreeArray, hipHostFree, hipMalloc3D, hipMalloc3DArray, hipHostMalloc
@@ -1236,6 +1252,9 @@ hipError_t hipMemsetAsync(void* dst, int value, size_t sizeBytes, hipStream_t st
 hipError_t hipMemGetInfo  (size_t * free, size_t * total)   ;


+hipError_t hipMemPtrGetInfo(void *ptr, size_t *size);
+
+
 /**
 *  @brief Allocate an array on the device.
 *
@@ -1260,7 +1260,7 @@ __device__ __host__ static inline type& operator op (type& val) { \
 }

 #define DECLOP_1VAR_POSTOP(type, op) \
-__device__ __host__ static inline type operator op (type& val, int i) { \
+__device__ __host__ static inline type operator op (type& val, int) { \
  type ret; \
  ret.x = val.x; \
  val.x op; \
@@ -1326,7 +1326,7 @@ __device__ __host__ static inline type& operator op (type& val) { \
 }

 #define DECLOP_2VAR_POSTOP(type, op) \
-__device__ __host__ static inline type operator op (type& val, int i) { \
+__device__ __host__ static inline type operator op (type& val, int) { \
  type ret; \
  ret.x = val.x; \
  ret.y = val.y; \
@@ -1337,7 +1337,7 @@ __device__ __host__ static inline type operator op (type& val, int i) { \

 #define DECLOP_2VAR_COMP(type, op) \
 __device__ __host__ static inline bool operator op (type& lhs, type& rhs) { \
-  return lhs.x op rhs.x && lhs.y op rhs.y; \
+  return (lhs.x op rhs.x) && (lhs.y op rhs.y); \
 }

 #define DECLOP_2VAR_1IN_1OUT(type, op) \
@@ -1350,7 +1350,7 @@ __device__ __host__ static inline type operator op(type &rhs) { \

 #define DECLOP_2VAR_1IN_BOOLOUT(type, op) \
 __device__ __host__ static inline bool operator op (type &rhs) { \
-  return op rhs.x && op rhs.y; \
+  return (op rhs.x) && (op rhs.y); \
 }


@@ -1401,7 +1401,7 @@ __device__ __host__ static inline type& operator op (type& val) { \
 }

 #define DECLOP_3VAR_POSTOP(type, op) \
-__device__ __host__ static inline type operator op (type& val, int i) { \
+__device__ __host__ static inline type operator op (type& val, int) { \
  type ret; \
  ret.x = val.x; \
  ret.y = val.y; \
@@ -1414,7 +1414,7 @@ __device__ __host__ static inline type operator op (type& val, int i) { \

 #define DECLOP_3VAR_COMP(type, op) \
 __device__ __host__ static inline bool operator op (type& lhs, type& rhs) { \
-  return lhs.x op rhs.x && lhs.y op rhs.y && lhs.z op rhs.z; \
+  return (lhs.x op rhs.x) && (lhs.y op rhs.y) && (lhs.z op rhs.z); \
 }

 #define DECLOP_3VAR_1IN_1OUT(type, op) \
@@ -1428,7 +1428,7 @@ __device__ __host__ static inline type operator op(type &rhs) { \

 #define DECLOP_3VAR_1IN_BOOLOUT(type, op) \
 __device__ __host__ static inline bool operator op (type &rhs) { \
-  return op rhs.x && op rhs.y && op rhs.z; \
+  return (op rhs.x) && (op rhs.y) && (op rhs.z); \
 }


@@ -1484,7 +1484,7 @@ __device__ __host__ static inline type& operator op (type& val) { \
 }

 #define DECLOP_4VAR_POSTOP(type, op) \
-__device__ __host__ static inline type operator op (type& val, int i) { \
+__device__ __host__ static inline type operator op (type& val, int) { \
  type ret; \
  ret.x = val.x; \
  ret.y = val.y; \
@@ -1499,7 +1499,7 @@ __device__ __host__ static inline type operator op (type& val, int i) { \

 #define DECLOP_4VAR_COMP(type, op) \
 __device__ __host__ static inline bool operator op (type& lhs, type& rhs) { \
-  return lhs.x op rhs.x && lhs.y op rhs.y && lhs.z op rhs.z && lhs.w op rhs.w; \
+  return (lhs.x op rhs.x) && (lhs.y op rhs.y) && (lhs.z op rhs.z) && (lhs.w op rhs.w); \
 }

 #define DECLOP_4VAR_1IN_1OUT(type, op) \
@@ -1514,7 +1514,7 @@ __device__ __host__ static inline type operator op(type &rhs) { \

 #define DECLOP_4VAR_1IN_BOOLOUT(type, op) \
 __device__ __host__ static inline bool operator op (type &rhs) { \
-  return op rhs.x && op rhs.y && op rhs.z && op rhs.w; \
+  return (op rhs.x) && (op rhs.y) && (op rhs.z) && (op rhs.w); \
 }


@@ -27,13 +27,6 @@ THE SOFTWARE.
 // Other compiler (GCC,ICC,etc) need to set one of these macros explicitly
 #if defined(__HCC__)
 #define __HIP_PLATFORM_HCC__
-
-#if defined(__HCC_ACCELERATOR__) && (__HCC_ACCELERATOR__ != 0)
-#define __HIP_DEVICE_COMPILE__ 1
-#else
-#define __HIP_DEVICE_COMPILE__ 0
-#endif
-
 #endif //__HCC__

 // Auto enable __HIP_PLATFORM_NVCC__ if compiling with NVCC
@@ -43,14 +36,12 @@ THE SOFTWARE.
 #define __HIPCC__
 #endif

-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ != 0)
-#define __HIP_DEVICE_COMPILE__ 1
-#else
-#define __HIP_DEVICE_COMPILE__ 0
-#endif
-
 #endif //__NVCC__

+// Auto enable __HIP_DEVICE_COMPILE__ if compiled in HCC or NVCC device path
+#if (defined(__HCC_ACCELERATOR__) && __HCC_ACCELERATOR__ != 0) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ != 0)
+  #define __HIP_DEVICE_COMPILE__ 1
+#endif

 #if __HIP_DEVICE_COMPILE__ == 0
 // 32-bit Atomics
@@ -106,6 +106,7 @@ typedef struct hipDeviceProp_t {
    size_t maxSharedMemoryPerMultiProcessor;    ///< Maximum Shared Memory Per Multiprocessor.
    int isMultiGpuBoard;                        ///< 1 if device is on a multi-GPU board, 0 if not.
    int canMapHostMemory;                       ///< Check whether HIP can map host memory
+    int gcnArch;                                ///< AMD GCN Arch Value. Eg: 803, 701
 } hipDeviceProp_t;


@@ -32,7 +32,7 @@ THE SOFTWARE.
 	}\
 }

-/* 
+/*
 * Square each element in the array A and write to array C.
 */
 template <typename T>
@@ -58,16 +58,18 @@ int main(int argc, char *argv[])
    hipDeviceProp_t props;
    CHECK(hipGetDeviceProperties(&props, 0/*deviceID*/));
    printf ("info: running on device %s\n", props.name);
-
+    #ifdef __HIP_PLATFORM_HCC__
+      printf ("info: architecture on AMD GPU device is: %d\n",props.gcnArch);
+    #endif
    printf ("info: allocate host mem (%6.2f MB)\n", 2*Nbytes/1024.0/1024.0);
    A_h = (float*)malloc(Nbytes);
    CHECK(A_h == 0 ? hipErrorMemoryAllocation : hipSuccess );
    C_h = (float*)malloc(Nbytes);
    CHECK(C_h == 0 ? hipErrorMemoryAllocation : hipSuccess );
 	// Fill with Phi + i
-    for (size_t i=0; i<N; i++) 
+    for (size_t i=0; i<N; i++)
    {
-        A_h[i] = 1.618f + i; 
+        A_h[i] = 1.618f + i;
    }

    printf ("info: allocate device mem (%6.2f MB)\n", 2*Nbytes/1024.0/1024.0);
@@ -6,9 +6,12 @@

 #include "ResultDatabase.h"

+enum MallocMode {MallocPinned, MallocUnpinned, MallocRegistered};
+
 // Cmdline parms:
 bool          p_verbose = false;
-bool          p_pinned  = true;
+MallocMode    p_malloc_mode  = MallocPinned;
+int           p_numa_ctl = -1;
 int           p_iterations   = 10;
 int           p_beatsperiteration=1;
 int           p_device  = 0;
@@ -21,7 +24,7 @@ bool          p_h2d   = true;
 bool          p_d2h   = true;
 bool          p_bidir = true;

-
+//#define NO_CHECK


 #define CHECK_HIP_ERROR()                                                    \
@@ -36,6 +39,14 @@ bool          p_bidir = true;
 }


+std::string mallocModeString(int mallocMode) {
+    switch (mallocMode) {
+        case MallocPinned  : return "pinned";
+        case MallocUnpinned: return "unpinned";
+        case MallocRegistered: return "registered";
+        default: return "mallocmode-UNKNOWN";
+    };
+};

 // ****************************************************************************
 int sizeToBytes(int size) {
@@ -106,7 +117,7 @@ void RunBenchmark_H2D(ResultDatabase &resultDB)

    // Create some host memory pattern
    float *hostMem = NULL;
-    if (p_pinned)
+    if (p_malloc_mode == MallocPinned)
    {
        hipHostMalloc((void**)&hostMem, sizeof(float) * numMaxFloats);
        while (hipGetLastError() != hipSuccess)
@@ -116,20 +127,33 @@ void RunBenchmark_H2D(ResultDatabase &resultDB)
            --nSizes;
            if (nSizes < 1)
            {
-                std::cerr << "Error: Couldn't allocated any pinned buffer\n";
+                std::cerr << "Error: Couldn't allocate any pinned buffer\n";
            return;
            }
            numMaxFloats = 1024 * (sizes[nSizes-1]) / 4;
            hipHostMalloc((void**)&hostMem, sizeof(float) * numMaxFloats);
        }
    }
-    else
+    else if (p_malloc_mode == MallocUnpinned) 
    {
        if (p_alignedhost) {
            hostMem = (float*)aligned_alloc(p_alignedhost, numMaxFloats*sizeof(float));
        } else {
            hostMem = new float[numMaxFloats];
        }
+    } 
+    else if (p_malloc_mode == MallocRegistered) 
+    {
+        if (p_numa_ctl == -1) {
+            hostMem = (float*)malloc(numMaxFloats*sizeof(float));
+        }
+
+        hipHostRegister(hostMem, numMaxFloats * sizeof(float), 0);
+        CHECK_HIP_ERROR();
+    }
+    else 
+    {
+        assert(0);
    }

    for (int i = 0; i < numMaxFloats; i++)
@@ -146,7 +170,7 @@ void RunBenchmark_H2D(ResultDatabase &resultDB)
        --nSizes;
        if (nSizes < 1)
        {
-            std::cerr << "Error: Couldn't allocated any device buffer\n";
+            std::cerr << "Error: Couldn't allocate any device buffer\n";
            return;
        }
        numMaxFloats = 1024 * (sizes[nSizes-1]) / 4;
@@ -199,8 +223,8 @@ void RunBenchmark_H2D(ResultDatabase &resultDB)
            } else {
                sprintf(sizeStr, "%9s", sizeToString(thisSize).c_str());
            }
-            resultDB.AddResult(std::string("H2D_Bandwidth") + (p_pinned ? "_Pinned" : "_Unpinned"), sizeStr, "GB/sec", speed);
-            resultDB.AddResult(std::string("H2D_Time") + (p_pinned ? "_Pinned" : "_Unpinned"), sizeStr, "ms", t);
+            resultDB.AddResult(std::string("H2D_Bandwidth") + "_" + mallocModeString(p_malloc_mode), sizeStr, "GB/sec", speed);
+            resultDB.AddResult(std::string("H2D_Time") + mallocModeString(p_malloc_mode), sizeStr, "ms", t);

            if (p_onesize) {
                break;
@@ -212,6 +236,8 @@ void RunBenchmark_H2D(ResultDatabase &resultDB)
        numMaxFloats = sizeToBytes(p_onesize) / sizeof(float);
    }

+#ifndef NO_CHECK
+
    // Check.  First reset the host memory, then copy-back result.  Then compare against original ref value.
    for (int i = 0; i < numMaxFloats; i++)
    {
@@ -225,24 +251,36 @@ void RunBenchmark_H2D(ResultDatabase &resultDB)
            printf ("error: H2D. i=%d reference:%6.f != copyback:%6.2f\n", i, ref, hostMem[i]);
        }
    }
+#endif


    // Cleanup
    hipFree((void*)device);
    CHECK_HIP_ERROR();
-    if (p_pinned)
-    {
+    switch (p_malloc_mode) {
+    case MallocPinned:
        hipHostFree((void*)hostMem);
        CHECK_HIP_ERROR();
-    }
-    else
-    {
+        break;
+
+    case MallocUnpinned:
        if (p_alignedhost) {
            delete[] hostMem;
        } else {
            free(hostMem);
        }
+        break;
+
+    case MallocRegistered:
+        hipHostUnregister(hostMem);
+        CHECK_HIP_ERROR();
+        free(hostMem);
+        break;
+    default:
+        assert(0);
    }
+
+
    hipEventDestroy(start);
    hipEventDestroy(stop);
 }
@@ -257,38 +295,56 @@ void RunBenchmark_D2H(ResultDatabase &resultDB)
    // Create some host memory pattern
    float *hostMem1;
    float *hostMem2;
-    if (p_pinned)
+    if (p_malloc_mode == MallocPinned)
    {
        hipHostMalloc((void**)&hostMem1, sizeof(float)*numMaxFloats);
        hipError_t err1 = hipGetLastError();
        hipHostMalloc((void**)&hostMem2, sizeof(float)*numMaxFloats);
        hipError_t err2 = hipGetLastError();
-	while (err1 != hipSuccess || err2 != hipSuccess)
-	{
-	    // free the first buffer if only the second failed
-	    if (err1 == hipSuccess)
-	        hipHostFree((void*)hostMem1);
+        while (err1 != hipSuccess || err2 != hipSuccess)
+        {
+            // free the first buffer if only the second failed
+            if (err1 == hipSuccess)
+                hipHostFree((void*)hostMem1);

-	    // drop the size and try again
-	    if (p_verbose) std::cout << " - dropping size allocating pinned mem\n";
-	    --nSizes;
-	    if (nSizes < 1)
-	    {
-            std::cerr << "Error: Couldn't allocated any pinned buffer\n";
-		return;
-	    }
-	    numMaxFloats = 1024 * (sizes[nSizes-1]) / 4;
-            hipHostMalloc((void**)&hostMem1, sizeof(float)*numMaxFloats);
-            err1 = hipGetLastError();
-            hipHostMalloc((void**)&hostMem2, sizeof(float)*numMaxFloats);
-            err2 = hipGetLastError();
-	}
-   }
-    else
+            // drop the size and try again
+            if (p_verbose) std::cout << " - dropping size allocating pinned mem\n";
+            --nSizes;
+            if (nSizes < 1)
+            {
+                std::cerr << "Error: Couldn't allocate any pinned buffer\n";
+            return;
+            }
+            numMaxFloats = 1024 * (sizes[nSizes-1]) / 4;
+                hipHostMalloc((void**)&hostMem1, sizeof(float)*numMaxFloats);
+                err1 = hipGetLastError();
+                hipHostMalloc((void**)&hostMem2, sizeof(float)*numMaxFloats);
+                err2 = hipGetLastError();
+        }
+    }
+    else if (p_malloc_mode == MallocUnpinned) 
    {
        hostMem1 = new float[numMaxFloats];
        hostMem2 = new float[numMaxFloats];
    }
+    else if (p_malloc_mode == MallocRegistered) 
+    {
+        if (p_numa_ctl == -1) {
+            hostMem1 = (float*)malloc(numMaxFloats*sizeof(float));
+            hostMem2 = (float*)malloc(numMaxFloats*sizeof(float));
+        }
+
+        hipHostRegister(hostMem1, numMaxFloats * sizeof(float), 0);
+        CHECK_HIP_ERROR();
+        hipHostRegister(hostMem2, numMaxFloats * sizeof(float), 0);
+        CHECK_HIP_ERROR();
+    }
+    else 
+    {
+        assert(0);
+    }
+
+
    for (int i=0; i<numMaxFloats; i++)
        hostMem1[i] = i % 77;

@@ -301,7 +357,7 @@ void RunBenchmark_D2H(ResultDatabase &resultDB)
        --nSizes;
        if (nSizes < 1)
        {
-            std::cerr << "Error: Couldn't allocated any device buffer\n";
+            std::cerr << "Error: Couldn't allocate any device buffer\n";
            return;
        }
        numMaxFloats = 1024 * (sizes[nSizes-1]) / 4;
@@ -358,8 +414,8 @@ void RunBenchmark_D2H(ResultDatabase &resultDB)
            } else {
                sprintf(sizeStr, "%9s", sizeToString(thisSize).c_str());
            }
-            resultDB.AddResult(std::string("D2H_Bandwidth") + (p_pinned ? "_Pinned" : "_Unpinned"), sizeStr, "GB/sec", speed);
-            resultDB.AddResult(std::string("D2H_Time") + (p_pinned ? "_Pinned" : "_Unpinned"), sizeStr, "ms", t);
+            resultDB.AddResult(std::string("D2H_Bandwidth") +"_" + mallocModeString(p_malloc_mode) , sizeStr, "GB/sec", speed);
+            resultDB.AddResult(std::string("D2H_Time") +"_" + mallocModeString(p_malloc_mode) , sizeStr, "ms", t);
            if (p_onesize) {
                break;
            }
@@ -381,20 +437,31 @@ void RunBenchmark_D2H(ResultDatabase &resultDB)
    // Cleanup
    hipFree((void*)device);
    CHECK_HIP_ERROR();
-    if (p_pinned)
-    {
+
+    switch (p_malloc_mode) {
+    case MallocPinned:
        hipHostFree((void*)hostMem1);
        CHECK_HIP_ERROR();
        hipHostFree((void*)hostMem2);
        CHECK_HIP_ERROR();
-    }
-    else
-    {
+        break;
+    case MallocUnpinned:
        delete[] hostMem1;
        delete[] hostMem2;
-        hipEventDestroy(start);
-	    hipEventDestroy(stop);
+        break;
+    case MallocRegistered:
+        hipHostUnregister(hostMem1);
+        CHECK_HIP_ERROR();
+        free(hostMem1);
+        hipHostUnregister(hostMem2);
+        free(hostMem2);
+        break;
+    default:
+        assert(0);
    }
+
+    hipEventDestroy(start);
+    hipEventDestroy(stop);
 }


@@ -409,7 +476,7 @@ void RunBenchmark_Bidir(ResultDatabase &resultDB)

    // Create some host memory pattern
    float *hostMem[2] = {NULL, NULL};
-    if (p_pinned)
+    if (p_malloc_mode == MallocPinned)
    {
        while (1) 
        {
@@ -424,18 +491,34 @@ void RunBenchmark_Bidir(ResultDatabase &resultDB)
                --nSizes;
                if (nSizes < 1)
                {
-                    std::cerr << "Error: Couldn't allocated any pinned buffer\n";
+                    std::cerr << "Error: Couldn't allocate any pinned buffer\n";
                return;
                }
                numMaxFloats = 1024 * (sizes[nSizes-1]) / 4;
            }
        }
    }
-    else
+    else if (p_malloc_mode == MallocUnpinned) 
    {
        hostMem[0] = new float[numMaxFloats];
        hostMem[1] = new float[numMaxFloats];
    }
+    else if (p_malloc_mode == MallocRegistered) 
+    {
+        if (p_numa_ctl == -1) {
+            hostMem[0] = (float*)malloc(numMaxFloats*sizeof(float));
+            hostMem[1] = (float*)malloc(numMaxFloats*sizeof(float));
+        }
+        hipHostRegister(hostMem[0], numMaxFloats * sizeof(float), 0);
+        CHECK_HIP_ERROR();
+        hipHostRegister(hostMem[1], numMaxFloats * sizeof(float), 0);
+        CHECK_HIP_ERROR();
+    } 
+    else 
+    {
+        assert(0);
+    }
+

    for (int i = 0; i < numMaxFloats; i++)
    {
@@ -459,7 +542,7 @@ void RunBenchmark_Bidir(ResultDatabase &resultDB)
            --nSizes;
            if (nSizes < 1)
            {
-                std::cerr << "Error: Couldn't allocated any device buffer\n";
+                std::cerr << "Error: Couldn't allocate any device buffer\n";
                return;
            }
            numMaxFloats = 1024 * (sizes[nSizes-1]) / 4;
@@ -512,8 +595,8 @@ void RunBenchmark_Bidir(ResultDatabase &resultDB)
            double speed = (double(sizeToBytes(thisSize)) / (1000*1000)) / t;
            char sizeStr[256];
            sprintf(sizeStr, "%9s", sizeToString(thisSize).c_str());
-            resultDB.AddResult(std::string("Bidir_Bandwidth") + (p_pinned ? "_Pinned" : "_Unpinned"), sizeStr, "GB/sec", speed);
-            resultDB.AddResult(std::string("Bidir_Time") + (p_pinned ? "_Pinned" : "_Unpinned"), sizeStr, "ms", t);
+            resultDB.AddResult(std::string("Bidir_Bandwidth") + "_" + mallocModeString(p_malloc_mode), sizeStr, "GB/sec", speed);
+            resultDB.AddResult(std::string("Bidir_Time") + "_" + mallocModeString(p_malloc_mode), sizeStr, "ms", t);
        }
    }

@@ -521,17 +604,27 @@ void RunBenchmark_Bidir(ResultDatabase &resultDB)
    hipFree((void*)deviceMem[0]);
    hipFree((void*)deviceMem[1]);
    CHECK_HIP_ERROR();
-    if (p_pinned)
-    {
+    switch (p_malloc_mode) {
+    case MallocPinned:
        hipHostFree((void*)hostMem[0]);
        hipHostFree((void*)hostMem[1]);
        CHECK_HIP_ERROR();
-    }
-    else
-    {
+        break;
+    case MallocUnpinned:
        delete[] hostMem[0];
        delete[] hostMem[1];
-    }
+        break;
+    case MallocRegistered:
+        for (int i=0; i<2; i++) {
+            hipHostUnregister(hostMem[i]);
+            CHECK_HIP_ERROR();
+            free(hostMem[i]);
+        }
+        break;
+    default:
+        assert(0);
+    };
+
    hipEventDestroy(start);
    hipEventDestroy(stop);
    hipStreamDestroy(stream[0]);
@@ -557,7 +650,7 @@ void printConfig() {
    hipDeviceProp_t props;
    hipGetDeviceProperties(&props, p_device);

-    printf ("Device:%s Mem=%.1fGB #CUs=%d Freq=%.0fMhz  Pinned=%s\n", props.name, props.totalGlobalMem/1024.0/1024.0/1024.0, props.multiProcessorCount, props.clockRate/1000.0, p_pinned ? "YES" : "NO");
+    printf ("Device:%s Mem=%.1fGB #CUs=%d Freq=%.0fMhz  MallocMode=%s\n", props.name, props.totalGlobalMem/1024.0/1024.0/1024.0, props.multiProcessorCount, props.clockRate/1000.0, mallocModeString(p_malloc_mode).c_str());
 }

 void help() {
@@ -601,7 +694,9 @@ int parseStandardArguments(int argc, char *argv[])
               failed("Bad onesize argument"); 
            }
        } else if (!strcmp(arg, "--unpinned")) {
-            p_pinned = 0;
+            p_malloc_mode = MallocUnpinned;
+        } else if (!strcmp(arg, "--registered")) {
+            p_malloc_mode = MallocRegistered;
        } else if (!strcmp(arg, "--h2d")) {
            p_h2d   = true;
            p_d2h   = false;
@@ -3,6 +3,10 @@ ifeq (,$(HIP_PATH))
 	HIP_PATH=../../..
 endif

+ifeq (gfx701, $(findstring gfx701,$(HCC_AMDGPU_TARGET)))
+	$(error gfx701 is not a supported device for this sample)
+endif
+
 HIPCC=$(HIP_PATH)/bin/hipcc

 TARGET=hcc
@@ -22,7 +26,7 @@ CXX=$(HIPCC)


 $(EXECUTABLE): $(OBJECTS)
-	$(HIPCC)  $(OBJECTS) -o $@
+	$(HIPCC) $(OBJECTS) -o $@


 test: $(EXECUTABLE)
@@ -33,4 +37,3 @@ clean:
 	rm -f $(EXECUTABLE)
 	rm -f $(OBJECTS)
 	rm -f $(HIP_PATH)/src/*.o
-
@@ -3,6 +3,10 @@ ifeq (,$(HIP_PATH))
 	HIP_PATH=../../..
 endif

+ifeq (gfx701, $(findstring gfx701,$(HCC_AMDGPU_TARGET)))
+	$(error gfx701 is not a supported device for this sample)
+endif
+
 HIPCC=$(HIP_PATH)/bin/hipcc

 TARGET=hcc
@@ -758,11 +758,24 @@ hipError_t ihipDevice_t::initProperties(hipDeviceProp_t* prop)
    prop->isMultiGpuBoard = 0 ? gpuAgentsCount < 2 : 1;

    // Get agent name
-#if HIP_USE_PRODUCT_NAME
+
    err = hsa_agent_get_info(_hsaAgent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_PRODUCT_NAME, &(prop->name));
-#else
-    err = hsa_agent_get_info(_hsaAgent, HSA_AGENT_INFO_NAME, &(prop->name));
-#endif
+    char archName[256];
+    err = hsa_agent_get_info(_hsaAgent, HSA_AGENT_INFO_NAME, &archName);
+    
+    if(strcmp(archName,"gfx701")==0){
+      prop->gcnArch = 701;
+    }
+    if(strcmp(archName,"gfx801")==0){
+      prop->gcnArch = 801;
+    }
+    if(strcmp(archName,"gfx802")==0){
+      prop->gcnArch = 802;
+    }
+    if(strcmp(archName,"gfx803")==0){
+      prop->gcnArch = 803;
+    }
+
    DeviceErrorCheck(err);

    // Get agent node
@@ -1790,6 +1803,20 @@ void ihipStream_t::resolveHcMemcpyDirection(unsigned hipMemKind,
 }


+void printPointerInfo(unsigned dbFlag, const char *tag, const void *ptr, const hc::AmPointerInfo &ptrInfo)
+{
+    tprintf (dbFlag, "  %s=%p baseHost=%p baseDev=%p sz=%zu home_dev=%d tracked=%d isDevMem=%d registered=%d\n",
+             tag, ptr, 
+             ptrInfo._hostPointer, ptrInfo._devicePointer, ptrInfo._sizeBytes,
+             ptrInfo._appId, ptrInfo._sizeBytes != 0, ptrInfo._isInDeviceMem, !ptrInfo._isAmManaged);
+}
+
+
+// TODO : For registered and host memory, if the portable flag is set, we need to recognize that and perform appropriate copy operation.
+// What can happen now is that Portable memory is mapped into multiple devices but Peer access is not enabled. i
+// The peer detection logic doesn't see that the memory is already mapped and so tries to use an unpinned copy algorithm.  If this is PinInPlace, then an error can occur.
+// Need to track Portable flag correctly or use new RT functionality to query the peer status for the pointer.
+//
 // TODO - remove kind parm from here or use it below?
 void ihipStream_t::locked_copySync(void* dst, const void* src, size_t sizeBytes, unsigned kind, bool resolveOn)
 {
@@ -1806,6 +1833,16 @@ void ihipStream_t::locked_copySync(void* dst, const void* src, size_t sizeBytes,
    bool dstTracked = (hc::am_memtracker_getinfo(&dstPtrInfo, dst) == AM_SUCCESS);
    bool srcTracked = (hc::am_memtracker_getinfo(&srcPtrInfo, src) == AM_SUCCESS);

+
+    // Some code in HCC and in printPointerInfo uses _sizeBytes==0 as an indication ptr is not valid, so check it here:
+    if (!dstTracked) {
+        assert (dstPtrInfo._sizeBytes == 0);
+    }
+    if (!srcTracked) {
+        assert (srcPtrInfo._sizeBytes == 0);
+    }
+
+
    hc::hcCommandKind hcCopyDir;
    ihipCtx_t *copyDevice;
    bool forceUnpinnedCopy;
@@ -1818,12 +1855,8 @@ void ihipStream_t::locked_copySync(void* dst, const void* src, size_t sizeBytes,
                 dst, dstPtrInfo._appId, dstPtrInfo._isInDeviceMem,
                 src, srcPtrInfo._appId, srcPtrInfo._isInDeviceMem,
                 sizeBytes, hcMemcpyStr(hcCopyDir), forceUnpinnedCopy);
-        tprintf (DB_COPY, "  dst=%p baseHost=%p baseDev=%p sz=%zu home_dev=%d tracked=%d isDevMem=%d\n",
-                 dst, dstPtrInfo._hostPointer, dstPtrInfo._devicePointer, dstPtrInfo._sizeBytes,
-                 dstPtrInfo._appId, dstTracked, dstPtrInfo._isInDeviceMem);
-        tprintf (DB_COPY, "  src=%p baseHost=%p baseDev=%p sz=%zu home_dev=%d tracked=%d isDevMem=%d\n",
-                 src, srcPtrInfo._hostPointer, srcPtrInfo._devicePointer, srcPtrInfo._sizeBytes,
-                 srcPtrInfo._appId, srcTracked, srcPtrInfo._isInDeviceMem);
+        printPointerInfo(DB_COPY, "  dst", dst, dstPtrInfo);
+        printPointerInfo(DB_COPY, "  src", src, srcPtrInfo);

        this->ensureHaveQueue(crit);

@@ -1908,12 +1941,8 @@ void ihipStream_t::locked_copyAsync(void* dst, const void* src, size_t sizeBytes
                 dst, dstPtrInfo._appId, dstPtrInfo._isInDeviceMem,
                 src, srcPtrInfo._appId, srcPtrInfo._isInDeviceMem,
                 sizeBytes, hcMemcpyStr(hcCopyDir), forceUnpinnedCopy);
-        tprintf (DB_COPY, "  dst=%p baseHost=%p baseDev=%p sz=%zu home_dev=%d tracked=%d isDevMem=%d\n",
-                 dst, dstPtrInfo._hostPointer, dstPtrInfo._devicePointer, dstPtrInfo._sizeBytes,
-                 dstPtrInfo._appId, dstTracked, dstPtrInfo._isInDeviceMem);
-        tprintf (DB_COPY, "  src=%p baseHost=%p baseDev=%p sz=%zu home_dev=%d tracked=%d isDevMem=%d\n",
-                 src, srcPtrInfo._hostPointer, srcPtrInfo._devicePointer, srcPtrInfo._sizeBytes,
-                 srcPtrInfo._appId, srcTracked, srcPtrInfo._isInDeviceMem);
+        printPointerInfo(DB_COPY, "  dst", dst, dstPtrInfo);
+        printPointerInfo(DB_COPY, "  src", src, srcPtrInfo);

        // "tracked" really indicates if the pointer's virtual address is available in the GPU address space.
        // If both pointers are not tracked, we need to fall back to a sync copy.
@@ -36,7 +36,7 @@ THE SOFTWARE.
 #error("This version of HIP requires a newer version of HCC.");
 #endif

-#define USE_IPC 0
+#define USE_IPC 1

 //---
 // Environment variables:
@@ -326,15 +326,15 @@ const hipStream_t hipStreamNull = 0x0;
 /**
 * HIP IPC Handle Size
 */
-#define HIP_IPC_HANDLE_SIZE 64
+#define HIP_IPC_RESERVED_SIZE 24
 class ihipIpcMemHandle_t
 {
 public:
 #if USE_IPC
    hsa_amd_ipc_memory_t ipc_handle; ///< ipc memory handle on ROCr
 #endif
-    char reserved[HIP_IPC_HANDLE_SIZE];
    size_t psize;
+    char reserved[HIP_IPC_RESERVED_SIZE];
 };


@@ -30,11 +30,109 @@ THE SOFTWARE.
 #include "hip/hcc_detail/hip_texture.h"
 #include <hc_am.hpp>

+
+
+// Internal HIP APIS:
+namespace hip_internal {
+
+hipError_t memcpyAsync (void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind, hipStream_t stream)
+{
+    hipError_t e = hipSuccess;
+
+    stream = ihipSyncAndResolveStream(stream);
+
+
+    if ((dst == NULL) || (src == NULL)) {
+        e= hipErrorInvalidValue;
+    } else if (stream) {
+        try {
+            stream->locked_copyAsync(dst, src, sizeBytes, kind);
+        }
+        catch (ihipException ex) {
+            e = ex._code;
+        }
+    } else {
+        e = hipErrorInvalidValue;
+    }
+
+    return e;
+}
+
+// return 0 on success or -1 on error:
+int sharePtr(void *ptr, ihipCtx_t *ctx, unsigned hipFlags)
+{
+    int ret = 0;
+
+    auto device = ctx->getWriteableDevice();
+
+    hc::am_memtracker_update(ptr, device->_deviceId, hipFlags);
+    int peerCnt=0;
+    {
+        LockedAccessor_CtxCrit_t crit(ctx->criticalData());
+        // the peerCnt always stores self so make sure the trace actually
+        peerCnt = crit->peerCnt();
+        tprintf(DB_MEM, "  allow access to %d other peer(s)\n", peerCnt-1);
+        if (peerCnt > 1) {
+
+            //printf ("peer self access\n");
+
+            // TODOD - remove me:
+            for (auto iter = crit->_peers.begin(); iter!=crit->_peers.end(); iter++) {
+                tprintf (DB_MEM, "    allow access to peer: %s%s\n", (*iter)->toString().c_str(), (iter == crit->_peers.begin()) ? " (self)":"");
+            };
+
+            hsa_status_t s = hsa_amd_agents_allow_access(crit->peerCnt(), crit->peerAgents(), NULL, ptr);
+            if (s != HSA_STATUS_SUCCESS) {
+                ret = -1;
+            }
+        }
+    }
+
+    return ret;
+}
+
+
+
+
+// Allocate a new pointer with am_alloc and share with all valid peers.
+// Returns null-ptr if a memory error occurs (either allocation or sharing)
+void * allocAndSharePtr(const char *msg, size_t sizeBytes, ihipCtx_t *ctx, unsigned amFlags, unsigned hipFlags)
+{
+
+    void *ptr = nullptr;
+
+    auto device = ctx->getWriteableDevice();
+
+    ptr = hc::am_alloc(sizeBytes, device->_acc, amFlags);
+    tprintf(DB_MEM, " alloc %s ptr:%p size:%zu on dev:%d\n",
+            msg, ptr, sizeBytes, device->_deviceId);
+
+    if (ptr != nullptr) {
+        int r = sharePtr(ptr, ctx, hipFlags);
+        if (r != 0) {
+            ptr = nullptr;
+        }
+    }
+
+    return ptr;
+}
+
+
+} // end namespace hip_internal
+
 //-------------------------------------------------------------------------------------------------
 //-------------------------------------------------------------------------------------------------
 // Memory
 //
 //
+//
+//HIP uses several "app*" fields HC memory tracker to track state necessary for the HIP API.
+//_appId : DeviceID.  For device mem, this is device where the memory is physically allocated.
+//         For host or registered mem, this is the current device when the memory is allocated or registered.  This device will have a GPUVM mapping for the host mem.
+//
+//_appAllocationFlags : These are flags provided by the user when allocation is performed. They are returned to user in hipHostGetFlags and other APIs.
+// TODO - add more info here when available.
+//
 hipError_t hipPointerGetAttributes(hipPointerAttribute_t *attributes, void* ptr)
 {
    HIP_INIT_API(attributes, ptr);
@@ -78,6 +176,7 @@ hipError_t hipPointerGetAttributes(hipPointerAttribute_t *attributes, void* ptr)
    return ihipLogStatus(e);
 }

+
 hipError_t hipHostGetDevicePointer(void **devicePointer, void *hostPointer, unsigned flags)
 {
    HIP_INIT_API(devicePointer, hostPointer, flags);
@@ -102,6 +201,7 @@ hipError_t hipHostGetDevicePointer(void **devicePointer, void *hostPointer, unsi
    return ihipLogStatus(e);
 }

+
 hipError_t hipMalloc(void** ptr, size_t sizeBytes)
 {
    HIP_INIT_API(ptr, sizeBytes);
@@ -118,37 +218,8 @@ hipError_t hipMalloc(void** ptr, size_t sizeBytes)

    if (ctx) {
        auto device = ctx->getWriteableDevice();
-        const unsigned am_flags = 0;
-        *ptr = hc::am_alloc(sizeBytes, device->_acc, am_flags);
+        *ptr = hip_internal::allocAndSharePtr("device_mem", sizeBytes, ctx,  0/*amFlags*/, 0/*hipFlags*/);

-
-        if (sizeBytes && (*ptr == NULL)) {
-            hip_status = hipErrorMemoryAllocation;
-        } else {
-            hc::am_memtracker_update(*ptr, device->_deviceId, 0);
-            int peerCnt=0;
-            {
-                LockedAccessor_CtxCrit_t crit(ctx->criticalData());
-                // the peerCnt always stores self so make sure the trace actually
-                peerCnt = crit->peerCnt();
-                tprintf(DB_MEM, " allocated device_mem ptr:%p size:%zu on dev:%d and allow access to %d other peer(s)\n",
-                        *ptr, sizeBytes, device->_deviceId, peerCnt-1);
-                if (peerCnt > 1) {
-
-                    //printf ("peer self access\n");
-
-                    // TODOD - remove me:
-                    for (auto iter = crit->_peers.begin(); iter!=crit->_peers.end(); iter++) {
-                        tprintf (DB_MEM, "   allow access to peer: %s%s\n", (*iter)->toString().c_str(), (iter == crit->_peers.begin()) ? " (self)":"");
-                    };
-
-                    hsa_status_t e = hsa_amd_agents_allow_access(crit->peerCnt(), crit->peerAgents(), NULL, *ptr);
-                    if (e != HSA_STATUS_SUCCESS) {
-                        hip_status = hipErrorMemoryAllocation;
-                    }
-                }
-            }
-        }
    } else {
        hip_status = hipErrorMemoryAllocation;
    }
@@ -188,54 +259,36 @@ hipError_t hipHostMalloc(void** ptr, size_t sizeBytes, unsigned int flags)
        }
        else {
            auto device = ctx->getWriteableDevice();
-            if(HIP_COHERENT_HOST_ALLOC){
-                // Force to allocate finedgrained system memory
-                *ptr = hc::am_alloc(sizeBytes, device->_acc, amHostPinned);
-                if(sizeBytes < 1 && (*ptr == NULL)){
-                    hip_status = hipErrorMemoryAllocation;
-                } else {
-                    hc::am_memtracker_update(*ptr, device->_deviceId, amHostCoherent);
-                }
-                tprintf(DB_MEM, " %s: finegrained system memory ptr=%p\n", __func__, *ptr);
-             }
-            else{
-                // TODO - am_alloc requires writeable __acc, perhaps could be refactored?
-                // TODO - hipHostMallocMapped is be ignored on ROCM - all memory is mapped to host address space as WC.
-                *ptr = hc::am_alloc(sizeBytes, device->_acc, amHostPinned);
-                if (*ptr == NULL) {
-                    hip_status = hipErrorMemoryAllocation;
-                } else {
-                    hc::am_memtracker_update(*ptr, device->_deviceId, flags);
-                    // TODO-hipHostMallocPortable should map the host memory into all contexts, regardless of peer status.
-                    int peerCnt=0;
-                    {
-                        LockedAccessor_CtxCrit_t crit(ctx->criticalData());
-                        peerCnt = crit->peerCnt();
-                        if (peerCnt > 1) {
-                            hsa_amd_agents_allow_access(crit->peerCnt(), crit->peerAgents(), NULL, *ptr);
-                        }
-                    }
-                    tprintf(DB_MEM, "allocated pinned_host ptr:%p size:%zu on dev:%d and allow access to %d other peer(s)\n", *ptr, sizeBytes, device->_deviceId, peerCnt-1);
-                }
-            }
+            unsigned amFlags = HIP_COHERENT_HOST_ALLOC ? amHostCoherent : amHostPinned;
+
+            *ptr = hip_internal::allocAndSharePtr(HIP_COHERENT_HOST_ALLOC ? "finegrained_host":"pinned_host", 
+                                                  sizeBytes, ctx, amFlags, flags);
+            if(sizeBytes  && (*ptr == NULL)){
+                hip_status = hipErrorMemoryAllocation;
+            } 
        }
    }
+
    if (HIP_SYNC_HOST_ALLOC) {
        hipDeviceSynchronize();
    }
    return ihipLogStatus(hip_status);
 }

+// Deprecated function:
 hipError_t hipMallocHost(void** ptr, size_t sizeBytes)
 {
    return hipHostMalloc(ptr, sizeBytes, 0);
 }

+
+// Deprecated function:
 hipError_t hipHostAlloc(void** ptr, size_t sizeBytes, unsigned int flags)
 {
    return hipHostMalloc(ptr, sizeBytes, flags);
 };

+
 // width in bytes
 hipError_t hipMallocPitch(void** ptr, size_t* pitch, size_t width, size_t height)
 {
@@ -257,22 +310,11 @@ hipError_t hipMallocPitch(void** ptr, size_t* pitch, size_t width, size_t height
        auto device = ctx->getWriteableDevice();

        const unsigned am_flags = 0;
-        *ptr = hc::am_alloc(sizeBytes, device->_acc, am_flags);
+        *ptr = hip_internal::allocAndSharePtr("device_pitch", sizeBytes, ctx, am_flags, 0);

        if (sizeBytes && (*ptr == NULL)) {
            hip_status = hipErrorMemoryAllocation;
-        } else {
-            hc::am_memtracker_update(*ptr, device->_deviceId, 0);
-            {
-                LockedAccessor_CtxCrit_t crit(ctx->criticalData());
-                if (crit->peerCnt() > 1) { // peerCnt includes self so only call allow_access if other peers involved:
-                    hsa_status_t hsa_status = hsa_amd_agents_allow_access(crit->peerCnt(), crit->peerAgents(), NULL, *ptr);
-                    if (hsa_status != HSA_STATUS_SUCCESS) {
-                        hip_status = hipErrorMemoryAllocation;
-                    }
-                }
-            }
-        }
+        } 
    } else {
        hip_status = hipErrorMemoryAllocation;
    }
@@ -306,41 +348,31 @@ hipError_t hipMallocArray(hipArray** array, const hipChannelFormatDesc* desc,
    void ** ptr = &array[0]->data;

    if (ctx) {
-        auto device = ctx->getWriteableDevice();
        const unsigned am_flags = 0;
        const size_t size = width*height;

+        size_t allocSize = 0;
        switch(desc->f) {
            case hipChannelFormatKindSigned:
-                *ptr = hc::am_alloc(size*sizeof(int), device->_acc, am_flags);
+                allocSize = size * sizeof(int);
                break;
            case hipChannelFormatKindUnsigned:
-                *ptr = hc::am_alloc(size*sizeof(unsigned int), device->_acc, am_flags);
+                allocSize = size * sizeof(unsigned int);
                break;
            case hipChannelFormatKindFloat:
-                *ptr = hc::am_alloc(size*sizeof(float), device->_acc, am_flags);
+                allocSize = size * sizeof(float);
                break;
            case hipChannelFormatKindNone:
-                *ptr = hc::am_alloc(size*sizeof(size_t), device->_acc, am_flags);
+                allocSize = size * sizeof(size_t);
                break;
            default:
                hip_status = hipErrorUnknown;
                break;
        }
+        *ptr = hip_internal::allocAndSharePtr("device_array", allocSize, ctx, am_flags, 0);
        if (size && (*ptr == NULL)) {
            hip_status = hipErrorMemoryAllocation;
-        } else {
-            hc::am_memtracker_update(*ptr, device->_deviceId, 0);
-            {
-                LockedAccessor_CtxCrit_t crit(ctx->criticalData());
-                if (crit->peerCnt() > 1) { // peerCnt includes self so only call allow_access if other peers involved:
-                    hsa_status_t hsa_status = hsa_amd_agents_allow_access(crit->peerCnt(), crit->peerAgents(), NULL, *ptr);
-                    if (hsa_status != HSA_STATUS_SUCCESS) {
-                        hip_status = hipErrorMemoryAllocation;
-                    }
-                }
-            }
-        }
+        } 

    } else {
        hip_status = hipErrorMemoryAllocation;
@@ -373,6 +405,8 @@ hipError_t hipHostGetFlags(unsigned int* flagsPtr, void* hostPtr)
    return ihipLogStatus(hip_status);
 }

+
+// TODO - need to fix several issues here related to P2P access, host memory fallback.
 hipError_t hipHostRegister(void *hostPtr, size_t sizeBytes, unsigned int flags)
 {
    HIP_INIT_API(hostPtr, sizeBytes, flags);
@@ -392,19 +426,21 @@ hipError_t hipHostRegister(void *hostPtr, size_t sizeBytes, unsigned int flags)
        hip_status = hipErrorHostMemoryAlreadyRegistered;
    } else {
        auto ctx = ihipGetTlsDefaultCtx();
-        if(hostPtr == NULL){
+        if (hostPtr == NULL) {
            return ihipLogStatus(hipErrorInvalidValue);
        }
+        //TODO-test : multi-gpu access to registered host memory.
        if (ctx) {
-            auto device = ctx->getWriteableDevice();
            if(flags == hipHostRegisterDefault || flags == hipHostRegisterPortable || flags == hipHostRegisterMapped){
+                auto device = ctx->getWriteableDevice();
                std::vector<hc::accelerator>vecAcc;
                for(int i=0;i<g_deviceCnt;i++){
                    vecAcc.push_back(ihipGetDevice(i)->_acc);
                }
                am_status = hc::am_memory_host_lock(device->_acc, hostPtr, sizeBytes, &vecAcc[0], vecAcc.size());
+                hc::am_memtracker_update(hostPtr, device->_deviceId, flags);

-                tprintf(DB_MEM, " %s registered ptr=%p\n", __func__, hostPtr);
+                tprintf(DB_MEM, " %s registered ptr=%p and allowed access to %zu peers\n", __func__, hostPtr, vecAcc.size());
                if(am_status == AM_SUCCESS){
                    hip_status = hipSuccess;
                } else {
@@ -603,6 +639,7 @@ hipError_t hipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind
    return ihipLogStatus(e);
 }

+
 hipError_t hipMemcpyHtoD(hipDeviceptr_t dst, void* src, size_t sizeBytes)
 {
    HIP_INIT_CMD_API(dst, src, sizeBytes);
@@ -624,6 +661,7 @@ hipError_t hipMemcpyHtoD(hipDeviceptr_t dst, void* src, size_t sizeBytes)
    return ihipLogStatus(e);
 }

+
 hipError_t hipMemcpyDtoH(void* dst, hipDeviceptr_t src, size_t sizeBytes)
 {
    HIP_INIT_CMD_API(dst, src, sizeBytes);
@@ -645,6 +683,7 @@ hipError_t hipMemcpyDtoH(void* dst, hipDeviceptr_t src, size_t sizeBytes)
    return ihipLogStatus(e);
 }

+
 hipError_t hipMemcpyDtoD(hipDeviceptr_t dst, hipDeviceptr_t src, size_t sizeBytes)
 {
    HIP_INIT_CMD_API(dst, src, sizeBytes);
@@ -666,6 +705,7 @@ hipError_t hipMemcpyDtoD(hipDeviceptr_t dst, hipDeviceptr_t src, size_t sizeByte
    return ihipLogStatus(e);
 }

+
 hipError_t hipMemcpyHtoH(void* dst, void* src, size_t sizeBytes)
 {
    HIP_INIT_CMD_API(dst, src, sizeBytes);
@@ -689,32 +729,6 @@ hipError_t hipMemcpyHtoH(void* dst, void* src, size_t sizeBytes)



-// Internal copy sync:
-namespace hip_internal {
-
-hipError_t memcpyAsync (void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind, hipStream_t stream)
-{
-    hipError_t e = hipSuccess;
-
-    stream = ihipSyncAndResolveStream(stream);
-
-
-    if ((dst == NULL) || (src == NULL)) {
-        e= hipErrorInvalidValue;
-    } else if (stream) {
-        try {
-            stream->locked_copyAsync(dst, src, sizeBytes, kind);
-        }
-        catch (ihipException ex) {
-            e = ex._code;
-        }
-    } else {
-        e = hipErrorInvalidValue;
-    }
-
-    return e;
-}
-} // end namespace hip_internal


 hipError_t hipMemcpyAsync(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind, hipStream_t stream)
@@ -990,6 +1004,7 @@ hipError_t hipMemset(void* dst, int  value, size_t sizeBytes )
    return ihipLogStatus(e);
 }

+
 hipError_t hipMemGetInfo  (size_t *free, size_t *total)
 {
    HIP_INIT_API(free, total);
@@ -1024,6 +1039,28 @@ hipError_t hipMemGetInfo  (size_t *free, size_t *total)
    return ihipLogStatus(e);
 }

+hipError_t hipMemPtrGetInfo(void *ptr, size_t *size)
+{
+  HIP_INIT_API(ptr, size);
+
+  hipError_t e = hipSuccess;
+
+  if(ptr != nullptr && size != nullptr){
+    hc::accelerator acc;
+    hc::AmPointerInfo amPointerInfo(NULL, NULL, 0, acc, 0, 0);
+    am_status_t status = hc::am_memtracker_getinfo(&amPointerInfo, ptr);
+    if(status == AM_SUCCESS){
+      *size = amPointerInfo._sizeBytes;
+    }else{
+      e = hipErrorInvalidValue;
+    }
+  }else{
+    e = hipErrorInvalidValue;
+  }
+  return ihipLogStatus(e);
+}
+
+
 hipError_t hipFree(void* ptr)
 {
    HIP_INIT_API(ptr);
@@ -1051,6 +1088,7 @@ hipError_t hipFree(void* ptr)
    return ihipLogStatus(hipStatus);
 }

+
 hipError_t hipHostFree(void* ptr)
 {
    HIP_INIT_API(ptr);
@@ -1122,7 +1160,7 @@ hipError_t hipMemGetAddressRange ( hipDeviceptr_t* pbase, size_t* psize, hipDevi
    }
    else
        hipStatus = hipErrorInvalidDevicePointer;
-    return hipStatus;
+    return ihipLogStatus(hipStatus);
 }


@@ -1141,25 +1179,25 @@ hipError_t hipIpcGetMemHandle(hipIpcMemHandle_t* handle, void* devPtr){
    }
    else
        hipStatus = hipErrorInvalidResourceHandle;
-
+    ihipIpcMemHandle_t* iHandle = (ihipIpcMemHandle_t*) handle;
    // Save the size of the pointer to hipIpcMemHandle
-    (*handle)->psize = psize;
+    iHandle->psize = psize;

 #if USE_IPC
    // Create HSA ipc memory
    hsa_status_t hsa_status =
-        hsa_amd_ipc_memory_create(devPtr, psize, &(*handle)->ipc_handle);
+        hsa_amd_ipc_memory_create(devPtr, psize, (hsa_amd_ipc_memory_t*) &(iHandle->ipc_handle));
    if(hsa_status!= HSA_STATUS_SUCCESS)
        hipStatus = hipErrorMemoryAllocation;
 #else
    hipStatus = hipErrorRuntimeOther;
 #endif

-    return hipStatus;
+    return ihipLogStatus(hipStatus);
 }

 hipError_t hipIpcOpenMemHandle(void** devPtr, hipIpcMemHandle_t handle, unsigned int flags){
-// HIP_INIT_API ( devPtr, handle.handle , flags);
+    HIP_INIT_API ( devPtr, &handle , flags);
    hipError_t hipStatus = hipSuccess;

 #if USE_IPC
@@ -1169,15 +1207,16 @@ hipError_t hipIpcOpenMemHandle(void** devPtr, hipIpcMemHandle_t handle, unsigned
    if(!agent)
        return hipErrorInvalidResourceHandle;

+    ihipIpcMemHandle_t* iHandle = (ihipIpcMemHandle_t*) &handle;
    //Attach ipc memory
    hsa_status_t hsa_status =
-        hsa_amd_ipc_memory_attach(&handle->ipc_handle, handle->psize, 1, agent, devPtr);
+        hsa_amd_ipc_memory_attach((hsa_amd_ipc_memory_t*)&(iHandle->ipc_handle), iHandle->psize, 1, agent, devPtr);
    if(hsa_status != HSA_STATUS_SUCCESS)
        hipStatus = hipErrorMapBufferObjectFailed;
 #else
    hipStatus = hipErrorRuntimeOther;
 #endif
-    return hipStatus;
+    return ihipLogStatus(hipStatus);
 }

 hipError_t hipIpcCloseMemHandle(void *devPtr){
@@ -1192,7 +1231,7 @@ hipError_t hipIpcCloseMemHandle(void *devPtr){
 #else
    hipStatus = hipErrorRuntimeOther;
 #endif
-    return hipStatus;
+    return ihipLogStatus(hipStatus);
 }

 // hipError_t hipIpcOpenEventHandle(hipEvent_t* event, hipIpcEventHandle_t handle){
@@ -218,31 +218,33 @@ hipError_t hipModuleUnload(hipModule_t hmod)
 		{
 				ret = hipErrorInvalidValue;
 		}
-    for(std::list<hipFunction_t>::iterator f = hmod->funcTrack.begin(); f != hmod->funcTrack.end(); ++f) {
+    for(auto f = hmod->funcTrack.begin(); f != hmod->funcTrack.end(); ++f) {
      delete *f;
    }
    delete hmod;
    return ihipLogStatus(ret);
 }

-hipError_t ihipModuleGetSymbol(hipFunction_t *func, hipModule_t hmod, const char *name){
+
+hipError_t ihipModuleGetSymbol(hipFunction_t *func, hipModule_t hmod, const char *name)
+{
    auto ctx = ihipGetTlsDefaultCtx();
    hipError_t ret = hipSuccess;

-    if(name == nullptr){
+    if (name == nullptr){
        return ihipLogStatus(hipErrorInvalidValue);
    }

-    if(ctx == nullptr){
+    if (ctx == nullptr){
        ret = hipErrorInvalidContext;

-    }else{
+    } else {
        std::string str(name);
-        for(std::list<hipFunction_t>::iterator f = hmod->funcTrack.begin(); f != hmod->funcTrack.end(); ++f) {
-          if((*f)->_name == str) {
-            *func = *f;
-          }
-          return ret;
+        for(auto f = hmod->funcTrack.begin(); f != hmod->funcTrack.end(); ++f) {
+            if((*f)->_name == str) {
+                *func = *f;
+                return ret;
+            }
        }
        ihipModuleSymbol_t *sym = new ihipModuleSymbol_t;
        int deviceId = ctx->getDevice()->_deviceId;
@@ -46,7 +46,7 @@ __device__ float asinhf(float x)
 }
 __device__ float atan2f(float y, float x)
 {
-    return hc::precise_math::atan2f(x, y);
+    return hc::precise_math::atan2f(y, x);
 }
 __device__ float atanf(float x)
 {
@@ -1,5 +1,5 @@
 /*
-Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
+Copyright (c) 2015-2017 Advanced Micro Devices, Inc. All rights reserved.

 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -39,22 +39,67 @@ int main(){
 	const size_t size = N * sizeof(float);
 	A = (float*)malloc(size);
 	HIPCHECK(hipHostRegister(A, size, 0));
+
+
 	for(int i=0;i<N;i++){
 		A[i] = float(1);
 	}
+
+
 	for(int i=0;i<num_devices;i++){
-	HIPCHECK(hipSetDevice(i));
-	HIPCHECK(hipHostGetDevicePointer((void**)&Ad[i], A, 0));
+        HIPCHECK(hipSetDevice(i));
+        HIPCHECK(hipHostGetDevicePointer((void**)&Ad[i], A, 0));
 	}

+    // Reference the registered device pointer Ad from inside the kernel:
 	for(int i=0;i<num_devices;i++){
-	HIPCHECK(hipSetDevice(i));
-	hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N/512), dim3(512), 0, 0, Ad[i]);
-
-	HIPCHECK(hipDeviceSynchronize());
+        HIPCHECK(hipSetDevice(i));
+        hipLaunchKernel(Inc, dim3(N/512), dim3(512), 0, 0, Ad[i]);

+        HIPCHECK(hipDeviceSynchronize());
 	}
 	HIPASSERT(A[10] == 1.0f + float(num_devices));
+
+
+    { 
+        // Sensitize HIP bug if device does not match where the memory was registered.
+        HIPCHECK(hipSetDevice(0));
+
+
+
+        // Copy to B, this should be optimal pinned malloc copy:
+        // Note we are using the host pointer here:
+        float *Bh, *Bd;
+        Bh = (float*)malloc(size);
+        HIPCHECK(hipMalloc(&Bd, size));
+        HIPCHECK(hipMemset(Bd, 13.0f, size));
+
+        for(int i=0;i<N;i++){
+            A[i] = float(i);
+            Bh[i] = 0.0f;
+        }
+
+        HIPCHECK(hipMemcpy(Bd, A, size,  hipMemcpyHostToDevice));
+
+        HIPCHECK(hipMemcpy(Bh, Bd, size, hipMemcpyDeviceToHost));
+
+#if 0
+        //TODO - disable check HCC patch for registered/locked memory usin device pointers is merged.
+        for(int i=0;i<N;i++){
+            if (Bh[i] != A[i]) {
+                printf ("mismatch at Bh[%d]=%f, A[%d]=%f\n", i, Bh[i], i, A[i]);
+                failed("mismatch");
+            };
+        }
+#endif
+
+
+
+        // Make sure the copy worked
+    }
+
+
+
 	HIPCHECK(hipHostUnregister(A));
 	passed();
 }
@@ -0,0 +1,52 @@
+/*
+Copyright (c) 2015-2017 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+/* HIT_START
+ * BUILD: %t %s ../../test_common.cpp EXCLUDE_HIP_PLATFORM nvcc
+ * RUN: %t
+ * HIT_END
+ */
+
+#include"test_common.h"
+
+struct {
+  float a;
+  int b;
+  void *c;
+} Struct ;
+
+int main(){
+  int *iPtr;
+  float *fPtr;
+  struct Struct *sPtr;
+  size_t sSetSize = 1024, sGetSize;
+  hipMalloc(&iPtr, sSetSize);
+  hipMalloc(&fPtr, sSetSize);
+  hipMalloc(&sPtr, sSetSize);
+  hipMemPtrGetInfo(iPtr, &sGetSize);
+  assert(sGetSize == sSetSize);
+  hipMemPtrGetInfo(fPtr, &sGetSize);
+  assert(sGetSize == sSetSize);
+  hipMemPtrGetInfo(sPtr, &sGetSize);
+  assert(sGetSize == sSetSize);
+  passed();
+}