Merge branch 'amd-develop' into amd-master

Change-Id: Ib589b5e7032dce5f79dbba493aaac30c2c1a4137
2017-06-28 09:20:59 +05:30
@@ -2,12 +2,6 @@

 We have attempted to document known bugs and limitations - in particular the [HIP Kernel Language](docs/markdown/hip_kernel_language.md) document uses the phrase "Under Development", and the [HIP Runtime API bug list](http://gpuopen-professionalcompute-tools.github.io/HIP/bug.html) lists known bugs. 

-Upcoming:
- Stability: Enforce periodic host synchronization to reclaim resources if the application has launched a large
-  number of commands (>1K) without synchronizing.  
- Register keyword now silently ignored on HCC (previously would emit warning).
- Doc updates: Add some more frequently asked questions to FAQ, fix TOC in some files, review.
- Cookbook.

 ===================================================================================================

@@ -553,24 +553,51 @@

 |   **CUDA**                                                |   **HIP**                     | **CUDA description**                                                                                                           |
 |-----------------------------------------------------------|-------------------------------|--------------------------------------------------------------------------------------------------------------------------------|
-
+| `cuMemAdvise`                                             |                               | Advise about the usage of a given memory range.                                                                                |
+| `cuMemPrefetchAsync`                                      |                               | Prefetches memory to the specified destination device.                                                                         |
+| `cuMemRangeGetAttribute`                                  |                               | Query an attribute of a given memory range.                                                                                    |
+| `cuMemRangeGetAttributes`                                 |                               | Query attributes of a given memory range.                                                                                      |
+| `cuPointerGetAttribute`                                   |                               | Returns information about a pointer.                                                                                           |
+| `cuPointerGetAttributes`                                  |                               | Returns information about a pointer.                                                                                           |
+| `cuPointerSetAttribute`                                   |                               | Set attributes on a previously allocated memory region.                                                                        |

 ## **13. Stream Management**

 |   **CUDA**                                                |   **HIP**                     | **CUDA description**                                                                                                           |
 |-----------------------------------------------------------|-------------------------------|--------------------------------------------------------------------------------------------------------------------------------|
-
+| `cuStreamAddCallback`                                     |                               | Add a callback to a compute stream.                                                                                            |
+| `cuStreamAttachMemAsync`                                  |                               | Attach memory to a stream asynchronously.                                                                                      |
+| `cuStreamCreate`                                          |                               | Create a stream.                                                                                                               |
+| `cuStreamCreateWithPriority`                              |                               | Create a stream with the given priority.                                                                                       |
+| `cuStreamDestroy`                                         | `hipStreamDestroy`            | Destroys a stream.                                                                                                             |
+| `cuStreamGetFlags`                                        | `hipStreamGetFlags`           | Query the flags of a given stream.                                                                                             |
+| `cuStreamGetPriority`                                     | `hipStreamGetPriority`        | Query the priority of a given stream.                                                                                          |
+| `cuStreamQuery`                                           | `hipStreamQuery`              | Determine status of a compute stream.                                                                                          |
+| `cuStreamSynchronize`                                     | `hipStreamSynchronize`        | Wait until a stream's tasks are completed.                                                                                     |
+| `cuStreamWaitEvent`                                       | `hipStreamWaitEvent`          | Make a compute stream wait on an event.                                                                                        |
+| `cuStreamBatchMemOp`                                      |                               | Batch operations to synchronize the stream via memory operations.                                                              |
+| `cuStreamWaitValue32`                                     |                               | Wait on a memory location.                                                                                                     |
+| `cuStreamWriteValue32`                                    |                               | Write a value to memory.                                                                                                       |

 ## **14. Event Management**

 |   **CUDA**                                                |   **HIP**                     | **CUDA description**                                                                                                           |
 |-----------------------------------------------------------|-------------------------------|--------------------------------------------------------------------------------------------------------------------------------|
-
+| `cuEventCreate`                                           | `hipEventCreate`              | Creates an event.                                                                                                              |
+| `cuEventDestroy`                                          | `hipEventDestroy`             | Destroys an event.                                                                                                             |
+| `cuEventElapsedTime`                                      | `hipEventElapsedTime`         | Computes the elapsed time between two events.                                                                                  |
+| `cuEventQuery`                                            | `hipEventQuery`               | Queries an event's status.                                                                                                     |
+| `cuEventRecord`                                           | `hipEventRecord`              | Records an event.                                                                                                              |
+| `cuEventSynchronize`                                      | `hipEventSynchronize`         | Waits for an event to complete.                                                                                                |

 ## **15. Execution Control**

 |   **CUDA**                                                |   **HIP**                     | **CUDA description**                                                                                                           |
 |-----------------------------------------------------------|-------------------------------|--------------------------------------------------------------------------------------------------------------------------------|
+| `cuFuncGetAttribute`                                      |                               | Returns information about a function.                                                                                          |
+| `cuFuncSetCacheConfig`                                    | `hipFuncSetCacheConfig`       | Sets the preferred cache configuration for a device function.                                                                  |
+| `cuFuncSetSharedMemConfig`                                |                               | Sets the shared memory configuration for a device function.                                                                    |
+| `cuLaunchKernel`                                          | `hipModuleLaunchKernel`       | Launches a CUDA function.                                                                                                      |


 ## **16. Execution Control [DEPRECATED]**
@@ -70,7 +70,6 @@ See the [API Support Table](CUDA_Runtime_API_functions_supported_by_HIP.md) for
  - printf
  - assert
  - `__restrict__`
-  - `__launch_bounds__`
  - `__threadfence*_`, `__syncthreads*`
  - Unbounded loop unroll

@@ -610,30 +610,59 @@ Device-side dynamic global memory allocation is under development.  HIP now incl
 implementation of malloc and free that can be called from device functions.

 ## `__launch_bounds__`
-GPU multiprocessors have a fixed pool of resources (primarily registers and shared memory) that are shared among the active warps. Using more resources can increase the kernel’s IPC, but it reduces the resources available for other warps and limits the number of warps that can run simultaneously. Thus, GPUs exhibit a complex relationship between resource usage and performance. `__launch_bounds__` allows the application to provide usage hints that influence the resources (primarily registers) employed by the generated code. It’s a function attribute that must be attached to a `__global__` function:
+
+
+GPU multiprocessors have a fixed pool of resources (primarily registers and shared memory) which are shared by the actively running warps. Using more resources can increase IPC of the kernel but reduces the resources available for other warps and limits the number of warps that can be simulaneously running. Thus GPUs have a complex relationship between resource usage and performance.  
+
+__hip_launch_bounds__ allows the application to provide usage hints that influence the resources (primarily registers) used by the generated code.
+__hip_launch_bounds__ is a function attribute that must be attached to a __global__ function:

 ```
-__global__ void
-`__launch_bounds__`(requiredMaxThreadsPerBlock, minBlocksPerMultiprocessor)  
+__global__ void `__launch_bounds__`(MAX_THREADS_PER_BLOCK, MIN_WARPS_PER_EU) MyKernel(...) ...
 MyKernel(hipGridLaunch lp, ...) 
 ...
 ```

-`__launch_bounds__` supports two parameters:
+__launch_bounds__ supports two parameters:
+- MAX_THREADS_PER_BLOCK - The programmers guarantees that kernel will be launched with threads less than MAX_THREADS_PER_BLOCK. (On NVCC this maps to the .maxntid PTX directive). If no launch_bounds is specified, MAX_THREADS_PER_BLOCK is the maximum block size supported by the device (typically 1024 or larger). Specifying MAX_THREADS_PER_BLOCK less than the maximum effectively allows the compiler to use more resources than a default unconstrained compilation that supports all possible block sizes at launch time.
+The threads-per-block is the product of (hipBlockDim_x * hipBlockDim_y * hipBlockDim_z).
+- MIN_WARPS_PER_EU - directs the compiler to minimize resource usage so that the requested number of warps can be simultaneously active on a multi-processor. Since active warps compete for the same fixed pool of resources, the compiler must reduce resources required by each warp(primarily registers). MIN_WARPS_PER_EU is optional and defaults to 1 if not specified. Specifying a MIN_WARPS_PER_EU greater than the default 1 effectively constrains the compiler's resource usage.

- **requiredMaxThreadsPerBlock**---the programmer guarantees that the kernel will launch with threadsPerBlock less than requiredMaxThreadsPerBlock. (In nvcc, this parameter maps to the _.maxntid_ PTX directive; in hcc, it maps to the HSAIL _requiredworkgroupsize_ directive.) If launch_bounds is unspecified, requiredMaxThreadsPerBlock is the maximum block size that the device supports (typically 1,024 or larger). Specifying requiredMaxThreadsPerBlock less than the maximum effectively allows the compiler to use more resources than a default unconstrained compilation supporting all possible block sizes at launch time. The threadsPerBlock value is the product hipBlockDim_x * hipBlockDim_y * hipBlockDim_z.
- **minBlocksPerMultiprocessor**---directs the compiler to minimize resource usage so that the requested number of blocks can be simultaneously active on a multiprocessor. Because active blocks compete for the same fixed resource pool, the compiler must reduce the resource requirements of each block (primarily registers). minBlocksPerMultiprocessor is optional and defaults to 1 if unspecified. Selecting a minBlocksPerMultiprocessor value greater than 1 effectively constrains the compiler's resource usage. 
+### Compiler Impact
+The compiler uses these parameters as follows:
+- The compiler uses the hints only to manage register usage, and does not automatically reduce shared memory or other resources.
+- Compilation fails if compiler cannot generate a kernel which meets the requirements of the specified launch bounds.
+- From MAX_THREADS_PER_BLOCK, the compiler derives the maximum number of warps/block that can be used at launch time.
+Values of MAX_THREADS_PER_BLOCK less than the default allows the compiler to use a larger pool of registers : each warp uses registers, and this hint constains the launch to a warps/block size which is less than maximum.
+- From MIN_WARPS_PER_EU, the compiler derives a maximum number of registers that can be used by the kernel (to meet the required #simultaneous active blocks).
+If MIN_WARPS_PER_EU is 1, then the kernel can use all registers supported by the multiprocessor.
+- The compiler ensures that the registers used in the kernel is less than both allowed maximums, typically by spilling registers (to shared or global memory), or by using more instructions.
+- The compiler may use hueristics to increase register usage, or may simply be able to avoid spilling. The MAX_THREADS_PER_BLOCK is particularly useful in this cases, since it allows the compiler to use more registers and avoid situations where the compiler constrains the register usage (potentially spilling) to meet the requirements of a large block size that is never used at launch time.

-The compiler uses these two parameters as follows:
- It employs the hints only to manage register usage and does not automatically reduce shared memory or other resources.
- Compilation fails if the compiler cannot generate a kernel that meets the requirements of the specified launch bounds.
- From requiredMaxThreadsPerBlock, the compiler derives the maximum number of warps per block that are usable at launch time. Values less than the default allow the compiler to use a larger register pool: each warp uses registers, and this hint constrains the launch to a warps-per-block size less than maximum. 
- From minBlocksPerMultiprocessor, the compiler derives a maximum number of registers that the kernel can use (to meet the required number of simultaneously active blocks). If the value is 1, the kernel can use all registers supported by the multiprocessor.  

-The compiler ensures that the kernel uses fewer registers than both allowed maxima specify, typically by spilling to shared memory or using more instructions. It may use heuristics to increase register usage or may simply be able to avoid spilling. The requiredMaxThreadsPerBlock parameter is particularly useful in this case, since it allows the compiler to use more registers---avoiding situations where the compiler constrains the register usage (potentially spilling) to meet the requirements of a large block size never sees use at launch time.  
+### CU and EU Definitions
+A compute unit (CU) is responsible for executing the waves of a work-group. It is composed of one or more execution units (EU) which are responsible for executing waves. An EU can have enough resources to maintain the state of more than one executing wave. This allows an EU to hide latency by switching between waves in a similar way to symmetric multithreading on a CPU. In order to allow the state for multiple waves to fit on an EU, the resources used by a single wave have to be limited. Limiting such resources can allow greater latency hiding, but can result in having to spill some register state to memory. This attribute allows an advanced developer to tune the number of waves that are capable of fitting within the resources of an EU. It can be used to ensure at least a certain number will fit to help hide latency, and can also be used to ensure no more than a certain number will fit to limit cache thrashing.
+ 
+### Porting from CUDA __launch_bounds
+CUDA defines a __launch_bounds which is also designed to control occupancy:
+```
+__launch_bounds(MAX_THREADS_PER_BLOCK, MIN_BLOCKS_PER_MULTIPROCESSOR)
+```

-HIP/hcc will parse the `launch_bounds` attribute but silently ignores the performance hint. Full support is under development.
+- The second parameter __launch_bounds parameters must be converted to the format used __hip_launch_bounds, which uses warps and execution-units rather than blocks and multi-processors  ( This conversion is performed automatically by the clang hipify tools.)
+```
+MIN_WARPS_PER_EXECUTION_UNIT = (MIN_BLOCKS_PER_MULTIPROCESSOR * MAX_THREADS_PER_BLOCK) / 32
+```

+The key differences in the interface are:
+- Warps (rather than blocks):
+The developer is trying to tell the compiler to control resource utilization to guarantee some amount of active Warps/EU for latency hiding.  Specifying active warps in terms of blocks appears to hide the micro-architectural details of the warp size, but makes the interface more confusing since the developer ultimately needs to compute the number of warps to obtain the desired level of control. 
+- Execution Units  (rather than multiProcessor):
+The use of execution units rather than multiprocessors provides support for architectures with multiple execution units/multi-processor. For example, the AMD GCN architecture has 4 execution units per multiProcessor.  The hipDeviceProps has a field executionUnitsPerMultiprocessor.
+Platform-specific coding techniques such as #ifdef can be used to specify different launch_bounds for NVCC and HCC platforms, if desired. 
+
+
+### maxregcount
 Unlike nvcc, hcc does not support the "--maxregcount" option.  Instead, users are encouraged to use the hip_launch_bounds directive since the parameters are more intuitive and portable than
 micro-architecture details like registers, and also the directive allows per-kernel control rather than an entire file.  hip_launch_bounds works on both hcc and nvcc targets.

@@ -268,9 +268,12 @@ PASSED!
 ```

 HIP_TRACE_API supports multiple levels of debug information:
-   - 0x1 = print all HIP APIs
-   - 0x2 = print HIP APIs which initiate GPU kernels, copies, or memsets.  Includes hipLaunchKernel, hipMemcpy*, hipMemset*.
-   - 0x4 = print HIP APIs which allocate or free memory.  Includes hipMalloc, hipHostMalloc, hipFree, hipHostFree.
+   - 0x1 = print all HIP APIs.  This is the most verbose setting; the flags below allow selecting a subset.
+   - 0x2 = print HIP APIs which initiate GPU kernel commands.  Includes hipLaunchKernel, hipLaunchModuleKernel
+   - 0x4 = print HIP APIs which initiate GPU memory commands.  Includes hipMemcpy*, hipMemset*.
+   - 0x8 = print HIP APIs which allocate or free memory.  Includes hipMalloc, hipHostMalloc, hipFree, hipHostFree.
+
+These can be combined.  For example, HIP_TRACE_API=6 shows a concise view of the HIP commands (both kernel and memory) that are sent to the GPU.


 #### Color
@@ -70,7 +70,7 @@ To set additional options like Language Selection (only "-x cuda" is supported),

 Delimiter "--" is used to separate hipify-clang options (before the delimiter) from clang options (after the delimiter). It is strongly recommended to always specify the delimiter, even if there are no clang specific options at all, in order to avoid possible errors regarding compilation database; in such case delimeter should be the last option in hipify-clang's command line.

-Option "-x clang" is also worth specifying in order to convert source CUDA files with extensions other than standard extensions (*.cu, *.cuh).
+Option "-x cuda" is also worth specifying in order to convert source CUDA files with extensions other than standard extensions (*.cu, *.cuh).

 ## Disclaimer

@@ -987,20 +987,26 @@ struct cuda2hipMap {
    cuda2hipRename["cuEventRecord"]                             = {"hipEventRecord", CONV_EVENT, API_DRIVER};
    cuda2hipRename["cuEventSynchronize"]                        = {"hipEventSynchronize", CONV_EVENT, API_DRIVER};

+    // Execution Control
+    cuda2hipRename["cuFuncGetAttribute"]                        = {"hipFuncGetAttribute", CONV_MODULE, API_DRIVER, HIP_UNSUPPORTED};
+    cuda2hipRename["cuFuncSetCacheConfig"]                      = {"hipFuncSetCacheConfig", CONV_MODULE, API_DRIVER};
+    cuda2hipRename["cuFuncSetSharedMemConfig"]                  = {"hipFuncSetSharedMemConfig", CONV_MODULE, API_DRIVER, HIP_UNSUPPORTED};
    cuda2hipRename["cuLaunchKernel"]                            = {"hipModuleLaunchKernel", CONV_MODULE, API_DRIVER};

    // Streams
-    // unsupported yet by HIP
    cuda2hipRename["cuStreamAddCallback"]                       = {"hipStreamAddCallback", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED};
-    cuda2hipRename["cuStreamWaitValue32"]                       = {"hipStreamWaitValue32", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED}; // [CUDA 8.0.44] // no API_Runtime ANALOGUE
-    cuda2hipRename["cuStreamWriteValue32"]                      = {"hipStreamWriteValue32", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED}; // [CUDA 8.0.44] // no API_Runtime ANALOGUE
-    cuda2hipRename["cuStreamBatchMemOp"]                        = {"hipStreamBatchMemOp", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED}; // [CUDA 8.0.44] // no API_Runtime ANALOGUE
-
-    cuda2hipRename["cuStreamCreate"]                            = {"hipStreamCreate", CONV_STREAM, API_DRIVER};
+    cuda2hipRename["cuStreamAttachMemAsync"]                    = {"hipStreamAttachMemAsync", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED};
+    cuda2hipRename["cuStreamCreate"]                            = {"hipStreamCreate__", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED};      // Not equal to cudaStreamCreate due to different signatures
+    cuda2hipRename["cuStreamCreateWithPriority"]                = {"hipStreamCreateWithPriority", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED};
    cuda2hipRename["cuStreamDestroy_v2"]                        = {"hipStreamDestroy", CONV_STREAM, API_DRIVER};
+    cuda2hipRename["cuStreamGetFlags"]                          = {"hipStreamGetFlags", CONV_STREAM, API_DRIVER};
+    cuda2hipRename["cuStreamGetPriority"]                       = {"hipStreamGetPriority", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED};
    cuda2hipRename["cuStreamQuery"]                             = {"hipStreamQuery", CONV_STREAM, API_DRIVER};
    cuda2hipRename["cuStreamSynchronize"]                       = {"hipStreamSynchronize", CONV_STREAM, API_DRIVER};
    cuda2hipRename["cuStreamWaitEvent"]                         = {"hipStreamWaitEvent", CONV_STREAM, API_DRIVER};
+    cuda2hipRename["cuStreamWaitValue32"]                       = {"hipStreamWaitValue32", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED};   // [CUDA 8.0.44] // no API_Runtime ANALOGUE
+    cuda2hipRename["cuStreamWriteValue32"]                      = {"hipStreamWriteValue32", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED};  // [CUDA 8.0.44] // no API_Runtime ANALOGUE
+    cuda2hipRename["cuStreamBatchMemOp"]                        = {"hipStreamBatchMemOp", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED};    // [CUDA 8.0.44] // no API_Runtime ANALOGUE

    // Memory management
    cuda2hipRename["cuArray3DCreate"]                           = {"hipArray3DCreate", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED};
@@ -1016,16 +1022,16 @@ struct cuda2hipMap {
    cuda2hipRename["cuMemAlloc_v2"]                             = {"hipMalloc", CONV_MEM, API_DRIVER};
    cuda2hipRename["cuMemAllocHost"]                            = {"hipMemAllocHost", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED};
    cuda2hipRename["cuMemAllocManaged"]                         = {"hipMemAllocManaged", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED};
-    cuda2hipRename["cuMemAllocPitch"]                           = {"hipMemAllocPitch__", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED};      // Not equal to cudaMemAllocPitch due to different signatures
-    cuda2hipRename["cuMemcpy"]                                  = {"hipMemcpy__", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED};             // Not equal to cudaMemcpy due to different signatures
-    cuda2hipRename["cuMemcpy2D"]                                = {"hipMemcpy2D__", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED};           // Not equal to cudaMemcpy2D due to different signatures
-    cuda2hipRename["cuMemcpy2DAsync"]                           = {"hipMemcpy2DAsync__", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED};      // Not equal to cudaMemcpy2DAsync due to different signatures
+    cuda2hipRename["cuMemAllocPitch"]                           = {"hipMemAllocPitch__", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED};        // Not equal to cudaMemAllocPitch due to different signatures
+    cuda2hipRename["cuMemcpy"]                                  = {"hipMemcpy__", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED};               // Not equal to cudaMemcpy due to different signatures
+    cuda2hipRename["cuMemcpy2D"]                                = {"hipMemcpy2D__", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED};             // Not equal to cudaMemcpy2D due to different signatures
+    cuda2hipRename["cuMemcpy2DAsync"]                           = {"hipMemcpy2DAsync__", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED};        // Not equal to cudaMemcpy2DAsync due to different signatures
    cuda2hipRename["cuMemcpy2DUnaligned"]                       = {"hipMemcpy2DUnaligned", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED};
-    cuda2hipRename["cuMemcpy3D"]                                = {"hipMemcpy3D__", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED};           // Not equal to cudaMemcpy3D due to different signatures
-    cuda2hipRename["cuMemcpy3DAsync"]                           = {"hipMemcpy3DAsync__", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED};     // Not equal to cudaMemcpy3DAsync due to different signatures
-    cuda2hipRename["cuMemcpy3DPeer"]                            = {"hipMemcpy3DPeer__", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED};      // Not equal to cudaMemcpy3DPeer due to different signatures
-    cuda2hipRename["cuMemcpy3DPeerAsync"]                       = {"hipMemcpy3DPeerAsync__", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED}; // Not equal to cudaMemcpy3DPeerAsync due to different signatures
-    cuda2hipRename["cuMemcpyAsync"]                             = {"hipMemcpyAsync__", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED};       // Not equal to cudaMemcpyAsync due to different signatures
+    cuda2hipRename["cuMemcpy3D"]                                = {"hipMemcpy3D__", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED};             // Not equal to cudaMemcpy3D due to different signatures
+    cuda2hipRename["cuMemcpy3DAsync"]                           = {"hipMemcpy3DAsync__", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED};       // Not equal to cudaMemcpy3DAsync due to different signatures
+    cuda2hipRename["cuMemcpy3DPeer"]                            = {"hipMemcpy3DPeer__", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED};        // Not equal to cudaMemcpy3DPeer due to different signatures
+    cuda2hipRename["cuMemcpy3DPeerAsync"]                       = {"hipMemcpy3DPeerAsync__", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED};   // Not equal to cudaMemcpy3DPeerAsync due to different signatures
+    cuda2hipRename["cuMemcpyAsync"]                             = {"hipMemcpyAsync__", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED};         // Not equal to cudaMemcpyAsync due to different signatures
    cuda2hipRename["cuMemcpyAtoA"]                              = {"hipMemcpyAtoA", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED};
    cuda2hipRename["cuMemcpyAtoD"]                              = {"hipMemcpyAtoD", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED};
    cuda2hipRename["cuMemcpyAtoH"]                              = {"hipMemcpyAtoH", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED};
@@ -1039,17 +1045,17 @@ struct cuda2hipMap {
    cuda2hipRename["cuMemcpyHtoAAsync"]                         = {"hipMemcpyHtoAAsync", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED};
    cuda2hipRename["cuMemcpyHtoD_v2"]                           = {"hipMemcpyHtoD", CONV_MEM, API_DRIVER};
    cuda2hipRename["cuMemcpyHtoDAsync_v2"]                      = {"hipMemcpyHtoDAsync", CONV_MEM, API_DRIVER};
-    cuda2hipRename["cuMemcpyPeerAsync"]                         = {"hipMemcpyPeerAsync__", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED};    // Not equal to cudaMemcpyPeerAsync due to different signatures
-    cuda2hipRename["cuMemcpyPeer"]                              = {"hipMemcpyPeer__", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED};         // Not equal to cudaMemcpyPeer due to different signatures
+    cuda2hipRename["cuMemcpyPeerAsync"]                         = {"hipMemcpyPeerAsync__", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED};      // Not equal to cudaMemcpyPeerAsync due to different signatures
+    cuda2hipRename["cuMemcpyPeer"]                              = {"hipMemcpyPeer__", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED};           // Not equal to cudaMemcpyPeer due to different signatures
    cuda2hipRename["cuMemFree_v2"]                              = {"hipFree", CONV_MEM, API_DRIVER};
    cuda2hipRename["cuMemFreeHost"]                             = {"hipHostFree", CONV_MEM, API_DRIVER};
    cuda2hipRename["cuMemGetAddressRange"]                      = {"hipMemGetAddressRange", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED};
    cuda2hipRename["cuMemGetInfo_v2"]                           = {"hipMemGetInfo", CONV_MEM, API_DRIVER};
-    cuda2hipRename["cuMemHostAlloc"]                            = {"hipHostMalloc", CONV_MEM, API_DRIVER};                            // API_Runtime ANALOGUE (cudaHostAlloc)
+    cuda2hipRename["cuMemHostAlloc"]                            = {"hipHostMalloc", CONV_MEM, API_DRIVER};                              // API_Runtime ANALOGUE (cudaHostAlloc)
    cuda2hipRename["cuMemHostGetDevicePointer"]                 = {"hipMemHostGetDevicePointer", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED};
    cuda2hipRename["cuMemHostGetFlags"]                         = {"hipMemHostGetFlags", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED};
-    cuda2hipRename["cuMemHostRegister_v2"]                      = {"hipHostRegister", CONV_MEM, API_DRIVER};                          // API_Runtime ANALOGUE (cudaHostAlloc)
-    cuda2hipRename["cuMemHostUnregister"]                       = {"hipHostUnregister", CONV_MEM, API_DRIVER};                        // API_Runtime ANALOGUE (cudaHostUnregister)
+    cuda2hipRename["cuMemHostRegister_v2"]                      = {"hipHostRegister", CONV_MEM, API_DRIVER};                            // API_Runtime ANALOGUE (cudaHostAlloc)
+    cuda2hipRename["cuMemHostUnregister"]                       = {"hipHostUnregister", CONV_MEM, API_DRIVER};                          // API_Runtime ANALOGUE (cudaHostUnregister)
    cuda2hipRename["cuMemsetD16_v2"]                            = {"hipMemsetD16", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED};
    cuda2hipRename["cuMemsetD16Async"]                          = {"hipMemsetD16Async", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED};
    cuda2hipRename["cuMemsetD2D16_v2"]                          = {"hipMemsetD2D16", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED};
@@ -1058,18 +1064,22 @@ struct cuda2hipMap {
    cuda2hipRename["cuMemsetD2D32Async"]                        = {"hipMemsetD2D32Async", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED};
    cuda2hipRename["cuMemsetD2D8_v2"]                           = {"hipMemsetD2D8", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED};
    cuda2hipRename["cuMemsetD2D8Async"]                         = {"hipMemsetD2D8Async", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED};
-    cuda2hipRename["cuMemsetD32_v2"]                            = {"hipMemset", CONV_MEM, API_DRIVER};                                // API_Runtime ANALOGUE (cudaMemset)
-    cuda2hipRename["cuMemsetD32Async"]                          = {"hipMemsetAsync", CONV_MEM, API_DRIVER};                           // API_Runtime ANALOGUE (cudaMemsetAsync)
+    cuda2hipRename["cuMemsetD32_v2"]                            = {"hipMemset", CONV_MEM, API_DRIVER};                                  // API_Runtime ANALOGUE (cudaMemset)
+    cuda2hipRename["cuMemsetD32Async"]                          = {"hipMemsetAsync", CONV_MEM, API_DRIVER};                             // API_Runtime ANALOGUE (cudaMemsetAsync)
    cuda2hipRename["cuMemsetD8_v2"]                             = {"hipMemsetD8", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED};
    cuda2hipRename["cuMemsetD8Async"]                           = {"hipMemsetD8Async", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED};
    cuda2hipRename["cuMipmappedArrayCreate"]                    = {"hipMipmappedArrayCreate", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED};
    cuda2hipRename["cuMipmappedArrayDestroy"]                   = {"hipMipmappedArrayDestroy", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED};
    cuda2hipRename["cuMipmappedArrayGetLevel"]                  = {"hipMipmappedArrayGetLevel", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED};
-    // unsupported yet by HIP [CUDA 8.0.44]
-    cuda2hipRename["cuMemPrefetchAsync"]                        = {"hipMemPrefetchAsync___", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED};   // [CUDA 8.0.44] // no API_Runtime ANALOGUE (cudaMemPrefetchAsync has different signature)
+
+    // Unified Addressing
+    cuda2hipRename["cuMemPrefetchAsync"]                        = {"hipMemPrefetchAsync__", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED};    // [CUDA 8.0.44] // no API_Runtime ANALOGUE (cudaMemPrefetchAsync has different signature)
    cuda2hipRename["cuMemAdvise"]                               = {"hipMemAdvise", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED};             // [CUDA 8.0.44] // API_Runtime ANALOGUE (cudaMemAdvise)
    cuda2hipRename["cuMemRangeGetAttribute"]                    = {"hipMemRangeGetAttribute", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED};  // [CUDA 8.0.44] // API_Runtime ANALOGUE (cudaMemRangeGetAttribute)
    cuda2hipRename["cuMemRangeGetAttributes"]                   = {"hipMemRangeGetAttributes", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED}; // [CUDA 8.0.44] // API_Runtime ANALOGUE (cudaMemRangeGetAttributes)
+    cuda2hipRename["cuPointerGetAttribute"]                     = {"hipPointerGetAttribute", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED};
+    cuda2hipRename["cuPointerGetAttributes"]                    = {"hipPointerGetAttributes", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED};
+    cuda2hipRename["cuPointerSetAttribute"]                     = {"hipPointerSetAttribute", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED};

    // Texture Reference Mngmnt
    // Texture reference filtering modes
@@ -245,6 +245,128 @@ namespace hip_impl
        HIP_kernel_functor_name_begin ## _ ## k ## _ ## \
        HIP_kernel_functor_name_end ## _ ## n

+    #define make_kernel_functor_hip_30(\
+        function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\
+        p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20, p21, p22, p23,\
+        p24, p25, p26, p27)\
+        struct make_kernel_name_hip(function_name, 28) {\
+            std::decay_t<decltype(p0)> _p0_;\
+            std::decay_t<decltype(p1)> _p1_;\
+            std::decay_t<decltype(p2)> _p2_;\
+            std::decay_t<decltype(p3)> _p3_;\
+            std::decay_t<decltype(p4)> _p4_;\
+            std::decay_t<decltype(p5)> _p5_;\
+            std::decay_t<decltype(p6)> _p6_;\
+            std::decay_t<decltype(p7)> _p7_;\
+            std::decay_t<decltype(p8)> _p8_;\
+            std::decay_t<decltype(p9)> _p9_;\
+            std::decay_t<decltype(p10)> _p10_;\
+            std::decay_t<decltype(p11)> _p11_;\
+            std::decay_t<decltype(p12)> _p12_;\
+            std::decay_t<decltype(p13)> _p13_;\
+            std::decay_t<decltype(p14)> _p14_;\
+            std::decay_t<decltype(p15)> _p15_;\
+            std::decay_t<decltype(p16)> _p16_;\
+            std::decay_t<decltype(p17)> _p17_;\
+            std::decay_t<decltype(p18)> _p18_;\
+            std::decay_t<decltype(p19)> _p19_;\
+            std::decay_t<decltype(p20)> _p20_;\
+            std::decay_t<decltype(p21)> _p21_;\
+            std::decay_t<decltype(p22)> _p22_;\
+            std::decay_t<decltype(p23)> _p23_;\
+            std::decay_t<decltype(p24)> _p24_;\
+            std::decay_t<decltype(p25)> _p25_;\
+            std::decay_t<decltype(p26)> _p26_;\
+            std::decay_t<decltype(p27)> _p27_;\
+            void operator()(const hc::tiled_index<3>&) const [[hc]]\
+            {\
+                kernel_name(\
+                    _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\
+                    _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\
+                    _p18_, _p19_, _p20_, _p21_, _p22_, _p23_, _p24_, _p25_,\
+                    _p26_, _p27_);\
+            }\
+        }
+    #define make_kernel_functor_hip_29(\
+        function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\
+        p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20, p21, p22, p23,\
+        p24, p25, p26)\
+        struct make_kernel_name_hip(function_name, 27) {\
+            std::decay_t<decltype(p0)> _p0_;\
+            std::decay_t<decltype(p1)> _p1_;\
+            std::decay_t<decltype(p2)> _p2_;\
+            std::decay_t<decltype(p3)> _p3_;\
+            std::decay_t<decltype(p4)> _p4_;\
+            std::decay_t<decltype(p5)> _p5_;\
+            std::decay_t<decltype(p6)> _p6_;\
+            std::decay_t<decltype(p7)> _p7_;\
+            std::decay_t<decltype(p8)> _p8_;\
+            std::decay_t<decltype(p9)> _p9_;\
+            std::decay_t<decltype(p10)> _p10_;\
+            std::decay_t<decltype(p11)> _p11_;\
+            std::decay_t<decltype(p12)> _p12_;\
+            std::decay_t<decltype(p13)> _p13_;\
+            std::decay_t<decltype(p14)> _p14_;\
+            std::decay_t<decltype(p15)> _p15_;\
+            std::decay_t<decltype(p16)> _p16_;\
+            std::decay_t<decltype(p17)> _p17_;\
+            std::decay_t<decltype(p18)> _p18_;\
+            std::decay_t<decltype(p19)> _p19_;\
+            std::decay_t<decltype(p20)> _p20_;\
+            std::decay_t<decltype(p21)> _p21_;\
+            std::decay_t<decltype(p22)> _p22_;\
+            std::decay_t<decltype(p23)> _p23_;\
+            std::decay_t<decltype(p24)> _p24_;\
+            std::decay_t<decltype(p25)> _p25_;\
+            std::decay_t<decltype(p26)> _p26_;\
+            void operator()(const hc::tiled_index<3>&) const [[hc]]\
+            {\
+                kernel_name(\
+                    _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\
+                    _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\
+                    _p18_, _p19_, _p20_, _p21_, _p22_, _p23_, _p24_, _p25_,\
+                    _p26_);\
+            }\
+        }
+    #define make_kernel_functor_hip_28(\
+        function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\
+        p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20, p21, p22, p23,\
+        p24, p25)\
+        struct make_kernel_name_hip(function_name, 26) {\
+            std::decay_t<decltype(p0)> _p0_;\
+            std::decay_t<decltype(p1)> _p1_;\
+            std::decay_t<decltype(p2)> _p2_;\
+            std::decay_t<decltype(p3)> _p3_;\
+            std::decay_t<decltype(p4)> _p4_;\
+            std::decay_t<decltype(p5)> _p5_;\
+            std::decay_t<decltype(p6)> _p6_;\
+            std::decay_t<decltype(p7)> _p7_;\
+            std::decay_t<decltype(p8)> _p8_;\
+            std::decay_t<decltype(p9)> _p9_;\
+            std::decay_t<decltype(p10)> _p10_;\
+            std::decay_t<decltype(p11)> _p11_;\
+            std::decay_t<decltype(p12)> _p12_;\
+            std::decay_t<decltype(p13)> _p13_;\
+            std::decay_t<decltype(p14)> _p14_;\
+            std::decay_t<decltype(p15)> _p15_;\
+            std::decay_t<decltype(p16)> _p16_;\
+            std::decay_t<decltype(p17)> _p17_;\
+            std::decay_t<decltype(p18)> _p18_;\
+            std::decay_t<decltype(p19)> _p19_;\
+            std::decay_t<decltype(p20)> _p20_;\
+            std::decay_t<decltype(p21)> _p21_;\
+            std::decay_t<decltype(p22)> _p22_;\
+            std::decay_t<decltype(p23)> _p23_;\
+            std::decay_t<decltype(p24)> _p24_;\
+            std::decay_t<decltype(p25)> _p25_;\
+            void operator()(const hc::tiled_index<3>&) const [[hc]]\
+            {\
+                kernel_name(\
+                    _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\
+                    _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\
+                    _p18_, _p19_, _p20_, _p21_, _p22_, _p23_, _p24_, _p25_);\
+            }\
+        }
    #define make_kernel_functor_hip_27(\
        function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\
        p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20, p21, p22, p23,\
@@ -174,6 +174,9 @@ static constexpr int warpSize = 64;
 __device__ long long int clock64();
 __device__ clock_t clock();

+//abort
+__device__ void abort();
+
 //atomicAdd()
 __device__ int atomicAdd(int* address, int val);
 __device__ unsigned int atomicAdd(unsigned int* address,
@@ -1194,7 +1194,7 @@ hipError_t hipMemcpyDtoDAsync(hipDeviceptr_t dst, hipDeviceptr_t src, size_t siz
 *
 *  @see hipMemcpy, hipMemcpy2D, hipMemcpyToArray, hipMemcpy2DToArray, hipMemcpyFromArray, hipMemcpy2DFromArray, hipMemcpyArrayToArray, hipMemcpy2DArrayToArray, hipMemcpyFromSymbol, hipMemcpyAsync, hipMemcpy2DAsync, hipMemcpyToArrayAsync, hipMemcpy2DToArrayAsync, hipMemcpyFromArrayAsync, hipMemcpy2DFromArrayAsync, hipMemcpyToSymbolAsync, hipMemcpyFromSymbolAsync
 */
-hipError_t hipMemcpyToSymbol(const void* symbolName, const void *src, size_t sizeBytes, size_t offset, hipMemcpyKind kind);
+hipError_t hipMemcpyToSymbol(const void* symbolName, const void *src, size_t sizeBytes, size_t offset = 0, hipMemcpyKind kind = hipMemcpyHostToDevice);


 /**
@@ -1214,11 +1214,11 @@ hipError_t hipMemcpyToSymbol(const void* symbolName, const void *src, size_t siz
 *
 *  @see hipMemcpy, hipMemcpy2D, hipMemcpyToArray, hipMemcpy2DToArray, hipMemcpyFromArray, hipMemcpy2DFromArray, hipMemcpyArrayToArray, hipMemcpy2DArrayToArray, hipMemcpyFromSymbol, hipMemcpyAsync, hipMemcpy2DAsync, hipMemcpyToArrayAsync, hipMemcpy2DToArrayAsync, hipMemcpyFromArrayAsync, hipMemcpy2DFromArrayAsync, hipMemcpyToSymbolAsync, hipMemcpyFromSymbolAsync
 */
-hipError_t hipMemcpyToSymbolAsync(const void* symbolName, const void *src, size_t sizeBytes, size_t offset, hipMemcpyKind kind, hipStream_t stream);
+hipError_t hipMemcpyToSymbolAsync(const void* symbolName, const void *src, size_t sizeBytes, size_t offset, hipMemcpyKind kind, hipStream_t stream = 0);

-hipError_t hipMemcpyFromSymbol(void *dst, const void* symbolName, size_t sizeBytes, size_t offset, hipMemcpyKind kind);
+hipError_t hipMemcpyFromSymbol(void *dst, const void* symbolName, size_t sizeBytes, size_t offset = 0, hipMemcpyKind kind = hipMemcpyDeviceToHost);

-hipError_t hipMemcpyFromSymbolAsync(void *dst, const void* symbolName, size_t sizeBytes, size_t offset, hipMemcpyKind kind, hipStream_t stream);
+hipError_t hipMemcpyFromSymbolAsync(void *dst, const void* symbolName, size_t sizeBytes, size_t offset, hipMemcpyKind kind, hipStream_t stream = 0);

 /**
 *  @brief Copy data from src to dst asynchronously.
@@ -36,25 +36,21 @@ THE SOFTWARE.

 #define MAKE_DEFAULT_CONSTRUCTOR_ONE_COMPONENT(type) \
 __device__ __host__ type() {} \
-__device__ __host__ type(type& val) : x(val.x) { } \
 __device__ __host__ type(const type& val) : x(val.x) { } \
 __device__ __host__ ~type() {} 

 #define MAKE_DEFAULT_CONSTRUCTOR_TWO_COMPONENT(type) \
 __device__ __host__ type() {} \
-__device__ __host__ type(type& val) : x(val.x), y(val.y) { } \
 __device__ __host__ type(const type& val) : x(val.x), y(val.y) { } \
 __device__ __host__ ~type() {}

 #define MAKE_DEFAULT_CONSTRUCTOR_THREE_COMPONENT(type) \
 __device__ __host__ type() {} \
-__device__ __host__ type(type& val) : x(val.x), y(val.y), z(val.z) { } \
 __device__ __host__ type(const type& val) : x(val.x), y(val.y), z(val.z) { } \
 __device__ __host__ ~type() {} 

 #define MAKE_DEFAULT_CONSTRUCTOR_FOUR_COMPONENT(type) \
 __device__ __host__ type() {} \
-__device__ __host__ type(type& val) : x(val.x), y(val.y), z(val.z), w(val.w) { } \
 __device__ __host__ type(const type& val) : x(val.x), y(val.y), z(val.z), w(val.w) { } \
 __device__ __host__ ~type() {}

@@ -28,7 +28,6 @@ THE SOFTWARE.
 #ifndef HIP_INCLUDE_HIP_HCC_DETAIL_HOST_DEFINES_H
 #define HIP_INCLUDE_HIP_HCC_DETAIL_HOST_DEFINES_H

-#define USE_PROMOTE_FREE_HCC 1

 // Add guard to Generic Grid Launch method
 #ifndef GENERIC_GRID_LAUNCH
@@ -42,13 +41,10 @@ THE SOFTWARE.
 #define __host__     __attribute__((cpu))
 #define __device__   __attribute__((hc))

-//#warning "HOST DEFINE header included"
 #if GENERIC_GRID_LAUNCH == 0
-//#warning "original global define reached"
 #define __global__  __attribute__((hc_grid_launch)) __attribute__((used))
 #else
-//#warning "GGL global define reached"
-#define __global__ __attribute__((annotate("hip__global__"), hc, used))
+#define __global__ __attribute__((annotate("hip__global__"), hc, used, weak))
 #endif //GENERIC_GRID_LAUNCH

 #define __noinline__      __attribute__((noinline))
@@ -61,11 +57,7 @@ THE SOFTWARE.
 */
 // _restrict is supported by the compiler
 #define __shared__     tile_static
-#if USE_PROMOTE_FREE_HCC==1
 #define __constant__   __attribute__((hc))
-#else
-#define __constant__   ADDRESS_SPACE_1
-#endif

 #else
 // Non-HCC compiler
@@ -109,6 +109,10 @@ kernelName<<<numblocks,numthreads,memperblock,streamId>>>(__VA_ARGS__);\

 #define HIP_DYNAMIC_SHARED_ATTRIBUTE

+#ifdef __HIP_DEVICE_COMPILE__
+#define abort() {asm("trap;");}
+#endif
+
 #endif

 #endif
@@ -360,16 +360,16 @@ inline static hipError_t hipMemcpyToSymbol(const void* symbol, const void* src,
 	return hipCUDAErrorTohipError(cudaMemcpyToSymbol(symbol, src, sizeBytes, offset, hipMemcpyKindToCudaMemcpyKind(copyType)));
 }

-inline static hipError_t hipMemcpyToSymbolAsync(const void* symbol, const void* src, size_t sizeBytes, size_t offset, hipMemcpyKind copyType, hipStream_t stream) {
-    return hipCUDAErrorTohipError(cudaMemcpyToSymbolAsync(symbol, src, sizeBytes, offset, hipMemcpyKindToCudaMemcpyKind(copyType)));
+inline static hipError_t hipMemcpyToSymbolAsync(const void* symbol, const void* src, size_t sizeBytes, size_t offset, hipMemcpyKind copyType, hipStream_t stream = 0) {
+    return hipCUDAErrorTohipError(cudaMemcpyToSymbolAsync(symbol, src, sizeBytes, offset, hipMemcpyKindToCudaMemcpyKind(copyType), stream));
 }

-inline static hipError_t hipMemcpyFromSymbol(void *dst, const void* symbolName, size_t sizeBytes, size_t offset, hipMemcpyKind kind)
+inline static hipError_t hipMemcpyFromSymbol(void *dst, const void* symbolName, size_t sizeBytes, size_t offset = 0, hipMemcpyKind kind = hipMemcpyDeviceToHost)
 {
    return hipCUDAErrorTohipError(cudaMemcpyFromSymbol(dst, symbolName, sizeBytes, offset, hipMemcpyKindToCudaMemcpyKind(kind)));
 }

-inline static hipError_t hipMemcpyFromSymbolAsync(void *dst, const void* symbolName, size_t sizeBytes, size_t offset, hipMemcpyKind kind, hipStream_t stream)
+inline static hipError_t hipMemcpyFromSymbolAsync(void *dst, const void* symbolName, size_t sizeBytes, size_t offset, hipMemcpyKind kind, hipStream_t stream = 0)
 {
    return hipCUDAErrorTohipError(cudaMemcpyFromSymbolAsync(dst, symbolName, sizeBytes, offset, hipMemcpyKindToCudaMemcpyKind(kind), stream));
 }
@@ -32,4 +32,4 @@ test: $(EXECUTABLE)
 clean:
 	rm -f $(EXECUTABLE)
 	rm -f $(OBJECTS)
-	rm -f $(HIP_PATH)/src/*.o
+
@@ -1,5 +1,5 @@
 /*
-Copyright (c) 2015-2017 Advanced Micro Devices, Inc. All rights reserved.
+Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
 in the Software without restriction, including without limitation the rights
@@ -149,19 +149,19 @@ __device__ long long int __double_as_longlong(double x)
  return hold64.sli;
 }

-__device__ int float2int_rd(float x)
+__device__ int __float2int_rd(float x)
 {
  return (int)x;
 }
-__device__ int float2int_rn(float x)
+__device__ int __float2int_rn(float x)
 {
  return (int)x;
 }
-__device__ int float2int_ru(float x)
+__device__ int __float2int_ru(float x)
 {
  return (int)x;
 }
-__device__ int float2int_rz(float x)
+__device__ int __float2int_rz(float x)
 {
  return (int)x;
 }
@@ -839,6 +839,11 @@ __device__ float __hip_ynf(int n, float x)
 __device__ long long int clock64() { return (long long int)hc::__cycle_u64(); };
 __device__ clock_t clock() { return (clock_t)hc::__cycle_u64(); };

+//abort
+__device__ void abort()
+{
+    return hc::abort();
+}

 //atomicAdd()
 __device__  int atomicAdd(int* address, int val)
@@ -369,12 +369,24 @@ hipError_t hipDeviceGetName(char *name,int len,hipDevice_t device)
 hipError_t hipDeviceGetPCIBusId (char *pciBusId,int len, int device)
 {
    HIP_INIT_API(pciBusId, len, device);
-    hipError_t e = hipSuccess;
-    int tempPciBusId = 0;
-    e = ihipDeviceGetAttribute( &tempPciBusId, hipDeviceAttributePciBusId, device);
-    if( e == hipSuccess) {
-        std::string tempPciStr = std::to_string(tempPciBusId);
-        memcpy( pciBusId , tempPciStr.c_str() , tempPciStr.length() );
+    hipError_t e = hipErrorInvalidValue;
+    int deviceCount = 0;
+    ihipGetDeviceCount( &deviceCount );
+    if((device > deviceCount) || (device < 0)) {
+        e = hipErrorInvalidDevice;
+    } else {
+        if((pciBusId != nullptr) && (len > 0)) {
+            int tempPciBusId = 0;
+            e = ihipDeviceGetAttribute( &tempPciBusId, hipDeviceAttributePciBusId, device);
+            if( e == hipSuccess) {
+                std::string tempPciStr = std::to_string(tempPciBusId);
+                if( len < tempPciStr.length()){
+                    e = hipErrorInvalidValue;
+                } else { 
+                    memcpy( pciBusId , tempPciStr.c_str() , tempPciStr.length() );
+                }
+            }
+        }
    }
    return ihipLogStatus(e);
 }
@@ -48,9 +48,6 @@ THE SOFTWARE.
 #include "env.h"


-// needs HCC change for hc::no_scope
-#define USE_NO_SCOPE 1
-
 //=================================================================================================
 //Global variables:
 //=================================================================================================
@@ -81,6 +78,7 @@ int HIP_FORCE_P2P_HOST = 0;
 int HIP_FAIL_SOC = 0;
 int HIP_DENY_PEER_ACCESS = 0;

+int HIP_HIDDEN_FREE_MEM = 256;
 // Force async copies to actually use the synchronous copy interface.
 int HIP_FORCE_SYNC_COPY = 0;

@@ -1207,8 +1205,8 @@ void HipReadEnv()
        tokenize(HIP_LAUNCH_BLOCKING_KERNELS, ',', &g_hipLaunchBlockingKernels);
    }
    READ_ENV_I(release, HIP_API_BLOCKING, 0, "Make HIP APIs 'host-synchronous', so they block until completed.  Impacts hipMemcpyAsync, hipMemsetAsync." );
-
-
+    
+    READ_ENV_I(release, HIP_HIDDEN_FREE_MEM, 0, "Amount of memory to hide from the free memory reported by hipMemGetInfo, specified in MB. Impacts hipMemGetInfo." );

    READ_ENV_C(release, HIP_DB, 0,  "Print debug info.  Bitmask (HIP_DB=0xff) or flags separated by '+' (HIP_DB=api+sync+mem+copy)", HIP_DB_callback);
    if ((HIP_DB & (1<<DB_API))  && (HIP_TRACE_API == 0)) {
@@ -61,7 +61,7 @@ extern int HIP_FORCE_P2P_HOST;

 extern int HIP_COHERENT_HOST_ALLOC;

-
+extern int HIP_HIDDEN_FREE_MEM;
 //---
 // Chicken bits for disabling functionality to work around potential issues:
 extern int HIP_SYNC_HOST_ALLOC;
@@ -1130,6 +1130,10 @@ hipError_t hipMemGetInfo  (size_t *free, size_t *total)
            hc::am_memtracker_sizeinfo(device->_acc, &deviceMemSize, &hostMemSize, &userMemSize);

            *free =  device->_props.totalGlobalMem - deviceMemSize;
+
+           // Deduct the amount of memory from the free memory reported from the system
+           if(HIP_HIDDEN_FREE_MEM)
+               *free -= (size_t)HIP_HIDDEN_FREE_MEM*1024*1024;
        }
        else {
             e = hipErrorInvalidValue;
@@ -1275,70 +1279,78 @@ hipError_t hipIpcGetMemHandle(hipIpcMemHandle_t* handle, void* devPtr){
    // Get the size of allocated pointer
    size_t psize;
    hc::accelerator acc;
-    hc::AmPointerInfo amPointerInfo( NULL , NULL , 0 , acc , 0 , 0 );
-    am_status_t status = hc::am_memtracker_getinfo( &amPointerInfo , devPtr );
-    if (status == AM_SUCCESS) {
-        psize = (size_t)amPointerInfo._sizeBytes;
-    }
-    else
+    if((handle == NULL) || (devPtr == NULL)) {
        hipStatus = hipErrorInvalidResourceHandle;
-    ihipIpcMemHandle_t* iHandle = (ihipIpcMemHandle_t*) handle;
-    // Save the size of the pointer to hipIpcMemHandle
-    iHandle->psize = psize;
+    } else {
+        hc::AmPointerInfo amPointerInfo( NULL , NULL , 0 , acc , 0 , 0 );
+        am_status_t status = hc::am_memtracker_getinfo( &amPointerInfo , devPtr );
+        if (status == AM_SUCCESS) {
+            psize = (size_t)amPointerInfo._sizeBytes;
+        } else
+            hipStatus = hipErrorInvalidResourceHandle;
+        ihipIpcMemHandle_t* iHandle = (ihipIpcMemHandle_t*) handle;
+        // Save the size of the pointer to hipIpcMemHandle
+        iHandle->psize = psize;

 #if USE_IPC
-    // Create HSA ipc memory
-    hsa_status_t hsa_status =
-        hsa_amd_ipc_memory_create(devPtr, psize, (hsa_amd_ipc_memory_t*) &(iHandle->ipc_handle));
-    if(hsa_status!= HSA_STATUS_SUCCESS)
-        hipStatus = hipErrorMemoryAllocation;
+        // Create HSA ipc memory
+        hsa_status_t hsa_status =
+            hsa_amd_ipc_memory_create(devPtr, psize, (hsa_amd_ipc_memory_t*) &(iHandle->ipc_handle));
+        if(hsa_status!= HSA_STATUS_SUCCESS)
+            hipStatus = hipErrorMemoryAllocation;
 #else
-    hipStatus = hipErrorRuntimeOther;
+        hipStatus = hipErrorRuntimeOther;
 #endif
-
+    }
    return ihipLogStatus(hipStatus);
 }

 hipError_t hipIpcOpenMemHandle(void** devPtr, hipIpcMemHandle_t handle, unsigned int flags){
    HIP_INIT_API ( devPtr, &handle , flags);
    hipError_t hipStatus = hipSuccess;
-
+    if(devPtr == NULL) {
+        hipStatus = hipErrorInvalidValue;
+    } else {
 #if USE_IPC
-    // Get the current device agent.
-    hc::accelerator acc;
-    hsa_agent_t *agent = static_cast<hsa_agent_t*>(acc.get_hsa_agent());
-    if(!agent)
-        return hipErrorInvalidResourceHandle;
+        // Get the current device agent.
+        hc::accelerator acc;
+        hsa_agent_t *agent = static_cast<hsa_agent_t*>(acc.get_hsa_agent());
+        if(!agent)
+            return hipErrorInvalidResourceHandle;

-    ihipIpcMemHandle_t* iHandle = (ihipIpcMemHandle_t*) &handle;
-    //Attach ipc memory
-    auto ctx= ihipGetTlsDefaultCtx();
-    {
-        LockedAccessor_CtxCrit_t crit(ctx->criticalData());
-        // the peerCnt always stores self so make sure the trace actually
-        hsa_status_t hsa_status =
-         hsa_amd_ipc_memory_attach((hsa_amd_ipc_memory_t*)&(iHandle->ipc_handle), iHandle->psize, crit->peerCnt(), crit->peerAgents(), devPtr);
-        if(hsa_status != HSA_STATUS_SUCCESS)
-            hipStatus = hipErrorMapBufferObjectFailed;
-    }
+        ihipIpcMemHandle_t* iHandle = (ihipIpcMemHandle_t*) &handle;
+        //Attach ipc memory
+        auto ctx= ihipGetTlsDefaultCtx();
+        {
+            LockedAccessor_CtxCrit_t crit(ctx->criticalData());
+            // the peerCnt always stores self so make sure the trace actually
+            hsa_status_t hsa_status =
+            hsa_amd_ipc_memory_attach((hsa_amd_ipc_memory_t*)&(iHandle->ipc_handle), iHandle->psize, crit->peerCnt(), crit->peerAgents(), devPtr);
+            if(hsa_status != HSA_STATUS_SUCCESS)
+                hipStatus = hipErrorMapBufferObjectFailed;
+        }
 #else
-    hipStatus = hipErrorRuntimeOther;
+        hipStatus = hipErrorRuntimeOther;
 #endif
+    }
    return ihipLogStatus(hipStatus);
 }

 hipError_t hipIpcCloseMemHandle(void *devPtr){
    HIP_INIT_API ( devPtr );
    hipError_t hipStatus = hipSuccess;
-
+    if(devPtr == NULL) {
+        hipStatus = hipErrorInvalidValue;
+    } else {
 #if USE_IPC
-    hsa_status_t hsa_status =
-        hsa_amd_ipc_memory_detach(devPtr);
-    if(hsa_status != HSA_STATUS_SUCCESS)
-        return hipErrorInvalidResourceHandle;
+        hsa_status_t hsa_status =
+            hsa_amd_ipc_memory_detach(devPtr);
+        if(hsa_status != HSA_STATUS_SUCCESS)
+            return hipErrorInvalidResourceHandle;
 #else
-    hipStatus = hipErrorRuntimeOther;
+        hipStatus = hipErrorRuntimeOther;
 #endif
+    }
    return ihipLogStatus(hipStatus);
 }

@@ -451,7 +451,13 @@ hipError_t ihipModuleLaunchKernel(hipFunction_t f,
        hc::completion_future cf;

        lp.av->dispatch_hsa_kernel(&aql, config[1] /* kernarg*/, kernArgSize, 
-                                  (startEvent || stopEvent) ? &cf : nullptr);
+                                  (startEvent || stopEvent) ? &cf : nullptr
+#define USE_NAMED_KERNEL 0
+#if USE_NAMED_KERNEL
+                                  , f->_name.c_str()
+#endif
+                                  );
+


        if (startEvent) {
@@ -32,7 +32,6 @@ THE SOFTWARE.

 #endif

-#define USE_AV_COPY (__hcc_workweek__ >= 16351) 

 size_t Nbytes = 0;

@@ -410,21 +409,13 @@ void thread_noise_generator(int iters, size_t numBuffers, Dir addDir, Dir remove

        if (addDir == Up) {
            for (char *p = basePtr; p<basePtr + maxSize; p+=bufferSize) {
-#if USE_AV_COPY
                hc::AmPointerInfo info(p, p, bufferSize, acc, false, false);
                hc::am_memtracker_add(p, info);
-#else
-                hc::am_memtracker_add(p, bufferSize, acc, false);
-#endif
            }
        } else if (addDir == Down) {
            for (char *p = basePtr+maxSize-bufferSize; p>=0; p-=bufferSize) {
-#if USE_AV_COPY
                hc::AmPointerInfo info(p, p, bufferSize, acc, false, false);
                hc::am_memtracker_add(p, info);
-#else
-                hc::am_memtracker_add(p, bufferSize, acc, false);
-#endif
            }
        }

@@ -36,7 +36,6 @@ THE SOFTWARE.
 #define USE_HCC_MEMTRACKER 0
 #endif

-#define USE_HSA_COPY 1

 int elementSizes[] = {16, 1024,524288};
 int nSizes  = sizeof(elementSizes) / sizeof(int);
@@ -102,11 +101,8 @@ void runTest(bool stepAIsCopy, bool hostSync, hipStream_t gpu0Stream, hipStream_
    hipStream_t stepAStream = gpu0Stream;

    if (stepAIsCopy) {
-#ifdef USE_HSA_COPY
        HIPCHECK(hipMemcpyAsync(dataGpu1, dataGpu0_0, sizeElements, hipMemcpyDeviceToDevice, stepAStream));
-#endif
    } else {
-        //assert(0); // not yet supported.
        unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, numElements);
        hipLaunchKernelGGL(memcpyIntKernel, dim3(blocks), dim3(threadsPerBlock), 0, gpu0Stream,
                       dataGpu0_0, dataGpu1, numElements);