Merge branch 'amd-develop' into amd-master

Change-Id: I28abeb5f0c81dd0315dcb1c987f22704a9c0d18b
2016-09-01 16:06:01 +05:30
commit 1bd2fa7df6
@@ -117,6 +117,10 @@ Differences or limitations of HIP APIs as compared to CUDA APIs should be clearl
    - Keyword TODO refers to a note that should be addressed in long-term.  Could be style issue, software architecture, or known bugs.
    - FIXME refers to a short-term bug that needs to be addressed.

+- HIP_INIT_API() should be placed at the start of each top-level HIP API.  This function will make sure the HIP runtime is initialized,
+  and also constructs an appropriate API string for tracing and CodeXL marker tracing.  The arguments to HIP_INIT_API should match
+  those of the parent fucntion.  
+

 #### Presubmit Testing:
 Before checking in or submitting a pull request, run all Rodinia tests and ensure pass results match starting point:
@@ -105,7 +105,7 @@ if ($HIP_PLATFORM eq "hcc") {
    }

    # Satisfy HCC dependencies
-    $HIPLDFLAGS .= " -lc++abi";
+    $HIPLDFLAGS .= " -lc++abi -lsupc++";
    $HIPLDFLAGS .= " -L$HSA_PATH/lib -L$ROCM_PATH/lib -lhsa-runtime64 -lhc_am -lhsakmt";

    # Handle ROCm target platform
@@ -151,13 +151,12 @@ if ($HIP_PLATFORM eq "hcc") {
    $HIPCC="$CUDA_PATH/bin/nvcc";
    $HIPCXXFLAGS .= " -I$CUDA_PATH/include";

-    $HIPLDFLAGS = "";
+    $HIPLDFLAGS = "-lcuda -lcudart";
 } else {
    printf ("error: unknown HIP_PLATFORM = '$HIP_PLATFORM'");
    exit (-1);
 }

-
 # Add paths to common HIP includes:
 $HIPCXXFLAGS .= " -I$HIP_PATH/include" ;

@@ -177,7 +176,41 @@ if ($verbose & 0x4) {
    print "hipcc-args: ", join (" ", @ARGV), "\n";
 }

+# Handle code object generation
+my $ISACMD="";
+if($HIP_PLATFORM eq "hcc"){
+  $ISACMD .= "$HIP_PATH/bin/hipgenisa.sh ";
+  $ISACMD .= $ROCM_PATH;
+  if($ARGV[0] eq "--gencodeobject"){
+    foreach $isaarg (@ARGV[1..$#ARGV]){
+      $ISACMD .= " ";
+      $ISACMD .= $isaarg;
+    }
+    if ($verbose & 0x1) {
+      print "hipcc-cmd: ", $ISACMD, "\n";
+    }
+    system($ISACMD) and die();
+    exit(0);
+  }
+}
+
+if($HIP_PLATFORM eq "nvcc"){
+  $ISACMD .= "$HIP_PATH/bin/hipcc -ptx ";
+  if($ARGV[0] eq "--gencodeobject"){
+    foreach $isaarg (@ARGV[1..$#ARGV]){
+      $ISACMD .= " ";
+      $ISACMD .= $isaarg;
+    }
+    if ($verbose & 0x1) {
+      print "hipcc-cmd: ", $ISACMD, "\n";
+    }
+    system($ISACMD) and die();
+    exit(0);
+  }
+}
+
 my $toolArgs = "";  # arguments to pass to the hcc or nvcc tool
+
 foreach $arg (@ARGV)
 {
    my $swallowArg = 0;
@@ -0,0 +1,31 @@
+#!/bin/bash
+
+if [ $1 = " " ]
+then
+exit
+fi
+
+ROCM_PATH=$1
+GEN_ISA=$2
+FILE_NAMES=$3
+OUT=$4
+OUTPUT_FILE=$5
+TARGET=""
+if [ ${GEN_ISA:0:12} = "--target-isa" ]
+then
+  TARGET=${GEN_ISA:13:12}
+fi
+
+SOURCE="${BASH_SOURCE[0]}"
+HIP_PATH="$( command cd -P "$( dirname "$SOURCE" )/.." && pwd )"
+
+export KMDUMPISA=1
+export KMDUMPLLVM=1
+mkdir /tmp/hipgenisa
+$HIP_PATH/bin/hipcc $FILE_NAMES -o /tmp/hipgenisa/a.out
+mv dump.* /tmp/hipgenisa/
+$ROCM_PATH/hcc-lc/bin/llvm-mc -arch=amdgcn -mcpu=$TARGET -filetype=obj /tmp/hipgenisa/dump.isa -o /tmp/hipgenisa/dump.o
+$ROCM_PATH/llvm/bin/clang -target amdgcn--amdhsa /tmp/hipgenisa/dump.o -o $OUTPUT_FILE
+rm -r /tmp/hipgenisa
+export KMDUMPISA=0
+export KMDUMPLLVM=0
@@ -42,12 +42,12 @@
 |   **CUDA**                                                |   **HIP**                     | **CUDA description**                                                                                                           |
 |-----------------------------------------------------------|-------------------------------|--------------------------------------------------------------------------------------------------------------------------------|
 | `cudaStreamAddCallback`                                   |                               | Add a callback to a compute stream.                                                                                            |
-| `cudaStreamAttachMemAsync`                                |                               | Attach memory to a stream asynchronously.                                                                                      |
+| `cudaStreamAttachMemAsync`                                |                               | Attach managed memory to a stream asynchronously.                                                                                      |
 | `cudaStreamCreate`                                        | `hipStreamCreate`             | Create an asynchronous stream.                                                                                                 |
 | `cudaStreamCreateWithFlags`                               | `hipStreamCreateWithFlags`    | Create an asynchronous stream.                                                                                                 |
 | `cudaStreamCreateWithPriority`                            |                               | Create an asynchronous stream with the specified priority.                                                                     |
 | `cudaStreamDestroy`                                       | `hipStreamDestroy`            | Destroys and cleans up an asynchronous stream.                                                                                 |
-| `cudaStreamGetFlags`                                      |                               | Query the flags of a stream.                                                                                                   |
+| `cudaStreamGetFlags`                                      | `hipStreamGetFlags`           | Query the flags of a stream.                                                                                                   |
 | `cudaStreamGetPriority`                                   |                               | Query the priority of a stream.                                                                                                |
 | `cudaStreamQuery`                                         |                               | Queries an asynchronous stream for completion status.                                                                          |
 | `cudaStreamSynchronize`                                   | `hipStreamSynchronize`        | Waits for stream tasks to complete.                                                                                            |
@@ -100,8 +100,8 @@
 | `cudaHostAlloc`                                           | `hipHostMalloc`                | Allocates page-locked memory on the host.                                                                                      |
 | `cudaHostGetDevicePointer`                                | `hipHostGetDevicePointer`    | Passes back device pointer of mapped host memory allocated by cudaHostAlloc or registered by cudaHostRegister.                 |
 | `cudaHostGetFlags`                                        | `hipHostGetFlags`           | Passes back flags used to allocate pinned host memory allocated by cudaHostAlloc.                                              |
-| `cudaHostRegister`                                        |                               | Registers an existing host memory range for use by CUDA.                                                                       |
-| `cudaHostUnregister`                                      |                               | Unregisters a memory range that was registered with cudaHostRegister.                                                          |
+| `cudaHostRegister`                                        | `hipHostRegister`             | Registers an existing host memory range for use by CUDA.                                                                       |
+| `cudaHostUnregister`                                      | `hipHostUnregister`           | Unregisters a memory range that was registered with cudaHostRegister.                                                          |
 | `cudaMalloc`                                              | `hipMalloc`                   | Allocate memory on the device.                                                                                                 |
 | `cudaMalloc3D`                                            |                               | Allocates logical 1D, 2D, or 3D memory objects on the device.                                                                  |
 | `cudaMalloc3DArray`                                       |                               | Allocate an array on the device.                                                                                               |
@@ -231,7 +231,7 @@

 |   **CUDA**                                                |   **HIP**                     | **CUDA description**                                                                                                           |
 |-----------------------------------------------------------|-------------------------------|--------------------------------------------------------------------------------------------------------------------------------|
-| `cudaBindSurfaceToArra`y                                  |                               | Binds an array to a surface.                                                                                                   |
+| `cudaBindSurfaceToArray`                                  |                               | Binds an array to a surface.                                                                                                   |
 | `cudaBindTexture`                                         |                               | Binds a memory area to a texture.                                                                                              |
 | `cudaBindTexture2D`                                       |                               | Binds a 2D memory area to a texture.                                                                                           |
 | `cudaBindTextureToArray`                                  |                               | Binds an array to a texture.                                                                                                   |
@@ -32,36 +32,52 @@
 HIP provides the following:
 - Devices (hipSetDevice(), hipGetDeviceProperties(), etc.)
 - Memory management (hipMalloc(), hipMemcpy(), hipFree(), etc.)
- Streams (hipStreamCreate(), etc.)
+- Streams (hipStreamCreate(),hipStreamSynchronize(), hipStreamWaitEvent(),  etc.)
 - Events (hipEventRecord(), hipEventElapsedTime(), etc.)
 - Kernel launching (hipLaunchKernel is a standard C/C++ function that replaces <<< >>>)
+- HIP Module API to control when adn how code is loaded.
 - CUDA-style kernel coordinate functions (threadIdx, blockIdx, blockDim, gridDim)
+- Cross-lane instructions including shfl, ballot, any, all
 - Most device-side math built-ins
 - Error reporting (hipGetLastError(), hipGetErrorString())

 The HIP API documentation describes each API and its limitations, if any, compared with the equivalent CUDA API.

 ### What is not supported?
-#### Run-time features
+#### Runtime/Driver API features
+At a high-level, the following features are not supported:
 - Textures 
- MemcpyToSymbol functions
 - Dynamic parallelism (CUDA 5.0)
 - Managed memory (CUDA 6.5)
 - Graphics interoperation with OpenGL or Direct3D
+- CUDA Driver API (Under Development)
+- CUDA IPC Functions (Under Development)
+
 - CUDA array, mipmappedArray and pitched memory
- CUDA Driver API
+- MemcpyToSymbol functions
+- Queue priority controls
+
+See the [API Support Table](CUDA_Runtime_API_functions_supported_by_HIP.md) for more detailed information.

 #### Kernel language features
 - Device-side dynamic memory allocations (malloc, free, new, delete) (CUDA 4.0)
 - Virtual functions, indirect functions and try/catch (CUDA 4.0)
 - `__prof_trigger` 
- PTX assembly (CUDA 4.0)
- Several kernel features are under development.  See the [HIP Kernel Language](hip_kernel_language.md) for more information.
+- PTX assembly (CUDA 4.0).  HCC supports inline GCN assembly.
+- Several kernel features are under development.  See the [HIP Kernel Language](hip_kernel_language.md) for more information.  These include:
+  - printf
+  - assert
+  - `__restrict__`
+  - `__launch_bounds__`
+  - `__threadfence*_`, `__syncthreads*`
+  - Unbounded loop unroll
+
+

 ### Is HIP a drop-in replacement for CUDA?
 No. HIP provides porting tools which do most of the work do convert CUDA code into portable C++ code that uses the HIP APIs.
 Most developers will port their code from CUDA to HIP and then maintain the HIP version. 
-HIP code provides the same performance as coding in native CUDA, plus the benefit that the code can also run on AMD platforms.
+HIP code provides the same performance as native CUDA code, plus the benefits of running on AMD platforms.

 ### What specific version of CUDA does HIP support?
 HIP APIs and features do not map to a specific CUDA version.  HIP provides a strong subset of functionality provided in CUDA, and the hipify tools can 
@@ -156,7 +156,7 @@ The `__constant__` keyword is supported. The host writes constant memory before
 ### `__shared__` 
 The `__shared__` keyword is supported.

-`extern __shared__` allows the host to dynamically allocate shared memory and is specified as a launch parameter. This feature is under development.
+`extern __shared__` allows the host to dynamically allocate shared memory and is specified as a launch parameter.  HIP uses an alternate syntax based on the HIP_DYNAMIC_SHARED macro.

 ### `__managed__`
 Managed memory, including the `__managed__` keyword, are not supported in HIP.
@@ -537,7 +537,6 @@ HIP supports the following atomic operations.
 ### Caveats and Features Under-Development:

 - HIP enables atomic operations on 32-bit integers. Additionally, it supports an atomic float add. AMD hardware, however, implements the float add using a CAS loop, so this function may not perform efficiently.
- wrapping increment and decrement are under development.

 ## Warp Cross-Lane Functions

@@ -573,8 +572,6 @@ Applications can test whether the target platform supports the any/all instructi

 ### Warp Shuffle Functions 

-The following warp shuffle instructions are under development.
-
 Half-float shuffles are not supported. The default width is warpSize---see [Warp Cross-Lane Functions](#warp-cross-lane-functions). Applications should not assume the warpSize is 32 or 64.

 ```
@@ -670,3 +667,22 @@ The following C++ features are not supported:
 - Run-time-type information (RTTI)
 - Virtual functions
 - Try/catch
+
+## Kernel Compilation
+HIP now supports compiling C++/HIP kernels to binary. Eventhough HIP does not support fatbinary (yet), the user can specify the target for which the binary can be generated. The file format for binary is `.co` which means Code Object. The following command builds the binary using `hipcc`.
+
+`hipcc --genisa --target-isa=[TARGET GPU] [INPUT FILE] -o [OUTPUT FILE]`
+```[TARGET GPU] = fiji/hawaii
+[INPUT FILE] = Name of the file containing kernels
+[OUTPUT FILE] = Name of the generated code object file```
+
+Note that the kernel file should have `int main(){}` at the end it so that the binary is generated. This happens because HCC generates binaries at linking time rather than compilation
+
+You need 3 things to run kernel in binary.
+1. Kernel Binary
+2. Name of kernel binary
+3. Name of the kernel
+
+We already got first two of them. In order to get name of the kernel, try `objdump -x [OUTPUT FILE]`. OUTPUT FILE is file generated by hipcc during kernel compilation. The output from objdump has symbol to the kernel whose name is mangled with `grid_launch_parm`, `__functor`, `__cxxamp_trampoline`. An example of how it looks is `ZN12_GLOBAL__N_137_Z3Cpy16grid_launch_parmPfS0__functor19__cxxamp_trampolineEiiiiiiPKfPf` where `Cpy` is the name of the kernel written in C++.
+
+
@@ -0,0 +1,42 @@
+# HIP Performance Optimizations
+
+Please note that this document lists possible ways for experimenting with HIP stack to gain performance. Performance may vary from platform to platform.
+
+### Unpinned Memory Transfer Optimizations
+ 
+#### On Small BAR Setup
+
+There are two possible ways to transfer data from Host to Device (H2D) and Device to Host(D2H)
+ * Using Staging Buffers
+ * Using PinInPlace
+
+#### On Large BAR Setup
+
+There are two possible ways to transfer data from Host to Device (H2D)
+ * Using Staging Buffers
+ * Using PinInPlace
+ * Direct Memcpy
+ 
+ And there are two possible ways to transfer data from Device to Host (D2H)
+ * Using Staging Buffers
+ * Using PinInPlace
+ 
+Some GPUs may not be able to directly access host memory, and in these cases we need to
+stage the copy through an optimized pinned staging buffer, to implement H2D and D2H copies.The copy is broken into buffer-sized chunks to limit the size of the buffer and also to provide better performance by overlapping the CPU copies with the DMA copies.
+
+PinInPlace is another algorithm which pins the host memory "in-place", and copies it with the DMA
+engine.
+
+By default staging buffers are used for unpinned memory transfers, however other ways can be used by enabling few environment variables (so no need to build the code again!!!)
+
+Following environment variables can be used:
+
+- HIP_PININPLACE - This environment variable forces the use of PinInPlace logic for all unpinned memory copies
+
+- HIP_OPTIMAL_MEM_TRANSFER- This environment variable enables a hybrid memory copy logic based on thresholds. These thresholds can be managed with following environment variables:
+  -   HIP_H2D_MEM_TRANSFER_THRESHOLD_STAGING_OR_PININPLACE - Threshold in bytes for H2D copy. For sizes smaller than threshold staging buffers logic would be used else PinInPlace logic.
+  -   HIP_H2D_MEM_TRANSFER_THRESHOLD_DIRECT_OR_STAGING - Threshold in bytes for H2D copy. For sizes smaller than threshold direct copy logic would be used else staging buffers logic.
+  -   HIP_D2H_MEM_TRANSFER_THRESHOLD - Threshold in bytes for D2H copy. For sizes smaller than threshold staging buffer logic would be used else PinInPlace logic.
+
+
+
@@ -0,0 +1,257 @@
+# Porting CUDA Driver API
+
+## Introduction to the CUDA Driver and Runtime APIs
+CUDA provides a separate CUDA Driver and Runtime APIs.  The two APis have significant overlap in functionality:  
+- Both APIs support events, streams, memory management, memory copy, and error handling.
+- Both APIs deliver similar performance.  
+- Driver APIs calls begin with the prefix `cu` while Runtime APIs begin with the prefix `cuda`.  For example, the Driver API API contains `cuEventCreate` while the Runtime API contains `cudaEventCreate`, with similar functionality.
+- The Driver API defines a different but largely overlapping error code code space than the Runtime API, and uses a different coding convention.  For example, Driver API defines `CUDA_ERROR_INVALID_VALUE` while the Runtime API defines `cudaErrorInvalidValue`
+
+
+The Driver API offers two additional pieces of functionality not provided by the Runtime API: cuModule and cuCtx APIs.
+
+### cuModule API
+The Module section of the Driver API provides additional control over how and when accelerator
+ code objects are loaded.
+For example, the driver API allows code objects to be loaded from files or memory pointers.
+Symbols for kernels or global data can be extracted from the loaded code objects.
+In contrast, the Runtime API automatically loads and (if necessary) compiles all of the 
+kernels from a executable binary when run.
+In this mode, NVCC must be used to compile kernel code so the automatic loading can function correctly.
+
+Both Driver and Runtime APIs define a function for launching kernels (called `cuLaunchKernel` or `cudaLaunchKernel`.
+The kernel arguments and the execution configuration (grid dimensions, group dimensions, dynamic shared memory, and stream) are passed as arguments to the launch function.
+The Runtime additionally provides the `<<< >>>` syntax for launching kernels, which resembles
+a special function call and is easier to use than explicit launch API (in particular with respect to
+handling of kernel arguments).
+However, this syntax is not standard C++ and is available only when NVCC is used to compile the host code.  
+
+The Module features are useful in an environment which generate the code objects directly, such as a new
+accelerator language front-end.  Here, NVCC is not used.   Instead, the environment may have a 
+different kernel language or different compilation flow.
+Other environments have many kernels and do not want them to be all loaded automatically.
+The Module functions can be used to load the generated code objects and launch kernels.
+As we will see below, HIP defines a Module API which provides similar explict control over code
+object managemenet.
+
+### cuCtx API
+The Driver API defines "Context" and "Devices" as separate entities.  
+Contexts contain a single device, and a device can theoretically have multiple contexts.  
+Each context contains a set of streams and events specific to the context.  
+Historically contexts also defined a unique address space for the GPU, though this may not longer be the case in Unified Memory platforms (since the CPU and all the devices in the same process share a single unified address space).  
+The Context APIs also provide a mechanism to switch between devices, which allowed
+a single CPU thread to send commands to different GPUs.  HIP as well as a recent versions
+of CUDA Runtime provide other mechanisms to accomplish this feat - for example using streams or
+`cudaSetDevice`.
+
+The CUDA Runtime API unifies the Context API with the Device API.  This simplifies the APIs
+and has little loss of functionality since each Context can contain a single device, 
+and the benefits of multiple contexts has been replaced with other interfaces.   HIP provides
+a context API to facilitate easy porting from existing Driver codes.  
+In HIP, the Ctx functions largely provide an alternate syntax for changing the active device. 
+Most new applications will prefer to use `hipSetDevice` or the stream APIs.
+
+## HIP Module and Ctx APIs
+
+Rather than present two separate APIs, HIP extends the HIP API with new APIs for Modules 
+and Ctx control.   
+
+### hipModule API
+
+Like the CUDA Driver API, the Module API provides additional control 
+over how code is loaded, including options to load code from files or from in-memory pointers.  
+NVCC and HCC target different architectures and use different code object formats : NVCC
+is `cubin` or `ptx` files, while the HCC path is the `hsaco` format.   The external
+compilers which generate these code objects are responsible for generating and loading
+the correct code object for each platform.  Notably, there is not a fat binary format that
+can contain code for both NVCC and HCC platforms.     The following table summarizes the
+formats used on each platform:
+
+| Format        | APIs                         | NVCC               | HCC               |
+| ---           | ---                          | ---                | ---               |
+| Code Object   | hipModuleLoad, hipModuleLoadData | .cubin or PTX text | .hsaco            |
+| Fat Binary    | hipModuleLoadFatBin          | .fatbin            | Under Development | 
+
+hipcc uses NVCC and HCC to compile host codes.  Both of these may embed code objects
+into the final executable, and these code objects will be automatically loaded when
+the application starts.
+The hipModule API can be used to load additional code objects, and in this way
+provides an extended capability to the automatically loaded code objects.  HCC allows
+both of these capabilities to be used together, if desired.  Of course it is possible
+to create a program with no kernels and thus no automatic loading.
+
+
+### hipCtx API
+HIP provides a `Ctx` API as a thin layer over the existing Device functions.  This Ctx API
+can be used to set the current context, or to query properties of the device associated with
+the context.  The current context is implicitly used by other APIs such as `hipStreamCreate`.
+
+### hipify translation of CUDA Driver API
+The hipify tool will convert CUDA Driver APIs for streams, events, memory management to 
+the equivalent HIP driver calls.   For example, `cuEventCreate` will be translated to 
+`hipEventCreate`.  Hipify also converts error code from the Driver namespace and coding
+convention to the equivalent HIP error code.  Thus, HIP unifies the APis for these common functions.
+[hipify support for translating driver API is Under Development]
+
+The memory copy APIs require additional explanation.  The CUDA driver includes the memory
+direction in the name of the API (ie `cuMemcpyH2D`) while the CUDA driver API provides
+a single memory copy API with a parameter that specifies the direction and additionally
+supports a "default" direction where the runtime determines the direction automatically.
+HIP provides APis with both styles: for example, `hipMemcpyH2D` as well as `hipMemcpy`.
+The first flavor may be faster in some cases since they avoid host overhead to detect the 
+different memory directions.
+
+HIP defines a single error space, and uses camel-case for all errors (i.e. `hipErrorInvalidValue`).  
+
+### HCC Implementation Notes
+#### .hsaco
+The .hsaco format used by HCC is described in more detail [here](https://github.com/RadeonOpenCompute/ROCm-ComputeABI-Doc).
+An example and blog that show how to use the format is [here](http://gpuopen.com/rocm-with-harmony-combining-opencl-hcc-hsa-in-a-single-program).  hsaco can be generated by hcc + extractkernel tool,
+cloc, the GCN assembler, or other tools.
+
+#### Address Spaces
+HCC defines a process-wide address space where the CPU and all devices allocate
+addresses from a single unified pool.  Thus addresses may be shared between contexts, and 
+unlike the original CUDA definition a new context does not create a new address space for
+the device.   
+
+#### Using hipModuleLaunchKernel
+`hipModuleLaunchKernel` is `cuLaunchKernel` in HIP world. It takes the same arguments as `cuLaunchKernel`. The argument `kernelParams` is not fully implemented for HCC. The workaround for it is, to use platform specific macros for each target. Or, `extra` argument can be used which works on both the platforms.
+
+#### Additional Information
+- HCC allocates staging buffers (used for unpinned copies) on a per-device basis.
+- HCC creates a primary context when the HIP API is called.  So in a pure driver API code, HIP/HCC will create a primary context while HIP/NVCC will have empty context stack.  HIP/HCC will push primary context to context stack when it is empty. This can have subtle differences on applications which mix the runtime and driver APIs.
+
+### NVCC Implementation Notes
+
+#### Interoperation between HIP and CUDA Driver
+CUDA applications may want to mix CUDA driver code with HIP code (see example below).  This
+table shows the type equivalence to enable this interaction.
+
+|**HIP Type** |**CU Driver Type**|  **CUDA Runtime Type** |
+| ----        | ----             | ---- | 
+| hipModule   |  CUmodule        |   			|
+| hipFunction |  CUfunction      | 			|
+| hipCtx_t    |  CUcontext       | 			|
+| hipDevice_t |  CUdevice        |  			|
+| hipStream_t |  CUstream        | cudaStream_t         |
+| hipEvent_t  |  CUevent         | cudaEvent_t          |
+| hipArray    |  CUarray         | cudaArray            |
+
+
+
+#### Compilation Flags
+The hipModule interface does not support the hipModuleLoadEx function, which is used to control PTX compilaton options.
+HCC does not use PTX and does not support the same compilation options.
+In fact, HCC code objects always contain fully compiled ISA and do not require additional compilation as part of the load step.
+Code which requires this functionaly should use platform-specific coding, calling `cuModuleLoadEx` 
+on the NVCC path and hipModuleLoad on the hcc path.  For example:
+
+```
+hipModule module;
+void *imagePtr = ... ;  // Somehow populate data pointer with code object
+
+#ifdef __HIP_PLATFORM_NVCC__
+// Use CUDA driver API but write to hipModule since they are same type:
+const int numOptions = 1;
+CUJit_option options[numOptions];
+void * optionValues[numOptions];
+
+options[0] = CU_JIT_MAX_REGISTERS;
+unsigned maxRegs=15;
+optionValues[0] = (void*) (&maxRegs);
+
+cuModuleLoadDataEx(module, imagePtr, numOptions, options, optionValues);
+
+#else // __HIP_PLATFORM_HCC__
+
+// HCC path does not support or require JIT options, so just load the module.
+hipModuleLoadData(&module, imagePtr);
+
+#endif
+
+// Back to unified code - both paths above loaded the "module" variable.
+hipFunction k;
+hipModuleGetFunction(&k, module, "myKernel");
+```
+
+The below sample shows how to use `hipModuleGetFunction`.
+
+```
+#include<hip_runtime.h>
+#include<hip_runtime_api.h>
+#include<iostream>
+#include<fstream>
+#include<vector>
+
+#define LEN 64
+#define SIZE LEN<<2
+
+#ifdef __HIP_PLATFORM_HCC__
+#define fileName "vcpy_isa.co"
+#endif
+
+#ifdef __HIP_PLATFORM_NVCC__
+#define fileName "vcpy_isa.ptx"
+#endif
+
+#define kernel_name "hello_world"
+
+int main(){
+    float *A, *B;
+    hipDeviceptr_t Ad, Bd;
+    A = new float[LEN];
+    B = new float[LEN];
+
+    for(uint32_t i=0;i<LEN;i++){
+        A[i] = i*1.0f;
+        B[i] = 0.0f;
+        std::cout<<A[i] << " "<<B[i]<<std::endl;
+    }
+
+
+#ifdef __HIP_PLATFORM_NVCC__
+          hipInit(0);
+          hipDevice_t device;
+          hipCtx_t context;
+          hipDeviceGet(&device, 0);
+          hipCtxCreate(&context, 0, device);
+#endif
+
+    hipMalloc((void**)&Ad, SIZE);
+    hipMalloc((void**)&Bd, SIZE);
+
+    hipMemcpyHtoD(Ad, A, SIZE);
+    hipMemcpyHtoD(Bd, B, SIZE);
+    hipModule_t Module;
+    hipFunction_t Function;
+    hipModuleLoad(&Module, fileName);
+    hipModuleGetFunction(&Function, Module, kernel_name);
+
+    std::vector<void*>argBuffer(2);
+    memcpy(&argBuffer[0], &Ad, sizeof(void*));
+    memcpy(&argBuffer[1], &Bd, sizeof(void*));
+
+    size_t size = argBuffer.size()*sizeof(void*);
+
+    void *config[] = {
+      HIP_LAUNCH_PARAM_BUFFER_POINTER, &argBuffer[0],
+      HIP_LAUNCH_PARAM_BUFFER_SIZE, &size,
+      HIP_LAUNCH_PARAM_END
+    };
+
+    hipModuleLaunchKernel(Function, 1, 1, 1, LEN, 1, 1, 0, 0, NULL, (void**)&con                                     fig);
+
+    hipMemcpyDtoH(B, Bd, SIZE);
+    for(uint32_t i=0;i<LEN;i++){
+        std::cout<<A[i]<<" - "<<B[i]<<std::endl;
+    }
+
+#ifdef __HIP_PLATFORM_NVCC__
+          hipCtxDetach(context);
+#endif
+
+    return 0;
+}
+
+```
@@ -395,13 +395,53 @@ For new projects or ports which can be re-factored, we recommend the use of the
 This indicates that the code is standard C++ code, but also provides a unique indication for make tools to
 run hipcc when appropriate.

-### Workarounds
+## Workarounds

-#### warpSize
+### warpSize
 Code should not assume a warp size of 32 or 64.  See [Warp Cross-Lane Functions](hip_kernel_language.md#warp-cross-lane-functions) for information on how to write portable wave-aware code.

+## memcpyToSymbol

-#### Textures and Cache Control
+HIP support for hipMemCpyToSymbol is under-development.  This feature allows a kernel
+to define a device-side data symbol which can be accessed on the host side.  The symbol
+can be in __constant or device space.  As a workaround, programs can pass the symbol
+as an argument to the kernel, and use standard hipMemcpy routines to initialize it.
+
+For example:
+
+Device Code:
+```
+// Cuda Device Code
+__constant__ float Array[1024];
+__global__ void Inc(float *Out){
+   Int tx = hipThreadIdx_x;
+   Out[tx] = Array[tx] + 1;
+}
+ 
+// HIP Device Code
+__global__ void Inc(hipLaunchParm lp, float *Array, float *Out){
+   Int tx = hipThreadIdx_x;
+   Out[tx] = Array[tx] + 1;
+}
+```
+ 
+Host Code:
+```
+// CUDA Host Code
+cudaMemcpyToSymbol(Array, hostArray, sizeofArray);
+ 
+// HIP Host Code
+hipMemcpy(Array, hostArray, sizeofArray);
+```
+ 
+## threadfence_system
+Threadfence_system makes all device memory writes, all writes to mapped host memory, and all writes to peer memory visible to CPU and other GPU devices.
+Some implementations can provide this behavior by flushing the GPU L2 cache.
+HIP/HCC does not provide this functionality.  As a workaround, users can set the environment variable `HSA_DISABLE_CACHE=1` to 
+disable the GPU L2 cache. This will affect all accesses and for all kernels and so may have 
+a performance impact.
+
+### Textures and Cache Control

 >Texture support is under-development and not yet supported by HIP.

@@ -0,0 +1,172 @@
+/*
+Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR
+IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+
+#ifndef HIPCOMPLEX_H
+#define HIPCOMPLEX_H
+
+typedef struct{
+    float x;
+    float y;
+}hipFloatComplex;
+
+__device__ static inline float hipCrealf(hipFloatComplex z){
+    return z.x;
+}
+
+__device__ static inline float hipCimagf(hipFloatComplex z){
+    return z.y;
+}
+
+__device__ static inline hipFloatComplex make_hipFloatComplex(float a, float b){
+    hipFloatComplex z;
+    z.x = a;
+    z.y = b;
+    return z;
+}
+
+__device__ static inline hipFloatComplex hipConjf(hipFloatComplex z){
+    hipFloatComplex ret;
+    ret.x = z.x;
+    ret.y = -z.y;
+    return ret;
+}
+
+__device__ static inline float hipCsqabsf(hipFloatComplex z){
+    return z.x * z.x + z.y * z.y;
+}
+
+__device__ static inline hipFloatComplex hipCaddf(hipFloatComplex p, hipFloatComplex q){
+    return make_hipFloatComplex(p.x + q.x, p.y + q.y);
+}
+
+__device__ static inline hipFloatComplex hipCsubf(hipFloatComplex p, hipFloatComplex q){
+    return make_hipFloatComplex(p.x - q.x, p.y - q.y);
+}
+
+__device__ static inline hipFloatComplex hipCmulf(hipFloatComplex p, hipFloatComplex q){
+    return make_hipFloatComplex(p.x * q.x - p.y * q.y, p.y * q.x + p.x * q.y);
+}
+
+__device__ static inline hipFloatComplex hipCdivf(hipFloatComplex p, hipFloatComplex q){
+    float sqabs = hipCsqabsf(q);
+    hipFloatComplex ret;
+    ret.x = (p.x * q.x + p.y * q.y)/sqabs;
+    ret.y = (p.y * q.x - p.x * q.y)/sqabs;
+    return ret;
+}
+
+__device__ static inline float hipCabsf(hipFloatComplex z){
+    return sqrtf(hipCsqabsf(z));
+}
+
+
+typedef struct{
+    double x;
+    double y;
+}hipDoubleComplex;
+
+__device__ static inline double hipCreal(hipDoubleComplex z){
+    return z.x;
+}
+
+__device__ static inline double hipCimag(hipDoubleComplex z){
+    return z.y;
+}
+
+__device__ static inline hipDoubleComplex make_hipDoubleComplex(double a, double b){
+    hipDoubleComplex z;
+    z.x = a;
+    z.y = b;
+    return z;
+}
+
+__device__ static inline hipDoubleComplex hipConj(hipDoubleComplex z){
+    hipDoubleComplex ret;
+    ret.x = z.x;
+    ret.y = z.y;
+    return ret;
+}
+
+__device__ static inline double hipCsqabs(hipDoubleComplex z){
+    return z.x * z.x + z.y * z.y;
+}
+
+__device__ static inline hipDoubleComplex hipCadd(hipDoubleComplex p, hipDoubleComplex q){
+    return make_hipDoubleComplex(p.x + q.x, p.y + q.y);
+}
+
+__device__ static inline hipDoubleComplex hipCsub(hipDoubleComplex p, hipDoubleComplex q){
+    return make_hipDoubleComplex(p.x - q.x, p.y - q.y);
+}
+
+__device__ static inline hipDoubleComplex hipCmul(hipDoubleComplex p, hipDoubleComplex q){
+    return make_hipDoubleComplex(p.x * q.x - p.y * q.y, p.y * q.x + p.x * q.y);
+}
+
+__device__ static inline hipDoubleComplex hipCdiv(hipDoubleComplex p, hipDoubleComplex q){
+    double sqabs = hipCsqabs(q);
+    hipDoubleComplex ret;
+    ret.x = (p.x * q.x + p.y * q.y)/sqabs;
+    ret.y = (p.y * q.x - p.x * q.y)/sqabs;
+    return ret;
+}
+
+__device__ static inline double hipCabs(hipDoubleComplex z){
+    return sqrtf(hipCsqabs(z));
+}
+
+typedef hipFloatComplex hipComplex;
+
+__device__ static inline hipComplex make_hipComplex(float x,
+                                            float y){
+    return make_hipFloatComplex(x, y);
+}
+
+__device__ static inline hipFloatComplex hipComplexDoubleToFloat
+(hipDoubleComplex z){
+    return make_hipFloatComplex((float)z.x, (float)z.y);
+}
+
+__device__ static inline hipDoubleComplex hipComplexFloatToDouble
+(hipFloatComplex z){
+    return make_hipDoubleComplex((double)z.x, (double)z.y);
+}
+
+__device__ static inline hipComplex hipCfmaf(hipComplex p, hipComplex q, hipComplex r){
+    float real = (p.x * q.x) + r.x;
+    float imag = (q.x * p.y) + r.y;
+
+    real = -(p.y * q.y) + real;
+    imag = (p.x * q.y) + imag;
+
+    return make_hipComplex(real, imag);
+}
+
+__device__ static inline hipDoubleComplex hipCfma(hipDoubleComplex p, hipDoubleComplex q, hipDoubleComplex r){
+    float real = (p.x * q.x) + r.x;
+    float imag = (q.x * p.y) + r.y;
+
+    real = -(p.y * q.y) + real;
+    imag = (p.x * q.y) + imag;
+
+    return make_hipDoubleComplex(real, imag);
+}
+
+#endif
@@ -160,12 +160,17 @@ class ihipCtx_t;
 #endif


+// Just initialize the HIP runtime, but don't log any trace information.
+#define HIP_INIT()\
+	std::call_once(hip_initialized, ihipInit);\
+    ihipCtxStackUpdate();
+

 // This macro should be called at the beginning of every HIP API.
 // It initialies the hip runtime (exactly once), and
 // generate trace string that can be output to stderr or to ATP file.
 #define HIP_INIT_API(...) \
-	std::call_once(hip_initialized, ihipInit);\
+    HIP_INIT()\
    API_TRACE(__VA_ARGS__);

 #define ihipLogStatus(hipStatus) \
@@ -396,6 +401,22 @@ public:
 typedef ihipStreamCriticalBase_t<StreamMutex> ihipStreamCritical_t;
 typedef LockedAccessor<ihipStreamCritical_t> LockedAccessor_StreamCrit_t;

+class ihipModule_t{
+public:
+  hsa_executable_t executable;
+  hsa_code_object_t object;
+  std::string fileName;
+  void *ptr;
+  size_t size;
+};
+
+
+class ihipFunction_t{
+public:
+  hsa_executable_symbol_t kernel_symbol;
+  uint64_t kernel;
+};
+
 // Internal stream structure.
 class ihipStream_t {
 public:
@@ -404,8 +425,9 @@ typedef uint64_t SeqNum_t ;
    ~ihipStream_t();

    // kind is hipMemcpyKind
-    void copySync (LockedAccessor_StreamCrit_t &crit, void* dst, const void* src, size_t sizeBytes, unsigned kind);
-    void locked_copySync (void* dst, const void* src, size_t sizeBytes, unsigned kind);
+    void copySync (LockedAccessor_StreamCrit_t &crit, void* dst, const void* src, size_t sizeBytes, unsigned kind, bool resolveOn = true);
+    void locked_copySync (void* dst, const void* src, size_t sizeBytes, unsigned kind, bool resolveOn = true);
+

    void copyAsync(void* dst, const void* src, size_t sizeBytes, unsigned kind);

@@ -423,7 +445,7 @@ typedef uint64_t SeqNum_t ;
    // Use this if we already have the stream critical data mutex:
    void                 wait(LockedAccessor_StreamCrit_t &crit, bool assertQueueEmpty=false);

-
+    void launchModuleKernel(hsa_signal_t signal, uint32_t blockDimX, uint32_t blockDimY, uint32_t blockDimZ, uint32_t gridDimX, uint32_t gridDimY, uint32_t gridDimZ, uint32_t sharedMemBytes, void *kernarg, size_t kernSize, uint64_t kernel);

    // Non-threadsafe accessors - must be protected by high-level stream lock with accessor passed to function.
    SIGSEQNUM            lastCopySeqId (LockedAccessor_StreamCrit_t &crit) const { return crit->_last_copy_signal ? crit->_last_copy_signal->_sigId : 0; };
@@ -443,6 +465,7 @@ public:
    hc::accelerator_view        _av;
    unsigned                    _flags;

+
 private:
    void     enqueueBarrier(hsa_queue_t* queue, ihipSignal_t *depSignal, ihipSignal_t *completionSignal);
    void     waitCopy(LockedAccessor_StreamCrit_t &crit, ihipSignal_t *signal);
@@ -648,6 +671,8 @@ extern void ihipInit();
 extern const char *ihipErrorString(hipError_t);
 extern ihipCtx_t    *ihipGetTlsDefaultCtx();
 extern void          ihipSetTlsDefaultCtx(ihipCtx_t *ctx);
+extern hipError_t    ihipSynchronize(void);
+extern hipError_t    ihipCtxStackUpdate();

 extern ihipDevice_t *ihipGetDevice(int);
 ihipCtx_t * ihipGetPrimaryCtx(unsigned deviceIndex);
@@ -568,16 +568,66 @@ __device__ void  __threadfence_system(void);
 #define hipGridDim_y   (hc_get_num_groups(1))
 #define hipGridDim_z   (hc_get_num_groups(2))

+// loop unrolling
+__device__ static inline void* memcpy(void* dst, void* src, size_t size)
+{
+    uint64_t i = 0;
+    uint64_t totalLength = size/sizeof(uint32_t);
+    for(i=hipThreadIdx_x+hipBlockIdx_x*hipBlockDim_x;
+                  i<(totalLength/4);
+                  i = i + hipBlockDim_x * hipGridDim_x)
+    {
+        ((uint32_t*)dst)[4*i] = ((uint32_t*)src)[4*i];
+        ((uint32_t*)dst)[4*i+1] = ((uint32_t*)src)[4*i+1];
+        ((uint32_t*)dst)[4*i+2] = ((uint32_t*)src)[4*i+2];
+        ((uint32_t*)dst)[4*i+3] = ((uint32_t*)src)[4*i+3];
+    }
+    if(4*i < totalLength){
+        ((uint32_t*)dst)[4*i] = ((uint32_t*)src)[4*i];
+        ((uint32_t*)dst)[4*i+1] = ((uint32_t*)src)[4*i+1];
+        ((uint32_t*)dst)[4*i+2] = ((uint32_t*)src)[4*i+2];
+        ((uint32_t*)dst)[4*i+3] = ((uint32_t*)src)[4*i+3];
+
+    }
+    return nullptr;
+}
+
+__device__ static inline void* memset(void* ptr, uint8_t val, size_t size)
+{
+    uint32_t _val = 0;
+    _val = (val | val << 8 | val << 16 | val << 24);
+    uint64_t totalLength = size/sizeof(uint32_t);
+    uint64_t i = 0;
+    for(i=hipThreadIdx_x+hipBlockIdx_x*hipBlockDim_x;
+                  i<(totalLength/4);
+                  i = i + hipBlockDim_x * hipGridDim_x)
+    {
+        ((uint32_t*)ptr)[4*i] = _val;
+        ((uint32_t*)ptr)[4*i+1] = _val;
+        ((uint32_t*)ptr)[4*i+2] = _val;
+        ((uint32_t*)ptr)[4*i+3] = _val;
+    }
+    if(4*i < totalLength){
+        ((uint32_t*)ptr)[4*i] = _val;
+        ((uint32_t*)ptr)[4*i+1] = _val;
+        ((uint32_t*)ptr)[4*i+2] = _val;
+        ((uint32_t*)ptr)[4*i+3] = _val;
+
+    }
+    return nullptr;
+}
+
 #define __syncthreads() hc_barrier(CLK_LOCAL_MEM_FENCE)

 #define HIP_KERNEL_NAME(...) __VA_ARGS__

 #ifdef __HCC_CPP__
-hipStream_t ihipPreLaunchKernel(hipStream_t stream, dim3 grid, dim3 block, grid_launch_parm *lp);
-hipStream_t ihipPreLaunchKernel(hipStream_t stream, dim3 grid, size_t block, grid_launch_parm *lp);
-hipStream_t ihipPreLaunchKernel(hipStream_t stream, size_t grid, dim3 block, grid_launch_parm *lp);
-hipStream_t ihipPreLaunchKernel(hipStream_t stream, size_t grid, size_t block, grid_launch_parm *lp);
-void ihipPostLaunchKernel(hipStream_t stream, grid_launch_parm &lp);
+extern hipStream_t ihipPreLaunchKernel(hipStream_t stream, dim3 grid, dim3 block, grid_launch_parm *lp);
+extern hipStream_t ihipPreLaunchKernel(hipStream_t stream, dim3 grid, size_t block, grid_launch_parm *lp);
+extern hipStream_t ihipPreLaunchKernel(hipStream_t stream, size_t grid, dim3 block, grid_launch_parm *lp);
+extern hipStream_t ihipPreLaunchKernel(hipStream_t stream, size_t grid, size_t block, grid_launch_parm *lp);
+extern void ihipPrintKernelLaunch(const char *kernelName, const grid_launch_parm *lp, const hipStream_t stream);
+extern void ihipPostLaunchKernel(hipStream_t stream, grid_launch_parm &lp);

 // TODO - move to common header file.
 #define KNRM  "\x1B[0m"
@@ -589,10 +639,9 @@ do {\
  grid_launch_parm lp;\
  lp.dynamic_group_mem_bytes = _groupMemBytes; \
  hipStream_t trueStream = (ihipPreLaunchKernel(_stream, _numBlocks3D, _blockDim3D, &lp)); \
-    if (HIP_TRACE_API) {\
-        fprintf(stderr, KGRN "<<hip-api: hipLaunchKernel '%s' gridDim:(%d,%d,%d) groupDim:(%d,%d,%d) groupMem:+%d stream=%p\n" KNRM, \
-                #_kernelName, lp.grid_dim.x, lp.grid_dim.y, lp.grid_dim.z, lp.group_dim.x, lp.group_dim.y, lp.group_dim.z, lp.dynamic_group_mem_bytes, (void*)(_stream));\
-    }\
+  if (HIP_TRACE_API) {\
+      ihipPrintKernelLaunch(#_kernelName, &lp, _stream); \
+  }\
  _kernelName (lp, ##__VA_ARGS__);\
  ihipPostLaunchKernel(trueStream, lp);\
 } while(0)
@@ -653,4 +702,6 @@ do {\
 */


+
+
 #endif
@@ -52,13 +52,13 @@ typedef struct ihipDevice_t *hipDevice_t;

 typedef struct ihipStream_t *hipStream_t;

-typedef uint64_t hipFunction;
+typedef struct ihipModule_t *hipModule_t;

-typedef uint64_t hipModule;
+typedef struct ihipFunction_t *hipFunction_t;

-typedef struct hipEvent_t {
-    struct ihipEvent_t *_handle;
-} hipEvent_t;
+typedef void* hipDeviceptr_t;
+
+typedef struct ihipEvent_t *hipEvent_t;


 /**
@@ -844,6 +844,14 @@ hipError_t hipHostFree(void* ptr);
 */
 hipError_t hipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind);

+hipError_t hipMemcpyHtoD(hipDeviceptr_t dst, void* src, size_t sizeBytes);
+
+hipError_t hipMemcpyDtoH(void* dst, hipDeviceptr_t src, size_t sizeBytes);
+
+hipError_t hipMemcpyDtoD(hipDeviceptr_t dst, hipDeviceptr_t src, size_t sizeBytes);
+
+hipError_t hipMemcpyHtoH(void* dst, void* src, size_t sizeBytes);
+

 /**
 *  @brief Copies @p sizeBytes bytes from the memory area pointed to by @p src to the memory area pointed to by @p offset bytes from the start of symbol @p symbol.
@@ -1054,32 +1062,173 @@ hipError_t hipMemcpyPeerAsync(void* dst, int dstDevice, const void* src, int src
 hipError_t hipInit(unsigned int flags) ;


+/**
+ *-------------------------------------------------------------------------------------------------
+ *-------------------------------------------------------------------------------------------------
+ *  @defgroup Context Management
+ *  @{
+ */

-// TODO-ctx
+/**
+ * @brief Create a context and set it as current/ default context
+ *
+ * @param [out] ctx
+ * @param [in] flags
+ * @param [in] associated device handle
+ *
+ * @returns #hipSuccess, #hipErrorInvalidContext
+ */
 hipError_t hipCtxCreate(hipCtx_t *ctx, unsigned int flags,  hipDevice_t device);

 hipError_t hipCtxDestroy(hipCtx_t ctx);

+/**
+ * @brief Pop the current/default context and return the popped context.
+ *
+ * @param [out] ctx
+ *
+ * @returns #hipSuccess
+ */
+
 hipError_t hipCtxPopCurrent(hipCtx_t* ctx);

+/**
+ * @brief Push the context to be set as current/ default context
+ *
+ * @param [in] ctx
+ *
+ * @returns #hipSuccess, #hipErrorInvalidContext
+ */
+
 hipError_t hipCtxPushCurrent(hipCtx_t ctx);

+/**
+ * @brief Set the passed context as current/default
+ *
+ * @param [in] ctx
+ *
+ * @returns #hipSuccess
+ */
+
 hipError_t hipCtxSetCurrent(hipCtx_t ctx);

+/**
+ * @brief Get the handle of the current/ default context
+ *
+ * @param [out] ctx
+ *
+ * @returns #hipSuccess
+ */
+
 hipError_t hipCtxGetCurrent(hipCtx_t* ctx);

+/**
+ * @brief Get the handle of the device associated with current/default context
+ *
+ * @param [out] device
+ *
+ * @returns #hipSuccess, #hipErrorInvalidContext
+ */
+
 hipError_t hipCtxGetDevice(hipDevice_t *device);

+/**
+ * @brief Returns the approximate HIP api version.
+ *
+ * @warning The HIP feature set does not correspond to an exact CUDA SDK api revision.
+ * This function always set *apiVersion to 4 as an approximation though HIP supports
+ * some features which were introduced in later CUDA SDK revisions.
+ * HIP apps code should not rely on the api revision number here and should
+ * use arch feature flags to test device capabilities or conditional compilation.
+ *
+ */
 hipError_t hipCtxGetApiVersion (hipCtx_t ctx,int *apiVersion);

+/**
+ * @brief Set Cache configuration for a specific function
+ *
+ * Note: AMD devices and recent Nvidia GPUS do not support reconfigurable cache.  This hint is ignored on those architectures.
+ *
+ */
 hipError_t hipCtxGetCacheConfig ( hipFuncCache *cacheConfig );

+/**
+ * @brief Set L1/Shared cache partition.
+ *
+ * Note: AMD devices and recent Nvidia GPUS do not support reconfigurable cache.  This hint is ignored on those architectures.
+ *
+ */
 hipError_t hipCtxSetCacheConfig ( hipFuncCache cacheConfig );

+/**
+ * @brief Set Shared memory bank configuration.
+ *
+ * Note: AMD devices and recent Nvidia GPUS do not support shared cache banking, and the hint is ignored on those architectures.
+ *
+ */
 hipError_t hipCtxSetSharedMemConfig ( hipSharedMemConfig config );

+/**
+ * @brief Get Shared memory bank configuration.
+ *
+ * Note: AMD devices and recent Nvidia GPUS do not support shared cache banking, and the hint is ignored on those architectures.
+ *
+ */
 hipError_t hipCtxGetSharedMemConfig ( hipSharedMemConfig * pConfig );

+/**
+ * @brief Blocks until the default context has completed all preceding requested tasks.
+ *
+ * This function waits for all streams on the default context to complete execution, and then returns.
+ *
+ * @returns #hipSuccess.
+*/
+hipError_t hipCtxSynchronize ( void );
+
+/**
+ * @brief Get flags used for creating current/default context.
+ *
+ * @param [out] flags
+ *
+ * @returns #hipSuccess.
+*/
+
+hipError_t hipCtxGetFlags ( unsigned int* flags );
+
+/**
+ * @brief Enables direct access to memory allocations in a peer context.
+ *
+ * Memory which already allocated on peer device will be mapped into the address space of the current device.  In addition, all
+ * future memory allocations on peerDeviceId will be mapped into the address space of the current device when the memory is allocated.
+ * The peer memory remains accessible from the current device until a call to hipDeviceDisablePeerAccess or hipDeviceReset.
+ *
+ *
+ * @param [in] peerCtx
+ * @param [in] flags
+ *
+ * Returns #hipSuccess, #hipErrorInvalidDevice, #hipErrorInvalidValue,
+ * @returns #hipErrorPeerAccessAlreadyEnabled if peer access is already enabled for this device.
+ * @warning PeerToPeer support is experimental.
+ */
+hipError_t  hipCtxEnablePeerAccess (hipCtx_t peerCtx, unsigned int flags);
+
+/**
+ * @brief Disable direct access from current context's virtual address space to memory allocations physically located on a peer context.Disables direct access to memory allocations in a peer context and unregisters any registered allocations.
+ *
+ * Returns hipErrorPeerAccessNotEnabled if direct access to memory on peerDevice has not yet been enabled from the current device.
+ *
+ * @param [in] peerCtx
+ *
+ * @returns #hipSuccess, #hipErrorPeerAccessNotEnabled
+ * @warning PeerToPeer support is experimental.
+ */
+hipError_t  hipCtxDisablePeerAccess (hipCtx_t peerCtx);
+// doxygen end Context Management
+/**
+ * @}
+ */
+
+
 // TODO-ctx
 /**
 * @return hipSuccess, hipErrorInvalidDevice
@@ -1099,12 +1248,84 @@ hipError_t hipDeviceGetFromId(hipDevice_t *device, int deviceId);
 */
 hipError_t hipDriverGetVersion(int *driverVersion) ;

+/**
+ * @brief Loads code object from file into a hipModule_t
+ *
+ * @param [in] fname
+ * @param [out] module
+ * 
+ * @returns hipSuccess, hipErrorInvalidValue, hipErrorInvalidContext, hipErrorFileNotFound, hipErrorOutOfMemory, hipErrorSharedObjectInitFailed, hipErrorNotInitialized
+ *
+ * 
+ */
+hipError_t hipModuleLoad(hipModule_t *module, const char *fname);

-hipError_t hipModuleLoad(hipModule *module, const char *fname);
+/**
+ * @brief Freeing the module
+ *
+ * @param [in] module
+ *
+ * @returns hipSuccess, hipInvalidValue
+ * module is freed and the code objects associated with it are destroyed
+ * 
+ */

-hipError_t hipModuleGetFunction(hipFunction *function, hipModule module, const char *kname);
+hipError_t hipModuleUnload(hipModule_t module);

-hipError_t hipLaunchModuleKernel(hipFunction f,
+/**
+ * @brief Function with kname will be extracted present in module
+ *
+ * @param [in] module
+ * @param [in] kname
+ * @param [out] function
+ *
+ * @returns hipSuccess, hipErrorInvalidValue, hipErrorInvalidContext, hipErrorNotInitialized, hipErrorNotFound, 
+ */
+hipError_t hipModuleGetFunction(hipFunction_t *function, hipModule_t module, const char *kname);
+
+/**
+ * @brief returns device memory pointer and size of the kernel present in the module with symbol - name
+ *
+ * @param [in] moodule
+ * @param [in] name
+ * @param [out] dptr
+ * @param [out[ bytes
+ *
+ * @returns hipSuccess, hipErrorInvalidValue, hipErrorNotInitialized
+ */
+hipError_t hipModuleGetGlobal(hipDeviceptr_t *dptr, size_t *bytes, hipModule_t hmod, const char *name);
+
+/**
+ * @brief builds module from code object which resides in host memory. And image is pointer to that location.
+ *
+ * @param [in] image
+ * @param [out] module
+ *
+ * @returns hipSuccess, hipErrorNotInitialized, hipErrorOutOfMemory, hipErrorNotInitialized
+ */
+hipError_t hipModuleLoadData(hipModule_t *module, const void *image);
+
+/**
+ * @brief launches kernel f with launch parameters and shared memory on stream with arguments passed to kerneelparams or extra
+ *
+ * @param [in[ f
+ * @param [in] gridDimX
+ * @param [in] gridDimY
+ * @param [in] gridDimZ
+ * @param [in] blockDimX
+ * @param [in] blockDimY
+ * @param [in] blockDimZ
+ * @param [in] sharedMemBytes
+ * @param [in] stream
+ * @param [in] kernelParams
+ * @param [in] extraa
+ *
+ * The function takes the above arguments and run the kernel in hipFunction_t f.  with launch parameters specified in gridDimX, gridDimY, gridDimZ,  blockDimX, blockDimY and blockDimmZ. The amount of shared memory is specificed and can be used with HIP_DYNAMIC_SHARED. The arguemt extra is used to pass in the arguments for the kernel.
+ * @returns hipSuccess, hipInvalidDevice, hipErrorNotInitialized, hipErrorInvalidValue
+ *
+ * @warning kernellParams argument is not yet implemented in HIP. Please use extra instead. Please refer to hip_porting_driver_api.md for sample usage.
+ */
+hipError_t hipModuleLaunchKernel(hipFunction_t f,
                              unsigned int gridDimX,
                              unsigned int gridDimY,
                              unsigned int gridDimZ,
@@ -1114,7 +1335,7 @@ hipError_t hipLaunchModuleKernel(hipFunction f,
                              unsigned int sharedMemBytes,
                              hipStream_t stream,
                              void **kernelParams,
-                              void **extra) __attribute__((deprecated("kernelParams is not fully supported, use extra instead"))) ;
+                              void **extra) ;

 // doxygen end Version Management
 /**
@@ -0,0 +1,31 @@
+/*
+Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR
+IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#pragma once
+
+#include <hip/hip_common.h>
+
+#if defined(__HIP_PLATFORM_HCC__) && !defined (__HIP_PLATFORM_NVCC__)
+#include <hip/hcc_detail/hipComplex.h>
+#elif defined(__HIP_PLATFORM_NVCC__) && !defined (__HIP_PLATFORM_HCC__)
+#include <hip/nvcc_detail/hipComplex.h>
+#else
+#error("Must define exactly one of __HIP_PLATFORM_HCC__ or __HIP_PLATFORM_NVCC__");
+#endif
+
@@ -0,0 +1,108 @@
+#ifndef HIPCOMPLEX_H
+#define HIPCOMPLEX_H
+
+#include"cuComplex.h"
+
+typedef cuFloatComplex hipFloatComplex;
+
+__device__ __host__ static inline float hipCrealf(hipFloatComplex z){
+    return cuCrealf(z);
+}
+
+__device__ __host__ static inline float hipCimagf(hipFloatComplex z){
+    return cuCimagf(z);
+}
+
+__device__ __host__ static inline hipFloatComplex make_hipFloatComplex(float a, float b){
+    return make_cuFloatComplex(a, b);
+}
+
+__device__ __host__ static inline hipFloatComplex hipConjf(hipFloatComplex z){
+    return cuConjf(z);
+}
+
+__device__ __host__ static inline float hipCsqabsf(hipFloatComplex z){
+    return cuCabsf(z) * cuCabsf(z);
+}
+
+__device__ __host__ static inline hipFloatComplex hipCaddf(hipFloatComplex p, hipFloatComplex q){
+    return cuCaddf(p, q);
+}
+
+__device__ __host__ static inline hipFloatComplex hipCsubf(hipFloatComplex p, hipFloatComplex q){
+    return cuCsubf(p, q);
+}
+
+__device__ __host__ static inline hipFloatComplex hipCmulf(hipFloatComplex p, hipFloatComplex q){
+    return cuCmulf(p, q);
+}
+
+__device__ __host__ static inline hipFloatComplex hipCdivf(hipFloatComplex p, hipFloatComplex q){
+    return cuCdivf(p, q);
+}
+
+__device__ __host__ static inline float hipCabsf(hipFloatComplex z){
+    return cuCabsf(p, q);
+}
+
+typedef cuDoubleComplex hipDoubleComplex;
+
+__device__ __host__ static inline double hipCreal(hipDoubleComplex z){
+    return cuCreal(z);
+}
+
+__device__ __host__ static inline double hipCimag(hipDoubleComplex z){
+    return cuCimag(z);
+}
+
+__device__ __host__ static inline hipDoubleComplex make_hipDoubleComplex(double a, double b){
+    return make_cuDoubleComplex(a, b);
+}
+
+__device__ __host__ static inline hipDoubleComplex hipConj(hipDoubleComplex z){
+    return cuConj(z);
+}
+
+__device__ __host__ static inline hipDoubleComplex hipCsqabs(hipDoubleComplex z){
+    return cuCabs(z) * cuCabs(z);
+}
+
+__device__ __host__ static inline hipDoubleComplex hipCadd(hipDoubleComplex p, hipDoubleComplex q){
+    return cuCadd(p, q);
+}
+
+__device__ __host__ static inline hipDoubleComplex hipCsub(hipDoubleComplex p, hipDoubleComplex q){
+    return cuCsub(p, q);
+}
+
+__device__ __host__ static inline hipDoubleComplex hipCdiv(hipDoubleComplex p, hipDoubleComplex q){
+    return cuCdiv(p, q);
+}
+
+__device__ __host__ static inline double hipCabs(hipDoubleComplex z){
+    return cuCabs(z);
+}
+
+typedef cuFloatComplex hipComplex;
+
+__device__ __host__ static inline hipComplex make_Complex(float x, float y){
+    return make_cuComplex(x, y);
+}
+
+__device__ __host__ static inline hipFloatComplex hipComplexDoubleToFloat(hipDoubleComplex z){
+    return cuComplexDoubleToFloat(z);
+}
+
+__device__ __host__ static inline hipDoubleComplex hipComplexFloatToDouble(hipFloatComplex z){
+    return cuComplexFloatToDouble(z);
+}
+
+__device__ __host__ static inline hipComplex hipCfmaf(hipComplex p, hipComplex q, hipComplex r){
+    return cuCfmaf(p, q, r);
+}
+
+__device__ __host__ static inline hipDoubleComplex hipCfma(hipComplex p, hipComplex q, hipComplex r){
+    return cuCfma(p, q, r);
+}
+
+#endif
@@ -22,7 +22,7 @@ THE SOFTWARE.
 #pragma once

 #include <cuda_runtime_api.h>
-
+#include <cuda.h>

 #ifdef __cplusplus
 extern "C" {
@@ -58,8 +58,20 @@ hipMemcpyHostToHost
 #define hipHostRegisterPortable cudaHostRegisterPortable
 #define hipHostRegisterMapped cudaHostRegisterMapped

+#define HIP_LAUNCH_PARAM_BUFFER_POINTER CU_LAUNCH_PARAM_BUFFER_POINTER
+#define HIP_LAUNCH_PARAM_BUFFER_SIZE     CU_LAUNCH_PARAM_BUFFER_SIZE
+#define HIP_LAUNCH_PARAM_END            CU_LAUNCH_PARAM_END
+
 typedef cudaEvent_t hipEvent_t;
 typedef cudaStream_t hipStream_t;
+typedef CUcontext hipCtx_t;
+typedef CUsharedconfig hipSharedMemConfig;
+typedef CUfunc_cache hipFuncCache;
+typedef CUdevice hipDevice_t;
+typedef CUmodule hipModule_t;
+typedef CUfunction hipFunction_t;
+typedef CUdeviceptr hipDeviceptr_t;
+
 //typedef cudaChannelFormatDesc hipChannelFormatDesc;
 #define hipChannelFormatDesc cudaChannelFormatDesc

@@ -85,6 +97,20 @@ switch(cuError) {
 };
 }

+inline static hipError_t hipCUResultTohipError(CUresult cuError) { //TODO Populate further
+switch(cuError) {
+    case CUDA_SUCCESS                            : return hipSuccess;
+    case CUDA_ERROR_OUT_OF_MEMORY                : return hipErrorMemoryAllocation            ;
+    case CUDA_ERROR_INVALID_VALUE                : return hipErrorInvalidValue                ;
+    case CUDA_ERROR_INVALID_DEVICE               : return hipErrorInvalidDevice               ;
+    case CUDA_ERROR_DEINITIALIZED                : return hipErrorDeinitialized               ;
+    case CUDA_ERROR_NO_DEVICE                    : return hipErrorNoDevice                    ;
+    case CUDA_ERROR_INVALID_CONTEXT              : return hipErrorInvalidContext              ;
+    case CUDA_ERROR_NOT_INITIALIZED              : return hipErrorNotInitialized              ;
+    default                                      : return hipErrorUnknown;  // Note - translated error.
+};
+}
+
 // TODO   match the error enum names of hip and cuda
 inline static cudaError_t hipErrorToCudaError(hipError_t hError) {
 switch(hError) {
@@ -124,6 +150,11 @@ default:
 }
 }

+inline static hipError_t hipInit(unsigned int flags)
+{
+    return hipCUResultTohipError(cuInit(flags));
+}
+
 inline static hipError_t hipDeviceReset() {
    return hipCUDAErrorTohipError(cudaDeviceReset());
 }
@@ -182,6 +213,25 @@ inline static hipError_t hipHostFree(void* ptr)  {
 inline static hipError_t hipSetDevice(int device) {
    return hipCUDAErrorTohipError(cudaSetDevice(device));
 }
+
+inline static hipError_t hipMemcpyHtoD(hipDeviceptr_t dst, 
+                  void* src, size_t size)
+{
+    return hipCUResultTohipError(cuMemcpyHtoD(dst, src, size));
+}
+
+inline static hipError_t hipMemcpyDtoH(void* dst, 
+                  hipDeviceptr_t src, size_t size)
+{
+    return hipCUResultTohipError(cuMemcpyDtoH(dst, src, size));
+}
+
+inline static hipError_t hipMemcpyDtoD(hipDeviceptr_t dst,
+            hipDeviceptr_t src, size_t size)
+{
+    return hipCUResultTohipError(cuMemcpyDtoD(dst, src, size));
+}
+
 inline static hipError_t hipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind copyKind) {
  return hipCUDAErrorTohipError(cudaMemcpy(dst, src, sizeBytes, hipMemcpyKindToCudaMemcpyKind(copyKind)));
 }
@@ -347,20 +397,6 @@ inline static hipError_t hipDeviceGetAttribute(int* pi, hipDeviceAttribute_t att
    return hipCUDAErrorTohipError(cerror);
 }

-template<class T>
-inline static hipError_t hipOccupancyMaxPotentialBlockSize(
-        int *minGridSize,
-        int *blockSize,
-        T func,
-        size_t dynamicSMemSize = 0,
-        int blockSizeLimit = 0,
-        unsigned int flags = 0
-        ){
-    cudaError_t cerror;
-    cerror =  cudaOccupancyMaxPotentialBlockSize(minGridSize, blockSize, func, dynamicSMemSize, blockSizeLimit, flags);
-    return hipCUDAErrorTohipError(cerror);
-}
-
 inline static hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessor(
        int *numBlocks,
        const void* func,
@@ -458,7 +494,6 @@ inline static hipError_t hipDriverGetVersion(int *driverVersion)
 	return hipCUDAErrorTohipError(err);
 }

-
 inline static hipError_t hipDeviceCanAccessPeer ( int* canAccessPeer, int  device, int  peerDevice )
 {
    return hipCUDAErrorTohipError(cudaDeviceCanAccessPeer(canAccessPeer, device, peerDevice));
@@ -474,6 +509,16 @@ inline static hipError_t  hipDeviceEnablePeerAccess ( int  peerDevice, unsigned
    return hipCUDAErrorTohipError(cudaDeviceEnablePeerAccess ( peerDevice, flags ));
 }

+inline static hipError_t  hipCtxDisablePeerAccess ( hipCtx_t peerCtx )
+{
+    return hipCUResultTohipError(cuCtxDisablePeerAccess ( peerCtx ));
+}
+
+inline static hipError_t  hipCtxEnablePeerAccess ( hipCtx_t peerCtx, unsigned int  flags )
+{
+    return hipCUResultTohipError(cuCtxEnablePeerAccess ( peerCtx, flags ));
+}
+
 inline static hipError_t hipMemcpyPeer ( void* dst, int  dstDevice, const void* src, int  srcDevice, size_t count )
 {
    return hipCUDAErrorTohipError(cudaMemcpyPeer ( dst, dstDevice, src, srcDevice, count ));
@@ -499,12 +544,145 @@ inline static hipError_t hipEventQuery(hipEvent_t event)
 	return hipCUDAErrorTohipError(cudaEventQuery(event));
 }

+inline static hipError_t  hipCtxCreate(hipCtx_t *ctx, unsigned int flags,  hipDevice_t device)
+{
+    return hipCUResultTohipError(cuCtxCreate ( ctx,flags,device ));
+}
+
+inline static hipError_t  hipCtxDestroy(hipCtx_t ctx)
+{
+    return hipCUResultTohipError(cuCtxDestroy ( ctx ));
+}
+
+inline static hipError_t  hipCtxPopCurrent(hipCtx_t* ctx)
+{
+    return hipCUResultTohipError(cuCtxPopCurrent ( ctx ));
+}
+
+inline static hipError_t  hipCtxPushCurrent(hipCtx_t ctx)
+{
+    return hipCUResultTohipError(cuCtxPushCurrent ( ctx ));
+}
+
+inline static hipError_t  hipCtxSetCurrent(hipCtx_t ctx)
+{
+    return hipCUResultTohipError(cuCtxSetCurrent ( ctx ));
+}
+
+inline static hipError_t  hipCtxGetCurrent(hipCtx_t* ctx)
+{
+    return hipCUResultTohipError(cuCtxGetCurrent ( ctx ));
+}
+
+inline static hipError_t  hipCtxGetDevice(hipDevice_t *device)
+{
+    return hipCUResultTohipError(cuCtxGetDevice ( device ));
+}
+
+inline static hipError_t  hipCtxGetApiVersion (hipCtx_t ctx,int *apiVersion)
+{
+    return hipCUResultTohipError(cuCtxGetApiVersion ( ctx,(unsigned int*)apiVersion ));
+}
+
+inline static hipError_t  hipCtxGetCacheConfig ( hipFuncCache *cacheConfig )
+{
+    return hipCUResultTohipError(cuCtxGetCacheConfig ( cacheConfig ));
+}
+
+inline static hipError_t  hipCtxSetCacheConfig ( hipFuncCache cacheConfig )
+{
+    return hipCUResultTohipError(cuCtxSetCacheConfig ( cacheConfig ));
+}
+
+inline static hipError_t  hipCtxSetSharedMemConfig ( hipSharedMemConfig config )
+{
+    return hipCUResultTohipError(cuCtxSetSharedMemConfig ( config ));
+}
+
+inline static hipError_t  hipCtxGetSharedMemConfig ( hipSharedMemConfig * pConfig )
+{
+    return hipCUResultTohipError(cuCtxGetSharedMemConfig ( pConfig ));
+}
+
+inline static hipError_t  hipCtxSynchronize ( void )
+{
+    return hipCUResultTohipError(cuCtxSynchronize ( ));
+}
+
+inline static hipError_t  hipCtxGetFlags ( unsigned int* flags )
+{
+    return hipCUResultTohipError(cuCtxGetFlags ( flags ));
+}
+
+inline static hipError_t hipCtxDetach(hipCtx_t ctx)
+{
+    return hipCUResultTohipError(cuCtxDetach(ctx));
+}
+
+inline static hipError_t hipDeviceGet(hipDevice_t *device, int ordinal)
+{
+    return hipCUResultTohipError(cuDeviceGet(device, ordinal));
+}
+
+inline static hipError_t hipModuleLoad(hipModule_t *module, const char* fname)
+{
+    return hipCUResultTohipError(cuModuleLoad(module, fname));
+}
+
+inline static hipError_t hipModuleUnload(hipModule_t hmod)
+{
+    return hipCUResultTohipError(cuModuleUnload(hmod));
+}
+
+inline static hipError_t hipModuleGetFunction(hipFunction_t *function,
+                         hipModule_t module, const char *kname)
+{
+    return hipCUResultTohipError(cuModuleGetFunction(function, module, kname));
+}
+
+inline static hipError_t hipModuleGetGlobal(hipDeviceptr_t *dptr, size_t *bytes,
+                         hipModule_t hmod, const char* name)
+{
+    return hipCUResultTohipError(cuModuleGetGlobal(dptr, bytes, hmod, name));
+}
+
+inline static hipError_t hipModuleLoadData(hipModule_t *module, const void *image)
+{
+    return hipCUResultTohipError(cuModuleLoadData(module, image));
+}
+
+inline static hipError_t hipModuleLaunchKernel(hipFunction_t f,
+      unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ,
+      unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ,
+      unsigned int sharedMemBytes, hipStream_t stream,
+      void **kernelParams, void **extra)
+{
+    return hipCUResultTohipError(cuLaunchKernel(f, 
+                    gridDimX, gridDimY, gridDimZ,
+                    blockDimX, blockDimY, blockDimZ,
+                    sharedMemBytes, stream, kernelParams, extra));
+}
+
 #ifdef __cplusplus
 }
 #endif

 #ifdef __CUDACC__

+template<class T>
+inline static hipError_t hipOccupancyMaxPotentialBlockSize(
+        int *minGridSize,
+        int *blockSize,
+        T func,
+        size_t dynamicSMemSize = 0,
+        int blockSizeLimit = 0,
+        unsigned int flags = 0
+        ){
+    cudaError_t cerror;
+    cerror =  cudaOccupancyMaxPotentialBlockSize(minGridSize, blockSize, func, dynamicSMemSize, blockSizeLimit, flags);
+    return hipCUDAErrorTohipError(cerror);
+}
+
 template <class T, int dim, enum cudaTextureReadMode readMode>
 inline static hipError_t  hipBindTexture(size_t *offset,
 			                             const struct texture<T, dim, readMode> &tex,
@@ -0,0 +1,21 @@
+HIP_PATH?= $(wildcard /opt/rocm/hip)
+ifeq (,$(HIP_PATH))
+	HIP_PATH=../../..
+endif
+HIPCC=$(HIP_PATH)/bin/hipcc
+HIP_PLATFORM=$(shell $(HIP_PATH)/bin/hipconfig --compiler)
+
+ifeq (${HIP_PLATFORM}, hcc)
+	GENCODEOBJECT_FLAGS=--target-isa-fiji
+endif
+
+all: vcpy_isa.compile runKernel.hip.out
+
+vcpy_isa.compile: vcpy_isa.cpp
+	$(HIPCC) --gencodeobject $(GENCODEOBJECT_FLAGS) vcpy_isa.cpp -o vcpy_isa.co
+
+runKernel.hip.out: runKernel.cpp
+	$(HIPCC) runKernel.cpp -o runKernel.hip.out
+
+clean:
+	rm -f *.co *.out
@@ -0,0 +1,105 @@
+/*
+Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR
+IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include<hip_runtime.h>
+#include<hip_runtime_api.h>
+#include<iostream>
+#include<fstream>
+#include<vector>
+
+#define LEN 64
+#define SIZE LEN<<2
+
+#ifdef __HIP_PLATFORM_HCC__ 
+#define fileName "vcpy_isa.co"
+#define kernel_name "ZN12_GLOBAL__N_146_Z11hello_world16grid_launch_parmPfS0__functor19__cxxamp_trampolineEiiiiiiPKfPf"
+#endif
+
+#ifdef __HIP_PLATFORM_NVCC__
+#define fileName "vcpy_isa.ptx"
+#define kernel_name "hello_world"
+#endif
+
+int main(){
+    float *A, *B;
+    hipDeviceptr_t Ad, Bd;
+    A = new float[LEN];
+    B = new float[LEN];
+
+    for(uint32_t i=0;i<LEN;i++){
+        A[i] = i*1.0f;
+        B[i] = 0.0f;
+    }
+
+
+#ifdef __HIP_PLATFORM_NVCC__
+  	hipInit(0);
+	  hipDevice_t device;
+	  hipCtx_t context;
+	  hipDeviceGet(&device, 0);
+	  hipCtxCreate(&context, 0, device);
+#endif
+
+    hipMalloc((void**)&Ad, SIZE);
+    hipMalloc((void**)&Bd, SIZE);
+
+    hipMemcpyHtoD(Ad, A, SIZE);
+    hipMemcpyHtoD(Bd, B, SIZE);
+    hipModule_t Module;
+    hipFunction_t Function;
+    hipModuleLoad(&Module, fileName);
+    hipModuleGetFunction(&Function, Module, kernel_name);
+
+  uint32_t len = LEN;
+  uint32_t one = 1;
+
+    std::vector<void*>argBuffer(5);
+    uint32_t *ptr32_t = (uint32_t*)&argBuffer[0];
+    memcpy(ptr32_t + 0, &one, sizeof(uint32_t));
+    memcpy(ptr32_t + 1, &one, sizeof(uint32_t));
+    memcpy(ptr32_t + 2, &one, sizeof(uint32_t));
+    memcpy(ptr32_t + 3, &len, sizeof(uint32_t));
+    memcpy(ptr32_t + 4, &one, sizeof(uint32_t));
+    memcpy(ptr32_t + 5, &one, sizeof(uint32_t));
+    memcpy(&argBuffer[3], &Ad, sizeof(void*));
+    memcpy(&argBuffer[4], &Bd, sizeof(void*));
+
+
+
+    size_t size = argBuffer.size()*sizeof(void*);
+
+    void *config[] = {
+      HIP_LAUNCH_PARAM_BUFFER_POINTER, &argBuffer[0],
+      HIP_LAUNCH_PARAM_BUFFER_SIZE, &size,
+      HIP_LAUNCH_PARAM_END
+    };
+
+    hipModuleLaunchKernel(Function, 1, 1, 1, LEN, 1, 1, 0, 0, NULL, (void**)&config); 
+
+    hipMemcpyDtoH(B, Bd, SIZE);
+    for(uint32_t i=LEN-4;i<LEN;i++){
+        std::cout<<A[i]<<" - "<<B[i]<<std::endl;
+    }
+
+#ifdef __HIP_PLATFORM_NVCC__
+	  hipCtxDetach(context);
+#endif 
+
+    return 0;
+}
@@ -0,0 +1,9 @@
+#include<hip_runtime.h>
+
+__global__ void hello_world(hipLaunchParm lp, float *a, float *b)
+{
+    int tx = hipThreadIdx_x;
+    b[tx] = a[tx];
+}
+
+int main(){}
@@ -0,0 +1,6 @@
+
+extern "C" __global__ void hello_world(float *a, float *b)
+{
+	int tx = threadIdx.x;
+	b[tx] = a[tx];
+}
@@ -0,0 +1,38 @@
+//
+// Generated by NVIDIA NVVM Compiler
+//
+// Compiler Build ID: CL-19856038
+// Cuda compilation tools, release 7.5, V7.5.17
+// Based on LLVM 3.4svn
+//
+
+.version 4.3
+.target sm_20
+.address_size 64
+
+	// .globl	hello_world
+
+.visible .entry hello_world(
+	.param .u64 hello_world_param_0,
+	.param .u64 hello_world_param_1
+)
+{
+	.reg .f32 	%f<2>;
+	.reg .b32 	%r<2>;
+	.reg .b64 	%rd<8>;
+
+
+	ld.param.u64 	%rd1, [hello_world_param_0];
+	ld.param.u64 	%rd2, [hello_world_param_1];
+	cvta.to.global.u64 	%rd3, %rd2;
+	cvta.to.global.u64 	%rd4, %rd1;
+	mov.u32 	%r1, %tid.x;
+	mul.wide.s32 	%rd5, %r1, 4;
+	add.s64 	%rd6, %rd4, %rd5;
+	ld.global.f32 	%f1, [%rd6];
+	add.s64 	%rd7, %rd3, %rd5;
+	st.global.f32 	[%rd7], %f1;
+	ret;
+}
+
+
@@ -29,7 +29,22 @@ THE SOFTWARE.
 // Stack of contexts
 thread_local std::stack<ihipCtx_t *>  tls_ctxStack;

+hipError_t ihipCtxStackUpdate()
+{
+    //HIP_INIT_API();
+    hipError_t e = hipSuccess;

+    if(tls_ctxStack.empty()) {
+            tls_ctxStack.push(ihipGetTlsDefaultCtx());
+    }
+
+    return ihipLogStatus(e);
+}
+
+/**
+ * @return #hipSuccess, #hipErrorInvalidValue
+ */
+//---
 hipError_t hipInit(unsigned int flags)
 {
    HIP_INIT_API(flags);
@@ -44,7 +59,10 @@ hipError_t hipInit(unsigned int flags)
    return ihipLogStatus(e);
 }

-
+/**
+ * @return #hipSuccess
+ */
+//---
 hipError_t hipCtxCreate(hipCtx_t *ctx, unsigned int flags,  hipDevice_t device)
 {
    HIP_INIT_API(ctx, flags, device); // FIXME - review if we want to init
@@ -57,7 +75,10 @@ hipError_t hipCtxCreate(hipCtx_t *ctx, unsigned int flags,  hipDevice_t device)
    return ihipLogStatus(e);
 }

-
+/**
+ * @return #hipSuccess, #hipErrorInvalidDevice
+ */
+//---
 hipError_t hipDeviceGet(hipDevice_t *device, int deviceId)
 {
    HIP_INIT_API(device, deviceId); // FIXME - review if we want to init
@@ -88,20 +109,39 @@ hipError_t hipDriverGetVersion(int *driverVersion)
    return ihipLogStatus(hipSuccess);
 }

+/**
+ * @return #hipSuccess, #hipErrorInvalidValue
+ */
+//---
 hipError_t hipCtxDestroy(hipCtx_t ctx)
 {
+    HIP_INIT_API(ctx);
    hipError_t e = hipSuccess;
    ihipCtx_t* currentCtx= ihipGetTlsDefaultCtx();
-    if(currentCtx == ctx) {
-        //need to destroy the ctx associated with calling thread
-        tls_ctxStack.pop();
+    ihipCtx_t* primaryCtx= ((ihipDevice_t*)ctx->getDevice())->_primaryCtx;
+    if(primaryCtx== ctx)
+    {
+        e = hipErrorInvalidValue;
    }
-    delete ctx; //As per CUDA docs , attempting to access ctx from those threads which has this ctx as current, will result in the error HIP_ERROR_CONTEXT_IS_DESTROYED.
+    else
+    {
+        if(currentCtx == ctx) {
+            //need to destroy the ctx associated with calling thread
+            tls_ctxStack.pop();
+        }
+        delete ctx; //As per CUDA docs , attempting to access ctx from those threads which has this ctx as current, will result in the error HIP_ERROR_CONTEXT_IS_DESTROYED.
+    }
+
    return ihipLogStatus(e);
 }

+/**
+ * @return #hipSuccess
+ */
+//---
 hipError_t hipCtxPopCurrent(hipCtx_t* ctx)
 {
+    HIP_INIT_API(ctx);
    hipError_t e = hipSuccess;
    ihipCtx_t* tempCtx;
    *ctx = ihipGetTlsDefaultCtx();
@@ -119,8 +159,13 @@ hipError_t hipCtxPopCurrent(hipCtx_t* ctx)
    return ihipLogStatus(e);
 }

+/**
+ * @return #hipSuccess, #hipErrorInvalidContext
+ */
+//---
 hipError_t hipCtxPushCurrent(hipCtx_t ctx)
 {
+    HIP_INIT_API(ctx);
    hipError_t e = hipSuccess;
    if(ctx != NULL) {    //TODO- is this check needed?
        ihipSetTlsDefaultCtx(ctx);
@@ -132,19 +177,30 @@ hipError_t hipCtxPushCurrent(hipCtx_t ctx)
    return ihipLogStatus(e);
 }

+/**
+ * @return #hipSuccess
+ */
+//---
 hipError_t hipCtxGetCurrent(hipCtx_t* ctx)
 {
+    HIP_INIT_API(ctx);
    hipError_t e = hipSuccess;
-
-    *ctx = ihipGetTlsDefaultCtx();
-    if(*ctx == nullptr) {
-        *ctx = NULL;    //TODO - is it required? Can return nullptr?
+    if(!tls_ctxStack.empty()) {
+        *ctx= tls_ctxStack.top();
+    }
+    else {
+        *ctx = NULL;
    }
    return ihipLogStatus(e);
 }

+/**
+ * @return #hipSuccess
+ */
+//---
 hipError_t hipCtxSetCurrent(hipCtx_t ctx)
 {
+    HIP_INIT_API(ctx);
    hipError_t e = hipSuccess;
    if(ctx == NULL) {
        tls_ctxStack.pop();
@@ -156,8 +212,13 @@ hipError_t hipCtxSetCurrent(hipCtx_t ctx)
    return ihipLogStatus(e);
 }

+/**
+ * @return #hipSuccess, #hipErrorInvalidContext
+ */
+//---
 hipError_t hipCtxGetDevice(hipDevice_t *device)
 {
+    HIP_INIT_API(device);
    hipError_t e = hipSuccess;

    ihipCtx_t *ctx = ihipGetTlsDefaultCtx();
@@ -171,6 +232,10 @@ hipError_t hipCtxGetDevice(hipDevice_t *device)
    return ihipLogStatus(e);
 }

+/**
+ * @return #hipSuccess
+ */
+//---
 hipError_t hipCtxGetApiVersion (hipCtx_t ctx,int *apiVersion)
 {
    HIP_INIT_API(apiVersion);
@@ -182,6 +247,10 @@ hipError_t hipCtxGetApiVersion (hipCtx_t ctx,int *apiVersion)
    return ihipLogStatus(hipSuccess);
 }

+/**
+ * @return #hipSuccess
+ */
+//---
 hipError_t hipCtxGetCacheConfig ( hipFuncCache *cacheConfig )
 {
    HIP_INIT_API(cacheConfig);
@@ -191,6 +260,10 @@ hipError_t hipCtxGetCacheConfig ( hipFuncCache *cacheConfig )
    return ihipLogStatus(hipSuccess);
 }

+/**
+ * @return #hipSuccess
+ */
+//---
 hipError_t hipCtxSetCacheConfig ( hipFuncCache cacheConfig )
 {
    HIP_INIT_API(cacheConfig);
@@ -200,6 +273,10 @@ hipError_t hipCtxSetCacheConfig ( hipFuncCache cacheConfig )
    return ihipLogStatus(hipSuccess);
 }

+/**
+ * @return #hipSuccess
+ */
+//---
 hipError_t hipCtxSetSharedMemConfig ( hipSharedMemConfig config )
 {
    HIP_INIT_API(config);
@@ -209,6 +286,10 @@ hipError_t hipCtxSetSharedMemConfig ( hipSharedMemConfig config )
    return ihipLogStatus(hipSuccess);
 }

+/**
+ * @return #hipSuccess
+ */
+//---
 hipError_t hipCtxGetSharedMemConfig ( hipSharedMemConfig * pConfig )
 {
    HIP_INIT_API(pConfig);
@@ -216,4 +297,28 @@ hipError_t hipCtxGetSharedMemConfig ( hipSharedMemConfig * pConfig )
    *pConfig = hipSharedMemBankSizeFourByte;

    return ihipLogStatus(hipSuccess);
-}
+}
+
+/**
+ * @return #hipSuccess
+ */
+//---
+hipError_t hipCtxSynchronize ( void )
+{
+    HIP_INIT_API(1);
+    return ihipSynchronize(); //TODP Shall check validity of ctx?
+}
+
+/**
+ * @return #hipSuccess
+ */
+//---
+hipError_t hipCtxGetFlags ( unsigned int* flags )
+{
+    HIP_INIT_API(flags);
+    hipError_t e = hipSuccess;
+    ihipCtx_t* tempCtx;
+    tempCtx = ihipGetTlsDefaultCtx();
+    *flags = tempCtx->_ctxFlags;
+    return ihipLogStatus(e);
+}
@@ -160,11 +160,8 @@ hipError_t hipSetDevice(int deviceId)
 */
 hipError_t hipDeviceSynchronize(void)
 {
-    HIP_INIT_API();
-
-    ihipGetTlsDefaultCtx()->locked_waitAllStreams(); // ignores non-blocking streams, this waits for all activity to finish.
-
-    return ihipLogStatus(hipSuccess);
+    HIP_INIT_API(1);
+    return ihipSynchronize();
 }


@@ -33,13 +33,14 @@ hipError_t ihipEventCreate(hipEvent_t* event, unsigned flags)

    // TODO - support hipEventDefault, hipEventBlockingSync, hipEventDisableTiming
    if (flags == 0) {
-        ihipEvent_t *eh = event->_handle = new ihipEvent_t();
+        ihipEvent_t *eh = new ihipEvent_t();

        eh->_state  = hipEventStatusCreated;
        eh->_stream = NULL;
        eh->_flags  = flags;
        eh->_timestamp  = 0;
        eh->_copySeqId  = 0;
+        *event = eh;
    } else {
        e = hipErrorInvalidValue;
    }
@@ -71,7 +72,7 @@ hipError_t hipEventRecord(hipEvent_t event, hipStream_t stream)
 {
    HIP_INIT_API(event, stream);

-    ihipEvent_t *eh = event._handle;
+    ihipEvent_t *eh = event;
    if (eh && eh->_state != hipEventStatusUnitialized)   {
        eh->_stream = stream;

@@ -106,10 +107,10 @@ hipError_t hipEventDestroy(hipEvent_t event)
 {
    HIP_INIT_API(event);

-    event._handle->_state  = hipEventStatusUnitialized;
+    event->_state  = hipEventStatusUnitialized;

-    delete event._handle;
-    event._handle = NULL;
+    delete event;
+    event = NULL;

    // TODO - examine return additional error codes
    return ihipLogStatus(hipSuccess);
@@ -121,7 +122,7 @@ hipError_t hipEventSynchronize(hipEvent_t event)
 {
    HIP_INIT_API(event);

-    ihipEvent_t *eh = event._handle;
+    ihipEvent_t *eh = event;

    if (eh) {
        if (eh->_state == hipEventStatusUnitialized) {
@@ -150,8 +151,8 @@ hipError_t hipEventElapsedTime(float *ms, hipEvent_t start, hipEvent_t stop)
 {
    HIP_INIT_API(ms, start, stop);

-    ihipEvent_t *start_eh = start._handle;
-    ihipEvent_t *stop_eh = stop._handle;
+    ihipEvent_t *start_eh = start;
+    ihipEvent_t *stop_eh = stop;

    ihipSetTs(start);
    ihipSetTs(stop);
@@ -195,7 +196,7 @@ hipError_t hipEventQuery(hipEvent_t event)
 {
    HIP_INIT_API(event);

-    ihipEvent_t *eh = event._handle;
+    ihipEvent_t *eh = event;

    // TODO-stream - need to read state of signal here:  The event may have become ready after recording..
    // TODO-HCC - use get_hsa_signal here.
@@ -158,7 +158,12 @@ ihipCtx_t *ihipGetTlsDefaultCtx()
    return tls_defaultCtx;
 }

+hipError_t ihipSynchronize(void)
+{
+    ihipGetTlsDefaultCtx()->locked_waitAllStreams(); // ignores non-blocking streams, this waits for all activity to finish.

+    return ihipLogStatus(hipSuccess);
+}

 //=================================================================================================
 // ihipSignal_t:
@@ -482,6 +487,53 @@ int ihipStream_t::preCopyCommand(LockedAccessor_StreamCrit_t &crit, ihipSignal_t
 }


+void ihipStream_t::launchModuleKernel(hsa_signal_t signal,
+                        uint32_t blockDimX,
+                        uint32_t blockDimY,
+                        uint32_t blockDimZ,
+                        uint32_t gridDimX,
+                        uint32_t gridDimY,
+                        uint32_t gridDimZ,
+                        uint32_t sharedMemBytes,
+                        void *kernarg,
+                        size_t kernSize,
+                        uint64_t kernel){
+    hsa_status_t status;
+    void *kern;
+    hsa_amd_memory_pool_t *pool = reinterpret_cast<hsa_amd_memory_pool_t*>(_av.get_hsa_kernarg_region());
+    status = hsa_amd_memory_pool_allocate(*pool, kernSize, 0, &kern);
+    status = hsa_amd_agents_allow_access(1, (hsa_agent_t*)_av.get_hsa_agent(), 0, kern);
+    memcpy(kern, kernarg, kernSize);
+    hsa_queue_t *Queue = (hsa_queue_t*)_av.get_hsa_queue();
+    const uint32_t queue_mask = Queue->size-1;
+    uint32_t packet_index = hsa_queue_load_write_index_relaxed(Queue);
+    hsa_kernel_dispatch_packet_t *dispatch_packet = &(((hsa_kernel_dispatch_packet_t*)(Queue->base_address))[packet_index & queue_mask]);
+
+    dispatch_packet->completion_signal = signal;
+    dispatch_packet->workgroup_size_x = blockDimX;
+    dispatch_packet->workgroup_size_y = blockDimY;
+    dispatch_packet->workgroup_size_z = blockDimZ;
+    dispatch_packet->grid_size_x = blockDimX * gridDimX;
+    dispatch_packet->grid_size_y = blockDimY * gridDimY;
+    dispatch_packet->grid_size_z = blockDimZ * gridDimZ;
+    dispatch_packet->group_segment_size = 0;
+    dispatch_packet->private_segment_size = sharedMemBytes;
+    dispatch_packet->kernarg_address = kern;
+    dispatch_packet->kernel_object = kernel;
+    uint16_t header = (HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) |
+    (1 << HSA_PACKET_HEADER_BARRIER) |
+    (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) |
+    (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE);
+
+    uint16_t setup = 1 << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
+    uint32_t header32 = header | (setup << 16);
+
+    __atomic_store_n((uint32_t*)(dispatch_packet), header32, __ATOMIC_RELEASE);
+
+    hsa_queue_store_write_index_relaxed(Queue, packet_index + 1);
+    hsa_signal_store_relaxed(Queue->doorbell_signal, packet_index);
+}
+

 //=============================================================================
 // Recompute the peercnt and the packed _peerAgents whenever a peer is added or deleted.
@@ -1260,13 +1312,20 @@ hipStream_t ihipSyncAndResolveStream(hipStream_t stream)
    }
 }

+void ihipPrintKernelLaunch(const char *kernelName, const grid_launch_parm *lp, const hipStream_t stream) 
+{
+    std::string streamString = ToString(stream);
+    fprintf(stderr, KGRN "<<hip-api: hipLaunchKernel '%s' gridDim:(%d,%d,%d) groupDim:(%d,%d,%d) groupMem:+%d %s\n" KNRM, \
+            kernelName, lp->grid_dim.x, lp->grid_dim.y, lp->grid_dim.z, lp->group_dim.x, lp->group_dim.y, lp->group_dim.z, 
+            lp->dynamic_group_mem_bytes, streamString.c_str());\
+}

 // TODO - data-up to data-down:
 // Called just before a kernel is launched from hipLaunchKernel.
 // Allows runtime to track some information about the stream.
 hipStream_t ihipPreLaunchKernel(hipStream_t stream, dim3 grid, dim3 block, grid_launch_parm *lp)
 {
-    HIP_INIT_API(stream, grid, block, lp);
+    HIP_INIT();
    stream = ihipSyncAndResolveStream(stream);
 #if USE_GRID_LAUNCH_20
    lp->grid_dim.x = grid.x;
@@ -1439,7 +1498,7 @@ const char *ihipErrorString(hipError_t hip_error)

 void ihipSetTs(hipEvent_t e)
 {
-    ihipEvent_t *eh = e._handle;
+    ihipEvent_t *eh = e;
    if (eh->_state == hipEventStatusRecorded) {
        // already recorded, done:
        return;
@@ -1509,7 +1568,7 @@ void ihipStream_t::setAsyncCopyAgents(unsigned kind, ihipCommand_t *commandType,
 }


-void ihipStream_t::copySync(LockedAccessor_StreamCrit_t &crit, void* dst, const void* src, size_t sizeBytes, unsigned kind)
+void ihipStream_t::copySync(LockedAccessor_StreamCrit_t &crit, void* dst, const void* src, size_t sizeBytes, unsigned kind, bool resolveOn)
 {
    ihipCtx_t *ctx = this->getCtx();
    const ihipDevice_t *device = ctx->getDevice();
@@ -1528,7 +1587,7 @@ void ihipStream_t::copySync(LockedAccessor_StreamCrit_t &crit, void* dst, const
    bool dstInDeviceMem = dstPtrInfo._isInDeviceMem;

    // Resolve default to a specific Kind so we know which algorithm to use:
-    if (kind == hipMemcpyDefault) {
+    if (kind == hipMemcpyDefault && resolveOn) {
        kind = resolveMemcpyDirection(srcTracked, dstTracked, srcInDeviceMem, dstInDeviceMem);
    };

@@ -1699,14 +1758,12 @@ void ihipStream_t::copySync(LockedAccessor_StreamCrit_t &crit, void* dst, const


 // Sync copy that acquires lock:
-void ihipStream_t::locked_copySync(void* dst, const void* src, size_t sizeBytes, unsigned kind)
+void ihipStream_t::locked_copySync(void* dst, const void* src, size_t sizeBytes, unsigned kind, bool resolveOn)
 {
    LockedAccessor_StreamCrit_t crit (_criticalData);
-    copySync(crit, dst, src, sizeBytes, kind);
+    copySync(crit, dst, src, sizeBytes, kind, resolveOn);
 }

-
-
 void ihipStream_t::copyAsync(void* dst, const void* src, size_t sizeBytes, unsigned kind)
 {
    LockedAccessor_StreamCrit_t crit(_criticalData);
@@ -446,12 +446,96 @@ hipError_t hipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind
        e = ex._code;
    }

+    return ihipLogStatus(e);
+}

+hipError_t hipMemcpyHtoD(hipDeviceptr_t dst, void* src, size_t sizeBytes)
+{
+    HIP_INIT_API(dst, src, sizeBytes);
+
+    hipStream_t stream = ihipSyncAndResolveStream(hipStreamNull);
+
+    hc::completion_future marker;
+
+    hipError_t e = hipSuccess;
+
+    try {
+
+        stream->locked_copySync((void*)dst, (void*)src, sizeBytes, hipMemcpyHostToDevice, false);
+    }
+    catch (ihipException ex) {
+        e = ex._code;
+    }

    return ihipLogStatus(e);
 }


+hipError_t hipMemcpyDtoH(void* dst, hipDeviceptr_t src, size_t sizeBytes)
+{
+    HIP_INIT_API(dst, src, sizeBytes);
+
+    hipStream_t stream = ihipSyncAndResolveStream(hipStreamNull);
+
+    hc::completion_future marker;
+
+    hipError_t e = hipSuccess;
+
+    try {
+
+        stream->locked_copySync((void*)dst, (void*)src, sizeBytes, hipMemcpyDeviceToHost, false);
+    }
+    catch (ihipException ex) {
+        e = ex._code;
+    }
+
+    return ihipLogStatus(e);
+}
+
+hipError_t hipMemcpyDtoD(hipDeviceptr_t dst, hipDeviceptr_t src, size_t sizeBytes)
+{
+    HIP_INIT_API(dst, src, sizeBytes);
+
+    hipStream_t stream = ihipSyncAndResolveStream(hipStreamNull);
+
+    hc::completion_future marker;
+
+    hipError_t e = hipSuccess;
+
+    try {
+
+        stream->locked_copySync((void*)dst, (void*)src, sizeBytes, hipMemcpyDeviceToDevice, false);
+    }
+    catch (ihipException ex) {
+        e = ex._code;
+    }
+
+    return ihipLogStatus(e);
+}
+
+hipError_t hipMemcpyHtoH(void* dst, void* src, size_t sizeBytes)
+{
+    HIP_INIT_API(dst, src, sizeBytes);
+
+    hipStream_t stream = ihipSyncAndResolveStream(hipStreamNull);
+
+    hc::completion_future marker;
+
+    hipError_t e = hipSuccess;
+
+    try {
+
+        stream->locked_copySync((void*)dst, (void*)src, sizeBytes, hipMemcpyHostToHost, false);
+    }
+    catch (ihipException ex) {
+        e = ex._code;
+    }
+
+    return ihipLogStatus(e);
+}
+
+
+
 /**
 * @result #hipSuccess, #hipErrorInvalidDevice, #hipErrorInvalidMemcpyDirection,
 * @result #hipErrorInvalidValue : If dst==NULL or src==NULL, or other bad argument.
@@ -786,7 +870,6 @@ hipError_t hipMemGetInfo  (size_t *free, size_t *total)
            // TODO - replace with kernel-level for reporting free memory:
            size_t deviceMemSize, hostMemSize, userMemSize;
            hc::am_memtracker_sizeinfo(device->_acc, &deviceMemSize, &hostMemSize, &userMemSize);
-            printf ("deviceMemSize=%zu\n", deviceMemSize);

            *free =  device->_props.totalGlobalMem - deviceMemSize;
        }
@@ -20,137 +20,188 @@ THE SOFTWARE.
 #include "hip_runtime.h"
 #include "hsa/hsa.h"
 #include "hsa/hsa_ext_amd.h"
+#include "hsa/amd_hsa_kernel_code.h"
 #include "hcc_detail/hip_hcc.h"
 #include "hcc_detail/trace_helper.h"
 #include <fstream>
+#include <stdio.h>
+#include <stdlib.h>
+#include <elf.h>

 //TODO Use Pool APIs from HCC to get memory regions.

 namespace hipdrv{
-hsa_status_t findSystemRegions(hsa_region_t region, void *data){
-  hsa_region_segment_t segment_id;
-  hsa_region_get_info(region, HSA_REGION_INFO_SEGMENT, &segment_id);

-  if(segment_id != HSA_REGION_SEGMENT_GLOBAL){
-    return HSA_STATUS_SUCCESS;
-  }
+    hsa_status_t findSystemRegions(hsa_region_t region, void *data){
+        hsa_region_segment_t segment_id;
+        hsa_region_get_info(region, HSA_REGION_INFO_SEGMENT, &segment_id);

-  hsa_region_global_flag_t flags;
-  hsa_region_get_info(region, HSA_REGION_INFO_GLOBAL_FLAGS, &flags);
+        if(segment_id != HSA_REGION_SEGMENT_GLOBAL){
+            return HSA_STATUS_SUCCESS;
+        }

-  hsa_region_t *reg = (hsa_region_t*)data;
+        hsa_region_global_flag_t flags;
+        hsa_region_get_info(region, HSA_REGION_INFO_GLOBAL_FLAGS, &flags);

-  if(flags & HSA_REGION_GLOBAL_FLAG_FINE_GRAINED){
-    *reg = region;
-  }
+        hsa_region_t *reg = (hsa_region_t*)data;

-  return HSA_STATUS_SUCCESS;
+        if(flags & HSA_REGION_GLOBAL_FLAG_FINE_GRAINED){
+            *reg = region;
+        }
+
+        return HSA_STATUS_SUCCESS;
+    }
+
+}   // End namespace hipdrv
+
+uint64_t PrintSymbolSizes(const void *emi, const char *name){
+    const Elf64_Ehdr *ehdr = (const Elf64_Ehdr*)emi;
+    if(NULL == ehdr || EV_CURRENT != ehdr->e_version){}
+    const Elf64_Shdr * shdr = (const Elf64_Shdr*)((char*)emi + ehdr->e_shoff);
+    for(uint16_t i=0;i<ehdr->e_shnum;++i){
+        if(shdr[i].sh_type == SHT_SYMTAB){
+            const Elf64_Sym *syms = (const Elf64_Sym*)((char*)emi + shdr[i].sh_offset);
+            assert(syms);
+            uint64_t numSyms = shdr[i].sh_size/shdr[i].sh_entsize;
+            const char* strtab = (const char*)((char*)emi + shdr[shdr[i].sh_link].sh_offset);
+            assert(strtab);
+            for(uint64_t i=0;i<numSyms;++i){
+                const char *symname = strtab + syms[i].st_name;
+                assert(symname);
+                uint64_t size = syms[i].st_size;
+                if(strcmp(name, symname) == 0){
+                    return size;
+                }
+            }
+        }
+    }
+    return 0;
 }

-hsa_status_t findKernArgRegions(hsa_region_t region, void *data){
-  hsa_region_segment_t segment_id;
-  hsa_region_get_info(region, HSA_REGION_INFO_SEGMENT, &segment_id);
+uint64_t ElfSize(const void *emi){
+    const Elf64_Ehdr *ehdr = (const Elf64_Ehdr*)emi;
+    const Elf64_Shdr *shdr = (const Elf64_Shdr*)((char*)emi + ehdr->e_shoff);

-  if(segment_id != HSA_REGION_SEGMENT_GLOBAL){
-    return HSA_STATUS_SUCCESS;
-  }
+    uint64_t max_offset = ehdr->e_shoff;
+    uint64_t total_size = max_offset + ehdr->e_shentsize * ehdr->e_shnum;

-  hsa_region_global_flag_t flags;
-  hsa_region_get_info(region, HSA_REGION_INFO_GLOBAL_FLAGS, &flags);
-
-  hsa_region_t *reg = (hsa_region_t*)data;
-
-  if(flags & HSA_REGION_GLOBAL_FLAG_KERNARG){
-    *reg = region;
-  }
-
-  return HSA_STATUS_SUCCESS;
+    for(uint16_t i=0;i < ehdr->e_shnum;++i){
+        uint64_t cur_offset = static_cast<uint64_t>(shdr[i].sh_offset);
+        if(max_offset < cur_offset){
+            max_offset = cur_offset;
+            total_size = max_offset;
+            if(SHT_NOBITS != shdr[i].sh_type){
+                total_size += static_cast<uint64_t>(shdr[i].sh_size);
+            }
+        }
+    }
+    return total_size;
 }

-
-}
-
-
-
-hipError_t hipModuleLoad(hipModule *module, const char *fname){
+hipError_t hipModuleLoad(hipModule_t *module, const char *fname){
    HIP_INIT_API(fname);
    hipError_t ret = hipSuccess;
+    *module = new ihipModule_t;
+
    if(module == NULL){
-        ret = hipErrorInvalidValue;
+        return hipErrorInvalidValue;
    }
+
    auto ctx = ihipGetTlsDefaultCtx();
    if(ctx == nullptr){
        ret = hipErrorInvalidContext;
+
    }else{
        int deviceId = ctx->getDevice()->_deviceId;
        ihipDevice_t *currentDevice = ihipGetDevice(deviceId);
        std::ifstream in(fname, std::ios::binary | std::ios::ate);
+
        if(!in){
            return hipErrorFileNotFound;
+
        }else{
+
+            *module = new ihipModule_t;
            size_t size = std::string::size_type(in.tellg());
            void *p = NULL;
            hsa_agent_t agent = currentDevice->_hsaAgent;
            hsa_region_t sysRegion;
            hsa_status_t status = hsa_agent_iterate_regions(agent, hipdrv::findSystemRegions, &sysRegion);
            status = hsa_memory_allocate(sysRegion, size, (void**)&p);
+
            if(status != HSA_STATUS_SUCCESS){
                return hipErrorOutOfMemory;
            }
+
            char *ptr = (char*)p;
            if(!ptr){
                return hipErrorOutOfMemory;
-                std::cout<<"Error: failed to allocate memory for code object"<<std::endl;
            }
-            hsa_code_object_t obj;
+            (*module)->ptr = p;
+            (*module)->size = size;
            in.seekg(0, std::ios::beg);
            std::copy(std::istreambuf_iterator<char>(in),
                      std::istreambuf_iterator<char>(), ptr);
-            status = hsa_code_object_deserialize(ptr, size, NULL, &obj);
+
+            status = hsa_code_object_deserialize(ptr, size, NULL, &(*module)->object);
+
            if(status != HSA_STATUS_SUCCESS){
                return hipErrorSharedObjectInitFailed;
            }
-            *module = obj.handle;
+
+            status = hsa_executable_create(HSA_PROFILE_FULL, HSA_EXECUTABLE_STATE_UNFROZEN, NULL, &(*module)->executable);
+            if(status != HSA_STATUS_SUCCESS){
+                return hipErrorNotInitialized;
+            }
        }
    }
+
    return ret;
 }

-hipError_t hipModuleGetFunction(hipFunction *func, hipModule hmod, const char *name){
-    HIP_INIT_API(name);
+hipError_t hipModuleUnload(hipModule_t hmod){
+    hipError_t ret = hipSuccess;
+    hsa_status_t status = hsa_executable_destroy(hmod->executable);
+    if(status != HSA_STATUS_SUCCESS){ret = hipErrorInvalidValue; }
+    status = hsa_code_object_destroy(hmod->object);
+    if(status != HSA_STATUS_SUCCESS){ret = hipErrorInvalidValue; }
+    delete hmod;
+    return ret;
+}
+
+hipError_t ihipModuleGetFunction(hipFunction_t *func, hipModule_t hmod, const char *name){
    auto ctx = ihipGetTlsDefaultCtx();
    hipError_t ret = hipSuccess;
-    if(name == nullptr || hmod == 0){
+
+    if(name == nullptr){
        return hipErrorInvalidValue;
    }
+
    if(ctx == nullptr){
        ret = hipErrorInvalidContext;
+
    }else{
+        *func = new ihipFunction_t;
        int deviceId = ctx->getDevice()->_deviceId;
        ihipDevice_t *currentDevice = ihipGetDevice(deviceId);
        hsa_agent_t gpuAgent = (hsa_agent_t)currentDevice->_hsaAgent;

        hsa_status_t status;
-        hsa_executable_symbol_t kernel_symbol;
-        hsa_executable_t executable;
-        status = hsa_executable_create(HSA_PROFILE_FULL, HSA_EXECUTABLE_STATE_UNFROZEN, NULL, &executable);
+        status = hsa_executable_load_code_object(hmod->executable, gpuAgent, hmod->object, NULL);
        if(status != HSA_STATUS_SUCCESS){
            return hipErrorNotInitialized;
        }
-        hsa_code_object_t obj;
-        obj.handle = hmod;
-        status = hsa_executable_load_code_object(executable, gpuAgent, obj, NULL);
-        if(status != HSA_STATUS_SUCCESS){
-            return hipErrorNotInitialized;
-        }
-        status = hsa_executable_freeze(executable, NULL);
-        status = hsa_executable_get_symbol(executable, NULL, name, gpuAgent, 0, &kernel_symbol);
+
+        status = hsa_executable_freeze(hmod->executable, NULL);
+        status = hsa_executable_get_symbol(hmod->executable, NULL, name, gpuAgent, 0, &(*func)->kernel_symbol);
        if(status != HSA_STATUS_SUCCESS){
            return hipErrorNotFound;
        }
-        status = hsa_executable_symbol_get_info(kernel_symbol,
+
+        status = hsa_executable_symbol_get_info((*func)->kernel_symbol,
                                   HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT,
-                                   func);
+                                   &(*func)->kernel);
+
        if(status != HSA_STATUS_SUCCESS){
            return hipErrorNotFound;
        }
@@ -158,16 +209,24 @@ hipError_t hipModuleGetFunction(hipFunction *func, hipModule hmod, const char *n
    return ret;
 }

-hipError_t hipLaunchModuleKernel(hipFunction f,
+hipError_t hipModuleGetFunction(hipFunction_t *hfunc, hipModule_t hmod,
+                                const char *name){
+    HIP_INIT_API(name);
+    return ihipModuleGetFunction(hfunc, hmod, name);
+}
+
+hipError_t hipModuleLaunchKernel(hipFunction_t f,
            uint32_t gridDimX, uint32_t gridDimY, uint32_t gridDimZ,
            uint32_t blockDimX, uint32_t blockDimY, uint32_t blockDimZ,
            uint32_t sharedMemBytes, hipStream_t hStream,
            void **kernelParams, void **extra){
-    HIP_INIT_API(f);
+    HIP_INIT_API(f->kernel);
    auto ctx = ihipGetTlsDefaultCtx();
    hipError_t ret = hipSuccess;
+
    if(ctx == nullptr){
        ret = hipErrorInvalidDevice;
+
    }else{
        int deviceId = ctx->getDevice()->_deviceId;
        ihipDevice_t *currentDevice = ihipGetDevice(deviceId);
@@ -189,66 +248,100 @@ hipError_t hipLaunchModuleKernel(hipFunction f,
 /*
 Kernel argument preparation.
 */
-
-        hsa_region_t kernArg;
-        hsa_status_t status = hsa_agent_iterate_regions(gpuAgent, hipdrv::findKernArgRegions, &kernArg);
-        void *kern;
-        status = hsa_memory_allocate(kernArg, kernSize, &kern);
-        if(status != HSA_STATUS_SUCCESS){
-            return hipErrorLaunchOutOfResources;
-        }
-        memcpy(kern, config[1], kernSize);
-
+        hsa_status_t status;
+        grid_launch_parm lp;
+        hStream = ihipPreLaunchKernel(hStream, 0, 0, &lp);

 /*
-Pre kernel launch 
-
-    stream = ihipSyncAndResolveStream(stream);
-    stream->lockopen_preKernelCommand();
-    hc::accelerator_view av = &stream->_av;
-    hc::completion_future cf = new hc::completion_future;
+  Create signal
 */

-        hStream = ihipSyncAndResolveStream(hStream);
-        hc::accelerator_view *av = &hStream->_av;
-        hsa_queue_t *Queue = (hsa_queue_t*)av->get_hsa_queue();
        hsa_signal_t signal;
        status = hsa_signal_create(1, 0, NULL, &signal);

 /*
-Creating the packets
+  Launch AQL packet
+*/
+        hStream->launchModuleKernel(signal, blockDimX, blockDimY, blockDimZ,
+                  gridDimX, gridDimY, gridDimZ, sharedMemBytes, config[1], kernSize, f->kernel);
+
+/*
+  Wait for signal
 */

-        const uint32_t queue_mask = Queue->size-1;
-        uint32_t packet_index = hsa_queue_load_write_index_relaxed(Queue);
-        hsa_kernel_dispatch_packet_t *dispatch_packet = &(((hsa_kernel_dispatch_packet_t*)(Queue->base_address))[packet_index & queue_mask]);
-
-        dispatch_packet->completion_signal = signal;
-        dispatch_packet->workgroup_size_x = blockDimX;
-        dispatch_packet->workgroup_size_y = blockDimY;
-        dispatch_packet->workgroup_size_z = blockDimZ;
-        dispatch_packet->grid_size_x = blockDimX * gridDimX;
-        dispatch_packet->grid_size_y = blockDimY * gridDimY;
-        dispatch_packet->grid_size_z = blockDimZ * gridDimZ;
-
-        dispatch_packet->group_segment_size = 0;
-        dispatch_packet->private_segment_size = sharedMemBytes;
-        dispatch_packet->kernarg_address = kern;
-        dispatch_packet->kernel_object = f;
-        uint16_t header = (HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) |
-        (1 << HSA_PACKET_HEADER_BARRIER) |
-        (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) |
-        (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE);
-
-
-        uint16_t setup = 1 << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
-        uint32_t header32 = header | (setup << 16);
-
-        __atomic_store_n((uint32_t*)(dispatch_packet), header32, __ATOMIC_RELEASE);
-
-        hsa_queue_store_write_index_relaxed(Queue, packet_index+1);
-        hsa_signal_store_relaxed(Queue->doorbell_signal, packet_index);
        hsa_signal_value_t value = hsa_signal_wait_acquire(signal, HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_BLOCKED);
+
+
+        ihipPostLaunchKernel(hStream, lp);
+                
+    }
+
+    return ret;
+}
+
+
+hipError_t hipModuleGetGlobal(hipDeviceptr_t *dptr, size_t *bytes,
+                              hipModule_t hmod, const char* name){
+    HIP_INIT_API(name);
+    hipError_t ret = hipSuccess;
+    if(dptr == NULL || bytes == NULL){
+        return hipErrorInvalidValue;
+    }
+    if(name == NULL || hmod == NULL){
+        return hipErrorNotInitialized;
+    }
+    else{
+        hipFunction_t func;
+        ihipModuleGetFunction(&func, hmod, name);
+        *bytes = PrintSymbolSizes(hmod->ptr, name) + sizeof(amd_kernel_code_t);
+        *dptr = reinterpret_cast<void*>(func->kernel);
+        return ret;
+    }
+}
+
+hipError_t hipModuleLoadData(hipModule_t *module, const void *image){
+    HIP_INIT_API(image);
+    hipError_t ret = hipSuccess;
+    if(image == NULL || module == NULL){
+        return hipErrorNotInitialized;
+    }else{
+        auto ctx = ihipGetTlsDefaultCtx();
+        *module = new ihipModule_t;
+        int deviceId = ctx->getDevice()->_deviceId;
+        ihipDevice_t *currentDevice = ihipGetDevice(deviceId);
+
+        void *p;
+        uint64_t size = ElfSize(image);
+        hsa_agent_t agent = currentDevice->_hsaAgent;
+        hsa_region_t sysRegion;
+        hsa_status_t status = hsa_agent_iterate_regions(agent, hipdrv::findSystemRegions, &sysRegion);
+        status = hsa_memory_allocate(sysRegion, size, (void**)&p);
+
+        if(status != HSA_STATUS_SUCCESS){
+            return hipErrorOutOfMemory;
+        }
+
+        char *ptr = (char*)p;
+        if(!ptr){
+           return hipErrorOutOfMemory;
+        }
+        (*module)->ptr = p;
+        (*module)->size = size;
+
+        memcpy(ptr, image, size);
+
+        status = hsa_code_object_deserialize(ptr, size, NULL, &(*module)->object);
+
+        if(status != HSA_STATUS_SUCCESS){
+            return hipErrorSharedObjectInitFailed;
+        }
+
+        status = hsa_executable_create(HSA_PROFILE_FULL, HSA_EXECUTABLE_STATE_UNFROZEN, NULL, &(*module)->executable);
+        if(status != HSA_STATUS_SUCCESS){
+            return hipErrorNotInitialized;
+        }
    }
    return ret;
 }
+
+
@@ -67,7 +67,7 @@ hipError_t hipDeviceCanAccessPeer (int* canAccessPeer, hipCtx_t thisCtx, hipCtx_
 //---
 // Disable visibility of this device into memory allocated on peer device.
 // Remove this device from peer device peerlist.
-hipError_t hipDeviceDisablePeerAccess (hipCtx_t peerCtx)
+hipError_t ihipDisablePeerAccess (hipCtx_t peerCtx)
 {
    HIP_INIT_API(peerCtx);

@@ -109,7 +109,7 @@ hipError_t hipDeviceDisablePeerAccess (hipCtx_t peerCtx)
 //---
 // Allow the current device to see all memory allocated on peerDevice.
 // This should add this device to the peer-device peer list.
-hipError_t hipDeviceEnablePeerAccess (hipCtx_t peerCtx, unsigned int flags)
+hipError_t ihipEnablePeerAccess (hipCtx_t peerCtx, unsigned int flags)
 {
    HIP_INIT_API(peerCtx, flags);

@@ -175,7 +175,7 @@ hipError_t hipDeviceDisablePeerAccess (int peerDeviceId)
 {
    HIP_INIT_API(peerDeviceId);

-    return hipDeviceDisablePeerAccess(ihipGetPrimaryCtx(peerDeviceId));
+    return ihipDisablePeerAccess(ihipGetPrimaryCtx(peerDeviceId));
 }


@@ -183,7 +183,7 @@ hipError_t hipDeviceEnablePeerAccess (int peerDeviceId, unsigned int flags)
 {
    HIP_INIT_API(peerDeviceId, flags);

-    return hipDeviceEnablePeerAccess(ihipGetPrimaryCtx(peerDeviceId), flags);
+    return ihipEnablePeerAccess(ihipGetPrimaryCtx(peerDeviceId), flags);
 }


@@ -200,6 +200,16 @@ hipError_t hipMemcpyPeerAsync (void* dst, int  dstDevice, const void* src, int
    return hipMemcpyPeerAsync(dst, ihipGetPrimaryCtx(dstDevice), src, ihipGetPrimaryCtx(srcDevice), sizeBytes, stream);
 }

+hipError_t hipCtxEnablePeerAccess (hipCtx_t peerCtx, unsigned int flags)
+{
+    HIP_INIT_API(peerCtx, flags);

+    return ihipEnablePeerAccess(peerCtx, flags);
+}

+hipError_t hipCtxDisablePeerAccess (hipCtx_t peerCtx)
+{
+    HIP_INIT_API(peerCtx);

+    return ihipDisablePeerAccess(peerCtx);
+}
@@ -0,0 +1,53 @@
+/*
+Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR
+IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+
+#include<iostream>
+#include<hip/hip_runtime.h>
+#include<hip/hip_runtime_api.h>
+#include<hip/hcc_detail/hipComplex.h>
+
+#define LEN 64
+#define SIZE 64<<2
+
+__global__  void getSqAbs(hipLaunchParm lp, float *A, float *B, float *C){
+    int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
+    C[tx] = hipCsqabsf(make_hipFloatComplex(A[tx], B[tx]));
+}
+
+int main(){
+    float *A, *Ad, *B, *Bd, *C, *Cd;
+    A = new float[LEN];
+    B = new float[LEN];
+    C = new float[LEN];
+    for(uint32_t i=0;i<LEN;i++){
+        A[i] = i*1.0f;
+        B[i] = i*1.0f;
+        C[i] = i*1.0f;
+    }
+
+    hipMalloc((void**)&Ad, SIZE);
+    hipMalloc((void**)&Bd, SIZE);
+    hipMalloc((void**)&Cd, SIZE);
+    hipMemcpy(Ad, A, SIZE, hipMemcpyHostToDevice);
+    hipMemcpy(Bd, B, SIZE, hipMemcpyHostToDevice);
+    hipLaunchKernel(getSqAbs, dim3(1), dim3(LEN), 0, 0, Ad, Bd, Cd);
+    hipMemcpy(C, Cd, SIZE, hipMemcpyDeviceToHost);
+    std::cout<<A[11]<<" "<<B[11]<<" "<<C[11]<<std::endl;
+}
@@ -0,0 +1,42 @@
+#include<iostream>
+#include<hip/hip_runtime.h>
+#include<hip/hip_runtime_api.h>
+
+#define LEN 1030
+#define SIZE LEN << 2
+
+__global__ void cpy(hipLaunchParm lp, uint32_t *Out, uint32_t *In, uint32_t *Vald)
+{
+    memcpy(Out, In, SIZE, Vald);
+}
+
+__global__ void set(hipLaunchParm lp, uint32_t *ptr, uint8_t val, size_t size)
+{
+    memset(ptr, val, size);
+}
+
+int main()
+{
+    uint32_t *A, *Ad, *B, *Bd;
+    uint32_t *Val, *Vald;
+    A = new uint32_t[LEN];
+    B = new uint32_t[LEN];
+    Val = new uint32_t;
+    *Val = 0;
+    for(int i=0;i<LEN;i++){
+        A[i] = i *1.0f;
+        B[i] = 0.0f;
+    }
+    hipMalloc((void**)&Ad, SIZE);
+    hipMalloc((void**)&Bd, SIZE);
+    hipMalloc((void**)&Vald, sizeof(uint32_t));
+    hipMemcpy(Ad, A, SIZE, hipMemcpyHostToDevice);
+    hipLaunchKernel(cpy, dim3(1), dim3(LEN/4), 0, 0, Bd, Ad, Vald);
+    hipLaunchKernel(set, dim3(1), dim3(LEN/4), 0, 0, Bd, 0x1, SIZE);
+    hipMemcpy(B, Bd, SIZE, hipMemcpyDeviceToHost);
+    hipMemcpy(Val, Vald, sizeof(uint32_t), hipMemcpyDeviceToHost);
+    for(int i=LEN-16;i<LEN;i++){
+        std::cout<<A[i]<<" "<<B[i]<<std::endl;
+    }
+    std::cout<<*Val<<std::endl;
+}
@@ -0,0 +1,26 @@
+#include<iostream>
+#include<hip/hip_runtime.h>
+#include<hip/hip_runtime_api.h>
+
+#define LEN 1024
+#define SIZE LEN<<2
+
+int main(){
+  int *A, *B, *C;
+  hipDeviceptr Ad, Bd;
+  A = new int[LEN];
+  B = new int[LEN];
+  C = new int[LEN];
+  for(int i=0;i<LEN;i++){
+    A[i] = i;
+  }
+  hipMalloc((void**)&Ad, SIZE);
+  hipMalloc((void**)&Bd, SIZE);
+  hipMemcpyHtoD(Ad, A, SIZE);
+  hipMemcpyDtoD(Bd, Ad, SIZE);
+  hipMemcpyDtoH(B, Bd, SIZE);
+  hipMemcpyHtoH(C, B,SIZE);
+  for(int i=0;i<16;i++){
+    std::cout<<A[i]<<" "<<C[i]<<std::endl;
+  }
+}
@@ -0,0 +1,85 @@
+/*
+Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR
+IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include<hip_runtime.h>
+#include<hip_runtime_api.h>
+#include<iostream>
+#include<fstream>
+#include<vector>
+
+#define LEN 64
+#define SIZE LEN<<2
+
+#define fileName "vcpy_isa.co"
+#define kernel_name "hello_world"
+
+__global__ void Cpy(hipLaunchParm lp, float *Ad, float* Bd){
+  int tx = hipThreadIdx_x;
+  Bd[tx] = Ad[tx];
+}
+
+int main(){
+  float *A, *B, *Ad, *Bd;
+  A = new float[LEN];
+  B = new float[LEN];
+
+  for(uint32_t i=0;i<LEN;i++){
+    A[i] = i*1.0f;
+    B[i] = 0.0f;
+    std::cout<<A[i] << " "<<B[i]<<std::endl;
+  }
+
+  hipMalloc((void**)&Ad, SIZE);
+  hipMalloc((void**)&Bd, SIZE);
+
+  hipMemcpy(Ad, A, SIZE, hipMemcpyHostToDevice);
+  hipMemcpy(Bd, B, SIZE, hipMemcpyHostToDevice);
+  hipModule_t Module;
+  hipFunction_t Function;
+  hipModuleLoad(&Module, fileName);
+  hipModuleGetFunction(&Function, Module, kernel_name);
+  hipStream_t stream;
+  hipStreamCreate(&stream);
+  void *args[2] = {&Ad, &Bd};
+
+
+    std::vector<void*>argBuffer(2);
+    memcpy(&argBuffer[0], &Ad, sizeof(void*));
+    memcpy(&argBuffer[1], &Bd, sizeof(void*));
+
+    size_t size = argBuffer.size()*sizeof(void*);
+
+    void *config[] = {
+      HIP_LAUNCH_PARAM_BUFFER_POINTER, &argBuffer[0],
+      HIP_LAUNCH_PARAM_BUFFER_SIZE, &size,
+      HIP_LAUNCH_PARAM_END
+    };
+
+    hipModuleLaunchKernel(Function, 1, 1, 1, LEN, 1, 1, 0, stream, NULL, (void**)&config); 
+
+    hipStreamDestroy(stream);
+
+ hipMemcpy(B, Bd, SIZE, hipMemcpyDeviceToHost);
+
+  for(uint32_t i=0;i<LEN;i++){
+  std::cout<<A[i]<<" - "<<B[i]<<std::endl;
+  }
+
+  return 0;
+}
@@ -0,0 +1,31 @@
+/*
+Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR
+IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include<hip/hip_runtime.h>
+#include<hip/hip_runtime_api.h>
+#include<iostream>
+
+#define fileName "vcpy_isa.co"
+
+int main(){
+    hipModule_t module;
+    hipModuleLoad(&module, fileName);
+    hipModuleUnload(module);
+}
+
@@ -0,0 +1,94 @@
+/*
+Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR
+IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include<hip_runtime.h>
+#include<hip_runtime_api.h>
+#include<iostream>
+#include<fstream>
+#include<vector>
+
+#define LEN 64
+#define SIZE LEN<<2
+
+#ifdef __HIP_PLATFORM_HCC__ 
+#define fileName "vcpy_isa.co"
+#endif
+
+#ifdef __HIP_PLATFORM_NVCC__
+#define fileName "vcpy_isa.ptx"
+#endif
+
+#define kernel_name "hello_world"
+
+int main(){
+    float *A, *B;
+    hipDeviceptr_t Ad, Bd;
+    A = new float[LEN];
+    B = new float[LEN];
+
+    for(uint32_t i=0;i<LEN;i++){
+        A[i] = i*1.0f;
+        B[i] = 0.0f;
+        std::cout<<A[i] << " "<<B[i]<<std::endl;
+    }
+
+
+#ifdef __HIP_PLATFORM_NVCC__
+  	hipInit(0);
+	  hipDevice_t device;
+	  hipCtx_t context;
+	  hipDeviceGet(&device, 0);
+	  hipCtxCreate(&context, 0, device);
+#endif
+
+    hipMalloc((void**)&Ad, SIZE);
+    hipMalloc((void**)&Bd, SIZE);
+
+    hipMemcpyHtoD(Ad, A, SIZE);
+    hipMemcpyHtoD(Bd, B, SIZE);
+    hipModule_t Module;
+    hipFunction_t Function;
+    hipModuleLoad(&Module, fileName);
+    hipModuleGetFunction(&Function, Module, kernel_name);
+
+    std::vector<void*>argBuffer(2);
+    memcpy(&argBuffer[0], &Ad, sizeof(void*));
+    memcpy(&argBuffer[1], &Bd, sizeof(void*));
+
+    size_t size = argBuffer.size()*sizeof(void*);
+
+    void *config[] = {
+      HIP_LAUNCH_PARAM_BUFFER_POINTER, &argBuffer[0],
+      HIP_LAUNCH_PARAM_BUFFER_SIZE, &size,
+      HIP_LAUNCH_PARAM_END
+    };
+
+    hipModuleLaunchKernel(Function, 1, 1, 1, LEN, 1, 1, 0, 0, NULL, (void**)&config); 
+
+    hipMemcpyDtoH(B, Bd, SIZE);
+    for(uint32_t i=0;i<LEN;i++){
+        std::cout<<A[i]<<" - "<<B[i]<<std::endl;
+    }
+
+#ifdef __HIP_PLATFORM_NVCC__
+	  hipCtxDetach(context);
+#endif 
+
+    return 0;
+}
@@ -0,0 +1,6 @@
+
+extern "C" __global__ void hello_world(float *a, float *b)
+{
+	int tx = threadIdx.x;
+	b[tx] = a[tx];
+}
@@ -0,0 +1,38 @@
+//
+// Generated by NVIDIA NVVM Compiler
+//
+// Compiler Build ID: CL-19856038
+// Cuda compilation tools, release 7.5, V7.5.17
+// Based on LLVM 3.4svn
+//
+
+.version 4.3
+.target sm_20
+.address_size 64
+
+	// .globl	hello_world
+
+.visible .entry hello_world(
+	.param .u64 hello_world_param_0,
+	.param .u64 hello_world_param_1
+)
+{
+	.reg .f32 	%f<2>;
+	.reg .b32 	%r<2>;
+	.reg .b64 	%rd<8>;
+
+
+	ld.param.u64 	%rd1, [hello_world_param_0];
+	ld.param.u64 	%rd2, [hello_world_param_1];
+	cvta.to.global.u64 	%rd3, %rd2;
+	cvta.to.global.u64 	%rd4, %rd1;
+	mov.u32 	%r1, %tid.x;
+	mul.wide.s32 	%rd5, %r1, 4;
+	add.s64 	%rd6, %rd4, %rd5;
+	ld.global.f32 	%f1, [%rd6];
+	add.s64 	%rd7, %rd3, %rd5;
+	st.global.f32 	[%rd7], %f1;
+	ret;
+}
+
+