From ba51d7f676ab830253a6ea1affe03c214c1ccc72 Mon Sep 17 00:00:00 2001
From: Rahul Garg <rahul.garg@amd.com>
Date: Wed, 14 Jun 2017 15:18:57 +0530
Subject: [PATCH 01/19] Validity check of input arguments in Ipc Mem APIs

Change-Id: Ia48e949d19f354f10c7e44cc2457fd4154bf6d76
---
 src/hip_memory.cpp | 88 +++++++++++++++++++++++++---------------------
 1 file changed, 48 insertions(+), 40 deletions(-)

diff --git a/src/hip_memory.cpp b/src/hip_memory.cpp
index c04c2611c3..ce65579e34 100644
--- a/src/hip_memory.cpp
+++ b/src/hip_memory.cpp
@@ -1275,70 +1275,78 @@ hipError_t hipIpcGetMemHandle(hipIpcMemHandle_t* handle, void* devPtr){
     // Get the size of allocated pointer
     size_t psize;
     hc::accelerator acc;
-    hc::AmPointerInfo amPointerInfo( NULL , NULL , 0 , acc , 0 , 0 );
-    am_status_t status = hc::am_memtracker_getinfo( &amPointerInfo , devPtr );
-    if (status == AM_SUCCESS) {
-        psize = (size_t)amPointerInfo._sizeBytes;
-    }
-    else
+    if((handle == NULL) || (devPtr == NULL)) {
         hipStatus = hipErrorInvalidResourceHandle;
-    ihipIpcMemHandle_t* iHandle = (ihipIpcMemHandle_t*) handle;
-    // Save the size of the pointer to hipIpcMemHandle
-    iHandle->psize = psize;
+    } else {
+        hc::AmPointerInfo amPointerInfo( NULL , NULL , 0 , acc , 0 , 0 );
+        am_status_t status = hc::am_memtracker_getinfo( &amPointerInfo , devPtr );
+        if (status == AM_SUCCESS) {
+            psize = (size_t)amPointerInfo._sizeBytes;
+        } else
+            hipStatus = hipErrorInvalidResourceHandle;
+        ihipIpcMemHandle_t* iHandle = (ihipIpcMemHandle_t*) handle;
+        // Save the size of the pointer to hipIpcMemHandle
+        iHandle->psize = psize;
 
 #if USE_IPC
-    // Create HSA ipc memory
-    hsa_status_t hsa_status =
-        hsa_amd_ipc_memory_create(devPtr, psize, (hsa_amd_ipc_memory_t*) &(iHandle->ipc_handle));
-    if(hsa_status!= HSA_STATUS_SUCCESS)
-        hipStatus = hipErrorMemoryAllocation;
+        // Create HSA ipc memory
+        hsa_status_t hsa_status =
+            hsa_amd_ipc_memory_create(devPtr, psize, (hsa_amd_ipc_memory_t*) &(iHandle->ipc_handle));
+        if(hsa_status!= HSA_STATUS_SUCCESS)
+            hipStatus = hipErrorMemoryAllocation;
 #else
-    hipStatus = hipErrorRuntimeOther;
+        hipStatus = hipErrorRuntimeOther;
 #endif
-
+    }
     return ihipLogStatus(hipStatus);
 }
 
 hipError_t hipIpcOpenMemHandle(void** devPtr, hipIpcMemHandle_t handle, unsigned int flags){
     HIP_INIT_API ( devPtr, &handle , flags);
     hipError_t hipStatus = hipSuccess;
-
+    if(devPtr == NULL) {
+        hipStatus = hipErrorInvalidValue;
+    } else {
 #if USE_IPC
-    // Get the current device agent.
-    hc::accelerator acc;
-    hsa_agent_t *agent = static_cast<hsa_agent_t*>(acc.get_hsa_agent());
-    if(!agent)
-        return hipErrorInvalidResourceHandle;
+        // Get the current device agent.
+        hc::accelerator acc;
+        hsa_agent_t *agent = static_cast<hsa_agent_t*>(acc.get_hsa_agent());
+        if(!agent)
+            return hipErrorInvalidResourceHandle;
 
-    ihipIpcMemHandle_t* iHandle = (ihipIpcMemHandle_t*) &handle;
-    //Attach ipc memory
-    auto ctx= ihipGetTlsDefaultCtx();
-    {
-        LockedAccessor_CtxCrit_t crit(ctx->criticalData());
-        // the peerCnt always stores self so make sure the trace actually
-        hsa_status_t hsa_status =
-         hsa_amd_ipc_memory_attach((hsa_amd_ipc_memory_t*)&(iHandle->ipc_handle), iHandle->psize, crit->peerCnt(), crit->peerAgents(), devPtr);
-        if(hsa_status != HSA_STATUS_SUCCESS)
-            hipStatus = hipErrorMapBufferObjectFailed;
-    }
+        ihipIpcMemHandle_t* iHandle = (ihipIpcMemHandle_t*) &handle;
+        //Attach ipc memory
+        auto ctx= ihipGetTlsDefaultCtx();
+        {
+            LockedAccessor_CtxCrit_t crit(ctx->criticalData());
+            // the peerCnt always stores self so make sure the trace actually
+            hsa_status_t hsa_status =
+            hsa_amd_ipc_memory_attach((hsa_amd_ipc_memory_t*)&(iHandle->ipc_handle), iHandle->psize, crit->peerCnt(), crit->peerAgents(), devPtr);
+            if(hsa_status != HSA_STATUS_SUCCESS)
+                hipStatus = hipErrorMapBufferObjectFailed;
+        }
 #else
-    hipStatus = hipErrorRuntimeOther;
+        hipStatus = hipErrorRuntimeOther;
 #endif
+    }
     return ihipLogStatus(hipStatus);
 }
 
 hipError_t hipIpcCloseMemHandle(void *devPtr){
     HIP_INIT_API ( devPtr );
     hipError_t hipStatus = hipSuccess;
-
+    if(devPtr == NULL) {
+        hipStatus = hipErrorInvalidValue;
+    } else {
 #if USE_IPC
-    hsa_status_t hsa_status =
-        hsa_amd_ipc_memory_detach(devPtr);
-    if(hsa_status != HSA_STATUS_SUCCESS)
-        return hipErrorInvalidResourceHandle;
+        hsa_status_t hsa_status =
+            hsa_amd_ipc_memory_detach(devPtr);
+        if(hsa_status != HSA_STATUS_SUCCESS)
+            return hipErrorInvalidResourceHandle;
 #else
-    hipStatus = hipErrorRuntimeOther;
+        hipStatus = hipErrorRuntimeOther;
 #endif
+    }
     return ihipLogStatus(hipStatus);
 }
 

From fd36303c24fcbfd21ac9bfba56162a51317f783e Mon Sep 17 00:00:00 2001
From: "Sun, Peng" <peng.sun@amd.com>
Date: Wed, 14 Jun 2017 11:10:52 -0500
Subject: [PATCH 02/19] Additional GGL make_kernel_functor_* macros,
 contributed by Alex

Change-Id: I01aabb7d2b5418fcefb1bbf78eb5d1888dbc5c96
---
 include/hip/hcc_detail/grid_launch_GGL.hpp | 122 +++++++++++++++++++++
 1 file changed, 122 insertions(+)

diff --git a/include/hip/hcc_detail/grid_launch_GGL.hpp b/include/hip/hcc_detail/grid_launch_GGL.hpp
index 8e3dab8482..eac48b595e 100644
--- a/include/hip/hcc_detail/grid_launch_GGL.hpp
+++ b/include/hip/hcc_detail/grid_launch_GGL.hpp
@@ -245,6 +245,128 @@ namespace hip_impl
         HIP_kernel_functor_name_begin ## _ ## k ## _ ## \
         HIP_kernel_functor_name_end ## _ ## n
 
+    #define make_kernel_functor_hip_30(\
+        function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\
+        p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20, p21, p22, p23,\
+        p24, p25, p26, p27)\
+        struct make_kernel_name_hip(function_name, 28) {\
+            std::decay_t<decltype(p0)> _p0_;\
+            std::decay_t<decltype(p1)> _p1_;\
+            std::decay_t<decltype(p2)> _p2_;\
+            std::decay_t<decltype(p3)> _p3_;\
+            std::decay_t<decltype(p4)> _p4_;\
+            std::decay_t<decltype(p5)> _p5_;\
+            std::decay_t<decltype(p6)> _p6_;\
+            std::decay_t<decltype(p7)> _p7_;\
+            std::decay_t<decltype(p8)> _p8_;\
+            std::decay_t<decltype(p9)> _p9_;\
+            std::decay_t<decltype(p10)> _p10_;\
+            std::decay_t<decltype(p11)> _p11_;\
+            std::decay_t<decltype(p12)> _p12_;\
+            std::decay_t<decltype(p13)> _p13_;\
+            std::decay_t<decltype(p14)> _p14_;\
+            std::decay_t<decltype(p15)> _p15_;\
+            std::decay_t<decltype(p16)> _p16_;\
+            std::decay_t<decltype(p17)> _p17_;\
+            std::decay_t<decltype(p18)> _p18_;\
+            std::decay_t<decltype(p19)> _p19_;\
+            std::decay_t<decltype(p20)> _p20_;\
+            std::decay_t<decltype(p21)> _p21_;\
+            std::decay_t<decltype(p22)> _p22_;\
+            std::decay_t<decltype(p23)> _p23_;\
+            std::decay_t<decltype(p24)> _p24_;\
+            std::decay_t<decltype(p25)> _p25_;\
+            std::decay_t<decltype(p26)> _p26_;\
+            std::decay_t<decltype(p27)> _p27_;\
+            void operator()(const hc::tiled_index<3>&) const [[hc]]\
+            {\
+                kernel_name(\
+                    _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\
+                    _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\
+                    _p18_, _p19_, _p20_, _p21_, _p22_, _p23_, _p24_, _p25_,\
+                    _p26_, _p27_);\
+            }\
+        }
+    #define make_kernel_functor_hip_29(\
+        function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\
+        p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20, p21, p22, p23,\
+        p24, p25, p26)\
+        struct make_kernel_name_hip(function_name, 27) {\
+            std::decay_t<decltype(p0)> _p0_;\
+            std::decay_t<decltype(p1)> _p1_;\
+            std::decay_t<decltype(p2)> _p2_;\
+            std::decay_t<decltype(p3)> _p3_;\
+            std::decay_t<decltype(p4)> _p4_;\
+            std::decay_t<decltype(p5)> _p5_;\
+            std::decay_t<decltype(p6)> _p6_;\
+            std::decay_t<decltype(p7)> _p7_;\
+            std::decay_t<decltype(p8)> _p8_;\
+            std::decay_t<decltype(p9)> _p9_;\
+            std::decay_t<decltype(p10)> _p10_;\
+            std::decay_t<decltype(p11)> _p11_;\
+            std::decay_t<decltype(p12)> _p12_;\
+            std::decay_t<decltype(p13)> _p13_;\
+            std::decay_t<decltype(p14)> _p14_;\
+            std::decay_t<decltype(p15)> _p15_;\
+            std::decay_t<decltype(p16)> _p16_;\
+            std::decay_t<decltype(p17)> _p17_;\
+            std::decay_t<decltype(p18)> _p18_;\
+            std::decay_t<decltype(p19)> _p19_;\
+            std::decay_t<decltype(p20)> _p20_;\
+            std::decay_t<decltype(p21)> _p21_;\
+            std::decay_t<decltype(p22)> _p22_;\
+            std::decay_t<decltype(p23)> _p23_;\
+            std::decay_t<decltype(p24)> _p24_;\
+            std::decay_t<decltype(p25)> _p25_;\
+            std::decay_t<decltype(p26)> _p26_;\
+            void operator()(const hc::tiled_index<3>&) const [[hc]]\
+            {\
+                kernel_name(\
+                    _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\
+                    _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\
+                    _p18_, _p19_, _p20_, _p21_, _p22_, _p23_, _p24_, _p25_,\
+                    _p26_);\
+            }\
+        }
+    #define make_kernel_functor_hip_28(\
+        function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\
+        p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20, p21, p22, p23,\
+        p24, p25)\
+        struct make_kernel_name_hip(function_name, 26) {\
+            std::decay_t<decltype(p0)> _p0_;\
+            std::decay_t<decltype(p1)> _p1_;\
+            std::decay_t<decltype(p2)> _p2_;\
+            std::decay_t<decltype(p3)> _p3_;\
+            std::decay_t<decltype(p4)> _p4_;\
+            std::decay_t<decltype(p5)> _p5_;\
+            std::decay_t<decltype(p6)> _p6_;\
+            std::decay_t<decltype(p7)> _p7_;\
+            std::decay_t<decltype(p8)> _p8_;\
+            std::decay_t<decltype(p9)> _p9_;\
+            std::decay_t<decltype(p10)> _p10_;\
+            std::decay_t<decltype(p11)> _p11_;\
+            std::decay_t<decltype(p12)> _p12_;\
+            std::decay_t<decltype(p13)> _p13_;\
+            std::decay_t<decltype(p14)> _p14_;\
+            std::decay_t<decltype(p15)> _p15_;\
+            std::decay_t<decltype(p16)> _p16_;\
+            std::decay_t<decltype(p17)> _p17_;\
+            std::decay_t<decltype(p18)> _p18_;\
+            std::decay_t<decltype(p19)> _p19_;\
+            std::decay_t<decltype(p20)> _p20_;\
+            std::decay_t<decltype(p21)> _p21_;\
+            std::decay_t<decltype(p22)> _p22_;\
+            std::decay_t<decltype(p23)> _p23_;\
+            std::decay_t<decltype(p24)> _p24_;\
+            std::decay_t<decltype(p25)> _p25_;\
+            void operator()(const hc::tiled_index<3>&) const [[hc]]\
+            {\
+                kernel_name(\
+                    _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\
+                    _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\
+                    _p18_, _p19_, _p20_, _p21_, _p22_, _p23_, _p24_, _p25_);\
+            }\
+        }
     #define make_kernel_functor_hip_27(\
         function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\
         p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20, p21, p22, p23,\

From 0208fa4e70899cbd2af550e65425ccbb9c7c8414 Mon Sep 17 00:00:00 2001
From: Evgeny Mankov <Evgeniy.Mankov@amd.com>
Date: Wed, 14 Jun 2017 19:55:55 +0300
Subject: [PATCH 03/19] [HIPIFY] Sync HIPIFY with HIP by CUDA Driver API
 functions.

+ 4.12. Unified Addressing
+ 4.13. Stream Management

ToDo: 4.14 - 4.31 modules of CUDA Driver API.
---
 ...A_Driver_API_functions_supported_by_HIP.md | 22 +++++++-
 hipify-clang/src/Cuda2Hip.cpp                 | 54 ++++++++++---------
 2 files changed, 50 insertions(+), 26 deletions(-)

diff --git a/docs/markdown/CUDA_Driver_API_functions_supported_by_HIP.md b/docs/markdown/CUDA_Driver_API_functions_supported_by_HIP.md
index d4b54438bb..0b3bb540bf 100644
--- a/docs/markdown/CUDA_Driver_API_functions_supported_by_HIP.md
+++ b/docs/markdown/CUDA_Driver_API_functions_supported_by_HIP.md
@@ -553,13 +553,31 @@
 
 |   **CUDA**                                                |   **HIP**                     | **CUDA description**                                                                                                           |
 |-----------------------------------------------------------|-------------------------------|--------------------------------------------------------------------------------------------------------------------------------|
-
+| `cuMemAdvise`                                             |                               | Advise about the usage of a given memory range.                                                                                |
+| `cuMemPrefetchAsync`                                      |                               | Prefetches memory to the specified destination device.                                                                         |
+| `cuMemRangeGetAttribute`                                  |                               | Query an attribute of a given memory range.                                                                                    |
+| `cuMemRangeGetAttributes`                                 |                               | Query attributes of a given memory range.                                                                                      |
+| `cuPointerGetAttribute`                                   |                               | Returns information about a pointer.                                                                                           |
+| `cuPointerGetAttributes`                                  |                               | Returns information about a pointer.                                                                                           |
+| `cuPointerSetAttribute`                                   |                               | Set attributes on a previously allocated memory region.                                                                        |
 
 ## **13. Stream Management**
 
 |   **CUDA**                                                |   **HIP**                     | **CUDA description**                                                                                                           |
 |-----------------------------------------------------------|-------------------------------|--------------------------------------------------------------------------------------------------------------------------------|
-
+| `cuStreamAddCallback`                                     |                               | Add a callback to a compute stream.                                                                                            |
+| `cuStreamAttachMemAsync`                                  |                               | Attach memory to a stream asynchronously.                                                                                      |
+| `cuStreamCreate`                                          |                               | Create a stream.                                                                                                               |
+| `cuStreamCreateWithPriority`                              |                               | Create a stream with the given priority.                                                                                       |
+| `cuStreamDestroy`                                         | `hipStreamDestroy`            | Destroys a stream.                                                                                                             |
+| `cuStreamGetFlags`                                        | `hipStreamGetFlags`           | Query the flags of a given stream.                                                                                             |
+| `cuStreamGetPriority`                                     | `hipStreamGetPriority`        | Query the priority of a given stream.                                                                                          |
+| `cuStreamQuery`                                           | `hipStreamQuery`              | Determine status of a compute stream.                                                                                          |
+| `cuStreamSynchronize`                                     | `hipStreamSynchronize`        | Wait until a stream's tasks are completed.                                                                                     |
+| `cuStreamWaitEvent`                                       | `hipStreamWaitEvent`          | Make a compute stream wait on an event.                                                                                        |
+| `cuStreamBatchMemOp`                                      |                               | Batch operations to synchronize the stream via memory operations.                                                              |
+| `cuStreamWaitValue32`                                     |                               | Wait on a memory location.                                                                                                     |
+| `cuStreamWriteValue32`                                    |                               | Write a value to memory.                                                                                                       |
 
 ## **14. Event Management**
 
diff --git a/hipify-clang/src/Cuda2Hip.cpp b/hipify-clang/src/Cuda2Hip.cpp
index de4da78451..7f9fefa7f9 100644
--- a/hipify-clang/src/Cuda2Hip.cpp
+++ b/hipify-clang/src/Cuda2Hip.cpp
@@ -990,17 +990,19 @@ struct cuda2hipMap {
     cuda2hipRename["cuLaunchKernel"]                            = {"hipModuleLaunchKernel", CONV_MODULE, API_DRIVER};
 
     // Streams
-    // unsupported yet by HIP
     cuda2hipRename["cuStreamAddCallback"]                       = {"hipStreamAddCallback", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED};
-    cuda2hipRename["cuStreamWaitValue32"]                       = {"hipStreamWaitValue32", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED}; // [CUDA 8.0.44] // no API_Runtime ANALOGUE
-    cuda2hipRename["cuStreamWriteValue32"]                      = {"hipStreamWriteValue32", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED}; // [CUDA 8.0.44] // no API_Runtime ANALOGUE
-    cuda2hipRename["cuStreamBatchMemOp"]                        = {"hipStreamBatchMemOp", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED}; // [CUDA 8.0.44] // no API_Runtime ANALOGUE
-
-    cuda2hipRename["cuStreamCreate"]                            = {"hipStreamCreate", CONV_STREAM, API_DRIVER};
+    cuda2hipRename["cuStreamAttachMemAsync"]                    = {"hipStreamAttachMemAsync", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED};
+    cuda2hipRename["cuStreamCreate"]                            = {"hipStreamCreate__", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED};      // Not equal to cudaStreamCreate due to different signatures
+    cuda2hipRename["cuStreamCreateWithPriority"]                = {"hipStreamCreateWithPriority", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED};
     cuda2hipRename["cuStreamDestroy_v2"]                        = {"hipStreamDestroy", CONV_STREAM, API_DRIVER};
+    cuda2hipRename["cuStreamGetFlags"]                          = {"hipStreamGetFlags", CONV_STREAM, API_DRIVER};
+    cuda2hipRename["cuStreamGetPriority"]                       = {"hipStreamGetPriority", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED};
     cuda2hipRename["cuStreamQuery"]                             = {"hipStreamQuery", CONV_STREAM, API_DRIVER};
     cuda2hipRename["cuStreamSynchronize"]                       = {"hipStreamSynchronize", CONV_STREAM, API_DRIVER};
     cuda2hipRename["cuStreamWaitEvent"]                         = {"hipStreamWaitEvent", CONV_STREAM, API_DRIVER};
+    cuda2hipRename["cuStreamWaitValue32"]                       = {"hipStreamWaitValue32", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED};   // [CUDA 8.0.44] // no API_Runtime ANALOGUE
+    cuda2hipRename["cuStreamWriteValue32"]                      = {"hipStreamWriteValue32", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED};  // [CUDA 8.0.44] // no API_Runtime ANALOGUE
+    cuda2hipRename["cuStreamBatchMemOp"]                        = {"hipStreamBatchMemOp", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED};    // [CUDA 8.0.44] // no API_Runtime ANALOGUE
 
     // Memory management
     cuda2hipRename["cuArray3DCreate"]                           = {"hipArray3DCreate", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED};
@@ -1016,16 +1018,16 @@ struct cuda2hipMap {
     cuda2hipRename["cuMemAlloc_v2"]                             = {"hipMalloc", CONV_MEM, API_DRIVER};
     cuda2hipRename["cuMemAllocHost"]                            = {"hipMemAllocHost", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED};
     cuda2hipRename["cuMemAllocManaged"]                         = {"hipMemAllocManaged", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED};
-    cuda2hipRename["cuMemAllocPitch"]                           = {"hipMemAllocPitch__", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED};      // Not equal to cudaMemAllocPitch due to different signatures
-    cuda2hipRename["cuMemcpy"]                                  = {"hipMemcpy__", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED};             // Not equal to cudaMemcpy due to different signatures
-    cuda2hipRename["cuMemcpy2D"]                                = {"hipMemcpy2D__", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED};           // Not equal to cudaMemcpy2D due to different signatures
-    cuda2hipRename["cuMemcpy2DAsync"]                           = {"hipMemcpy2DAsync__", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED};      // Not equal to cudaMemcpy2DAsync due to different signatures
+    cuda2hipRename["cuMemAllocPitch"]                           = {"hipMemAllocPitch__", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED};        // Not equal to cudaMemAllocPitch due to different signatures
+    cuda2hipRename["cuMemcpy"]                                  = {"hipMemcpy__", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED};               // Not equal to cudaMemcpy due to different signatures
+    cuda2hipRename["cuMemcpy2D"]                                = {"hipMemcpy2D__", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED};             // Not equal to cudaMemcpy2D due to different signatures
+    cuda2hipRename["cuMemcpy2DAsync"]                           = {"hipMemcpy2DAsync__", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED};        // Not equal to cudaMemcpy2DAsync due to different signatures
     cuda2hipRename["cuMemcpy2DUnaligned"]                       = {"hipMemcpy2DUnaligned", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED};
-    cuda2hipRename["cuMemcpy3D"]                                = {"hipMemcpy3D__", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED};           // Not equal to cudaMemcpy3D due to different signatures
-    cuda2hipRename["cuMemcpy3DAsync"]                           = {"hipMemcpy3DAsync__", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED};     // Not equal to cudaMemcpy3DAsync due to different signatures
-    cuda2hipRename["cuMemcpy3DPeer"]                            = {"hipMemcpy3DPeer__", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED};      // Not equal to cudaMemcpy3DPeer due to different signatures
-    cuda2hipRename["cuMemcpy3DPeerAsync"]                       = {"hipMemcpy3DPeerAsync__", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED}; // Not equal to cudaMemcpy3DPeerAsync due to different signatures
-    cuda2hipRename["cuMemcpyAsync"]                             = {"hipMemcpyAsync__", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED};       // Not equal to cudaMemcpyAsync due to different signatures
+    cuda2hipRename["cuMemcpy3D"]                                = {"hipMemcpy3D__", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED};             // Not equal to cudaMemcpy3D due to different signatures
+    cuda2hipRename["cuMemcpy3DAsync"]                           = {"hipMemcpy3DAsync__", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED};       // Not equal to cudaMemcpy3DAsync due to different signatures
+    cuda2hipRename["cuMemcpy3DPeer"]                            = {"hipMemcpy3DPeer__", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED};        // Not equal to cudaMemcpy3DPeer due to different signatures
+    cuda2hipRename["cuMemcpy3DPeerAsync"]                       = {"hipMemcpy3DPeerAsync__", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED};   // Not equal to cudaMemcpy3DPeerAsync due to different signatures
+    cuda2hipRename["cuMemcpyAsync"]                             = {"hipMemcpyAsync__", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED};         // Not equal to cudaMemcpyAsync due to different signatures
     cuda2hipRename["cuMemcpyAtoA"]                              = {"hipMemcpyAtoA", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED};
     cuda2hipRename["cuMemcpyAtoD"]                              = {"hipMemcpyAtoD", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED};
     cuda2hipRename["cuMemcpyAtoH"]                              = {"hipMemcpyAtoH", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED};
@@ -1039,17 +1041,17 @@ struct cuda2hipMap {
     cuda2hipRename["cuMemcpyHtoAAsync"]                         = {"hipMemcpyHtoAAsync", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED};
     cuda2hipRename["cuMemcpyHtoD_v2"]                           = {"hipMemcpyHtoD", CONV_MEM, API_DRIVER};
     cuda2hipRename["cuMemcpyHtoDAsync_v2"]                      = {"hipMemcpyHtoDAsync", CONV_MEM, API_DRIVER};
-    cuda2hipRename["cuMemcpyPeerAsync"]                         = {"hipMemcpyPeerAsync__", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED};    // Not equal to cudaMemcpyPeerAsync due to different signatures
-    cuda2hipRename["cuMemcpyPeer"]                              = {"hipMemcpyPeer__", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED};         // Not equal to cudaMemcpyPeer due to different signatures
+    cuda2hipRename["cuMemcpyPeerAsync"]                         = {"hipMemcpyPeerAsync__", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED};      // Not equal to cudaMemcpyPeerAsync due to different signatures
+    cuda2hipRename["cuMemcpyPeer"]                              = {"hipMemcpyPeer__", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED};           // Not equal to cudaMemcpyPeer due to different signatures
     cuda2hipRename["cuMemFree_v2"]                              = {"hipFree", CONV_MEM, API_DRIVER};
     cuda2hipRename["cuMemFreeHost"]                             = {"hipHostFree", CONV_MEM, API_DRIVER};
     cuda2hipRename["cuMemGetAddressRange"]                      = {"hipMemGetAddressRange", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED};
     cuda2hipRename["cuMemGetInfo_v2"]                           = {"hipMemGetInfo", CONV_MEM, API_DRIVER};
-    cuda2hipRename["cuMemHostAlloc"]                            = {"hipHostMalloc", CONV_MEM, API_DRIVER};                            // API_Runtime ANALOGUE (cudaHostAlloc)
+    cuda2hipRename["cuMemHostAlloc"]                            = {"hipHostMalloc", CONV_MEM, API_DRIVER};                              // API_Runtime ANALOGUE (cudaHostAlloc)
     cuda2hipRename["cuMemHostGetDevicePointer"]                 = {"hipMemHostGetDevicePointer", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED};
     cuda2hipRename["cuMemHostGetFlags"]                         = {"hipMemHostGetFlags", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED};
-    cuda2hipRename["cuMemHostRegister_v2"]                      = {"hipHostRegister", CONV_MEM, API_DRIVER};                          // API_Runtime ANALOGUE (cudaHostAlloc)
-    cuda2hipRename["cuMemHostUnregister"]                       = {"hipHostUnregister", CONV_MEM, API_DRIVER};                        // API_Runtime ANALOGUE (cudaHostUnregister)
+    cuda2hipRename["cuMemHostRegister_v2"]                      = {"hipHostRegister", CONV_MEM, API_DRIVER};                            // API_Runtime ANALOGUE (cudaHostAlloc)
+    cuda2hipRename["cuMemHostUnregister"]                       = {"hipHostUnregister", CONV_MEM, API_DRIVER};                          // API_Runtime ANALOGUE (cudaHostUnregister)
     cuda2hipRename["cuMemsetD16_v2"]                            = {"hipMemsetD16", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED};
     cuda2hipRename["cuMemsetD16Async"]                          = {"hipMemsetD16Async", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED};
     cuda2hipRename["cuMemsetD2D16_v2"]                          = {"hipMemsetD2D16", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED};
@@ -1058,18 +1060,22 @@ struct cuda2hipMap {
     cuda2hipRename["cuMemsetD2D32Async"]                        = {"hipMemsetD2D32Async", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED};
     cuda2hipRename["cuMemsetD2D8_v2"]                           = {"hipMemsetD2D8", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED};
     cuda2hipRename["cuMemsetD2D8Async"]                         = {"hipMemsetD2D8Async", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED};
-    cuda2hipRename["cuMemsetD32_v2"]                            = {"hipMemset", CONV_MEM, API_DRIVER};                                // API_Runtime ANALOGUE (cudaMemset)
-    cuda2hipRename["cuMemsetD32Async"]                          = {"hipMemsetAsync", CONV_MEM, API_DRIVER};                           // API_Runtime ANALOGUE (cudaMemsetAsync)
+    cuda2hipRename["cuMemsetD32_v2"]                            = {"hipMemset", CONV_MEM, API_DRIVER};                                  // API_Runtime ANALOGUE (cudaMemset)
+    cuda2hipRename["cuMemsetD32Async"]                          = {"hipMemsetAsync", CONV_MEM, API_DRIVER};                             // API_Runtime ANALOGUE (cudaMemsetAsync)
     cuda2hipRename["cuMemsetD8_v2"]                             = {"hipMemsetD8", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED};
     cuda2hipRename["cuMemsetD8Async"]                           = {"hipMemsetD8Async", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED};
     cuda2hipRename["cuMipmappedArrayCreate"]                    = {"hipMipmappedArrayCreate", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED};
     cuda2hipRename["cuMipmappedArrayDestroy"]                   = {"hipMipmappedArrayDestroy", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED};
     cuda2hipRename["cuMipmappedArrayGetLevel"]                  = {"hipMipmappedArrayGetLevel", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED};
-    // unsupported yet by HIP [CUDA 8.0.44]
-    cuda2hipRename["cuMemPrefetchAsync"]                        = {"hipMemPrefetchAsync___", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED};   // [CUDA 8.0.44] // no API_Runtime ANALOGUE (cudaMemPrefetchAsync has different signature)
+
+    // Unified Addressing
+    cuda2hipRename["cuMemPrefetchAsync"]                        = {"hipMemPrefetchAsync__", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED};    // [CUDA 8.0.44] // no API_Runtime ANALOGUE (cudaMemPrefetchAsync has different signature)
     cuda2hipRename["cuMemAdvise"]                               = {"hipMemAdvise", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED};             // [CUDA 8.0.44] // API_Runtime ANALOGUE (cudaMemAdvise)
     cuda2hipRename["cuMemRangeGetAttribute"]                    = {"hipMemRangeGetAttribute", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED};  // [CUDA 8.0.44] // API_Runtime ANALOGUE (cudaMemRangeGetAttribute)
     cuda2hipRename["cuMemRangeGetAttributes"]                   = {"hipMemRangeGetAttributes", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED}; // [CUDA 8.0.44] // API_Runtime ANALOGUE (cudaMemRangeGetAttributes)
+    cuda2hipRename["cuPointerGetAttribute"]                     = {"hipPointerGetAttribute", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED};
+    cuda2hipRename["cuPointerGetAttributes"]                    = {"hipPointerGetAttributes", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED};
+    cuda2hipRename["cuPointerSetAttribute"]                     = {"hipPointerSetAttribute", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED};
 
     // Texture Reference Mngmnt
     // Texture reference filtering modes

From d24818bff6e2f202b031e92d0b0f2c27a31757e5 Mon Sep 17 00:00:00 2001
From: Rahul Garg <rahul.garg@amd.com>
Date: Thu, 15 Jun 2017 00:21:47 +0530
Subject: [PATCH 04/19] Arguments validation in hipDeviceGetPCIBusId

Change-Id: I89770517c3ac94e4bf476344d27c18f03cfcde08
---
 src/hip_device.cpp | 24 ++++++++++++++++++------
 1 file changed, 18 insertions(+), 6 deletions(-)

diff --git a/src/hip_device.cpp b/src/hip_device.cpp
index 05db4c2b30..2bb9970d35 100644
--- a/src/hip_device.cpp
+++ b/src/hip_device.cpp
@@ -369,12 +369,24 @@ hipError_t hipDeviceGetName(char *name,int len,hipDevice_t device)
 hipError_t hipDeviceGetPCIBusId (char *pciBusId,int len, int device)
 {
     HIP_INIT_API(pciBusId, len, device);
-    hipError_t e = hipSuccess;
-    int tempPciBusId = 0;
-    e = ihipDeviceGetAttribute( &tempPciBusId, hipDeviceAttributePciBusId, device);
-    if( e == hipSuccess) {
-        std::string tempPciStr = std::to_string(tempPciBusId);
-        memcpy( pciBusId , tempPciStr.c_str() , tempPciStr.length() );
+    hipError_t e = hipErrorInvalidValue;
+    int deviceCount = 0;
+    ihipGetDeviceCount( &deviceCount );
+    if((device > deviceCount) || (device < 0)) {
+        e = hipErrorInvalidDevice;
+    } else {
+        if((pciBusId != nullptr) && (len > 0)) {
+            int tempPciBusId = 0;
+            e = ihipDeviceGetAttribute( &tempPciBusId, hipDeviceAttributePciBusId, device);
+            if( e == hipSuccess) {
+                std::string tempPciStr = std::to_string(tempPciBusId);
+                if( len < tempPciStr.length()){
+                    e = hipErrorInvalidValue;
+                } else { 
+                    memcpy( pciBusId , tempPciStr.c_str() , tempPciStr.length() );
+                }
+            }
+        }
     }
     return ihipLogStatus(e);
 }

From 64bb8d154a9ca093a93b8c945a469d1ac1686c66 Mon Sep 17 00:00:00 2001
From: Aditya Atluri <Aditya.Atluri@amd.com>
Date: Fri, 16 Jun 2017 09:02:26 -0500
Subject: [PATCH 05/19] removed bad copy constructor

Change-Id: I661991d9d43941a61848b0b8e9879c0bfa811b40
---
 include/hip/hcc_detail/hip_vector_types.h | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/include/hip/hcc_detail/hip_vector_types.h b/include/hip/hcc_detail/hip_vector_types.h
index 9da34d9f32..93c82cc0cb 100644
--- a/include/hip/hcc_detail/hip_vector_types.h
+++ b/include/hip/hcc_detail/hip_vector_types.h
@@ -36,25 +36,21 @@ THE SOFTWARE.
 
 #define MAKE_DEFAULT_CONSTRUCTOR_ONE_COMPONENT(type) \
 __device__ __host__ type() {} \
-__device__ __host__ type(type& val) : x(val.x) { } \
 __device__ __host__ type(const type& val) : x(val.x) { } \
 __device__ __host__ ~type() {} 
 
 #define MAKE_DEFAULT_CONSTRUCTOR_TWO_COMPONENT(type) \
 __device__ __host__ type() {} \
-__device__ __host__ type(type& val) : x(val.x), y(val.y) { } \
 __device__ __host__ type(const type& val) : x(val.x), y(val.y) { } \
 __device__ __host__ ~type() {}
 
 #define MAKE_DEFAULT_CONSTRUCTOR_THREE_COMPONENT(type) \
 __device__ __host__ type() {} \
-__device__ __host__ type(type& val) : x(val.x), y(val.y), z(val.z) { } \
 __device__ __host__ type(const type& val) : x(val.x), y(val.y), z(val.z) { } \
 __device__ __host__ ~type() {} 
 
 #define MAKE_DEFAULT_CONSTRUCTOR_FOUR_COMPONENT(type) \
 __device__ __host__ type() {} \
-__device__ __host__ type(type& val) : x(val.x), y(val.y), z(val.z), w(val.w) { } \
 __device__ __host__ type(const type& val) : x(val.x), y(val.y), z(val.z), w(val.w) { } \
 __device__ __host__ ~type() {}
 

From d1e28df22601a0db2e345e11c68cdc92a38da063 Mon Sep 17 00:00:00 2001
From: Aditya Atluri <Aditya.Atluri@amd.com>
Date: Fri, 16 Jun 2017 09:07:06 -0500
Subject: [PATCH 06/19] fixed float2int functions

Change-Id: I67be79149f06daacf0f0d131bdedabf294126248
---
 src/device_functions.cpp | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/device_functions.cpp b/src/device_functions.cpp
index 10d8d3ab89..615ae4d0b7 100644
--- a/src/device_functions.cpp
+++ b/src/device_functions.cpp
@@ -1,5 +1,5 @@
 /*
-Copyright (c) 2015-2017 Advanced Micro Devices, Inc. All rights reserved.
+Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
 in the Software without restriction, including without limitation the rights
@@ -149,19 +149,19 @@ __device__ long long int __double_as_longlong(double x)
   return hold64.sli;
 }
 
-__device__ int float2int_rd(float x)
+__device__ int __float2int_rd(float x)
 {
   return (int)x;
 }
-__device__ int float2int_rn(float x)
+__device__ int __float2int_rn(float x)
 {
   return (int)x;
 }
-__device__ int float2int_ru(float x)
+__device__ int __float2int_ru(float x)
 {
   return (int)x;
 }
-__device__ int float2int_rz(float x)
+__device__ int __float2int_rz(float x)
 {
   return (int)x;
 }

From 3c73229916787bfb7e0ab9549d2fb877b20729bd Mon Sep 17 00:00:00 2001
From: Rahul Garg <rahul.garg@amd.com>
Date: Sun, 18 Jun 2017 12:31:31 +0530
Subject: [PATCH 07/19] Abort device function in HIP/HCC, need new HCC

Change-Id: I4195ab75e9b7b48c8b8128d6925ddc0fa5e9e009
---
 include/hip/hcc_detail/hip_runtime.h | 3 +++
 src/device_util.cpp                  | 5 +++++
 2 files changed, 8 insertions(+)

diff --git a/include/hip/hcc_detail/hip_runtime.h b/include/hip/hcc_detail/hip_runtime.h
index 95826f9b60..da3b7ba50e 100644
--- a/include/hip/hcc_detail/hip_runtime.h
+++ b/include/hip/hcc_detail/hip_runtime.h
@@ -174,6 +174,9 @@ static constexpr int warpSize = 64;
 __device__ long long int clock64();
 __device__ clock_t clock();
 
+//abort
+__device__ void abort();
+
 //atomicAdd()
 __device__ int atomicAdd(int* address, int val);
 __device__ unsigned int atomicAdd(unsigned int* address,
diff --git a/src/device_util.cpp b/src/device_util.cpp
index 062372f0f4..1efda02933 100644
--- a/src/device_util.cpp
+++ b/src/device_util.cpp
@@ -839,6 +839,11 @@ __device__ float __hip_ynf(int n, float x)
 __device__ long long int clock64() { return (long long int)hc::__cycle_u64(); };
 __device__ clock_t clock() { return (clock_t)hc::__cycle_u64(); };
 
+//abort
+__device__ void abort()
+{
+    return hc::abort();
+}
 
 //atomicAdd()
 __device__  int atomicAdd(int* address, int val)

From e6e4fe613c89fc0b3ae0d4fd0b23f59a4e69a042 Mon Sep 17 00:00:00 2001
From: Rahul Garg <rahul.garg@amd.com>
Date: Tue, 20 Jun 2017 09:38:56 +0530
Subject: [PATCH 08/19] Added device side abort function in HIP/NVCC

Change-Id: I6ae35a72a8b9c34852619f02da1a046c8d3b2ed3
---
 include/hip/nvcc_detail/hip_runtime.h | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/include/hip/nvcc_detail/hip_runtime.h b/include/hip/nvcc_detail/hip_runtime.h
index 80da388007..8c08f3d151 100644
--- a/include/hip/nvcc_detail/hip_runtime.h
+++ b/include/hip/nvcc_detail/hip_runtime.h
@@ -109,6 +109,10 @@ kernelName<<<numblocks,numthreads,memperblock,streamId>>>(__VA_ARGS__);\
 
 #define HIP_DYNAMIC_SHARED_ATTRIBUTE
 
+#ifdef __HIP_DEVICE_COMPILE__
+#define abort() {asm("trap;");}
+#endif
+
 #endif
 
 #endif

From 871c2fc8d6b8854394e2486515b91a1ba5bc8232 Mon Sep 17 00:00:00 2001
From: Aditya Atluri <Aditya.Atluri@amd.com>
Date: Tue, 20 Jun 2017 11:35:52 -0500
Subject: [PATCH 09/19] removed rm for /opt/rocm/hip/src in inline asm sample

Change-Id: I0c02bccd4cd35e01a8e889ea1e586ea8baf0ab90
---
 samples/2_Cookbook/10_inline_asm/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/samples/2_Cookbook/10_inline_asm/Makefile b/samples/2_Cookbook/10_inline_asm/Makefile
index 77a7699635..6ad3c201bd 100644
--- a/samples/2_Cookbook/10_inline_asm/Makefile
+++ b/samples/2_Cookbook/10_inline_asm/Makefile
@@ -32,4 +32,4 @@ test: $(EXECUTABLE)
 clean:
 	rm -f $(EXECUTABLE)
 	rm -f $(OBJECTS)
-	rm -f $(HIP_PATH)/src/*.o
+

From b4a39664f0359c7769dca51664fb0b173a1a10a4 Mon Sep 17 00:00:00 2001
From: Evgeny Mankov <Evgeniy.Mankov@amd.com>
Date: Thu, 22 Jun 2017 21:53:32 +0300
Subject: [PATCH 10/19] [HIPIFY] Sync more CUDA Driver API functions.

+ 4.14. Event Management
+ 4.15. Execution Control

ToDo: 4.16 - 4.31 modules of CUDA Driver API.
---
 .../CUDA_Driver_API_functions_supported_by_HIP.md     | 11 ++++++++++-
 hipify-clang/src/Cuda2Hip.cpp                         |  4 ++++
 2 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/docs/markdown/CUDA_Driver_API_functions_supported_by_HIP.md b/docs/markdown/CUDA_Driver_API_functions_supported_by_HIP.md
index 0b3bb540bf..d797b31832 100644
--- a/docs/markdown/CUDA_Driver_API_functions_supported_by_HIP.md
+++ b/docs/markdown/CUDA_Driver_API_functions_supported_by_HIP.md
@@ -583,12 +583,21 @@
 
 |   **CUDA**                                                |   **HIP**                     | **CUDA description**                                                                                                           |
 |-----------------------------------------------------------|-------------------------------|--------------------------------------------------------------------------------------------------------------------------------|
-
+| `cuEventCreate`                                           | `hipEventCreate`              | Creates an event.                                                                                                              |
+| `cuEventDestroy`                                          | `hipEventDestroy`             | Destroys an event.                                                                                                             |
+| `cuEventElapsedTime`                                      | `hipEventElapsedTime`         | Computes the elapsed time between two events.                                                                                  |
+| `cuEventQuery`                                            | `hipEventQuery`               | Queries an event's status.                                                                                                     |
+| `cuEventRecord`                                           | `hipEventRecord`              | Records an event.                                                                                                              |
+| `cuEventSynchronize`                                      | `hipEventSynchronize`         | Waits for an event to complete.                                                                                                |
 
 ## **15. Execution Control**
 
 |   **CUDA**                                                |   **HIP**                     | **CUDA description**                                                                                                           |
 |-----------------------------------------------------------|-------------------------------|--------------------------------------------------------------------------------------------------------------------------------|
+| `cuFuncGetAttribute`                                      |                               | Returns information about a function.                                                                                          |
+| `cuFuncSetCacheConfig`                                    | `hipFuncSetCacheConfig`       | Sets the preferred cache configuration for a device function.                                                                  |
+| `cuFuncSetSharedMemConfig`                                |                               | Sets the shared memory configuration for a device function.                                                                    |
+| `cuLaunchKernel`                                          | `hipModuleLaunchKernel`       | Launches a CUDA function.                                                                                                      |
 
 
 ## **16. Execution Control [DEPRECATED]**
diff --git a/hipify-clang/src/Cuda2Hip.cpp b/hipify-clang/src/Cuda2Hip.cpp
index 7f9fefa7f9..9b58173899 100644
--- a/hipify-clang/src/Cuda2Hip.cpp
+++ b/hipify-clang/src/Cuda2Hip.cpp
@@ -987,6 +987,10 @@ struct cuda2hipMap {
     cuda2hipRename["cuEventRecord"]                             = {"hipEventRecord", CONV_EVENT, API_DRIVER};
     cuda2hipRename["cuEventSynchronize"]                        = {"hipEventSynchronize", CONV_EVENT, API_DRIVER};
 
+    // Execution Control
+    cuda2hipRename["cuFuncGetAttribute"]                        = {"hipFuncGetAttribute", CONV_MODULE, API_DRIVER, HIP_UNSUPPORTED};
+    cuda2hipRename["cuFuncSetCacheConfig"]                      = {"hipFuncSetCacheConfig", CONV_MODULE, API_DRIVER};
+    cuda2hipRename["cuFuncSetSharedMemConfig"]                  = {"hipFuncSetSharedMemConfig", CONV_MODULE, API_DRIVER, HIP_UNSUPPORTED};
     cuda2hipRename["cuLaunchKernel"]                            = {"hipModuleLaunchKernel", CONV_MODULE, API_DRIVER};
 
     // Streams

From d239b1a3fc2ac7d2c47cc91d72bcc349efde9134 Mon Sep 17 00:00:00 2001
From: Evgeny Mankov <Evgeniy.Mankov@amd.com>
Date: Fri, 23 Jun 2017 21:59:24 +0300
Subject: [PATCH 11/19] [HIPIFY] [DOC] Fix typo.

---
 hipify-clang/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hipify-clang/README.md b/hipify-clang/README.md
index c0d74dbe48..d74c53f187 100644
--- a/hipify-clang/README.md
+++ b/hipify-clang/README.md
@@ -70,7 +70,7 @@ To set additional options like Language Selection (only "-x cuda" is supported),
 
 Delimiter "--" is used to separate hipify-clang options (before the delimiter) from clang options (after the delimiter). It is strongly recommended to always specify the delimiter, even if there are no clang specific options at all, in order to avoid possible errors regarding compilation database; in such case delimeter should be the last option in hipify-clang's command line.
 
-Option "-x clang" is also worth specifying in order to convert source CUDA files with extensions other than standard extensions (*.cu, *.cuh).
+Option "-x cuda" is also worth specifying in order to convert source CUDA files with extensions other than standard extensions (*.cu, *.cuh).
 
 ## Disclaimer
 

From 7912e615022de55874fbd09305fcf4e355ffa182 Mon Sep 17 00:00:00 2001
From: Ben Sander <ben.sander@amd.com>
Date: Fri, 23 Jun 2017 10:38:29 -0500
Subject: [PATCH 12/19] Clean up old USE_* and RELEASE.md notes.

---
 RELEASE.md                                         | 6 ------
 include/hip/hcc_detail/host_defines.h              | 5 -----
 src/hip_hcc.cpp                                    | 3 ---
 tests/src/hipPointerAttrib.cpp                     | 9 ---------
 tests/src/runtimeApi/memory/p2p_copy_coherency.cpp | 4 ----
 5 files changed, 27 deletions(-)

diff --git a/RELEASE.md b/RELEASE.md
index 5787c59881..d6f3ec594c 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -2,12 +2,6 @@
 
 We have attempted to document known bugs and limitations - in particular the [HIP Kernel Language](docs/markdown/hip_kernel_language.md) document uses the phrase "Under Development", and the [HIP Runtime API bug list](http://gpuopen-professionalcompute-tools.github.io/HIP/bug.html) lists known bugs. 
 
-Upcoming:
-- Stability: Enforce periodic host synchronization to reclaim resources if the application has launched a large
-  number of commands (>1K) without synchronizing.  
-- Register keyword now silently ignored on HCC (previously would emit warning).
-- Doc updates: Add some more frequently asked questions to FAQ, fix TOC in some files, review.
-- Cookbook.
 
 ===================================================================================================
 
diff --git a/include/hip/hcc_detail/host_defines.h b/include/hip/hcc_detail/host_defines.h
index 140cbb0678..212fd650a3 100644
--- a/include/hip/hcc_detail/host_defines.h
+++ b/include/hip/hcc_detail/host_defines.h
@@ -28,7 +28,6 @@ THE SOFTWARE.
 #ifndef HIP_INCLUDE_HIP_HCC_DETAIL_HOST_DEFINES_H
 #define HIP_INCLUDE_HIP_HCC_DETAIL_HOST_DEFINES_H
 
-#define USE_PROMOTE_FREE_HCC 1
 
 // Add guard to Generic Grid Launch method
 #ifndef GENERIC_GRID_LAUNCH
@@ -61,11 +60,7 @@ THE SOFTWARE.
  */
 // _restrict is supported by the compiler
 #define __shared__     tile_static
-#if USE_PROMOTE_FREE_HCC==1
 #define __constant__   __attribute__((hc))
-#else
-#define __constant__   ADDRESS_SPACE_1
-#endif
 
 #else
 // Non-HCC compiler
diff --git a/src/hip_hcc.cpp b/src/hip_hcc.cpp
index d826a0cec3..061714070e 100644
--- a/src/hip_hcc.cpp
+++ b/src/hip_hcc.cpp
@@ -48,9 +48,6 @@ THE SOFTWARE.
 #include "env.h"
 
 
-// needs HCC change for hc::no_scope
-#define USE_NO_SCOPE 1
-
 //=================================================================================================
 //Global variables:
 //=================================================================================================
diff --git a/tests/src/hipPointerAttrib.cpp b/tests/src/hipPointerAttrib.cpp
index 7a2ab64bea..bddbff5ce0 100644
--- a/tests/src/hipPointerAttrib.cpp
+++ b/tests/src/hipPointerAttrib.cpp
@@ -32,7 +32,6 @@ THE SOFTWARE.
 
 #endif
 
-#define USE_AV_COPY (__hcc_workweek__ >= 16351) 
 
 size_t Nbytes = 0;
 
@@ -410,21 +409,13 @@ void thread_noise_generator(int iters, size_t numBuffers, Dir addDir, Dir remove
 
         if (addDir == Up) {
             for (char *p = basePtr; p<basePtr + maxSize; p+=bufferSize) {
-#if USE_AV_COPY
                 hc::AmPointerInfo info(p, p, bufferSize, acc, false, false);
                 hc::am_memtracker_add(p, info);
-#else
-                hc::am_memtracker_add(p, bufferSize, acc, false);
-#endif
             }
         } else if (addDir == Down) {
             for (char *p = basePtr+maxSize-bufferSize; p>=0; p-=bufferSize) {
-#if USE_AV_COPY
                 hc::AmPointerInfo info(p, p, bufferSize, acc, false, false);
                 hc::am_memtracker_add(p, info);
-#else
-                hc::am_memtracker_add(p, bufferSize, acc, false);
-#endif
             }
         }
 
diff --git a/tests/src/runtimeApi/memory/p2p_copy_coherency.cpp b/tests/src/runtimeApi/memory/p2p_copy_coherency.cpp
index a5d79464d0..9fadebea1e 100644
--- a/tests/src/runtimeApi/memory/p2p_copy_coherency.cpp
+++ b/tests/src/runtimeApi/memory/p2p_copy_coherency.cpp
@@ -36,7 +36,6 @@ THE SOFTWARE.
 #define USE_HCC_MEMTRACKER 0
 #endif
 
-#define USE_HSA_COPY 1
 
 int elementSizes[] = {16, 1024,524288};
 int nSizes  = sizeof(elementSizes) / sizeof(int);
@@ -102,11 +101,8 @@ void runTest(bool stepAIsCopy, bool hostSync, hipStream_t gpu0Stream, hipStream_
     hipStream_t stepAStream = gpu0Stream;
 
     if (stepAIsCopy) {
-#ifdef USE_HSA_COPY
         HIPCHECK(hipMemcpyAsync(dataGpu1, dataGpu0_0, sizeElements, hipMemcpyDeviceToDevice, stepAStream));
-#endif
     } else {
-        //assert(0); // not yet supported.
         unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, numElements);
         hipLaunchKernelGGL(memcpyIntKernel, dim3(blocks), dim3(threadsPerBlock), 0, gpu0Stream,
                        dataGpu0_0, dataGpu1, numElements);

From 176ff824d1142afaad1e26702d043ffae40cb763 Mon Sep 17 00:00:00 2001
From: Ben Sander <ben.sander@amd.com>
Date: Fri, 23 Jun 2017 10:39:16 -0500
Subject: [PATCH 13/19] Add option to pass names to HCC dispatch API (for
 debug)

---
 src/hip_module.cpp | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/src/hip_module.cpp b/src/hip_module.cpp
index 2a3bfabc28..b8c032da27 100644
--- a/src/hip_module.cpp
+++ b/src/hip_module.cpp
@@ -451,7 +451,13 @@ hipError_t ihipModuleLaunchKernel(hipFunction_t f,
         hc::completion_future cf;
 
         lp.av->dispatch_hsa_kernel(&aql, config[1] /* kernarg*/, kernArgSize, 
-                                  (startEvent || stopEvent) ? &cf : nullptr);
+                                  (startEvent || stopEvent) ? &cf : nullptr
+#define USE_NAMED_KERNEL 0
+#if USE_NAMED_KERNEL
+                                  , f->_name.c_str()
+#endif
+                                  );
+
 
 
         if (startEvent) {

From dff260de7eb44011c8e5795fac5823af0192718d Mon Sep 17 00:00:00 2001
From: Ben Sander <ben.sander@amd.com>
Date: Fri, 23 Jun 2017 17:12:04 -0500
Subject: [PATCH 14/19] Add docs for launch_bounds.

---
 docs/markdown/hip_faq.md             |  1 -
 docs/markdown/hip_kernel_language.md | 55 +++++++++++++++++++++-------
 2 files changed, 42 insertions(+), 14 deletions(-)

diff --git a/docs/markdown/hip_faq.md b/docs/markdown/hip_faq.md
index 07ec5f1d8b..ddf70f2875 100644
--- a/docs/markdown/hip_faq.md
+++ b/docs/markdown/hip_faq.md
@@ -70,7 +70,6 @@ See the [API Support Table](CUDA_Runtime_API_functions_supported_by_HIP.md) for
   - printf
   - assert
   - `__restrict__`
-  - `__launch_bounds__`
   - `__threadfence*_`, `__syncthreads*`
   - Unbounded loop unroll
 
diff --git a/docs/markdown/hip_kernel_language.md b/docs/markdown/hip_kernel_language.md
index 3cb7b17a0c..0485188a1f 100644
--- a/docs/markdown/hip_kernel_language.md
+++ b/docs/markdown/hip_kernel_language.md
@@ -610,30 +610,59 @@ Device-side dynamic global memory allocation is under development.  HIP now incl
 implementation of malloc and free that can be called from device functions.
 
 ## `__launch_bounds__`
-GPU multiprocessors have a fixed pool of resources (primarily registers and shared memory) that are shared among the active warps. Using more resources can increase the kernel’s IPC, but it reduces the resources available for other warps and limits the number of warps that can run simultaneously. Thus, GPUs exhibit a complex relationship between resource usage and performance. `__launch_bounds__` allows the application to provide usage hints that influence the resources (primarily registers) employed by the generated code. It’s a function attribute that must be attached to a `__global__` function:
+
+
+GPU multiprocessors have a fixed pool of resources (primarily registers and shared memory) which are shared by the actively running warps. Using more resources can increase IPC of the kernel but reduces the resources available for other warps and limits the number of warps that can be simulaneously running. Thus GPUs have a complex relationship between resource usage and performance.  
+
+__hip_launch_bounds__ allows the application to provide usage hints that influence the resources (primarily registers) used by the generated code.
+__hip_launch_bounds__ is a function attribute that must be attached to a __global__ function:
 
 ```
-__global__ void
-`__launch_bounds__`(requiredMaxThreadsPerBlock, minBlocksPerMultiprocessor)  
+__global__ void `__launch_bounds__`(MAX_THREADS_PER_BLOCK, MIN_WARPS_PER_EU) MyKernel(...) ...
 MyKernel(hipGridLaunch lp, ...) 
 ...
 ```
 
-`__launch_bounds__` supports two parameters:
+__launch_bounds__ supports two parameters:
+- MAX_THREADS_PER_BLOCK - The programmers guarantees that kernel will be launched with threads less than MAX_THREADS_PER_BLOCK. (On NVCC this maps to the .maxntid PTX directive). If no launch_bounds is specified, MAX_THREADS_PER_BLOCK is the maximum block size supported by the device (typically 1024 or larger). Specifying MAX_THREADS_PER_BLOCK less than the maximum effectively allows the compiler to use more resources than a default unconstrained compilation that supports all possible block sizes at launch time.
+The threads-per-block is the product of (hipBlockDim_x * hipBlockDim_y * hipBlockDim_z).
+- MIN_WARPS_PER_EU - directs the compiler to minimize resource usage so that the requested number of warps can be simultaneously active on a multi-processor. Since active warps compete for the same fixed pool of resources, the compiler must reduce resources required by each warp(primarily registers). MIN_WARPS_PER_EU is optional and defaults to 1 if not specified. Specifying a MIN_WARPS_PER_EU greater than the default 1 effectively constrains the compiler's resource usage.
 
-- **requiredMaxThreadsPerBlock**---the programmer guarantees that the kernel will launch with threadsPerBlock less than requiredMaxThreadsPerBlock. (In nvcc, this parameter maps to the _.maxntid_ PTX directive; in hcc, it maps to the HSAIL _requiredworkgroupsize_ directive.) If launch_bounds is unspecified, requiredMaxThreadsPerBlock is the maximum block size that the device supports (typically 1,024 or larger). Specifying requiredMaxThreadsPerBlock less than the maximum effectively allows the compiler to use more resources than a default unconstrained compilation supporting all possible block sizes at launch time. The threadsPerBlock value is the product hipBlockDim_x * hipBlockDim_y * hipBlockDim_z.
-- **minBlocksPerMultiprocessor**---directs the compiler to minimize resource usage so that the requested number of blocks can be simultaneously active on a multiprocessor. Because active blocks compete for the same fixed resource pool, the compiler must reduce the resource requirements of each block (primarily registers). minBlocksPerMultiprocessor is optional and defaults to 1 if unspecified. Selecting a minBlocksPerMultiprocessor value greater than 1 effectively constrains the compiler's resource usage. 
+### Compiler Impact
+The compiler uses these parameters as follows:
+- The compiler uses the hints only to manage register usage, and does not automatically reduce shared memory or other resources.
+- Compilation fails if compiler cannot generate a kernel which meets the requirements of the specified launch bounds.
+- From MAX_THREADS_PER_BLOCK, the compiler derives the maximum number of warps/block that can be used at launch time.
+Values of MAX_THREADS_PER_BLOCK less than the default allows the compiler to use a larger pool of registers : each warp uses registers, and this hint constains the launch to a warps/block size which is less than maximum.
+- From MIN_WARPS_PER_EU, the compiler derives a maximum number of registers that can be used by the kernel (to meet the required #simultaneous active blocks).
+If MIN_WARPS_PER_EU is 1, then the kernel can use all registers supported by the multiprocessor.
+- The compiler ensures that the registers used in the kernel is less than both allowed maximums, typically by spilling registers (to shared or global memory), or by using more instructions.
+- The compiler may use hueristics to increase register usage, or may simply be able to avoid spilling. The MAX_THREADS_PER_BLOCK is particularly useful in this cases, since it allows the compiler to use more registers and avoid situations where the compiler constrains the register usage (potentially spilling) to meet the requirements of a large block size that is never used at launch time.
 
-The compiler uses these two parameters as follows:
-- It employs the hints only to manage register usage and does not automatically reduce shared memory or other resources.
-- Compilation fails if the compiler cannot generate a kernel that meets the requirements of the specified launch bounds.
-- From requiredMaxThreadsPerBlock, the compiler derives the maximum number of warps per block that are usable at launch time. Values less than the default allow the compiler to use a larger register pool: each warp uses registers, and this hint constrains the launch to a warps-per-block size less than maximum. 
-- From minBlocksPerMultiprocessor, the compiler derives a maximum number of registers that the kernel can use (to meet the required number of simultaneously active blocks). If the value is 1, the kernel can use all registers supported by the multiprocessor.  
 
-The compiler ensures that the kernel uses fewer registers than both allowed maxima specify, typically by spilling to shared memory or using more instructions. It may use heuristics to increase register usage or may simply be able to avoid spilling. The requiredMaxThreadsPerBlock parameter is particularly useful in this case, since it allows the compiler to use more registers---avoiding situations where the compiler constrains the register usage (potentially spilling) to meet the requirements of a large block size never sees use at launch time.  
+### CU and EU Definitions
+A compute unit (CU) is responsible for executing the waves of a work-group. It is composed of one or more execution units (EU) which are responsible for executing waves. An EU can have enough resources to maintain the state of more than one executing wave. This allows an EU to hide latency by switching between waves in a similar way to symmetric multithreading on a CPU. In order to allow the state for multiple waves to fit on an EU, the resources used by a single wave have to be limited. Limiting such resources can allow greater latency hiding, but can result in having to spill some register state to memory. This attribute allows an advanced developer to tune the number of waves that are capable of fitting within the resources of an EU. It can be used to ensure at least a certain number will fit to help hide latency, and can also be used to ensure no more than a certain number will fit to limit cache thrashing.
+ 
+### Porting from CUDA __launch_bounds
+CUDA defines a __launch_bounds which is also designed to control occupancy:
+```
+__launch_bounds(MAX_THREADS_PER_BLOCK, MIN_BLOCKS_PER_MULTIPROCESSOR)
+```
 
-HIP/hcc will parse the `launch_bounds` attribute but silently ignores the performance hint. Full support is under development.
+- The second parameter __launch_bounds parameters must be converted to the format used __hip_launch_bounds, which uses warps and execution-units rather than blocks and multi-processors  ( This conversion is performed automatically by the clang hipify tools.)
+```
+MIN_WARPS_PER_EXECUTION_UNIT = (MIN_BLOCKS_PER_MULTIPROCESSOR * MAX_THREADS_PER_BLOCK) / 32
+```
 
+The key differences in the interface are:
+- Warps (rather than blocks):
+The developer is trying to tell the compiler to control resource utilization to guarantee some amount of active Warps/EU for latency hiding.  Specifying active warps in terms of blocks appears to hide the micro-architectural details of the warp size, but makes the interface more confusing since the developer ultimately needs to compute the number of warps to obtain the desired level of control. 
+- Execution Units  (rather than multiProcessor):
+The use of execution units rather than multiprocessors provides support for architectures with multiple execution units/multi-processor. For example, the AMD GCN architecture has 4 execution units per multiProcessor.  The hipDeviceProps has a field executionUnitsPerMultiprocessor.
+Platform-specific coding techniques such as #ifdef can be used to specify different launch_bounds for NVCC and HCC platforms, if desired. 
+
+
+### maxregcount
 Unlike nvcc, hcc does not support the "--maxregcount" option.  Instead, users are encouraged to use the hip_launch_bounds directive since the parameters are more intuitive and portable than
 micro-architecture details like registers, and also the directive allows per-kernel control rather than an entire file.  hip_launch_bounds works on both hcc and nvcc targets.
 

From 522e059a79bc0c68fde0234f572a6691f220fb66 Mon Sep 17 00:00:00 2001
From: Aditya Atluri <Aditya.Atluri@amd.com>
Date: Fri, 23 Jun 2017 19:05:34 -0500
Subject: [PATCH 15/19] fixed default args for symbol memcpy apis

Change-Id: Ie0b63f8b9c5535eb3946bd6af3f30fe71a015244
---
 include/hip/hcc_detail/hip_runtime_api.h  | 8 ++++----
 include/hip/nvcc_detail/hip_runtime_api.h | 8 ++++----
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/include/hip/hcc_detail/hip_runtime_api.h b/include/hip/hcc_detail/hip_runtime_api.h
index fde38c8395..724bf09b21 100644
--- a/include/hip/hcc_detail/hip_runtime_api.h
+++ b/include/hip/hcc_detail/hip_runtime_api.h
@@ -1194,7 +1194,7 @@ hipError_t hipMemcpyDtoDAsync(hipDeviceptr_t dst, hipDeviceptr_t src, size_t siz
  *
  *  @see hipMemcpy, hipMemcpy2D, hipMemcpyToArray, hipMemcpy2DToArray, hipMemcpyFromArray, hipMemcpy2DFromArray, hipMemcpyArrayToArray, hipMemcpy2DArrayToArray, hipMemcpyFromSymbol, hipMemcpyAsync, hipMemcpy2DAsync, hipMemcpyToArrayAsync, hipMemcpy2DToArrayAsync, hipMemcpyFromArrayAsync, hipMemcpy2DFromArrayAsync, hipMemcpyToSymbolAsync, hipMemcpyFromSymbolAsync
  */
-hipError_t hipMemcpyToSymbol(const void* symbolName, const void *src, size_t sizeBytes, size_t offset, hipMemcpyKind kind);
+hipError_t hipMemcpyToSymbol(const void* symbolName, const void *src, size_t sizeBytes, size_t offset = 0, hipMemcpyKind kind = hipMemcpyHostToDevice);
 
 
 /**
@@ -1214,11 +1214,11 @@ hipError_t hipMemcpyToSymbol(const void* symbolName, const void *src, size_t siz
  *
  *  @see hipMemcpy, hipMemcpy2D, hipMemcpyToArray, hipMemcpy2DToArray, hipMemcpyFromArray, hipMemcpy2DFromArray, hipMemcpyArrayToArray, hipMemcpy2DArrayToArray, hipMemcpyFromSymbol, hipMemcpyAsync, hipMemcpy2DAsync, hipMemcpyToArrayAsync, hipMemcpy2DToArrayAsync, hipMemcpyFromArrayAsync, hipMemcpy2DFromArrayAsync, hipMemcpyToSymbolAsync, hipMemcpyFromSymbolAsync
  */
-hipError_t hipMemcpyToSymbolAsync(const void* symbolName, const void *src, size_t sizeBytes, size_t offset, hipMemcpyKind kind, hipStream_t stream);
+hipError_t hipMemcpyToSymbolAsync(const void* symbolName, const void *src, size_t sizeBytes, size_t offset, hipMemcpyKind kind, hipStream_t stream = 0);
 
-hipError_t hipMemcpyFromSymbol(void *dst, const void* symbolName, size_t sizeBytes, size_t offset, hipMemcpyKind kind);
+hipError_t hipMemcpyFromSymbol(void *dst, const void* symbolName, size_t sizeBytes, size_t offset = 0, hipMemcpyKind kind = hipMemcpyDeviceToHost);
 
-hipError_t hipMemcpyFromSymbolAsync(void *dst, const void* symbolName, size_t sizeBytes, size_t offset, hipMemcpyKind kind, hipStream_t stream);
+hipError_t hipMemcpyFromSymbolAsync(void *dst, const void* symbolName, size_t sizeBytes, size_t offset, hipMemcpyKind kind, hipStream_t stream = 0);
 
 /**
  *  @brief Copy data from src to dst asynchronously.
diff --git a/include/hip/nvcc_detail/hip_runtime_api.h b/include/hip/nvcc_detail/hip_runtime_api.h
index f92523a3e3..b1011aac6c 100644
--- a/include/hip/nvcc_detail/hip_runtime_api.h
+++ b/include/hip/nvcc_detail/hip_runtime_api.h
@@ -360,16 +360,16 @@ inline static hipError_t hipMemcpyToSymbol(const void* symbol, const void* src,
 	return hipCUDAErrorTohipError(cudaMemcpyToSymbol(symbol, src, sizeBytes, offset, hipMemcpyKindToCudaMemcpyKind(copyType)));
 }
 
-inline static hipError_t hipMemcpyToSymbolAsync(const void* symbol, const void* src, size_t sizeBytes, size_t offset, hipMemcpyKind copyType, hipStream_t stream) {
-    return hipCUDAErrorTohipError(cudaMemcpyToSymbolAsync(symbol, src, sizeBytes, offset, hipMemcpyKindToCudaMemcpyKind(copyType)));
+inline static hipError_t hipMemcpyToSymbolAsync(const void* symbol, const void* src, size_t sizeBytes, size_t offset, hipMemcpyKind copyType, hipStream_t stream = 0) {
+    return hipCUDAErrorTohipError(cudaMemcpyToSymbolAsync(symbol, src, sizeBytes, offset, hipMemcpyKindToCudaMemcpyKind(copyType), stream));
 }
 
-inline static hipError_t hipMemcpyFromSymbol(void *dst, const void* symbolName, size_t sizeBytes, size_t offset, hipMemcpyKind kind)
+inline static hipError_t hipMemcpyFromSymbol(void *dst, const void* symbolName, size_t sizeBytes, size_t offset = 0, hipMemcpyKind kind = hipMemcpyDeviceToHost)
 {
     return hipCUDAErrorTohipError(cudaMemcpyFromSymbol(dst, symbolName, sizeBytes, offset, hipMemcpyKindToCudaMemcpyKind(kind)));
 }
 
-inline static hipError_t hipMemcpyFromSymbolAsync(void *dst, const void* symbolName, size_t sizeBytes, size_t offset, hipMemcpyKind kind, hipStream_t stream)
+inline static hipError_t hipMemcpyFromSymbolAsync(void *dst, const void* symbolName, size_t sizeBytes, size_t offset, hipMemcpyKind kind, hipStream_t stream = 0)
 {
     return hipCUDAErrorTohipError(cudaMemcpyFromSymbolAsync(dst, symbolName, sizeBytes, offset, hipMemcpyKindToCudaMemcpyKind(kind), stream));
 }

From 1df08626c896d6a89beedc098a1fa7bfda00be27 Mon Sep 17 00:00:00 2001
From: "Sun, Peng" <peng.sun@amd.com>
Date: Mon, 26 Jun 2017 15:29:38 -0500
Subject: [PATCH 16/19] Add support of HIP_HIDDEN_FREE_MEM, to deduct the
 returned available memory from hipMemGetInfo API, measured in MB.

Change-Id: I7a8260c12e032e04e26611db4c38c893a29f2653
---
 src/hip_hcc.cpp        | 5 +++--
 src/hip_hcc_internal.h | 2 +-
 src/hip_memory.cpp     | 4 ++++
 3 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/src/hip_hcc.cpp b/src/hip_hcc.cpp
index 061714070e..364db80537 100644
--- a/src/hip_hcc.cpp
+++ b/src/hip_hcc.cpp
@@ -78,6 +78,7 @@ int HIP_FORCE_P2P_HOST = 0;
 int HIP_FAIL_SOC = 0;
 int HIP_DENY_PEER_ACCESS = 0;
 
+int HIP_HIDDEN_FREE_MEM = 0;
 // Force async copies to actually use the synchronous copy interface.
 int HIP_FORCE_SYNC_COPY = 0;
 
@@ -1204,8 +1205,8 @@ void HipReadEnv()
         tokenize(HIP_LAUNCH_BLOCKING_KERNELS, ',', &g_hipLaunchBlockingKernels);
     }
     READ_ENV_I(release, HIP_API_BLOCKING, 0, "Make HIP APIs 'host-synchronous', so they block until completed.  Impacts hipMemcpyAsync, hipMemsetAsync." );
-
-
+    
+    READ_ENV_I(release, HIP_HIDDEN_FREE_MEM, 0, "Amount of memory to hide from the free memory reported by hipMemGetInfo, specified in MB. Impacts hipMemGetInfo." );
 
     READ_ENV_C(release, HIP_DB, 0,  "Print debug info.  Bitmask (HIP_DB=0xff) or flags separated by '+' (HIP_DB=api+sync+mem+copy)", HIP_DB_callback);
     if ((HIP_DB & (1<<DB_API))  && (HIP_TRACE_API == 0)) {
diff --git a/src/hip_hcc_internal.h b/src/hip_hcc_internal.h
index c3f8b72311..1499406780 100644
--- a/src/hip_hcc_internal.h
+++ b/src/hip_hcc_internal.h
@@ -61,7 +61,7 @@ extern int HIP_FORCE_P2P_HOST;
 
 extern int HIP_COHERENT_HOST_ALLOC;
 
-
+extern int HIP_HIDDEN_FREE_MEM;
 //---
 // Chicken bits for disabling functionality to work around potential issues:
 extern int HIP_SYNC_HOST_ALLOC;
diff --git a/src/hip_memory.cpp b/src/hip_memory.cpp
index ce65579e34..6728ae42d3 100644
--- a/src/hip_memory.cpp
+++ b/src/hip_memory.cpp
@@ -1130,6 +1130,10 @@ hipError_t hipMemGetInfo  (size_t *free, size_t *total)
             hc::am_memtracker_sizeinfo(device->_acc, &deviceMemSize, &hostMemSize, &userMemSize);
 
             *free =  device->_props.totalGlobalMem - deviceMemSize;
+
+           // Deduct the amount of memory from the free memory reported from the system
+           if(HIP_HIDDEN_FREE_MEM)
+               *free -= (size_t)HIP_HIDDEN_FREE_MEM*1024*1024;
         }
         else {
              e = hipErrorInvalidValue;

From 1c3a8b256469428fbbb8cfa80c0be1bfed405def Mon Sep 17 00:00:00 2001
From: sunway513 <peng.sun@amd.com>
Date: Mon, 26 Jun 2017 22:47:22 +0000
Subject: [PATCH 17/19] Fix docs for HIP_TRACE_API bit masks.

---
 docs/markdown/hip_profiling.md | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/docs/markdown/hip_profiling.md b/docs/markdown/hip_profiling.md
index ef349ef2a5..ac277c8433 100644
--- a/docs/markdown/hip_profiling.md
+++ b/docs/markdown/hip_profiling.md
@@ -268,9 +268,12 @@ PASSED!
 ```
 
 HIP_TRACE_API supports multiple levels of debug information:
-   - 0x1 = print all HIP APIs
-   - 0x2 = print HIP APIs which initiate GPU kernels, copies, or memsets.  Includes hipLaunchKernel, hipMemcpy*, hipMemset*.
-   - 0x4 = print HIP APIs which allocate or free memory.  Includes hipMalloc, hipHostMalloc, hipFree, hipHostFree.
+   - 0x1 = print all HIP APIs.  This is the most verbose setting; the flags below allow selecting a subset.
+   - 0x2 = print HIP APIs which initiate GPU kernel commands.  Includes hipLaunchKernel, hipLaunchModuleKernel
+   - 0x4 = print HIP APIs which initiate GPU memory commands.  Includes hipMemcpy*, hipMemset*.
+   - 0x8 = print HIP APIs which allocate or free memory.  Includes hipMalloc, hipHostMalloc, hipFree, hipHostFree.
+
+These can be combined.  For example, HIP_TRACE_API=6 shows a concise view of the HIP commands (both kernel and memory) that are sent to the GPU.
 
 
 #### Color

From 148dbc1027b908bfc32eca7231c2b250878d9661 Mon Sep 17 00:00:00 2001
From: Ben Sander <ben.sander@amd.com>
Date: Tue, 27 Jun 2017 12:17:12 -0500
Subject: [PATCH 18/19] Set default HIP_HIDDEN_FREE_MEM

---
 src/hip_hcc.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/hip_hcc.cpp b/src/hip_hcc.cpp
index 364db80537..be591f2f04 100644
--- a/src/hip_hcc.cpp
+++ b/src/hip_hcc.cpp
@@ -78,7 +78,7 @@ int HIP_FORCE_P2P_HOST = 0;
 int HIP_FAIL_SOC = 0;
 int HIP_DENY_PEER_ACCESS = 0;
 
-int HIP_HIDDEN_FREE_MEM = 0;
+int HIP_HIDDEN_FREE_MEM = 256;
 // Force async copies to actually use the synchronous copy interface.
 int HIP_FORCE_SYNC_COPY = 0;
 

From 1e1654c225290793269f505cb67880f68c764407 Mon Sep 17 00:00:00 2001
From: "Sun, Peng" <peng.sun@amd.com>
Date: Tue, 27 Jun 2017 14:15:16 -0500
Subject: [PATCH 19/19] Remove some warning debug info and add weak attribute
 back to GGL __global__ define

Change-Id: I2021b107dda697b1262d44fa1506465e94a3916b
---
 include/hip/hcc_detail/host_defines.h | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/include/hip/hcc_detail/host_defines.h b/include/hip/hcc_detail/host_defines.h
index 212fd650a3..b2e7ac2617 100644
--- a/include/hip/hcc_detail/host_defines.h
+++ b/include/hip/hcc_detail/host_defines.h
@@ -41,13 +41,10 @@ THE SOFTWARE.
 #define __host__     __attribute__((cpu))
 #define __device__   __attribute__((hc))
 
-//#warning "HOST DEFINE header included"
 #if GENERIC_GRID_LAUNCH == 0
-//#warning "original global define reached"
 #define __global__  __attribute__((hc_grid_launch)) __attribute__((used))
 #else
-//#warning "GGL global define reached"
-#define __global__ __attribute__((annotate("hip__global__"), hc, used))
+#define __global__ __attribute__((annotate("hip__global__"), hc, used, weak))
 #endif //GENERIC_GRID_LAUNCH
 
 #define __noinline__      __attribute__((noinline))