From 73f88cb3bc9abfcda7a1d446841c23f8b71a9ba0 Mon Sep 17 00:00:00 2001
From: Aditya Atluri <Aditya.Atluri@amd.com>
Date: Tue, 21 Jun 2016 11:11:20 -0500
Subject: [PATCH 1/5] added device functions to docs

Change-Id: I11ad1af86274620782986e227888b2d5f0544d8f
---
 docs/markdown/hip_kernel_language.md | 70 ++++++++++++++++++++++++++++
 1 file changed, 70 insertions(+)
diff --git a/docs/markdown/hip_kernel_language.md b/docs/markdown/hip_kernel_language.md
index e382ede4b3..23cde7fee6 100644
--- a/docs/markdown/hip_kernel_language.md
+++ b/docs/markdown/hip_kernel_language.md
@@ -299,6 +299,42 @@ Following is the list of supported single precision mathematical functions.
 | float tanhf ( float  x ) <br><sub>Calculate the hyperbolic tangent of the input argument.</sub> | ✓ | ✓ |
 | float truncf ( float  x ) <br><sub>Truncate input argument to the integral part.</sub> | ✓ | ✓ |
 | float tgammaf ( float  x ) <br><sub>Calculate the gamma function of the input argument.</sub> | ✓ | ✓ |
+| float erfcinvf ( float  y ) <br><sub>Calculate the inverse complementary function of the input argument.</sub> | ✓ | ✓ |
+| float erfcxf ( float  x ) <br><sub>Calculate the scaled complementary error function of the input argument.</sub> | ✓ | ✓ |
+| float erfinvf ( float  y ) <br><sub>Calculate the inverse error function of the input argument.</sub> | ✓ | ✓ |
+| float fdividef ( float x, float  y ) <br><sub>Divide two floating point values.</sub> | ✓ | ✓ |
+| float frexpf ( float  x, int *nptr ) <br><sub>Extract mantissa and exponent of a floating-point value.</sub> | ✓ | ✓ |
+| float j0f ( float  x ) <br><sub>Calculate the value of the Bessel function of the first kind of order 0 for the input argument.</sub> | ✓ | ✓ |
+| float j1f ( float  x ) <br><sub>Calculate the value of the Bessel function of the first kind of order 1 for the input argument.</sub> | ✓ | ✓ |
+| float jnf ( int n, float  x ) <br><sub>Calculate the value of the Bessel function of the first kind of order n for the input argument.</sub> | ✓ | ✓ |
+| float lgammaf ( float  x ) <br><sub>Calculate the natural logarithm of the absolute value of the gamma function of the input argument.</sub> | ✓ | ✓ |
+| long long int llrintf ( float  x ) <br><sub>Round input to nearest integer value.</sub> | ✓ | ✓ |
+| long long int llroundf ( float  x ) <br><sub>Round to nearest integer value.</sub> | ✓ | ✓ |
+| long int lrintf ( float  x ) <br><sub>Round input to nearest integer value.</sub> | ✓ | ✓ |
+| long int lroundf ( float  x ) <br><sub>Round to nearest integer value.</sub> | ✓ | ✓ |
+| float modff ( float  x, float *iptr ) <br><sub>Break down the input argument into fractional and integral parts.</sub> | ✓ | ✓ |
+| float nextafterf ( float  x, float y ) <br><sub>Returns next representable single-precision floating-point value after argument.</sub> | ✓ | ✓ |
+| float norm3df ( float  a, float b, float c ) <br><sub>Calculate the square root of the sum of squares of three coordinates of the argument.</sub> | ✓ | ✓ |
+| float norm4df ( float  a, float b, float c, float d ) <br><sub>Calculate the square root of the sum of squares of four coordinates of the argument.</sub> | ✓ | ✓ |
+| float normcdff ( float  y ) <br><sub>Calculate the standard normal cumulative distribution function.</sub> | ✓ | ✓ |
+| float normcdfinvf ( float  y ) <br><sub>Calculate the inverse of the standard normal cumulative distribution function.</sub> | ✓ | ✓ |
+| float normf ( int dim, const float *a ) <br><sub>Calculate the square root of the sum of squares of any number of coordinates.</sub> | ✓ | ✓ |
+| float rcbrtf ( float x ) <br><sub>Calculate the reciprocal cube root function.</sub> | ✓ | ✓ |
+| float remquof ( float x, float y, int *quo ) <br><sub>Compute single-precision floating-point remainder and part of quotient.</sub> | ✓ | ✓ |
+| float rhypotf ( float x, float y ) <br><sub>Calculate one over the square root of the sum of squares of two arguments.</sub> | ✓ | ✓ |
+| float rintf ( float x ) <br><sub>Round input to nearest integer value in floating-point.</sub> | ✓ | ✓ |
+| float rnorm3df ( float  a, float b, float c ) <br><sub>Calculate one over the square root of the sum of squares of three coordinates of the argument.</sub> | ✓ | ✓ |
+| float rnorm4df ( float  a, float b, float c, float d ) <br><sub>Calculate one over the square root of the sum of squares of four coordinates of the argument.</sub> | ✓ | ✓ |
+| float rnormf ( int dim, const float *a ) <br><sub>Calculate the reciprocal of square root of the sum of squares of any number of coordinates.</sub> | ✓ | ✓ |
+| float scalblnf ( float x, long int n ) <br><sub>Scale floating-point input by integer power of two.</sub> | ✓ | ✓ |
+| void sincosf ( float x, float *sptr, float *cptr ) <br><sub>Calculate the sine and cosine of the first input argument.</sub> | ✓ | ✓ |
+| void sincospif ( float x, float *sptr, float *cptr ) <br><sub>Calculate the sine and cosine of the first input argument multiplied by PI.</sub> | ✓ | ✓ |
+| float y0f ( float  x ) <br><sub>Calculate the value of the Bessel function of the second kind of order 0 for the input argument.</sub> | ✓ | ✓ |
+| float y1f ( float  x ) <br><sub>Calculate the value of the Bessel function of the second kind of order 1 for the input argument.</sub> | ✓ | ✓ |
+| float ynf ( int n, float  x ) <br><sub>Calculate the value of the Bessel function of the second kind of order n for the input argument.</sub> | ✓ | ✓ |
+
+
+
 <sub><b id="f1"><sup>[1]</sup></b> __RETURN_TYPE is dependent on compiler. It is usually 'int' for C compilers and 'bool' for C++ compilers.</sub> [↩](#a1)
 
 ### Double Precision Mathematical Functions
@@ -360,6 +396,40 @@ Following is the list of supported double precision mathematical functions.
 | double tanh ( double  x ) <br><sub>Calculate the hyperbolic tangent of the input argument.</sub> | ✓ | ✓ |
 | double tgamma ( double  x ) <br><sub>Calculate the gamma function of the input argument.</sub> | ✓ | ✓ |
 | double trunc ( double  x ) <br><sub>Truncate input argument to the integral part.</sub> | ✓ | ✓ |
+| double erfcinv ( double  y ) <br><sub>Calculate the inverse complementary function of the input argument.</sub> | ✓ | ✓ |
+| double erfcx ( double  x ) <br><sub>Calculate the scaled complementary error function of the input argument.</sub> | ✓ | ✓ |
+| double erfinv ( double  y ) <br><sub>Calculate the inverse error function of the input argument.</sub> | ✓ | ✓ |
+| double frexp ( float  x, int *nptr ) <br><sub>Extract mantissa and exponent of a floating-point value.</sub> | ✓ | ✓ |
+| double j0 ( double  x ) <br><sub>Calculate the value of the Bessel function of the first kind of order 0 for the input argument.</sub> | ✓ | ✓ |
+| double j1 ( double  x ) <br><sub>Calculate the value of the Bessel function of the first kind of order 1 for the input argument.</sub> | ✓ | ✓ |
+| double jn ( int n, double  x ) <br><sub>Calculate the value of the Bessel function of the first kind of order n for the input argument.</sub> | ✓ | ✓ |
+| double lgamma ( double  x ) <br><sub>Calculate the natural logarithm of the absolute value of the gamma function of the input argument.</sub> | ✓ | ✓ |
+| long long int llrint ( double  x ) <br><sub>Round input to nearest integer value.</sub> | ✓ | ✓ |
+| long long int llround ( double  x ) <br><sub>Round to nearest integer value.</sub> | ✓ | ✓ |
+| long int lrint ( double  x ) <br><sub>Round input to nearest integer value.</sub> | ✓ | ✓ |
+| long int lround ( double  x ) <br><sub>Round to nearest integer value.</sub> | ✓ | ✓ |
+| double modf ( double  x, double *iptr ) <br><sub>Break down the input argument into fractional and integral parts.</sub> | ✓ | ✓ |
+| double nextafter ( double  x, double y ) <br><sub>Returns next representable single-precision floating-point value after argument.</sub> | ✓ | ✓ |
+| double norm3d ( double  a, double b, double c ) <br><sub>Calculate the square root of the sum of squares of three coordinates of the argument.</sub> | ✓ | ✓ |
+| float norm4d ( double  a, double b, double c, double d ) <br><sub>Calculate the square root of the sum of squares of four coordinates of the argument.</sub> | ✓ | ✓ |
+| double normcdf ( double  y ) <br><sub>Calculate the standard normal cumulative distribution function.</sub> | ✓ | ✓ |
+| double normcdfinv ( double  y ) <br><sub>Calculate the inverse of the standard normal cumulative distribution function.</sub> | ✓ | ✓ |
+| double rcbrt ( double x ) <br><sub>Calculate the reciprocal cube root function.</sub> | ✓ | ✓ |
+| double remquo ( double x, double y, int *quo ) <br><sub>Compute single-precision floating-point remainder and part of quotient.</sub> | ✓ | ✓ |
+| double rhypot ( double x, double y ) <br><sub>Calculate one over the square root of the sum of squares of two arguments.</sub> | ✓ | ✓ |
+| double rint ( double x ) <br><sub>Round input to nearest integer value in floating-point.</sub> | ✓ | ✓ |
+| double rnorm3d ( double a, double b, double c ) <br><sub>Calculate one over the square root of the sum of squares of three coordinates of the argument.</sub> | ✓ | ✓ |
+| double rnorm4d ( double a, double b, double c, double d ) <br><sub>Calculate one over the square root of the sum of squares of four coordinates of the argument.</sub> | ✓ | ✓ |
+| double rnorm ( int dim, const double *a ) <br><sub>Calculate the reciprocal of square root of the sum of squares of any number of coordinates.</sub> | ✓ | ✓ |
+| double scalbln ( double x, long int n ) <br><sub>Scale floating-point input by integer power of two.</sub> | ✓ | ✓ |
+| void sincos ( double x, double *sptr, double *cptr ) <br><sub>Calculate the sine and cosine of the first input argument.</sub> | ✓ | ✓ |
+| void sincospi ( double x, double *sptr, double *cptr ) <br><sub>Calculate the sine and cosine of the first input argument multiplied by PI.</sub> | ✓ | ✓ |
+| double y0f ( double  x ) <br><sub>Calculate the value of the Bessel function of the second kind of order 0 for the input argument.</sub> | ✓ | ✓ |
+| double y1 ( double  x ) <br><sub>Calculate the value of the Bessel function of the second kind of order 1 for the input argument.</sub> | ✓ | ✓ |
+| double yn ( int n, double  x ) <br><sub>Calculate the value of the Bessel function of the second kind of order n for the input argument.</sub> | ✓ | ✓ |
+
+
+
 <sub><b id="f2"><sup>[1]</sup></b> __RETURN_TYPE is dependent on compiler. It is usually 'int' for C compilers and 'bool' for C++ compilers.</sub> [↩](#a2)
 
 ### Integer Intrinsics

From 100a744d926ee6b0ace439c0aa07b2803d17ded4 Mon Sep 17 00:00:00 2001
From: Ben Sander <ben.sander@amd.com>
Date: Mon, 20 Jun 2016 23:28:45 -0500
Subject: [PATCH 2/5] Switch to hc_* coordinate builtins (replace amp_)

Change-Id: I0a8871f0c9f047eb45a7391fd032100af2bbd4e0
---
 include/hcc_detail/hip_runtime.h | 34 ++++++--------------------------
 1 file changed, 6 insertions(+), 28 deletions(-)

diff --git a/include/hcc_detail/hip_runtime.h b/include/hcc_detail/hip_runtime.h
index 3d0bd7b1f2..066db4bbec 100644
--- a/include/hcc_detail/hip_runtime.h
+++ b/include/hcc_detail/hip_runtime.h
@@ -485,45 +485,23 @@ __device__ float __dsqrt_rz(double x);
  * Kernel launching
  */
 
-// Choose correct polarity of xyz/zyx ordering:
-#if __hcc_workweek__ >= 16123
 
-#define hipThreadIdx_x (amp_get_local_id(0))
-#define hipThreadIdx_y (amp_get_local_id(1))
-#define hipThreadIdx_z (amp_get_local_id(2))
+#define hipThreadIdx_x (hc_get_workitem_id(0))
+#define hipThreadIdx_y (hc_get_workitem_id(1))
+#define hipThreadIdx_z (hc_get_workitem_id(2))
 
 #define hipBlockIdx_x  (hc_get_group_id(0))
 #define hipBlockIdx_y  (hc_get_group_id(1))
 #define hipBlockIdx_z  (hc_get_group_id(2))
 
-#define hipBlockDim_x  (amp_get_local_size(0))
-#define hipBlockDim_y  (amp_get_local_size(1))
-#define hipBlockDim_z  (amp_get_local_size(2))
+#define hipBlockDim_x  (hc_get_group_size(0))
+#define hipBlockDim_y  (hc_get_group_size(1))
+#define hipBlockDim_z  (hc_get_group_size(2))
 
 #define hipGridDim_x   (hc_get_num_groups(0))
 #define hipGridDim_y   (hc_get_num_groups(1))
 #define hipGridDim_z   (hc_get_num_groups(2))
 
-#else
-
-#define hipThreadIdx_x (amp_get_local_id(2))
-#define hipThreadIdx_y (amp_get_local_id(1))
-#define hipThreadIdx_z (amp_get_local_id(0))
-
-#define hipBlockIdx_x  (hc_get_group_id(2))
-#define hipBlockIdx_y  (hc_get_group_id(1))
-#define hipBlockIdx_z  (hc_get_group_id(0))
-
-#define hipBlockDim_x  (amp_get_local_size(2))
-#define hipBlockDim_y  (amp_get_local_size(1))
-#define hipBlockDim_z  (amp_get_local_size(0))
-
-#define hipGridDim_x   (hc_get_num_groups(2))
-#define hipGridDim_y   (hc_get_num_groups(1))
-#define hipGridDim_z   (hc_get_num_groups(0))
-
-#endif // __hcc_workweek__ check
-
 #define __syncthreads() hc_barrier(CLK_LOCAL_MEM_FENCE)
 
 #define HIP_KERNEL_NAME(...) __VA_ARGS__

From 9f29cc698914d2c7b4bebc6113180361dddf06f8 Mon Sep 17 00:00:00 2001
From: Ben Sander <ben.sander@amd.com>
Date: Mon, 20 Jun 2016 23:46:42 -0500
Subject: [PATCH 3/5] Grid-launch updates to 2.0 and cleanup of old.

_ Use fields from GRID_LAUNCH_20 structure
  (See USE_GRID_LAUNCH_20 define, currently set to 0)
  "1" will require HCC support.
- Remove old DISABLE_GRID_LAUNCH support.

Change-Id: I584ce648d217251789a6283cf27feb24cb7dc8d1
---
 docs/markdown/hip_porting_guide.md | 31 --------------------
 include/hcc_detail/hip_runtime.h   | 46 +++++++++++++-----------------
 include/hcc_detail/host_defines.h  |  4 ---
 include/hip_common.h               |  3 --
 src/hip_hcc.cpp                    | 44 ++++++++++++++++++++++++++++
 tests/src/kernel/hipGridLaunch.cpp |  1 -
 6 files changed, 64 insertions(+), 65 deletions(-)

diff --git a/docs/markdown/hip_porting_guide.md b/docs/markdown/hip_porting_guide.md
index de5c590e12..6ae0a6d31e 100644
--- a/docs/markdown/hip_porting_guide.md
+++ b/docs/markdown/hip_porting_guide.md
@@ -464,37 +464,6 @@ hipcc-cmd: /opt/hcc/bin/hcc  -hc -I/opt/hcc/include -stdlib=libc++ -I../../../..
 If you pass a ".cu" file, hcc will attempt to compile it as a Cuda language file. You must tell hcc that its in fact a C++ file: use the "-x c++" option.
 
 
-#### grid_launch kernel dispatch - fallback
-HIP uses an hcc language feature called "grid_launch". The [[hc_grid_launch]] attribute that can be attached to a function definition, and the first parameter is of type grid_launch_parm.
-When a [[hc_grid_launch]] function is called, hcc runtime uses the grid_launch_parm to control the execution configuration of the kernel 
-(including the grid and group dimensions, the queue, and dynamic group memory allocations).   By default, the hipLaunchKernel macro creates a grid_launch_parm structure and launches a
-[[hc_grid_launch]] kernel.  grid_launch is a relatively new addition to hcc so this section describes how to fall back to a traditional calling sequence which invokes a standard host function
-which calls a hc::parallel_for_each to launch the kernel.  
-
-First, set DISABLE_GRID_LAUNCH:
-include/hip_common.h
-```
-// Set this define to disable GRID_LAUNCH
-#define DISABLE_GRID_LAUNCH
-```
-
-Inside any kernel use the KERNELBEGIN as the first line in the kernel function, and KERNELEND as the last line.  For example:
-```
-__global__ void
-MyKernel(hipLaunchParm lp, float *C, const float *A, size_t N)
-{
-    KERNELBEGIN; // Required if hc_grid_launch is disabled
-
-	int tid = hipBlockIdx_x*MAX_THREADS_PER_BLOCK + hipThreadIdx_x;
-
-    if (tid < N) {
-        C[tid] = A[tid];
-    }
-
-    KERNELEND; // Required if hc_grid_launch is disabled
-}
-```
-
 #### HIP Environment Variables
 
 On the HCC path, HIP provides a number of environment variables that control the behavior of HIP.  Some of these are useful for appliction development (for example HIP_VISIBLE_DEVICES, HIP_LAUNCH_BLOCKING),
diff --git a/include/hcc_detail/hip_runtime.h b/include/hcc_detail/hip_runtime.h
index 066db4bbec..8594954e75 100644
--- a/include/hcc_detail/hip_runtime.h
+++ b/include/hcc_detail/hip_runtime.h
@@ -38,6 +38,10 @@ THE SOFTWARE.
 #include <stddef.h>
 
 
+// Use field names for grid_launch 2.0 structure:
+#define USE_GRID_LAUNCH_20 0
+
+
 #define CUDA_SUCCESS hipSuccess
 
 #include <hip/hip_runtime_api.h>
@@ -517,7 +521,20 @@ void ihipPostLaunchKernel(hipStream_t stream, grid_launch_parm &lp);
 #define KNRM  "\x1B[0m"
 #define KGRN  "\x1B[32m"
 
-#if not defined(DISABLE_GRID_LAUNCH)
+#if USE_GRID_LAUNCH_20
+#define hipLaunchKernel(_kernelName, _numBlocks3D, _blockDim3D, _groupMemBytes, _stream, ...) \
+do {\
+  grid_launch_parm lp;\
+  lp.dynamic_group_mem_bytes = _groupMemBytes; \
+  hipStream_t trueStream = (ihipPreLaunchKernel(_stream, _numBlocks3D, _blockDim3D, &lp)); \
+    if (HIP_TRACE_API) {\
+        fprintf(stderr, KGRN "<<hip-api: hipLaunchKernel '%s' gridDim:(%d,%d,%d) groupDim:(%d,%d,%d) groupMem:+%d stream=%p\n" KNRM, \
+                #_kernelName, lp.grid_dim.x, lp.grid_dim.y, lp.grid_dim.z, lp.group_dim.x, lp.group_dim.y, lp.group_dim.z, lp.groupMemBytes, (void*)(_stream));\
+    }\
+  _kernelName (lp, ##__VA_ARGS__);\
+  ihipPostLaunchKernel(trueStream, lp);\
+} while(0)
+#else
 #define hipLaunchKernel(_kernelName, _numBlocks3D, _blockDim3D, _groupMemBytes, _stream, ...) \
 do {\
   grid_launch_parm lp;\
@@ -528,35 +545,12 @@ do {\
                 #_kernelName, lp.gridDim.x, lp.gridDim.y, lp.gridDim.z, lp.groupDim.x, lp.groupDim.y, lp.groupDim.z, lp.groupMemBytes, (void*)(_stream));\
     }\
   _kernelName (lp, ##__VA_ARGS__);\
-  ihipPostLaunchKernel(trueStream, lp);\
-} while(0)
-
-#else
-#warning(DISABLE_GRID_LAUNCH set)
-
-#define hipLaunchKernel(_kernelName, _numBlocks3D, _blockDim3D, _groupMemBytes, _stream, ...) \
-do {\
-  grid_launch_parm lp;\
-  lp.gridDim.x = _numBlocks3D.x * _blockDim3D.x;/*Convert from #blocks to #threads*/ \
-  lp.gridDim.y = _numBlocks3D.y * _blockDim3D.y;/*Convert from #blocks to #threads*/ \
-  lp.gridDim.z = _numBlocks3D.z * _blockDim3D.z;/*Convert from #blocks to #threads*/ \
-  lp.groupDim.x = _blockDim3D.x; \
-  lp.groupDim.y = _blockDim3D.y; \
-  lp.groupDim.z = _blockDim3D.z; \
-  lp.groupMemBytes = _groupMemBytes;\
-  hc::completion_future cf;\
-  lp.cf = &cf;  \
-  hipStream_t trueStream = (ihipPreLaunchKernel(_stream, &lp.av)); \
-    if (HIP_TRACE_API) {\
-        fprintf(stderr, "==hip-api: launch '%s' gridDim:[%d.%d.%d] groupDim:[%d.%d.%d] groupMem:+%d stream=%p\n", \
-                #_kernelName, lp.gridDim.z, lp.gridDim.y, lp.gridDim.x, lp.groupDim.z, lp.groupDim.y, lp.groupDim.x, lp.groupMemBytes, (void*)(_stream));\
-    }\
-  _kernelName (lp, ##__VA_ARGS__);\
   ihipPostLaunchKernel(trueStream, cf);\
 } while(0)
-/*end hipLaunchKernel */
+
 #endif
 
+
 #elif defined (__HCC_C__)
 
 //TODO - develop C interface.
diff --git a/include/hcc_detail/host_defines.h b/include/hcc_detail/host_defines.h
index dd9d60fc27..cc22b39ea7 100644
--- a/include/hcc_detail/host_defines.h
+++ b/include/hcc_detail/host_defines.h
@@ -35,11 +35,7 @@ THE SOFTWARE.
 #define __host__     __attribute__((cpu))
 #define __device__   __attribute__((hc))
 
-#ifndef DISABLE_GRID_LAUNCH
 #define __global__  __attribute__((hc_grid_launch))
-#else
-#define __global__
-#endif
 
 #define __noinline__      __attribute__((noinline))
 #define __forceinline__   __attribute__((always_inline))
diff --git a/include/hip_common.h b/include/hip_common.h
index 919105fef5..4c75568f4d 100644
--- a/include/hip_common.h
+++ b/include/hip_common.h
@@ -22,9 +22,6 @@ THE SOFTWARE.
 
 #pragma once
 
-// Disable use of grid_launch feature in HCC compiler.
-//#define DISABLE_GRID_LAUNCH
-
 // Common code included at start of every hip file.
 // Auto enable __HIP_PLATFORM_HCC__ if compiling with HCC
 // Other compiler (GCC,ICC,etc) need to set one of these macros explicitly
diff --git a/src/hip_hcc.cpp b/src/hip_hcc.cpp
index b24eed918a..28f83d9c0b 100644
--- a/src/hip_hcc.cpp
+++ b/src/hip_hcc.cpp
@@ -1091,12 +1091,23 @@ hipStream_t ihipPreLaunchKernel(hipStream_t stream, dim3 grid, dim3 block, grid_
 {
 	std::call_once(hip_initialized, ihipInit);
     stream = ihipSyncAndResolveStream(stream);
+#if USE_GRID_LAUNCH_20 
+    lp->grid_dim.x = grid.x;
+    lp->grid_dim.y = grid.y;
+    lp->grid_dim.z = grid.z;
+    lp->group_dim.x = block.x;
+    lp->group_dim.y = block.y;
+    lp->group_dim.z = block.z;
+    lp->barrier_bit = barrier_bit_queue_default;
+    lp->launch_fence = -1;
+#else
     lp->gridDim.x = grid.x;
     lp->gridDim.y = grid.y;
     lp->gridDim.z = grid.z;
     lp->groupDim.x = block.x;
     lp->groupDim.y = block.y;
     lp->groupDim.z = block.z;
+#endif
     stream->lockopen_preKernelCommand();
 //    *av = &stream->_av;
     lp->av = &stream->_av;
@@ -1109,12 +1120,23 @@ hipStream_t ihipPreLaunchKernel(hipStream_t stream, size_t grid, dim3 block, gri
 {
 	std::call_once(hip_initialized, ihipInit);
     stream = ihipSyncAndResolveStream(stream);
+#if USE_GRID_LAUNCH_20 
+    lp->grid_dim.x = grid;
+    lp->grid_dim.y = 1;
+    lp->grid_dim.z = 1;
+    lp->group_dim.x = block.x;
+    lp->group_dim.y = block.y;
+    lp->group_dim.z = block.z;
+    lp->barrier_bit = barrier_bit_queue_default;
+    lp->launch_fence = -1;
+#else
     lp->gridDim.x = grid;
     lp->gridDim.y = 1;
     lp->gridDim.z = 1;
     lp->groupDim.x = block.x;
     lp->groupDim.y = block.y;
     lp->groupDim.z = block.z;
+#endif
     stream->lockopen_preKernelCommand();
 //    *av = &stream->_av;
     lp->av = &stream->_av;
@@ -1128,12 +1150,23 @@ hipStream_t ihipPreLaunchKernel(hipStream_t stream, dim3 grid, size_t block, gri
 {
 	std::call_once(hip_initialized, ihipInit);
     stream = ihipSyncAndResolveStream(stream);
+#if USE_GRID_LAUNCH_20 
+    lp->grid_dim.x = grid.x;
+    lp->grid_dim.y = grid.y;
+    lp->grid_dim.z = grid.z;
+    lp->group_dim.x = block;
+    lp->group_dim.y = 1;
+    lp->group_dim.z = 1;
+    lp->barrier_bit = barrier_bit_queue_default;
+    lp->launch_fence = -1;
+#else
     lp->gridDim.x = grid.x;
     lp->gridDim.y = grid.y;
     lp->gridDim.z = grid.z;
     lp->groupDim.x = block;
     lp->groupDim.y = 1;
     lp->groupDim.z = 1;
+#endif
     stream->lockopen_preKernelCommand();
 //    *av = &stream->_av;
     lp->av = &stream->_av;
@@ -1147,12 +1180,23 @@ hipStream_t ihipPreLaunchKernel(hipStream_t stream, size_t grid, size_t block, g
 {
 	std::call_once(hip_initialized, ihipInit);
     stream = ihipSyncAndResolveStream(stream);
+#if USE_GRID_LAUNCH_20 
+    lp->grid_dim.x = grid;
+    lp->grid_dim.y = 1;
+    lp->grid_dim.z = 1;
+    lp->group_dim.x = block;
+    lp->group_dim.y = 1;
+    lp->group_dim.z = 1;
+    lp->barrier_bit = barrier_bit_queue_default;
+    lp->launch_fence = -1;
+#else
     lp->gridDim.x = grid;
     lp->gridDim.y = 1;
     lp->gridDim.z = 1;
     lp->groupDim.x = block;
     lp->groupDim.y = 1;
     lp->groupDim.z = 1;
+#endif
     stream->lockopen_preKernelCommand();
 //    *av = &stream->_av;
     lp->av = &stream->_av;
diff --git a/tests/src/kernel/hipGridLaunch.cpp b/tests/src/kernel/hipGridLaunch.cpp
index f13781362e..b195a0171d 100644
--- a/tests/src/kernel/hipGridLaunch.cpp
+++ b/tests/src/kernel/hipGridLaunch.cpp
@@ -21,7 +21,6 @@ THE SOFTWARE.
 */
 // Test the Grid_Launch syntax.
 
-#undef DISABLE_GRID_LAUNCH /* Tell hip_*.h to compile in GL mode */
 #include "hip_runtime.h"
 #include "test_common.h"
 

From 702b9eff1e4af40169e85084e8516d22d0d3078f Mon Sep 17 00:00:00 2001
From: Ben Sander <ben.sander@amd.com>
Date: Tue, 21 Jun 2016 16:04:04 -0500
Subject: [PATCH 4/5] Use GRID_LAUNCH_VERSION define provided by HCC
 grid_launch.h

Change-Id: I5a7b31f13c4055cdca3a8854728d59db7dc5a7bb
---
 include/hcc_detail/hip_runtime.h | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/include/hcc_detail/hip_runtime.h b/include/hcc_detail/hip_runtime.h
index 8594954e75..21da60631a 100644
--- a/include/hcc_detail/hip_runtime.h
+++ b/include/hcc_detail/hip_runtime.h
@@ -38,8 +38,6 @@ THE SOFTWARE.
 #include <stddef.h>
 
 
-// Use field names for grid_launch 2.0 structure:
-#define USE_GRID_LAUNCH_20 0
 
 
 #define CUDA_SUCCESS hipSuccess
@@ -50,6 +48,16 @@ THE SOFTWARE.
 // Remainder of this file only compiles with HCC
 #ifdef __HCC__
 #include <grid_launch.h>
+
+#if defined (GRID_LAUNCH_VERSION) and (GRID_LAUNCH_VERSION >= 20) 
+// Use field names for grid_launch 2.0 structure, if HCC supports GL 2.0.
+#define USE_GRID_LAUNCH_20 1
+#else
+#define USE_GRID_LAUNCH_20 0
+#endif
+
+
+
 extern int HIP_TRACE_API;
 
 //TODO-HCC-GL - change this to typedef.
@@ -529,7 +537,7 @@ do {\
   hipStream_t trueStream = (ihipPreLaunchKernel(_stream, _numBlocks3D, _blockDim3D, &lp)); \
     if (HIP_TRACE_API) {\
         fprintf(stderr, KGRN "<<hip-api: hipLaunchKernel '%s' gridDim:(%d,%d,%d) groupDim:(%d,%d,%d) groupMem:+%d stream=%p\n" KNRM, \
-                #_kernelName, lp.grid_dim.x, lp.grid_dim.y, lp.grid_dim.z, lp.group_dim.x, lp.group_dim.y, lp.group_dim.z, lp.groupMemBytes, (void*)(_stream));\
+                #_kernelName, lp.grid_dim.x, lp.grid_dim.y, lp.grid_dim.z, lp.group_dim.x, lp.group_dim.y, lp.group_dim.z, lp.dynamic_group_mem_bytes, (void*)(_stream));\
     }\
   _kernelName (lp, ##__VA_ARGS__);\
   ihipPostLaunchKernel(trueStream, lp);\
@@ -545,7 +553,7 @@ do {\
                 #_kernelName, lp.gridDim.x, lp.gridDim.y, lp.gridDim.z, lp.groupDim.x, lp.groupDim.y, lp.groupDim.z, lp.groupMemBytes, (void*)(_stream));\
     }\
   _kernelName (lp, ##__VA_ARGS__);\
-  ihipPostLaunchKernel(trueStream, cf);\
+  ihipPostLaunchKernel(trueStream, lp);\
 } while(0)
 
 #endif

From 4efa7e217ec93905e227fde96667f99b7df6e772 Mon Sep 17 00:00:00 2001
From: Maneesh Gupta <maneesh.gupta@amd.com>
Date: Fri, 24 Jun 2016 21:10:37 +0530
Subject: [PATCH 5/5] hip_samples package now contains uncompressed samples

Change-Id: I69e773ebeff59733e016abd5a90bd1637798b1f5
---
 packaging/hip_samples.txt | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/packaging/hip_samples.txt b/packaging/hip_samples.txt
index 6adf9160bf..f289f2a8e5 100644
--- a/packaging/hip_samples.txt
+++ b/packaging/hip_samples.txt
@@ -1,17 +1,13 @@
 cmake_minimum_required(VERSION 2.8.3)
 project(hip_samples)
 
-add_custom_target(create_installer_script ALL
-                  COMMAND tar cvzf ${PROJECT_BINARY_DIR}/samples.tgz --exclude='*.o' .
-                  COMMAND @hip_SOURCE_DIR@/packaging/create_hip_samples_installer.sh ${PROJECT_BINARY_DIR}/samples.tgz ${PROJECT_BINARY_DIR}/unpack_hip_samples.sh
-                  WORKING_DIRECTORY @hip_SOURCE_DIR@/samples)
-install(PROGRAMS unpack_hip_samples.sh DESTINATION bin)
+install(DIRECTORY @hip_SOURCE_DIR@/samples DESTINATION .)
 
 #############################
 # Packaging steps
 #############################
 set(CPACK_SET_DESTDIR TRUE)
-set(CPACK_INSTALL_PREFIX "/opt/rocm")
+set(CPACK_INSTALL_PREFIX "/opt/rocm/hip")
 set(CPACK_PACKAGE_NAME "hip_samples")
 set(CPACK_PACKAGE_DESCRIPTION_SUMMARY "HIP: Heterogenous-computing Interface for Portability [SAMPLES]")
 set(CPACK_PACKAGE_VENDOR "Advanced Micro Devices, Inc.")