From db7e460626301bd16ea25464aeaef5013badb994 Mon Sep 17 00:00:00 2001
From: sandeep kumar <sandeep.kumar@amd.com>
Date: Wed, 7 Sep 2016 17:16:12 +0530
Subject: [PATCH] Add 2_Cookbook

Change-Id: I10bbbd4bcb80a5900fe6af466c8f4c94ea5efe9a
---
 samples/2_Cookbook/0_MatrixTranspose/Makefile |  36 ++++
 .../0_MatrixTranspose/MatrixTranspose.cpp     | 137 ++++++++++++++
 .../2_Cookbook/0_MatrixTranspose/Readme.md    | 100 ++++++++++
 samples/2_Cookbook/1_hipEvent/Makefile        |  36 ++++
 samples/2_Cookbook/1_hipEvent/Readme.md       |  74 ++++++++
 samples/2_Cookbook/1_hipEvent/hipEvent.cpp    | 174 ++++++++++++++++++
 samples/2_Cookbook/2_HIP_ATP_MARKER/Makefile  |  36 ++++
 .../2_HIP_ATP_MARKER/MatrixTranspose.cpp      | 174 ++++++++++++++++++
 samples/2_Cookbook/2_HIP_ATP_MARKER/Readme.md |  51 +++++
 samples/2_Cookbook/3_shared_memory/Makefile   |  36 ++++
 .../3_shared_memory/sharedMemory.cpp          | 144 +++++++++++++++
 samples/2_Cookbook/4_shfl/Makefile            |  36 ++++
 samples/2_Cookbook/4_shfl/shfl.cpp            | 143 ++++++++++++++
 samples/2_Cookbook/5_2dshfl/2dshfl.cpp        | 139 ++++++++++++++
 samples/2_Cookbook/5_2dshfl/Makefile          |  36 ++++
 15 files changed, 1352 insertions(+)
 create mode 100644 samples/2_Cookbook/0_MatrixTranspose/Makefile
 create mode 100644 samples/2_Cookbook/0_MatrixTranspose/MatrixTranspose.cpp
 create mode 100644 samples/2_Cookbook/0_MatrixTranspose/Readme.md
 create mode 100644 samples/2_Cookbook/1_hipEvent/Makefile
 create mode 100644 samples/2_Cookbook/1_hipEvent/Readme.md
 create mode 100644 samples/2_Cookbook/1_hipEvent/hipEvent.cpp
 create mode 100644 samples/2_Cookbook/2_HIP_ATP_MARKER/Makefile
 create mode 100644 samples/2_Cookbook/2_HIP_ATP_MARKER/MatrixTranspose.cpp
 create mode 100644 samples/2_Cookbook/2_HIP_ATP_MARKER/Readme.md
 create mode 100644 samples/2_Cookbook/3_shared_memory/Makefile
 create mode 100644 samples/2_Cookbook/3_shared_memory/sharedMemory.cpp
 create mode 100644 samples/2_Cookbook/4_shfl/Makefile
 create mode 100644 samples/2_Cookbook/4_shfl/shfl.cpp
 create mode 100644 samples/2_Cookbook/5_2dshfl/2dshfl.cpp
 create mode 100644 samples/2_Cookbook/5_2dshfl/Makefile

diff --git a/samples/2_Cookbook/0_MatrixTranspose/Makefile b/samples/2_Cookbook/0_MatrixTranspose/Makefile
new file mode 100644
index 0000000000..ffb442e443
--- /dev/null
+++ b/samples/2_Cookbook/0_MatrixTranspose/Makefile
@@ -0,0 +1,36 @@
+HIP_PATH?= $(wildcard /opt/rocm/hip)
+ifeq (,$(HIP_PATH))
+	HIP_PATH=../../..
+endif
+
+HIPCC=$(HIP_PATH)/bin/hipcc
+
+TARGET=hcc
+
+SOURCES = MatrixTranspose.cpp
+OBJECTS = $(SOURCES:.cpp=.o)
+
+EXECUTABLE=./exe
+
+.PHONY: test
+
+
+all: $(EXECUTABLE) test
+
+CXXFLAGS =-g
+CXX=$(HIPCC)
+
+
+$(EXECUTABLE): $(OBJECTS)
+	$(HIPCC) $(OBJECTS) -o $@
+
+
+test: $(EXECUTABLE)
+	$(EXECUTABLE)
+
+
+clean:
+	rm -f $(EXECUTABLE)
+	rm -f $(OBJECTS)
+	rm -f $(HIP_PATH)/src/*.o
+
diff --git a/samples/2_Cookbook/0_MatrixTranspose/MatrixTranspose.cpp b/samples/2_Cookbook/0_MatrixTranspose/MatrixTranspose.cpp
new file mode 100644
index 0000000000..c43785f5c9
--- /dev/null
+++ b/samples/2_Cookbook/0_MatrixTranspose/MatrixTranspose.cpp
@@ -0,0 +1,137 @@
+/*
+Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+#include<iostream>
+
+// hip header file
+#include "hip_runtime.h"
+
+
+#define WIDTH     1024
+#define HEIGHT    1024
+
+#define NUM       (WIDTH*HEIGHT)
+
+#define THREADS_PER_BLOCK_X  16
+#define THREADS_PER_BLOCK_Y  16
+#define THREADS_PER_BLOCK_Z  1
+
+// Device (Kernel) function, it must be void
+// hipLaunchParm provides the execution configuration
+__global__ void matrixTranspose(hipLaunchParm lp,
+                                float *out,
+                                float *in,
+                                const int width,
+                                const int height)
+{
+    int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
+    int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;
+
+    out[y * width + x] = in[x * height + y];
+}
+
+// CPU implementation of matrix transpose
+void matrixTransposeCPUReference(
+    float * output,
+    float * input,
+    const unsigned int width,
+    const unsigned int height)
+{
+    for(unsigned int j=0; j < height; j++)
+    {
+        for(unsigned int i=0; i < width; i++)
+        {
+            output[i*height + j] = input[j*width + i];
+        }
+    }
+}
+
+int main() {
+
+  float* Matrix;
+  float* TransposeMatrix;
+  float* cpuTransposeMatrix;
+
+  float* gpuMatrix;
+  float* gpuTransposeMatrix;
+
+  hipDeviceProp_t devProp;
+  hipGetDeviceProperties(&devProp, 0);
+
+  std::cout << "Device name " << devProp.name << std::endl;
+
+  int i;
+  int errors;
+
+  Matrix = (float*)malloc(NUM * sizeof(float));
+  TransposeMatrix = (float*)malloc(NUM * sizeof(float));
+  cpuTransposeMatrix = (float*)malloc(NUM * sizeof(float));
+
+  // initialize the input data
+  for (i = 0; i < NUM; i++) {
+    Matrix[i] = (float)i*10.0f;
+  }
+
+  // allocate the memory on the device side
+  hipMalloc((void**)&gpuMatrix, NUM * sizeof(float));
+  hipMalloc((void**)&gpuTransposeMatrix, NUM * sizeof(float));
+
+  // Memory transfer from host to device
+  hipMemcpy(gpuMatrix, Matrix, NUM*sizeof(float), hipMemcpyHostToDevice);
+
+  // Lauching kernel from host
+  hipLaunchKernel(matrixTranspose,
+                  dim3(WIDTH/THREADS_PER_BLOCK_X, HEIGHT/THREADS_PER_BLOCK_Y),
+                  dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),
+                  0, 0,
+                  gpuTransposeMatrix , gpuMatrix, WIDTH ,HEIGHT);
+
+  // Memory transfer from device to host
+  hipMemcpy(TransposeMatrix, gpuTransposeMatrix, NUM*sizeof(float), hipMemcpyDeviceToHost);
+
+  // CPU MatrixTranspose computation
+  matrixTransposeCPUReference(cpuTransposeMatrix, Matrix, WIDTH, HEIGHT);
+
+  // verify the results
+  errors = 0;
+  double eps = 1.0E-6;
+  for (i = 0; i < NUM; i++) {
+    if (std::abs(TransposeMatrix[i] - cpuTransposeMatrix[i]) > 0 ) {
+      errors++;
+    }
+  }
+  if (errors!=0) {
+    printf("FAILED: %d errors\n",errors);
+  } else {
+    printf ("PASSED!\n");
+  }
+
+  //free the resources on device side
+  hipFree(gpuMatrix);
+  hipFree(gpuTransposeMatrix);
+
+  //free the resources on host side
+  free(Matrix);
+  free(TransposeMatrix);
+  free(cpuTransposeMatrix);
+
+  return errors;
+}
diff --git a/samples/2_Cookbook/0_MatrixTranspose/Readme.md b/samples/2_Cookbook/0_MatrixTranspose/Readme.md
new file mode 100644
index 0000000000..b1c0b261b9
--- /dev/null
+++ b/samples/2_Cookbook/0_MatrixTranspose/Readme.md
@@ -0,0 +1,100 @@
+## Writing first HIP program ###
+
+This tutorial shows how to get write simple HIP application. We will write the simplest Matrix Transpose program.
+
+## HIP Introduction:
+
+HIP is a C++ runtime API and kernel language that allows developers to create portable applications that can run on AMD and other GPU’s. Our goal was to rise above the lowest-common-denominator paths and deliver a solution that allows you, the developer, to use essential hardware features and maximize your application’s performance on GPU hardware.
+
+## Requirement:
+For hardware requirement and software installation [Installation](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/INSTALL.md) 
+
+## prerequiste knowledge:
+
+Programmers familiar with CUDA, OpenCL will be able to quickly learn and start coding with the HIP API. In case you are not, don't worry. You choose to start with the best one. We'll be explaining everything assuming you are completely new to gpgpu programming.
+
+## Simple Matrix Transpose 
+
+Here is simple example showing how to write your first program in HIP.
+In order to use the HIP framework, we need to add the "hip_runtime.h" header file. SInce its c++ api you can add any header file you have been using earlier while writing your c/c++ program. For gpgpu programming, we have host(microprocessor) and the device(gpu). 
+
+## Device-side code
+We will work on device side code first, Here is simple example showing a snippet of HIP device side code:
+
+`__global__ void matrixTranspose(hipLaunchParm lp,            `
+`                                float *out,                  `
+`                                float *in,                   `
+`                                const int width,             `
+`                                const int height)            `
+`{                                                            `
+`    int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;  `
+`    int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;  `
+`                                                             `
+`    out[y * width + x] = in[x * height + y];                 `
+`}                                                            `
+
+`__global__` keyword is the Function-Type Qualifiers, it is used with functions that are executed on device and are called/launched from the hosts.
+other function-type qualifiers are:
+`__device__` functions are Executed on the device and Called from the device only
+`__host__` functions are Executed on the host and Called from the host
+
+`__host__` can combine with `__device__`, in which case the function compiles for both the host and device. These functions cannot use the HIP grid coordinate functions (for example, "hipThreadIdx_x", will talk about it latter). A possible workaround is to pass the necessary coordinate info as an argument to the function.
+`__host__` cannot combine with `__global__`.
+
+`__global__` functions are often referred to as *kernels, and calling one is termed *launching the kernel*. 
+
+Next keyword is `void`. HIP `__global__` functions must have a `void` return type, and the first parameter to a HIP `__global__` function must have the type `hipLaunchParm`, which is for execution configuration. Global functions require the caller to specify an "execution configuration" that includes the grid and block dimensions. The execution configuration can also include other information for the launch, such as the amount of additional shared memory to allocate and the stream where the kernel should execute.
+
+After `hipLaunchParm`, Kernel arguments follows next(i.e.,  `float *out, float *in, const int width, const int height`).
+
+The kernel function begins with 
+`    int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;`
+`    int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;`
+here the keyword hipBlockIdx_x, hipBlockIdx_y and hipBlockIdx_z(not used here) are the built-in functions to identify the threads in a block. The keyword hipBlockDim_x, hipBlockDim_y and hipBlockDim_z(not used here) are to identify the dimensions of the block.
+
+We are familiar with rest of the code on device-side.
+
+## Host-side code
+
+Now, we'll see how to call the kernel from the host. Inside the main() function, we first defined the pointers(for both, the host-side as well as device). The declaration of device pointer is similar to that of the host. Next, we have `hipDeviceProp_t`, it is the pre-defined struct for hip device properties. This is followed by `hipGetDeviceProperties(&devProp, 0)` It is used to extract the device information. The first parameter is the struct, second parameter is the device number to get properties for. Next line print the name of the device.
+
+We allocated memory to the Matrix on host side by using malloc and initiallized it. While in order to allocate memory on device side we will be using `hipMalloc`, it's quiet similar to that of malloc instruction. After this, we will copy the data to the allocated memory on device-side using `hipMemcpy`.
+`  hipMemcpy(gpuMatrix, Matrix, NUM*sizeof(float), hipMemcpyHostToDevice);`
+here the first parameter is the destination pointer, second is the source pointer, third is the size of memory copy and the last specify the direction on memory copy(which is in this case froom host to device). While in order to transfer memory from device to host, use `hipMemcpyDeviceToHost` and for device to device memory copy use `hipMemcpyDeviceToDevice`.
+
+Now, we'll see how to launch the kernel.
+`  hipLaunchKernel(matrixTranspose,                                                         `
+`                  dim3(WIDTH/THREADS_PER_BLOCK_X, HEIGHT/THREADS_PER_BLOCK_Y),             `
+`                  dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),                          `
+`                  0, 0,                                                                    `
+`                  gpuTransposeMatrix , gpuMatrix, WIDTH ,HEIGHT);                          `
+
+HIP introduces a standard C++ calling convention to pass the execution configuration to the kernel (this convention replaces the `Cuda <<< >>>` syntax). In HIP,
+- Kernels launch with the `"hipLaunchKernel"` function
+- The first five parameters to hipLaunchKernel are the following:
+   - **symbol kernelName**: the name of the kernel to launch.  To support template kernels which contains "," use the HIP_KERNEL_NAME macro. In current application it's "matrixTranspose".
+   - **dim3 gridDim**: 3D-grid dimensions specifying the number of blocks to launch. In MatrixTranspose sample, it's "dim3(WIDTH/THREADS_PER_BLOCK_X, HEIGHT/THREADS_PER_BLOCK_Y)". 
+   - **dim3 blockDim**: 3D-block dimensions specifying the number of threads in each block.In MatrixTranspose sample, it's "dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y)". 
+   - **size_t dynamicShared**: amount of additional shared memory to allocate when launching the kernel. In MatrixTranspose sample, it's '0'.
+   - **hipStream_t**: stream where the kernel should execute. A value of 0 corresponds to the NULL stream.In MatrixTranspose sample, it's '0'.
+- Kernel arguments follow these first five parameters. Here, these are "gpuTransposeMatrix , gpuMatrix, WIDTH ,HEIGHT".
+
+Next, we'll copy the computed values/data back to the device using the `hipMemcpy`. Here the last parameter will be `hipMemcpyDeviceToHost`
+
+After, copying the data from device to memory, we will verify it with the one we computed with the cpu reference funtion.
+
+Finally, we will free the memory allocated earlier by using free() for host while for devices we will use `hipFree`.
+
+## How to build and run:
+Use the make command and execute it using ./exe
+Use hipcc to build the application, which is using hcc on AMD and nvcc on nvidia.
+
+## More Info:
+- [HIP FAQ](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/docs/markdown/hip_faq.md)
+- [HIP Kernel Language](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/docs/markdown/hip_kernel_language.md)
+- [HIP Runtime API (Doxygen)](http://gpuopen-professionalcompute-tools.github.io/HIP)
+- [HIP Porting Guide](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/docs/markdown/hip_porting_guide.md)
+- [HIP Terminology](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/docs/markdown/hip_terms.md) (including Rosetta Stone of GPU computing terms across CUDA/HIP/HC/AMP/OpenL)
+- [clang-hipify](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/clang-hipify/README.md)
+- [Developer/CONTRIBUTING Info](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/CONTRIBUTING.md)
+- [Release Notes](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/RELEASE.md)
diff --git a/samples/2_Cookbook/1_hipEvent/Makefile b/samples/2_Cookbook/1_hipEvent/Makefile
new file mode 100644
index 0000000000..dc0f7db2e6
--- /dev/null
+++ b/samples/2_Cookbook/1_hipEvent/Makefile
@@ -0,0 +1,36 @@
+HIP_PATH?= $(wildcard /opt/rocm/hip)
+ifeq (,$(HIP_PATH))
+	HIP_PATH=../../..
+endif
+
+HIPCC=$(HIP_PATH)/bin/hipcc
+
+TARGET=hcc
+
+SOURCES = hipEvent.cpp
+OBJECTS = $(SOURCES:.cpp=.o)
+
+EXECUTABLE=./exe
+
+.PHONY: test
+
+
+all: $(EXECUTABLE) test
+
+CXXFLAGS =-g
+CXX=$(HIPCC)
+
+
+$(EXECUTABLE): $(OBJECTS)
+	$(HIPCC) $(OBJECTS) -o $@
+
+
+test: $(EXECUTABLE)
+	$(EXECUTABLE)
+
+
+clean:
+	rm -f $(EXECUTABLE)
+	rm -f $(OBJECTS)
+	rm -f $(HIP_PATH)/src/*.o
+
diff --git a/samples/2_Cookbook/1_hipEvent/Readme.md b/samples/2_Cookbook/1_hipEvent/Readme.md
new file mode 100644
index 0000000000..16285120fa
--- /dev/null
+++ b/samples/2_Cookbook/1_hipEvent/Readme.md
@@ -0,0 +1,74 @@
+## Using hipEvents to measure performance ###
+
+This tutorial is follow-up of the previous one where we learn how to write our first hip program, in which we compute Matrix Transpose. In this tutorial, we'll explain how to use the hipEvent to get the performance score for memory transfer and kernel execution time.
+
+## Introduction:
+
+Memory transfer and kernel execution are the most important parameter in parallel computing (specially HPC and machine learning). Memory bottlenecks is the main problem why we are not able to get the highest performance, therefore obtaining the memory transfer timing and kernel execution timing plays key role in application optimization.
+
+## Requirement:
+For hardware requirement and software installation [Installation](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/INSTALL.md) 
+
+## prerequiste knowledge:
+
+Programmers familiar with CUDA, OpenCL will be able to quickly learn and start coding with the HIP API. In case you are not, don't worry. You choose to start with the best one. We'll be explaining everything assuming you are completely new to gpgpu programming.
+
+## Simple Matrix Transpose 
+
+We will be using the Simple Matrix Transpose application from the previous tutorial and modify it to learn how to get the performance score for memory transfer and kernel execution time.
+
+## hipEnvent_t
+
+We'll learn how to use the event management functionality of HIP runtime api. In the same sourcecode, we used for MatrixTranspose we will declare the following events as follows:
+
+`  hipEvent_t start, stop;`
+
+We'll create the event with the help of following code:
+
+`  hipEventCreate(&start);`
+`  hipEventCreate(&stop);`
+
+We'll use the "eventMs" variable to store the time taken value:
+`  float eventMs = 1.0f;`
+
+## Time taken measurement by using hipEvents:
+
+We'll start the timer by calling:
+`  hipEventRecord(start, NULL);`
+in this, the first parameter is the hipEvent_t, will will mark the start of the time from which the measurement has to be performed, while the second parameter has to be of the type hipStream_t. In current situation, we have passed NULL (the default stream). We will learn about the `hipStream_t` in more detail latter.
+
+Now, we'll have the operation for which we need to compute the time taken. For the case of memory transfer, we'll place the `hipMemcpy`:
+`  hipMemcpy(gpuMatrix, Matrix, NUM*sizeof(float), hipMemcpyHostToDevice);`
+
+and for kernel execution time we'll use `hipKernelLaunch`:
+`  hipLaunchKernel(matrixTranspose,                                                    `
+`                  dim3(WIDTH/THREADS_PER_BLOCK_X, HEIGHT/THREADS_PER_BLOCK_Y),        `
+`                  dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),                     `
+`                  0, 0,                                                               `
+`                  gpuTransposeMatrix , gpuMatrix, WIDTH ,HEIGHT);                     `
+
+Now to mark the end of the eventRecord, we will again use the hipEventRecord by passing the stop event:
+`  hipEventRecord(stop, NULL);`
+
+Will synchronize the event with the help of:
+`  hipEventSynchronize(stop);`
+
+In order to calculate the time taken by measuring the difference of occurance marked by the start and stop event, we'll use:
+`  hipEventElapsedTime(&eventMs, start, stop);`
+Here the first parameter will store the time taken value, second parameter is the starting marker for the event while the third one is marking the end.
+
+We can print the value of time take comfortably since eventMs is float variable.
+
+## How to build and run:
+Use the make command and execute it using ./exe
+Use hipcc to build the application, which is using hcc on AMD and nvcc on nvidia.
+
+## More Info:
+- [HIP FAQ](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/docs/markdown/hip_faq.md)
+- [HIP Kernel Language](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/docs/markdown/hip_kernel_language.md)
+- [HIP Runtime API (Doxygen)](http://gpuopen-professionalcompute-tools.github.io/HIP)
+- [HIP Porting Guide](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/docs/markdown/hip_porting_guide.md)
+- [HIP Terminology](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/docs/markdown/hip_terms.md) (including Rosetta Stone of GPU computing terms across CUDA/HIP/HC/AMP/OpenL)
+- [clang-hipify](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/clang-hipify/README.md)
+- [Developer/CONTRIBUTING Info](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/CONTRIBUTING.md)
+- [Release Notes](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/RELEASE.md)
diff --git a/samples/2_Cookbook/1_hipEvent/hipEvent.cpp b/samples/2_Cookbook/1_hipEvent/hipEvent.cpp
new file mode 100644
index 0000000000..b6bc4d1db1
--- /dev/null
+++ b/samples/2_Cookbook/1_hipEvent/hipEvent.cpp
@@ -0,0 +1,174 @@
+/*
+Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+#include<iostream>
+
+// hip header file
+#include "hip_runtime.h"
+
+#define WIDTH     1024
+#define HEIGHT    1024
+
+#define NUM       (WIDTH*HEIGHT)
+
+#define THREADS_PER_BLOCK_X  16
+#define THREADS_PER_BLOCK_Y  16
+#define THREADS_PER_BLOCK_Z  1
+
+// Device (Kernel) function, it must be void
+// hipLaunchParm provides the execution configuration
+__global__ void matrixTranspose(hipLaunchParm lp,
+                                float *out,
+                                float *in,
+                                const int width,
+                                const int height)
+{
+    int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
+    int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;
+
+    out[y * width + x] = in[x * height + y];
+}
+
+// CPU implementation of matrix transpose
+void matrixTransposeCPUReference(
+    float * output,
+    float * input,
+    const unsigned int width,
+    const unsigned int height)
+{
+    for(unsigned int j=0; j < height; j++)
+    {
+        for(unsigned int i=0; i < width; i++)
+        {
+            output[i*height + j] = input[j*width + i];
+        }
+    }
+}
+
+int main() {
+
+  float* Matrix;
+  float* TransposeMatrix;
+  float* cpuTransposeMatrix;
+
+  float* gpuMatrix;
+  float* gpuTransposeMatrix;
+
+  hipDeviceProp_t devProp;
+  hipGetDeviceProperties(&devProp, 0);
+
+  std::cout << "Device name " << devProp.name << std::endl;
+
+  hipEvent_t start, stop;
+  hipEventCreate(&start);
+  hipEventCreate(&stop);
+  float eventMs = 1.0f;
+
+  int i;
+  int errors;
+
+  Matrix = (float*)malloc(NUM * sizeof(float));
+  TransposeMatrix = (float*)malloc(NUM * sizeof(float));
+  cpuTransposeMatrix = (float*)malloc(NUM * sizeof(float));
+
+  // initialize the input data
+  for (i = 0; i < NUM; i++) {
+    Matrix[i] = (float)i*10.0f;
+  }
+
+  // allocate the memory on the device side
+  hipMalloc((void**)&gpuMatrix, NUM * sizeof(float));
+  hipMalloc((void**)&gpuTransposeMatrix, NUM * sizeof(float));
+
+  // Record the start event
+  hipEventRecord(start, NULL);
+
+  // Memory transfer from host to device
+  hipMemcpy(gpuMatrix, Matrix, NUM*sizeof(float), hipMemcpyHostToDevice);
+
+  // Record the stop event
+  hipEventRecord(stop, NULL);
+  hipEventSynchronize(stop);
+
+  hipEventElapsedTime(&eventMs, start, stop);
+
+  printf ("hipMemcpyHostToDevice time taken  = %6.3fms\n", eventMs);
+
+  // Record the start event
+  hipEventRecord(start, NULL);
+
+  // Lauching kernel from host
+  hipLaunchKernel(matrixTranspose,
+                  dim3(WIDTH/THREADS_PER_BLOCK_X, HEIGHT/THREADS_PER_BLOCK_Y),
+                  dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),
+                  0, 0,
+                  gpuTransposeMatrix , gpuMatrix, WIDTH ,HEIGHT);
+
+  // Record the stop event
+  hipEventRecord(stop, NULL);
+  hipEventSynchronize(stop);
+
+  hipEventElapsedTime(&eventMs, start, stop);
+
+  printf ("kernel Execution time             = %6.3fms\n", eventMs);
+
+  // Record the start event
+  hipEventRecord(start, NULL);
+
+  // Memory transfer from device to host
+  hipMemcpy(TransposeMatrix, gpuTransposeMatrix, NUM*sizeof(float), hipMemcpyDeviceToHost);
+
+  // Record the stop event
+  hipEventRecord(stop, NULL);
+  hipEventSynchronize(stop);
+
+  hipEventElapsedTime(&eventMs, start, stop);
+
+  printf ("hipMemcpyDeviceToHost time taken  = %6.3fms\n", eventMs);
+
+  // CPU MatrixTranspose computation
+  matrixTransposeCPUReference(cpuTransposeMatrix, Matrix, WIDTH, HEIGHT);
+
+  // verify the results
+  errors = 0;
+  double eps = 1.0E-6;
+  for (i = 0; i < NUM; i++) {
+    if (std::abs(TransposeMatrix[i] - cpuTransposeMatrix[i]) > 0 ) {
+      errors++;
+    }
+  }
+  if (errors!=0) {
+    printf("FAILED: %d errors\n",errors);
+  } else {
+    printf ("PASSED!\n");
+  }
+
+  //free the resources on device side
+  hipFree(gpuMatrix);
+  hipFree(gpuTransposeMatrix);
+
+  //free the resources on host side
+  free(Matrix);
+  free(TransposeMatrix);
+  free(cpuTransposeMatrix);
+
+  return errors;
+}
diff --git a/samples/2_Cookbook/2_HIP_ATP_MARKER/Makefile b/samples/2_Cookbook/2_HIP_ATP_MARKER/Makefile
new file mode 100644
index 0000000000..ffb442e443
--- /dev/null
+++ b/samples/2_Cookbook/2_HIP_ATP_MARKER/Makefile
@@ -0,0 +1,36 @@
+HIP_PATH?= $(wildcard /opt/rocm/hip)
+ifeq (,$(HIP_PATH))
+	HIP_PATH=../../..
+endif
+
+HIPCC=$(HIP_PATH)/bin/hipcc
+
+TARGET=hcc
+
+SOURCES = MatrixTranspose.cpp
+OBJECTS = $(SOURCES:.cpp=.o)
+
+EXECUTABLE=./exe
+
+.PHONY: test
+
+
+all: $(EXECUTABLE) test
+
+CXXFLAGS =-g
+CXX=$(HIPCC)
+
+
+$(EXECUTABLE): $(OBJECTS)
+	$(HIPCC) $(OBJECTS) -o $@
+
+
+test: $(EXECUTABLE)
+	$(EXECUTABLE)
+
+
+clean:
+	rm -f $(EXECUTABLE)
+	rm -f $(OBJECTS)
+	rm -f $(HIP_PATH)/src/*.o
+
diff --git a/samples/2_Cookbook/2_HIP_ATP_MARKER/MatrixTranspose.cpp b/samples/2_Cookbook/2_HIP_ATP_MARKER/MatrixTranspose.cpp
new file mode 100644
index 0000000000..b6bc4d1db1
--- /dev/null
+++ b/samples/2_Cookbook/2_HIP_ATP_MARKER/MatrixTranspose.cpp
@@ -0,0 +1,174 @@
+/*
+Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+#include<iostream>
+
+// hip header file
+#include "hip_runtime.h"
+
+#define WIDTH     1024
+#define HEIGHT    1024
+
+#define NUM       (WIDTH*HEIGHT)
+
+#define THREADS_PER_BLOCK_X  16
+#define THREADS_PER_BLOCK_Y  16
+#define THREADS_PER_BLOCK_Z  1
+
+// Device (Kernel) function, it must be void
+// hipLaunchParm provides the execution configuration
+__global__ void matrixTranspose(hipLaunchParm lp,
+                                float *out,
+                                float *in,
+                                const int width,
+                                const int height)
+{
+    int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
+    int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;
+
+    out[y * width + x] = in[x * height + y];
+}
+
+// CPU implementation of matrix transpose
+void matrixTransposeCPUReference(
+    float * output,
+    float * input,
+    const unsigned int width,
+    const unsigned int height)
+{
+    for(unsigned int j=0; j < height; j++)
+    {
+        for(unsigned int i=0; i < width; i++)
+        {
+            output[i*height + j] = input[j*width + i];
+        }
+    }
+}
+
+int main() {
+
+  float* Matrix;
+  float* TransposeMatrix;
+  float* cpuTransposeMatrix;
+
+  float* gpuMatrix;
+  float* gpuTransposeMatrix;
+
+  hipDeviceProp_t devProp;
+  hipGetDeviceProperties(&devProp, 0);
+
+  std::cout << "Device name " << devProp.name << std::endl;
+
+  hipEvent_t start, stop;
+  hipEventCreate(&start);
+  hipEventCreate(&stop);
+  float eventMs = 1.0f;
+
+  int i;
+  int errors;
+
+  Matrix = (float*)malloc(NUM * sizeof(float));
+  TransposeMatrix = (float*)malloc(NUM * sizeof(float));
+  cpuTransposeMatrix = (float*)malloc(NUM * sizeof(float));
+
+  // initialize the input data
+  for (i = 0; i < NUM; i++) {
+    Matrix[i] = (float)i*10.0f;
+  }
+
+  // allocate the memory on the device side
+  hipMalloc((void**)&gpuMatrix, NUM * sizeof(float));
+  hipMalloc((void**)&gpuTransposeMatrix, NUM * sizeof(float));
+
+  // Record the start event
+  hipEventRecord(start, NULL);
+
+  // Memory transfer from host to device
+  hipMemcpy(gpuMatrix, Matrix, NUM*sizeof(float), hipMemcpyHostToDevice);
+
+  // Record the stop event
+  hipEventRecord(stop, NULL);
+  hipEventSynchronize(stop);
+
+  hipEventElapsedTime(&eventMs, start, stop);
+
+  printf ("hipMemcpyHostToDevice time taken  = %6.3fms\n", eventMs);
+
+  // Record the start event
+  hipEventRecord(start, NULL);
+
+  // Lauching kernel from host
+  hipLaunchKernel(matrixTranspose,
+                  dim3(WIDTH/THREADS_PER_BLOCK_X, HEIGHT/THREADS_PER_BLOCK_Y),
+                  dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),
+                  0, 0,
+                  gpuTransposeMatrix , gpuMatrix, WIDTH ,HEIGHT);
+
+  // Record the stop event
+  hipEventRecord(stop, NULL);
+  hipEventSynchronize(stop);
+
+  hipEventElapsedTime(&eventMs, start, stop);
+
+  printf ("kernel Execution time             = %6.3fms\n", eventMs);
+
+  // Record the start event
+  hipEventRecord(start, NULL);
+
+  // Memory transfer from device to host
+  hipMemcpy(TransposeMatrix, gpuTransposeMatrix, NUM*sizeof(float), hipMemcpyDeviceToHost);
+
+  // Record the stop event
+  hipEventRecord(stop, NULL);
+  hipEventSynchronize(stop);
+
+  hipEventElapsedTime(&eventMs, start, stop);
+
+  printf ("hipMemcpyDeviceToHost time taken  = %6.3fms\n", eventMs);
+
+  // CPU MatrixTranspose computation
+  matrixTransposeCPUReference(cpuTransposeMatrix, Matrix, WIDTH, HEIGHT);
+
+  // verify the results
+  errors = 0;
+  double eps = 1.0E-6;
+  for (i = 0; i < NUM; i++) {
+    if (std::abs(TransposeMatrix[i] - cpuTransposeMatrix[i]) > 0 ) {
+      errors++;
+    }
+  }
+  if (errors!=0) {
+    printf("FAILED: %d errors\n",errors);
+  } else {
+    printf ("PASSED!\n");
+  }
+
+  //free the resources on device side
+  hipFree(gpuMatrix);
+  hipFree(gpuTransposeMatrix);
+
+  //free the resources on host side
+  free(Matrix);
+  free(TransposeMatrix);
+  free(cpuTransposeMatrix);
+
+  return errors;
+}
diff --git a/samples/2_Cookbook/2_HIP_ATP_MARKER/Readme.md b/samples/2_Cookbook/2_HIP_ATP_MARKER/Readme.md
new file mode 100644
index 0000000000..2bba31d349
--- /dev/null
+++ b/samples/2_Cookbook/2_HIP_ATP_MARKER/Readme.md
@@ -0,0 +1,51 @@
+## Using hipEvents to measure performance ###
+
+This tutorial is follow-up of the previous two tutorial where we learn how to write our first hip program, in which we compute Matrix Transpose and in second one, we added feature to measure time taken for memory transfer and kernel execution. In this tutorial, we won't make amy changes to the source code. We'll explain how to use the  codexl/rocm-profiler for hip timeline tracing.
+
+
+## Introduction:
+
+CodeXL and rocm-profiler are the tool used for profiling the application, which is of prominent use in optimizing the application by means of finding the memory bottlenecks and etc.
+
+## Requirement:
+[CodeXL Installation](http://gpuopen.com/compute-product/codexl/) 
+
+## prerequiste knowledge:
+
+Programmers familiar with CUDA, OpenCL will be able to quickly learn and start coding with the HIP API. In case you are not, don't worry. You choose to start with the best one. We'll be explaining everything assuming you are completely new to gpgpu programming.
+
+## Simple Matrix Transpose 
+
+We will be using the Simple Matrix Transpose source code from the previous tutorial as it is.
+
+## Using CodeXL markers for HIP Functions
+
+HIP can generate markers at function being/end which are displayed on the CodeXL timeline view. To do this, you need to install ROCm-Profiler and enable HIP to generate the markers:
+
+1. Install ROCm-Profiler Installing HIP from the rocm pre-built packages, installs the ROCm-Profiler as well. Alternatively, you can build ROCm-Profiler using the instructions given below.
+
+2. Build HIP with ATP markers enabled HIP pre-built packages are enabled with ATP marker support by default. To enable ATP marker support when building HIP from source, use the option -DCOMPILE_HIP_ATP_MARKER=1 during the cmake configure step.
+
+3. Set HIP_ATP_MARKER
+`export HIP_ATP_MARKER=1`
+
+4. Recompile the target application
+
+5. Run with profiler enabled to generate ATP file.
+`/opt/rocm/bin/rocm-profiler -o <outputATPFileName> -A <applicationName> <applicationArguments>`
+
+##Using HIP_TRACE_API
+
+You can also print the HIP function strings to stderr using HIP_TRACE_API environment variable. This can also be combined with the more detailed debug information provided by the HIP_DB switch. For example:
+`HIP_TRACE_API=1 HIP_DB=0x2 ./myHipApp`
+Note this trace mode uses colors. "less -r" can handle raw control characters and will display the debug output in proper colors.
+
+## More Info:
+- [HIP FAQ](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/docs/markdown/hip_faq.md)
+- [HIP Kernel Language](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/docs/markdown/hip_kernel_language.md)
+- [HIP Runtime API (Doxygen)](http://gpuopen-professionalcompute-tools.github.io/HIP)
+- [HIP Porting Guide](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/docs/markdown/hip_porting_guide.md)
+- [HIP Terminology](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/docs/markdown/hip_terms.md) (including Rosetta Stone of GPU computing terms across CUDA/HIP/HC/AMP/OpenL)
+- [clang-hipify](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/clang-hipify/README.md)
+- [Developer/CONTRIBUTING Info](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/CONTRIBUTING.md)
+- [Release Notes](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/RELEASE.md)
diff --git a/samples/2_Cookbook/3_shared_memory/Makefile b/samples/2_Cookbook/3_shared_memory/Makefile
new file mode 100644
index 0000000000..5e9ce47211
--- /dev/null
+++ b/samples/2_Cookbook/3_shared_memory/Makefile
@@ -0,0 +1,36 @@
+HIP_PATH?= $(wildcard /opt/rocm/hip)
+ifeq (,$(HIP_PATH))
+	HIP_PATH=../../..
+endif
+
+HIPCC=$(HIP_PATH)/bin/hipcc
+
+TARGET=hcc
+
+SOURCES = sharedMemory.cpp
+OBJECTS = $(SOURCES:.cpp=.o)
+
+EXECUTABLE=./exe
+
+.PHONY: test
+
+
+all: $(EXECUTABLE) test
+
+CXXFLAGS =-g
+CXX=$(HIPCC)
+
+
+$(EXECUTABLE): $(OBJECTS)
+	$(HIPCC) $(OBJECTS) -o $@
+
+
+test: $(EXECUTABLE)
+	$(EXECUTABLE)
+
+
+clean:
+	rm -f $(EXECUTABLE)
+	rm -f $(OBJECTS)
+	rm -f $(HIP_PATH)/src/*.o
+
diff --git a/samples/2_Cookbook/3_shared_memory/sharedMemory.cpp b/samples/2_Cookbook/3_shared_memory/sharedMemory.cpp
new file mode 100644
index 0000000000..1106d454f2
--- /dev/null
+++ b/samples/2_Cookbook/3_shared_memory/sharedMemory.cpp
@@ -0,0 +1,144 @@
+/*
+Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+#include<iostream>
+
+// hip header file
+#include "hip_runtime.h"
+
+
+#define WIDTH     1024
+#define HEIGHT    1024
+
+#define NUM       (WIDTH*HEIGHT)
+
+#define THREADS_PER_BLOCK_X  16
+#define THREADS_PER_BLOCK_Y  16
+#define THREADS_PER_BLOCK_Z  1
+
+// Device (Kernel) function, it must be void
+// hipLaunchParm provides the execution configuration
+__global__ void matrixTranspose(hipLaunchParm lp,
+                                float *out,
+                                float *in,
+                                const int width,
+                                const int height)
+{
+    __shared__ float sharedMem[16*16];
+
+    int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
+    int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;
+
+    sharedMem[y * width + x] = in[x * height + y];
+
+    __syncthreads();
+
+    out[y * width + x] = sharedMem[y * width + x];
+}
+
+// CPU implementation of matrix transpose
+void matrixTransposeCPUReference(
+    float * output,
+    float * input,
+    const unsigned int width,
+    const unsigned int height)
+{
+    for(unsigned int j=0; j < height; j++)
+    {
+        for(unsigned int i=0; i < width; i++)
+        {
+            output[i*height + j] = input[j*width + i];
+        }
+    }
+}
+
+int main() {
+
+  float* Matrix;
+  float* TransposeMatrix;
+  float* cpuTransposeMatrix;
+
+  float* gpuMatrix;
+  float* gpuTransposeMatrix;
+
+  hipDeviceProp_t devProp;
+  hipGetDeviceProperties(&devProp, 0);
+
+  std::cout << "Device name " << devProp.name << std::endl;
+
+  int i;
+  int errors;
+
+  Matrix = (float*)malloc(NUM * sizeof(float));
+  TransposeMatrix = (float*)malloc(NUM * sizeof(float));
+  cpuTransposeMatrix = (float*)malloc(NUM * sizeof(float));
+
+  // initialize the input data
+  for (i = 0; i < NUM; i++) {
+    Matrix[i] = (float)i*10.0f;
+  }
+
+  // allocate the memory on the device side
+  hipMalloc((void**)&gpuMatrix, NUM * sizeof(float));
+  hipMalloc((void**)&gpuTransposeMatrix, NUM * sizeof(float));
+
+  // Memory transfer from host to device
+  hipMemcpy(gpuMatrix, Matrix, NUM*sizeof(float), hipMemcpyHostToDevice);
+
+  // Lauching kernel from host
+  hipLaunchKernel(matrixTranspose,
+                  dim3(WIDTH/THREADS_PER_BLOCK_X, HEIGHT/THREADS_PER_BLOCK_Y),
+                  dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),
+                  0, 0,
+                  gpuTransposeMatrix , gpuMatrix, WIDTH ,HEIGHT);
+
+  // Memory transfer from device to host
+  hipMemcpy(TransposeMatrix, gpuTransposeMatrix, NUM*sizeof(float), hipMemcpyDeviceToHost);
+
+  // CPU MatrixTranspose computation
+  matrixTransposeCPUReference(cpuTransposeMatrix, Matrix, WIDTH, HEIGHT);
+
+  // verify the results
+  errors = 0;
+  double eps = 1.0E-6;
+  for (i = 0; i < NUM; i++) {
+    if (std::abs(TransposeMatrix[i] - cpuTransposeMatrix[i]) > 0 ) {
+    printf("%d cpu: %f gpu  %f\n",i,cpuTransposeMatrix[i],TransposeMatrix[i]);
+      errors++;
+    }
+  }
+  if (errors!=0) {
+    printf("FAILED: %d errors\n",errors);
+  } else {
+    printf ("PASSED!\n");
+  }
+
+  //free the resources on device side
+  hipFree(gpuMatrix);
+  hipFree(gpuTransposeMatrix);
+
+  //free the resources on host side
+  free(Matrix);
+  free(TransposeMatrix);
+  free(cpuTransposeMatrix);
+
+  return errors;
+}
diff --git a/samples/2_Cookbook/4_shfl/Makefile b/samples/2_Cookbook/4_shfl/Makefile
new file mode 100644
index 0000000000..1d30c78749
--- /dev/null
+++ b/samples/2_Cookbook/4_shfl/Makefile
@@ -0,0 +1,36 @@
+HIP_PATH?= $(wildcard /opt/rocm/hip)
+ifeq (,$(HIP_PATH))
+	HIP_PATH=../../..
+endif
+
+HIPCC=$(HIP_PATH)/bin/hipcc
+
+TARGET=hcc
+
+SOURCES = shfl.cpp
+OBJECTS = $(SOURCES:.cpp=.o)
+
+EXECUTABLE=./exe
+
+.PHONY: test
+
+
+all: $(EXECUTABLE) test
+
+CXXFLAGS =-g
+CXX=$(HIPCC)
+
+
+$(EXECUTABLE): $(OBJECTS)
+	$(HIPCC) $(OBJECTS) -o $@
+
+
+test: $(EXECUTABLE)
+	$(EXECUTABLE)
+
+
+clean:
+	rm -f $(EXECUTABLE)
+	rm -f $(OBJECTS)
+	rm -f $(HIP_PATH)/src/*.o
+
diff --git a/samples/2_Cookbook/4_shfl/shfl.cpp b/samples/2_Cookbook/4_shfl/shfl.cpp
new file mode 100644
index 0000000000..f43809b017
--- /dev/null
+++ b/samples/2_Cookbook/4_shfl/shfl.cpp
@@ -0,0 +1,143 @@
+/*
+Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+#include<iostream>
+
+// hip header file
+#include "hip_runtime.h"
+
+
+#define WIDTH     4
+#define HEIGHT    4
+
+#define NUM       (WIDTH*HEIGHT)
+
+#define THREADS_PER_BLOCK_X  4
+#define THREADS_PER_BLOCK_Y  4
+#define THREADS_PER_BLOCK_Z  1
+
+// Device (Kernel) function, it must be void
+// hipLaunchParm provides the execution configuration
+__global__ void matrixTranspose(hipLaunchParm lp,
+                                float *out,
+                                float *in,
+                                const int width,
+                                const int height)
+{
+    int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
+    //int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;
+    float val = in[x];
+
+    for(int i=0;i<width;i++)
+    {
+        for(int j=0;j<width;j++)
+            out[i*height + j] = __shfl(val,j*width + i);
+    }
+}
+
+// CPU implementation of matrix transpose
+void matrixTransposeCPUReference(
+    float * output,
+    float * input,
+    const unsigned int width,
+    const unsigned int height)
+{
+    for(unsigned int j=0; j < height; j++)
+    {
+        for(unsigned int i=0; i < width; i++)
+        {
+            output[i*height + j] = input[j*width + i];
+        }
+    }
+}
+
+int main() {
+
+  float* Matrix;
+  float* TransposeMatrix;
+  float* cpuTransposeMatrix;
+
+  float* gpuMatrix;
+  float* gpuTransposeMatrix;
+
+  hipDeviceProp_t devProp;
+  hipGetDeviceProperties(&devProp, 0);
+
+  std::cout << "Device name " << devProp.name << std::endl;
+
+  int i;
+  int errors;
+
+  Matrix = (float*)malloc(NUM * sizeof(float));
+  TransposeMatrix = (float*)malloc(NUM * sizeof(float));
+  cpuTransposeMatrix = (float*)malloc(NUM * sizeof(float));
+
+  // initialize the input data
+  for (i = 0; i < NUM; i++) {
+    Matrix[i] = (float)i*10.0f;
+  }
+
+  // allocate the memory on the device side
+  hipMalloc((void**)&gpuMatrix, NUM * sizeof(float));
+  hipMalloc((void**)&gpuTransposeMatrix, NUM * sizeof(float));
+
+  // Memory transfer from host to device
+  hipMemcpy(gpuMatrix, Matrix, NUM*sizeof(float), hipMemcpyHostToDevice);
+
+  // Lauching kernel from host
+  hipLaunchKernel(matrixTranspose,
+                  dim3(1),
+                  dim3(THREADS_PER_BLOCK_X * THREADS_PER_BLOCK_Y),
+                  0, 0,
+                  gpuTransposeMatrix , gpuMatrix, WIDTH ,HEIGHT);
+
+  // Memory transfer from device to host
+  hipMemcpy(TransposeMatrix, gpuTransposeMatrix, NUM*sizeof(float), hipMemcpyDeviceToHost);
+
+  // CPU MatrixTranspose computation
+  matrixTransposeCPUReference(cpuTransposeMatrix, Matrix, WIDTH, HEIGHT);
+
+  // verify the results
+  errors = 0;
+  double eps = 1.0E-6;
+  for (i = 0; i < NUM; i++) {
+    if (std::abs(TransposeMatrix[i] - cpuTransposeMatrix[i]) > 0 ) {
+    printf("%d cpu: %f gpu  %f\n",i,cpuTransposeMatrix[i],TransposeMatrix[i]);
+      errors++;
+    }
+  }
+  if (errors!=0) {
+    printf("FAILED: %d errors\n",errors);
+  } else {
+    printf ("PASSED!\n");
+  }
+
+  //free the resources on device side
+  hipFree(gpuMatrix);
+  hipFree(gpuTransposeMatrix);
+
+  //free the resources on host side
+  free(Matrix);
+  free(TransposeMatrix);
+  free(cpuTransposeMatrix);
+
+  return errors;
+}
diff --git a/samples/2_Cookbook/5_2dshfl/2dshfl.cpp b/samples/2_Cookbook/5_2dshfl/2dshfl.cpp
new file mode 100644
index 0000000000..85bc3be2ae
--- /dev/null
+++ b/samples/2_Cookbook/5_2dshfl/2dshfl.cpp
@@ -0,0 +1,139 @@
+/*
+Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+#include<iostream>
+
+// hip header file
+#include "hip_runtime.h"
+
+
+#define WIDTH     4
+#define HEIGHT    4
+
+#define NUM       (WIDTH*HEIGHT)
+
+#define THREADS_PER_BLOCK_X  4
+#define THREADS_PER_BLOCK_Y  4
+#define THREADS_PER_BLOCK_Z  1
+
+// Device (Kernel) function, it must be void
+// hipLaunchParm provides the execution configuration
+__global__ void matrixTranspose(hipLaunchParm lp,
+                                float *out,
+                                float *in,
+                                const int width,
+                                const int height)
+{
+    int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
+    int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;
+    float val = in[y*width + x];
+
+    out[x*height + y] = __shfl(val,y*width + x);
+}
+
+// CPU implementation of matrix transpose
+void matrixTransposeCPUReference(
+    float * output,
+    float * input,
+    const unsigned int width,
+    const unsigned int height)
+{
+    for(unsigned int j=0; j < height; j++)
+    {
+        for(unsigned int i=0; i < width; i++)
+        {
+            output[i*height + j] = input[j*width + i];
+        }
+    }
+}
+
+int main() {
+
+  float* Matrix;
+  float* TransposeMatrix;
+  float* cpuTransposeMatrix;
+
+  float* gpuMatrix;
+  float* gpuTransposeMatrix;
+
+  hipDeviceProp_t devProp;
+  hipGetDeviceProperties(&devProp, 0);
+
+  std::cout << "Device name " << devProp.name << std::endl;
+
+  int i;
+  int errors;
+
+  Matrix = (float*)malloc(NUM * sizeof(float));
+  TransposeMatrix = (float*)malloc(NUM * sizeof(float));
+  cpuTransposeMatrix = (float*)malloc(NUM * sizeof(float));
+
+  // initialize the input data
+  for (i = 0; i < NUM; i++) {
+    Matrix[i] = (float)i*10.0f;
+  }
+
+  // allocate the memory on the device side
+  hipMalloc((void**)&gpuMatrix, NUM * sizeof(float));
+  hipMalloc((void**)&gpuTransposeMatrix, NUM * sizeof(float));
+
+  // Memory transfer from host to device
+  hipMemcpy(gpuMatrix, Matrix, NUM*sizeof(float), hipMemcpyHostToDevice);
+
+  // Lauching kernel from host
+  hipLaunchKernel(matrixTranspose,
+                  dim3(1),
+                  dim3(THREADS_PER_BLOCK_X , THREADS_PER_BLOCK_Y),
+                  0, 0,
+                  gpuTransposeMatrix , gpuMatrix, WIDTH ,HEIGHT);
+
+  // Memory transfer from device to host
+  hipMemcpy(TransposeMatrix, gpuTransposeMatrix, NUM*sizeof(float), hipMemcpyDeviceToHost);
+
+  // CPU MatrixTranspose computation
+  matrixTransposeCPUReference(cpuTransposeMatrix, Matrix, WIDTH, HEIGHT);
+
+  // verify the results
+  errors = 0;
+  double eps = 1.0E-6;
+  for (i = 0; i < NUM; i++) {
+    if (std::abs(TransposeMatrix[i] - cpuTransposeMatrix[i]) > 0 ) {
+    printf("%d cpu: %f gpu  %f\n",i,cpuTransposeMatrix[i],TransposeMatrix[i]);
+      errors++;
+    }
+  }
+  if (errors!=0) {
+    printf("FAILED: %d errors\n",errors);
+  } else {
+    printf ("PASSED!\n");
+  }
+
+  //free the resources on device side
+  hipFree(gpuMatrix);
+  hipFree(gpuTransposeMatrix);
+
+  //free the resources on host side
+  free(Matrix);
+  free(TransposeMatrix);
+  free(cpuTransposeMatrix);
+
+  return errors;
+}
diff --git a/samples/2_Cookbook/5_2dshfl/Makefile b/samples/2_Cookbook/5_2dshfl/Makefile
new file mode 100644
index 0000000000..502d2948b0
--- /dev/null
+++ b/samples/2_Cookbook/5_2dshfl/Makefile
@@ -0,0 +1,36 @@
+HIP_PATH?= $(wildcard /opt/rocm/hip)
+ifeq (,$(HIP_PATH))
+	HIP_PATH=../../..
+endif
+
+HIPCC=$(HIP_PATH)/bin/hipcc
+
+TARGET=hcc
+
+SOURCES = 2dshfl.cpp
+OBJECTS = $(SOURCES:.cpp=.o)
+
+EXECUTABLE=./exe
+
+.PHONY: test
+
+
+all: $(EXECUTABLE) test
+
+CXXFLAGS =-g
+CXX=$(HIPCC)
+
+
+$(EXECUTABLE): $(OBJECTS)
+	$(HIPCC) $(OBJECTS) -o $@
+
+
+test: $(EXECUTABLE)
+	$(EXECUTABLE)
+
+
+clean:
+	rm -f $(EXECUTABLE)
+	rm -f $(OBJECTS)
+	rm -f $(HIP_PATH)/src/*.o
+