Adding new unroll example (#1187)

This commit is contained in:
Jatin Chaudhary
2019-07-03 14:21:19 +05:30
zatwierdzone przez Maneesh Gupta
rodzic 0218f95c32
commit 3cb87cdc46
2 zmienionych plików z 64 dodań i 68 usunięć
@@ -16,17 +16,15 @@ Programmers familiar with CUDA, OpenCL will be able to quickly learn and start c
## Simple Matrix Transpose
For this tutorial we will be using MatrixTranspose with shfl operation i.e., our 4_shfl tutorial since it is the only examples where we used loops inside the kernel.
For this tutorial we will be using an example which sums up the row of a 2D matrix and writes it in a 1D array.
In this tutorial, we'll use `#pragma unroll`. In the same sourcecode, we used for MatrixTranspose. We'll add it just before the for loop as following:
In this tutorial, we'll use `#pragma unroll`. In the same sourcecode, we used for gpuMatrixRowSum. We'll add it just before the for loop as following:
```
#pragma unroll
for(int i=0;i<width;i++)
{
for(int j=0;j<width;j++)
out[i*width + j] = __shfl(val,j*width + i);
}
for (int i = 0; i < width; i++) {
output[index] += input[index * width + i]
}
```
Specifying the optional parameter, #pragma unroll value, directs the unroller to unroll the loop value times. Be careful while using it.
@@ -25,100 +25,98 @@ THE SOFTWARE.
// hip header file
#include "hip/hip_runtime.h"
#define LENGTH 4
#define WIDTH 4
#define SIZE (LENGTH * LENGTH)
#define NUM (WIDTH * WIDTH)
#define THREADS_PER_BLOCK 1
#define BLOCKS_PER_GRID LENGTH
#define THREADS_PER_BLOCK_X 4
#define THREADS_PER_BLOCK_Y 4
#define THREADS_PER_BLOCK_Z 1
// Device (Kernel) function, it must be void
__global__ void matrixTranspose(float* out, float* in, const int width) {
int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
float val = in[x];
#pragma unroll
// CPU function - basically scan each row and save the output in array
void matrixRowSum(int* input, int* output, int width) {
for (int i = 0; i < width; i++) {
for (int j = 0; j < width; j++) out[i * width + j] = __shfl(val, j * width + i);
}
}
// CPU implementation of matrix transpose
void matrixTransposeCPUReference(float* output, float* input, const unsigned int width) {
for (unsigned int j = 0; j < width; j++) {
for (unsigned int i = 0; i < width; i++) {
output[i * width + j] = input[j * width + i];
for (int j = 0; j < width; j++) {
output[i] += input[i * width + j];
}
}
}
int main() {
float* Matrix;
float* TransposeMatrix;
float* cpuTransposeMatrix;
// Device (kernel) function
__global__ void gpuMatrixRowSum(int* input, int* output, int width) {
int index = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
#pragma unroll
for (int i = 0; i < width; i++) {
output[index] += input[index * width + i];
}
}
float* gpuMatrix;
float* gpuTransposeMatrix;
int main() {
int* Matrix;
int* sumMatrix;
int* cpuSumMatrix;
int* gpuMatrix;
int* gpuSumMatrix;
hipDeviceProp_t devProp;
hipGetDeviceProperties(&devProp, 0);
std::cout << "Device name " << devProp.name << std::endl;
int i;
int errors;
Matrix = (int*)malloc(sizeof(int) * SIZE);
sumMatrix = (int*)malloc(sizeof(int) * LENGTH);
cpuSumMatrix = (int*)malloc(sizeof(int) * LENGTH);
Matrix = (float*)malloc(NUM * sizeof(float));
TransposeMatrix = (float*)malloc(NUM * sizeof(float));
cpuTransposeMatrix = (float*)malloc(NUM * sizeof(float));
// initialize the input data
for (i = 0; i < NUM; i++) {
Matrix[i] = (float)i * 10.0f;
for (int i = 0; i < SIZE; i++) {
Matrix[i] = i * 2;
}
// allocate the memory on the device side
hipMalloc((void**)&gpuMatrix, NUM * sizeof(float));
hipMalloc((void**)&gpuTransposeMatrix, NUM * sizeof(float));
for (int i = 0; i < LENGTH; i++) {
cpuSumMatrix[i] = 0;
}
// Memory transfer from host to device
hipMemcpy(gpuMatrix, Matrix, NUM * sizeof(float), hipMemcpyHostToDevice);
// Allocated Device Memory
hipMalloc((void**)&gpuMatrix, SIZE * sizeof(int));
hipMalloc((void**)&gpuSumMatrix, LENGTH * sizeof(int));
// Lauching kernel from host
hipLaunchKernelGGL(matrixTranspose, dim3(1), dim3(THREADS_PER_BLOCK_X * THREADS_PER_BLOCK_Y), 0, 0,
gpuTransposeMatrix, gpuMatrix, WIDTH);
// Memory Copy to Device
hipMemcpy(gpuMatrix, Matrix, SIZE * sizeof(int), hipMemcpyHostToDevice);
hipMemcpy(gpuSumMatrix, cpuSumMatrix, LENGTH * sizeof(float), hipMemcpyHostToDevice);
// Memory transfer from device to host
hipMemcpy(TransposeMatrix, gpuTransposeMatrix, NUM * sizeof(float), hipMemcpyDeviceToHost);
// Launch device kernels
hipLaunchKernelGGL(gpuMatrixRowSum, dim3(BLOCKS_PER_GRID), dim3(THREADS_PER_BLOCK), 0, 0,
gpuMatrix, gpuSumMatrix, LENGTH);
// Memory copy back to device
hipMemcpy(sumMatrix, gpuSumMatrix, LENGTH * sizeof(int), hipMemcpyDeviceToHost);
// Cpu implementation
matrixRowSum(Matrix, cpuSumMatrix, LENGTH);
// CPU MatrixTranspose computation
matrixTransposeCPUReference(cpuTransposeMatrix, Matrix, WIDTH);
// verify the results
errors = 0;
double eps = 1.0E-6;
for (i = 0; i < NUM; i++) {
if (std::abs(TransposeMatrix[i] - cpuTransposeMatrix[i]) > eps) {
printf("%d cpu: %f gpu %f\n", i, cpuTransposeMatrix[i], TransposeMatrix[i]);
int errors = 0;
for (int i = 0; i < LENGTH; i++) {
if (sumMatrix[i] != cpuSumMatrix[i]) {
printf("%d - cpu: %d gpu: %d\n", i, sumMatrix[i], cpuSumMatrix[i]);
errors++;
}
}
if (errors != 0) {
printf("FAILED: %d errors\n", errors);
if (errors == 0) {
printf("PASSED\n");
} else {
printf("PASSED!\n");
printf("FAILED with %d errors\n", errors);
}
// free the resources on device side
// GPU Free
hipFree(gpuMatrix);
hipFree(gpuTransposeMatrix);
hipFree(gpuSumMatrix);
// free the resources on host side
// CPU Free
free(Matrix);
free(TransposeMatrix);
free(cpuTransposeMatrix);
free(sumMatrix);
free(cpuSumMatrix);
return errors;
}