9b183d909e
Change-Id: I081ada13f6364ea4401a97a485dedfa9bf8a45fe
[ROCm/rocprofiler commit: 60043d198b]
176 라인
6.1 KiB
Common Lisp
176 라인
6.1 KiB
Common Lisp
/**********************************************************************
|
|
Copyright ©2015 Advanced Micro Devices, Inc. All rights reserved.
|
|
|
|
Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
|
|
|
|
• Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
|
|
• Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or
|
|
other materials provided with the distribution.
|
|
|
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
|
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
|
|
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
|
|
OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
|
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
********************************************************************/
|
|
|
|
/**
|
|
* The kernel has two implementation of convolution.
|
|
* 1. Non-Separable Convolution
|
|
* 2. Separable Convolution
|
|
*/
|
|
|
|
|
|
/**
|
|
* NonSeparableConvolution
|
|
* is where each pixel of the output image
|
|
* is the weighted sum of the neighbourhood pixels of the input image
|
|
* The neighbourhood is defined by the dimensions of the mask and
|
|
* weight of each neighbour is defined by the mask itself.
|
|
* @param input Padded Input matrix on which convolution is to be performed
|
|
* @param mask mask matrix using which convolution was to be performed
|
|
* @param output Output matrix after performing convolution
|
|
* @param inputDimensions dimensions of the input matrix
|
|
* @param maskDimensions dimensions of the mask matrix
|
|
* @param nExWidth Size of padded input width
|
|
*/
|
|
|
|
__kernel void simpleNonSeparableConvolution(__global uint * input,
|
|
__global float * mask,
|
|
__global int * output,
|
|
const uint2 inputDimensions,
|
|
const uint2 maskDimensions,
|
|
const uint nExWidth)
|
|
{
|
|
uint tid = get_global_id(0);
|
|
|
|
uint width = inputDimensions.x;
|
|
uint height = inputDimensions.y;
|
|
|
|
uint x = tid%width;
|
|
uint y = tid/width;
|
|
|
|
uint maskWidth = maskDimensions.x;
|
|
uint maskHeight = maskDimensions.y;
|
|
|
|
if(x >= width || y >= height)
|
|
return;
|
|
|
|
/*
|
|
* initializing weighted sum value
|
|
*/
|
|
float sumFX = 0.0f;
|
|
int m = 0, n = 0;
|
|
|
|
//performing weighted sum within the mask boundaries
|
|
for(uint j = y ; j < (y + maskHeight); ++j, m++)
|
|
{
|
|
n = 0;
|
|
for(uint i = x; i < (x + maskWidth); ++i, n++)
|
|
{
|
|
uint maskIndex = m * maskWidth + n;
|
|
uint index = j * nExWidth + i;
|
|
|
|
sumFX += ((float)input[index] * mask[maskIndex]);
|
|
}
|
|
}
|
|
|
|
sumFX += 0.5f;
|
|
output[tid] = (int)sumFX;
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
* SeparableConvolution
|
|
* is product of 2 one-dimensional convolution.
|
|
* A 2-dimensional convolution operation is separated into 2 one one-dimensional convolution.
|
|
* SeparableConvolution is implemented in two passes.
|
|
* The first pass is called Row-wise convolution.
|
|
* And second pass is called Column-wise convolution.
|
|
*/
|
|
|
|
/**
|
|
* First Pass - Row-wise convolution
|
|
* @param input Input matrix on which convolution is to be performed
|
|
* @param rowFilter rowFilter vector using which row-wise convolution was to be performed
|
|
* @param tmpOutput Output matrix after performing first pass convolution
|
|
* @param inputDimensions dimensions of the input matrix
|
|
* @param filterSize length of row filter vector
|
|
* @param exInputDimensions dimensions of padded input
|
|
*/
|
|
__kernel void simpleSeparableConvolutionPass1(__global uint * input,
|
|
__global float * rowFilter,
|
|
__global float * tmpOutput,
|
|
const uint2 inputDimensions,
|
|
const uint filterSize,
|
|
const uint2 exInputDimensions)
|
|
{
|
|
int i = 0, cnt = 0;
|
|
|
|
uint width = inputDimensions.x;
|
|
uint height = inputDimensions.y;
|
|
|
|
uint tid = get_global_id(0);
|
|
uint x = tid%width;
|
|
uint y = tid/width;
|
|
|
|
if(x >= width || y >= (height+filterSize-1))
|
|
return;
|
|
|
|
/*
|
|
* initializing weighted sum value
|
|
*/
|
|
float sum = 0.0f;
|
|
|
|
for(uint i = x; i < (x + filterSize); ++i) {
|
|
sum = mad((float)input[y * exInputDimensions.x + i], rowFilter[cnt++], sum);
|
|
}
|
|
|
|
/* Transposed save */
|
|
tmpOutput[x * exInputDimensions.y + y] = sum;
|
|
}
|
|
|
|
/**
|
|
* Second Pass - Column-wise convolution
|
|
* @param input Input matrix on which convolution is to be performed
|
|
* @param colFilter colFilter vector using which column-wise convolution was to be performed
|
|
* @param Output Output matrix after performing second pass convolution
|
|
* @param inputDimensions dimensions of the input matrix
|
|
* @param filterSize length of col filter vector
|
|
* @param exInputDimensions dimensions of padded input
|
|
*/
|
|
__kernel void simpleSeparableConvolutionPass2(__global float * input,
|
|
__global float * colFilter,
|
|
__global int * output,
|
|
const uint2 inputDimensions,
|
|
const uint filterSize,
|
|
const uint2 exInputDimensions)
|
|
{
|
|
int i = 0, cnt = 0;
|
|
|
|
uint width = inputDimensions.x;
|
|
uint height = inputDimensions.y;
|
|
|
|
uint tid = get_global_id(0);
|
|
uint x = tid%height;
|
|
uint y = tid/height;
|
|
|
|
if(y >= width || x >= height)
|
|
return;
|
|
|
|
/*
|
|
* initializing wighted sum value
|
|
*/
|
|
float sum = 0.0f;
|
|
|
|
for(uint i = x; i < (x + filterSize); ++i) {
|
|
sum = mad(input[y * exInputDimensions.y + i], colFilter[cnt++], sum);
|
|
}
|
|
|
|
/* Tranposed save */
|
|
sum += 0.5f;
|
|
output[x * width + y] = (int)sum;
|
|
}
|