rocm-systems/projects/aqlprofile/test/simple_convolution/simple_convolution.cl



/**
 * simple_convolution is where each pixel of the output image
 * is the weighted sum of the neighborhood pixels of the input image
 * The neighborhood is defined by the dimensions of the mask and
 * weight of each neighbor is defined by the mask itself.
 * @param output Output matrix after performing convolution
 * @param input  Input  matrix on which convolution is to be performed
 * @param mask   mask matrix using which convolution was to be performed
 * @param inputDimensions dimensions of the input matrix
 * @param maskDimensions  dimensions of the mask matrix
 */
__kernel void simple_convolution(__global  uint  * output,
                                __global  uint  * input,
                                __global  float  * mask,
                                const     uint2  inputDimensions,
                                const     uint2  maskDimensions) {

  uint tid   = get_global_id(0);

  uint width  = inputDimensions.x;
  uint height = inputDimensions.y;

  uint x      = tid%width;
  uint y      = tid/width;

  uint maskWidth  = maskDimensions.x;
  uint maskHeight = maskDimensions.y;

  uint vstep = (maskWidth  -1)/2;
  uint hstep = (maskHeight -1)/2;

  // find the left, right, top and bottom indices such that
  // the indices do not go beyond image boundaires
  uint left    = (x           <  vstep) ? 0         : (x - vstep);
  uint right   = ((x + vstep) >= width) ? width - 1 : (x + vstep);
  uint top     = (y           <  hstep) ? 0         : (y - hstep);
  uint bottom  = ((y + hstep) >= height)? height - 1: (y + hstep);

  // initializing wighted sum value
  float sumFX = 0;

  for(uint i = left; i <= right; ++i) {
    for(uint j = top; j <= bottom; ++j) {
      // performing wighted sum within the mask boundaries
      uint maskIndex = (j - (y - hstep)) * maskWidth  + (i - (x - vstep));
      uint index     = j                 * width      + i;
      sumFX += ((float)input[index] * mask[maskIndex]);
    }
  }

  // To round to the nearest integer
  sumFX += 0.5f;
  output[tid] = (uint)sumFX;
}