Adding kernels to read / write buffers

Change-Id: Icad95c084e0fcd0bd9f86154e23ac8f54c24afbe
2017-10-03 17:20:01 -05:00
parent 9ba83d83f7
commit 34602f7e95
3 changed files with 85 additions and 0 deletions
@@ -152,6 +152,8 @@ function(buildCodeObjects kname)

 endfunction(buildCodeObjects)

+buildCodeObjects("read")
+buildCodeObjects("write")
 buildCodeObjects("binary_search")

 #
@@ -0,0 +1,43 @@
+
+/**
+ * @brief Opencl kernel to read from a buffer and sum its values
+ * into a destination integer
+ *
+ * @param src Pointer to an array of 16 unsigned integers (32-bit) i.e. one instance
+ * has 16 * 32-bit = 64 bytes
+ * 
+ * @param size Specifies number of uint16 elements in the array
+ *
+ * @param threads Number of threads running this kernel
+ *
+ * @param dst Output parameter updated with sum of the input buffer
+ *
+ * @note: It is critical that the size of 'src' be a integral multiple
+ * of (threads * sizeof(uint16)). If it is fractional and less than ONE
+ * it will lead to accessing memory that is out-of-bounds. If it is fractional
+ * more but more than ONE then it will lead to some threads not doing work
+ * at all leading to incorrect benchmark computation
+ *
+ */
+
+__kernel void
+  read_kernel(__global uint16 *src,
+              ulong size, uint threads, __global uint* dst) {
+
+  uint16 pval;
+  int idx = get_global_id(0);
+  __global uint16 *srcEnd = src + size;
+  
+  uint tmp = 0;
+  src = &src[idx];
+  while (src < srcEnd) {
+    pval = *src;
+    src += threads;
+    tmp += pval.s0 + pval.s1 + pval.s2 + pval.s3 +  \
+           pval.s4 + pval.s5 + pval.s6 +  pval.s7 + \
+           pval.s8 + pval.s9 + pval.sa + pval.sb +  \
+           pval.sc + pval.sd + pval.se + pval.sf;
+  }
+  atomic_add(dst, tmp);
+}
+
@@ -0,0 +1,40 @@
+
+/**
+ * @brief Opencl kernel to write into a buffer the values of const integer list
+ *
+ * @param dst Pointer to an array of 16 unsigned integers (32-bit) i.e. one instance
+ * has 16 * 32-bit = 64 bytes
+ * 
+ * @param size Specifies number of uint16 elements in the array
+ *
+ * @param threads Number of threads running this kernel
+ *
+ * @note: It is critical that the size of 'dst' be a integral multiple
+ * of (threads * sizeof(uint16)). If it is fractional and less than ONE
+ * it will lead to accessing memory that is out-of-bounds. If it is fractional
+ * more but more than ONE then it will lead to some threads not doing work
+ * at all leading to incorrect benchmark computation
+ *
+ */
+
+__kernel void
+  write_kernel(__global uint16 *dst,
+               ulong size, uint threads) {
+
+  uint16 pval = (uint16)(0xabababab, 0xabababab, 0xabababab, 0xabababab,
+                         0xabababab, 0xabababab, 0xabababab, 0xabababab,
+                         0xabababab, 0xabababab, 0xabababab, 0xabababab,
+                         0xabababab, 0xabababab, 0xabababab, 0xabababab);
+
+  int idx = get_global_id(0);
+  __global uint16 *dstEnd = dst + size;
+  
+  dst = &dst[idx];
+  do {
+    *dst = pval;
+    dst += threads;
+  } while (dst < dstEnd);
+
+}
+
+