diff --git a/rocrtst/Kernels/CMakeLists.txt b/rocrtst/Kernels/CMakeLists.txt index c2f7506ca6..408e94c19c 100644 --- a/rocrtst/Kernels/CMakeLists.txt +++ b/rocrtst/Kernels/CMakeLists.txt @@ -152,6 +152,8 @@ function(buildCodeObjects kname) endfunction(buildCodeObjects) +buildCodeObjects("read") +buildCodeObjects("write") buildCodeObjects("binary_search") # diff --git a/rocrtst/Kernels/read_kernel.cl b/rocrtst/Kernels/read_kernel.cl new file mode 100644 index 0000000000..c4e418e2df --- /dev/null +++ b/rocrtst/Kernels/read_kernel.cl @@ -0,0 +1,43 @@ + +/** + * @brief Opencl kernel to read from a buffer and sum its values + * into a destination integer + * + * @param src Pointer to an array of 16 unsigned integers (32-bit) i.e. one instance + * has 16 * 32-bit = 64 bytes + * + * @param size Specifies number of uint16 elements in the array + * + * @param threads Number of threads running this kernel + * + * @param dst Output parameter updated with sum of the input buffer + * + * @note: It is critical that the size of 'src' be a integral multiple + * of (threads * sizeof(uint16)). If it is fractional and less than ONE + * it will lead to accessing memory that is out-of-bounds. If it is fractional + * more but more than ONE then it will lead to some threads not doing work + * at all leading to incorrect benchmark computation + * + */ + +__kernel void + read_kernel(__global uint16 *src, + ulong size, uint threads, __global uint* dst) { + + uint16 pval; + int idx = get_global_id(0); + __global uint16 *srcEnd = src + size; + + uint tmp = 0; + src = &src[idx]; + while (src < srcEnd) { + pval = *src; + src += threads; + tmp += pval.s0 + pval.s1 + pval.s2 + pval.s3 + \ + pval.s4 + pval.s5 + pval.s6 + pval.s7 + \ + pval.s8 + pval.s9 + pval.sa + pval.sb + \ + pval.sc + pval.sd + pval.se + pval.sf; + } + atomic_add(dst, tmp); +} + diff --git a/rocrtst/Kernels/write_kernel.cl b/rocrtst/Kernels/write_kernel.cl new file mode 100644 index 0000000000..e223e43ac9 --- /dev/null +++ b/rocrtst/Kernels/write_kernel.cl @@ -0,0 +1,40 @@ + +/** + * @brief Opencl kernel to write into a buffer the values of const integer list + * + * @param dst Pointer to an array of 16 unsigned integers (32-bit) i.e. one instance + * has 16 * 32-bit = 64 bytes + * + * @param size Specifies number of uint16 elements in the array + * + * @param threads Number of threads running this kernel + * + * @note: It is critical that the size of 'dst' be a integral multiple + * of (threads * sizeof(uint16)). If it is fractional and less than ONE + * it will lead to accessing memory that is out-of-bounds. If it is fractional + * more but more than ONE then it will lead to some threads not doing work + * at all leading to incorrect benchmark computation + * + */ + +__kernel void + write_kernel(__global uint16 *dst, + ulong size, uint threads) { + + uint16 pval = (uint16)(0xabababab, 0xabababab, 0xabababab, 0xabababab, + 0xabababab, 0xabababab, 0xabababab, 0xabababab, + 0xabababab, 0xabababab, 0xabababab, 0xabababab, + 0xabababab, 0xabababab, 0xabababab, 0xabababab); + + int idx = get_global_id(0); + __global uint16 *dstEnd = dst + size; + + dst = &dst[idx]; + do { + *dst = pval; + dst += threads; + } while (dst < dstEnd); + +} + +