EXSWHTEC-259 - Implement new and extend existing tests for thread_block #151

Change-Id: I4d85af8e5f805a1c8ff7b7efd3b3400f38343ea9 [ROCm/hip-tests commit: 4fa177dd2b]
2023-12-29 16:02:50 +00:00
@@ -30,6 +30,9 @@ int main(int argc, char** argv) {
    | Opt(cmd_options.progress)
        ["-P"]["--progress"]
        ("Show progress bar when running performance tests")
+    | Opt(cmd_options.cg_iterations, "cg_iterations")
+        ["-E"]["--cg-iterations"]
+        ("Number of iterations used for cooperative groups sync tests (default: 5)")
    | Opt(cmd_options.cg_extended_run, "cg_extened_run")
        ["-E"]["--cg-extended-run"]
        ("TODO: Description goes here")
@@ -144,21 +144,6 @@ THE SOFTWARE.
 * @}
 */

- /**
- * @defgroup StreamOTest Ordered Memory Allocator
- * @{
- * This section describes the tests for Stream Ordered Memory Allocator functions of HIP runtime
- * API.
- * @}
- */
-
-/**
- * @defgroup StreamOTest Ordered Memory Allocator
- * @{
- * This section describes the tests for Stream Ordered Memory Allocator functions of HIP runtime
- * API.
- */
-
 /**
 * @defgroup StreamOTest Ordered Memory Allocator
 * @{
@@ -173,13 +158,6 @@ THE SOFTWARE.
 * @}
 */

-/**
- * @defgroup PerformanceTest Performance tests
- * @{
- * This section describes performance tests for the target API groups and use-cases.
- * @}
- */
-
 /**
 * @defgroup TextureTest Texture Management
 * @{
@@ -209,7 +187,6 @@ THE SOFTWARE.
 */

 /**
- * @defgroup ComplexTest Complex type
 * @{
 * This section describes tests for the Complex type functions.
 * @}
@@ -19,7 +19,6 @@
 # THE SOFTWARE.

 add_subdirectory(memset)
-add_subdirectory(memcpy)
 add_subdirectory(kernelLaunch)
 add_subdirectory(stream)
 add_subdirectory(event)
@@ -47,7 +47,6 @@ add_subdirectory(atomics)
 add_subdirectory(complex)
 add_subdirectory(p2p)
 add_subdirectory(gcc)
-add_subdirectory(vector_types)

 if(HIP_PLATFORM STREQUAL "amd")
 add_subdirectory(callback)
@@ -31,6 +31,14 @@ constexpr size_t kWarpSize = 64;
 constexpr int kMaxGPUs = 8;
 }  // namespace

+constexpr int MaxGPUs = 8;
+
+inline bool operator==(const dim3& l, const dim3& r) {
+  return l.x == r.x && l.y == r.y && l.z == r.z;
+}
+
+inline bool operator!=(const dim3& l, const dim3& r) { return !(l == r); }
+
 __device__ inline unsigned int thread_rank_in_grid() {
  const auto block_size = blockDim.x * blockDim.y * blockDim.z;
  const auto block_rank_in_grid = (blockIdx.z * gridDim.y + blockIdx.y) * gridDim.x + blockIdx.x;
@@ -67,4 +75,4 @@ template <class T> bool CheckDimensions(unsigned int device, T kernel, dim3 bloc
  }

  return true;
-}
+}
@@ -0,0 +1,350 @@
+/*
+Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "cooperative_groups_common.hh"
+
+#include <cpu_grid.h>
+#include <optional>
+#include <resource_guards.hh>
+#include <utils.hh>
+
+#include <cmd_options.hh>
+
+/**
+ * @addtogroup thread_block thread_block
+ * @{
+ * @ingroup DeviceLanguageTest
+ * Contains unit tests for all thread_block APIs
+ */
+
+namespace cg = cooperative_groups;
+
+template <typename BaseType = cg::thread_block>
+static __global__ void thread_block_size_getter(unsigned int* sizes) {
+  const BaseType group = cg::this_thread_block();
+  sizes[thread_rank_in_grid()] = group.size();
+}
+
+template <typename BaseType = cg::thread_block>
+static __global__ void thread_block_thread_rank_getter(unsigned int* thread_ranks) {
+  const BaseType group = cg::this_thread_block();
+  thread_ranks[thread_rank_in_grid()] = group.thread_rank();
+}
+
+static __global__ void thread_block_group_indices_getter(dim3* group_indices) {
+  group_indices[thread_rank_in_grid()] = cg::this_thread_block().group_index();
+}
+
+static __global__ void thread_block_thread_indices_getter(dim3* thread_indices) {
+  thread_indices[thread_rank_in_grid()] = cg::this_thread_block().thread_index();
+}
+
+static __global__ void thread_block_non_member_size_getter(unsigned int* sizes) {
+  sizes[thread_rank_in_grid()] = cg::group_size(cg::this_thread_block());
+}
+
+static __global__ void thread_block_non_member_thread_rank_getter(unsigned int* thread_ranks) {
+  thread_ranks[thread_rank_in_grid()] = cg::thread_rank(cg::this_thread_block());
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *    - Launches kernels that write the return values of size, thread_rank, group_index, and
+ * thread_index member functions to an output array that is validated on the host side. The kernels
+ * are run sequentially, reusing the output array, to avoid running out of device memory for large
+ * kernel launches.
+ * Test source
+ * ------------------------
+ *    - unit/cooperativeGrps/thread_block.cc
+ * Test requirements
+ * ------------------------
+ *    - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Unit_Thread_Block_Getters_Positive_Basic") {
+  const auto blocks = GenerateBlockDimensions();
+  const auto threads = GenerateThreadDimensions();
+  INFO("Grid dimensions: x " << blocks.x << ", y " << blocks.y << ", z " << blocks.z);
+  INFO("Block dimensions: x " << threads.x << ", y " << threads.y << ", z " << threads.z);
+  const CPUGrid grid(blocks, threads);
+
+  {
+    LinearAllocGuard<unsigned int> uint_arr_dev(LinearAllocs::hipMalloc,
+                                                grid.thread_count_ * sizeof(unsigned int));
+    LinearAllocGuard<unsigned int> uint_arr(LinearAllocs::hipHostMalloc,
+                                            grid.thread_count_ * sizeof(unsigned int));
+
+    thread_block_size_getter<<<blocks, threads>>>(uint_arr_dev.ptr());
+    HIP_CHECK(hipGetLastError());
+    HIP_CHECK(hipMemcpy(uint_arr.ptr(), uint_arr_dev.ptr(),
+                        grid.thread_count_ * sizeof(*uint_arr.ptr()), hipMemcpyDeviceToHost));
+    HIP_CHECK(hipDeviceSynchronize());
+    thread_block_thread_rank_getter<<<blocks, threads>>>(uint_arr_dev.ptr());
+    HIP_CHECK(hipGetLastError());
+
+    // Validate thread_block.size() values
+    ArrayAllOf(uint_arr.ptr(), grid.thread_count_,
+               [size = grid.threads_in_block_count_](uint32_t) { return size; });
+
+    HIP_CHECK(hipMemcpy(uint_arr.ptr(), uint_arr_dev.ptr(),
+                        grid.thread_count_ * sizeof(*uint_arr.ptr()), hipMemcpyDeviceToHost));
+    HIP_CHECK(hipDeviceSynchronize());
+
+    // Validate thread_block.thread_rank() values
+    ArrayAllOf(uint_arr.ptr(), grid.thread_count_,
+               [&grid](uint32_t i) { return grid.thread_rank_in_block(i).value(); });
+  }
+
+  {
+    LinearAllocGuard<dim3> dim3_arr_dev(LinearAllocs::hipMalloc, grid.thread_count_ * sizeof(dim3));
+    LinearAllocGuard<dim3> dim3_arr(LinearAllocs::hipHostMalloc, grid.thread_count_ * sizeof(dim3));
+
+    thread_block_group_indices_getter<<<blocks, threads>>>(dim3_arr_dev.ptr());
+    HIP_CHECK(hipGetLastError());
+    HIP_CHECK(hipMemcpy(dim3_arr.ptr(), dim3_arr_dev.ptr(),
+                        grid.thread_count_ * sizeof(*dim3_arr.ptr()), hipMemcpyDeviceToHost));
+    HIP_CHECK(hipDeviceSynchronize());
+    thread_block_thread_indices_getter<<<blocks, threads>>>(dim3_arr_dev.ptr());
+    HIP_CHECK(hipGetLastError());
+
+    // Validate thread_block.group_index() values
+    ArrayAllOf(dim3_arr.ptr(), grid.thread_count_,
+               [&grid](uint32_t i) { return grid.block_idx(i).value(); });
+
+    HIP_CHECK(hipMemcpy(dim3_arr.ptr(), dim3_arr_dev.ptr(),
+                        grid.thread_count_ * sizeof(*dim3_arr.ptr()), hipMemcpyDeviceToHost));
+    HIP_CHECK(hipDeviceSynchronize());
+
+    // Validate thread_block.thread_index() values
+    ArrayAllOf(dim3_arr.ptr(), grid.thread_count_,
+               [&grid](uint32_t i) { return grid.thread_idx(i).value(); });
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *    - Launches kernels that write the return values of size and thread_rank member functions to an
+ * output array that is validated on the host side, while treating the thread block as a thread
+ * group. The kernels are run sequentially, reusing the output array, to avoid running out of device
+ * memory for large kernel launches.
+ * Test source
+ * ------------------------
+ *    - unit/cooperativeGrps/thread_block.cc
+ * Test requirements
+ * ------------------------
+ *    - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Unit_Thread_Block_Getters_Via_Base_Type_Positive_Basic") {
+  const auto blocks = GenerateBlockDimensions();
+  const auto threads = GenerateThreadDimensions();
+  INFO("Grid dimensions: x " << blocks.x << ", y " << blocks.y << ", z " << blocks.z);
+  INFO("Block dimensions: x " << threads.x << ", y " << threads.y << ", z " << threads.z);
+
+  const CPUGrid grid(blocks, threads);
+
+  LinearAllocGuard<unsigned int> uint_arr_dev(LinearAllocs::hipMalloc,
+                                              grid.thread_count_ * sizeof(unsigned int));
+  LinearAllocGuard<unsigned int> uint_arr(LinearAllocs::hipHostMalloc,
+                                          grid.thread_count_ * sizeof(unsigned int));
+
+  thread_block_size_getter<cg::thread_group><<<blocks, threads>>>(uint_arr_dev.ptr());
+  HIP_CHECK(hipGetLastError());
+  HIP_CHECK(hipMemcpy(uint_arr.ptr(), uint_arr_dev.ptr(),
+                      grid.thread_count_ * sizeof(*uint_arr.ptr()), hipMemcpyDeviceToHost));
+  HIP_CHECK(hipDeviceSynchronize());
+  thread_block_thread_rank_getter<cg::thread_group><<<blocks, threads>>>(uint_arr_dev.ptr());
+  HIP_CHECK(hipGetLastError());
+
+  // Validate thread_block.size() values
+  ArrayAllOf(uint_arr.ptr(), grid.thread_count_,
+             [size = grid.threads_in_block_count_](uint32_t) { return size; });
+
+  HIP_CHECK(hipMemcpy(uint_arr.ptr(), uint_arr_dev.ptr(),
+                      grid.thread_count_ * sizeof(*uint_arr.ptr()), hipMemcpyDeviceToHost));
+  HIP_CHECK(hipDeviceSynchronize());
+
+  // Validate thread_block.thread_rank() values
+  ArrayAllOf(uint_arr.ptr(), grid.thread_count_,
+             [&grid](uint32_t i) { return grid.thread_rank_in_block(i).value(); });
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *    - Launches kernels that write the return values of size and thread_rank non-member functions
+ * to an output array that is validated on the host side. The kernels are run sequentially, reusing
+ * the output array, to avoid running out of device memory for large kernel launches.
+ * Test source
+ * ------------------------
+ *    - unit/cooperativeGrps/thread_block.cc
+ * Test requirements
+ * ------------------------
+ *    - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Unit_Thread_Block_Getters_Via_Non_Member_Functions_Positive_Basic") {
+  const auto blocks = GenerateBlockDimensions();
+  const auto threads = GenerateThreadDimensions();
+  INFO("Grid dimensions: x " << blocks.x << ", y " << blocks.y << ", z " << blocks.z);
+  INFO("Block dimensions: x " << threads.x << ", y " << threads.y << ", z " << threads.z);
+
+  const CPUGrid grid(blocks, threads);
+
+  LinearAllocGuard<unsigned int> uint_arr_dev(LinearAllocs::hipMalloc,
+                                              grid.thread_count_ * sizeof(unsigned int));
+  LinearAllocGuard<unsigned int> uint_arr(LinearAllocs::hipHostMalloc,
+                                          grid.thread_count_ * sizeof(unsigned int));
+
+  thread_block_non_member_size_getter<<<blocks, threads>>>(uint_arr_dev.ptr());
+  HIP_CHECK(hipGetLastError());
+  HIP_CHECK(hipMemcpy(uint_arr.ptr(), uint_arr_dev.ptr(),
+                      grid.thread_count_ * sizeof(*uint_arr.ptr()), hipMemcpyDeviceToHost));
+  HIP_CHECK(hipDeviceSynchronize());
+  thread_block_non_member_thread_rank_getter<<<blocks, threads>>>(uint_arr_dev.ptr());
+  HIP_CHECK(hipGetLastError());
+
+  // Validate thread_block.size() values
+  ArrayAllOf(uint_arr.ptr(), grid.thread_count_,
+             [size = grid.threads_in_block_count_](uint32_t) { return size; });
+
+  HIP_CHECK(hipMemcpy(uint_arr.ptr(), uint_arr_dev.ptr(),
+                      grid.thread_count_ * sizeof(*uint_arr.ptr()), hipMemcpyDeviceToHost));
+  HIP_CHECK(hipDeviceSynchronize());
+
+  // Validate thread_block.thread_rank() values
+  ArrayAllOf(uint_arr.ptr(), grid.thread_count_,
+             [&grid](uint32_t i) { return grid.thread_rank_in_block(i).value(); });
+}
+
+
+template <bool use_global, typename T>
+__global__ void thread_block_sync_check(T* global_data, unsigned int* wait_modifiers,
+                                        unsigned int* read_offsets) {
+  extern __shared__ uint8_t shared_data[];
+  T* const data = use_global ? global_data : reinterpret_cast<T*>(shared_data);
+  const auto block = cg::this_thread_block();
+  constexpr T divisor = 255;
+  const auto tid = block.thread_rank();
+  const auto wait_modifier = wait_modifiers[tid];
+  const auto read_offset = read_offsets[tid];
+  busy_wait(wait_modifier);
+  data[tid] = tid % divisor;
+  block.sync();
+  bool valid = true;
+  for (auto i = 0; i < block.size(); ++i) {
+    const auto offset = block.size() + read_offset;
+    const auto expected = (tid + offset + i) % block.size();
+    if (!(valid &= (data[expected] == expected % divisor))) {
+      break;
+    }
+  }
+  block.sync();
+  data[tid] = valid;
+  if constexpr (!use_global) {
+    global_data[tid] = data[tid];
+  }
+}
+
+static inline std::mt19937& GetRandomGenerator() {
+  // With a static seed the tests will remain consistent between runs, yet it relieves the problem
+  // of predetermining a set of modifiers by hand. The sets of modifiers could actually be
+  // determined at compile time if std::random objects could operate in a constexpr context.
+  static std::mt19937 mt(17);
+  return mt;
+}
+
+template <typename T> static inline T GenerateRandomInteger(const T min, const T max) {
+  std::uniform_int_distribution<T> dist(min, max);
+  return dist(GetRandomGenerator());
+}
+
+template <bool global_memory, typename T> void ThreadBlockSyncTest() {
+  const auto randomized_run_count = GENERATE(range(0, cmd_options.cg_iterations));
+  INFO("Run number: " << randomized_run_count + 1);
+  const auto blocks = dim3(1, 1, 1);
+  const auto threads = GenerateThreadDimensions();
+  INFO("Grid dimensions: x " << blocks.x << ", y " << blocks.y << ", z " << blocks.z);
+  INFO("Block dimensions: x " << threads.x << ", y " << threads.y << ", z " << threads.z);
+  CPUGrid grid(blocks, threads);
+
+  const auto alloc_size = grid.thread_count_ * sizeof(T);
+  int max_shared_mem_per_block = 0;
+  HIP_CHECK(hipDeviceGetAttribute(&max_shared_mem_per_block,
+                                  hipDeviceAttributeMaxSharedMemoryPerBlock, 0));
+  if (!global_memory && max_shared_mem_per_block < alloc_size) {
+    return;
+  }
+  LinearAllocGuard<T> arr_dev(LinearAllocs::hipMalloc, alloc_size);
+  LinearAllocGuard<T> arr(LinearAllocs::hipHostMalloc, alloc_size);
+
+  LinearAllocGuard<unsigned int> wait_modifiers_dev(LinearAllocs::hipMalloc,
+                                                    grid.thread_count_ * sizeof(unsigned int));
+  LinearAllocGuard<unsigned int> wait_modifiers(LinearAllocs::hipHostMalloc,
+                                                grid.thread_count_ * sizeof(unsigned int));
+  std::generate(wait_modifiers.ptr(), wait_modifiers.ptr() + grid.thread_count_,
+                [&] { return GenerateRandomInteger(0u, 1500u); });
+
+  LinearAllocGuard<unsigned int> read_offsets_dev(LinearAllocs::hipMalloc,
+                                                  grid.thread_count_ * sizeof(unsigned int));
+  std::vector<unsigned int> read_offsets(grid.thread_count_, 0u);
+  if (randomized_run_count != 0) {
+    std::generate(read_offsets.begin(), read_offsets.end(),
+                  [&] { return GenerateRandomInteger(0u, grid.thread_count_); });
+  }
+
+  const auto shared_memory_size = global_memory ? 0u : alloc_size;
+  HIP_CHECK(hipMemcpy(wait_modifiers_dev.ptr(), wait_modifiers.ptr(),
+                      grid.thread_count_ * sizeof(unsigned int), hipMemcpyHostToDevice));
+  HIP_CHECK(hipMemcpy(read_offsets_dev.ptr(), read_offsets.data(),
+                      grid.thread_count_ * sizeof(unsigned int), hipMemcpyHostToDevice));
+
+  thread_block_sync_check<global_memory><<<blocks, threads, shared_memory_size>>>(
+      arr_dev.ptr(), wait_modifiers_dev.ptr(), read_offsets_dev.ptr());
+  HIP_CHECK(hipGetLastError());
+
+  HIP_CHECK(hipMemcpy(arr.ptr(), arr_dev.ptr(), alloc_size, hipMemcpyDeviceToHost));
+  HIP_CHECK(hipDeviceSynchronize());
+
+  REQUIRE(std::all_of(arr.ptr(), arr.ptr() + grid.thread_count_, [](unsigned int e) { return e; }));
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *    - Launches a kernel wherein every thread writes its grid-wide linear index into an array. The
+ * array is either in global or dynamic shared memory based on a compile time switch, and the test
+ * is run for arrays of 1, 2, and 4 byte elements. Before the write each thread executes a busy wait
+ * loop for a random amount of clock cycles, the amount being read from an input array. After the
+ * write a block-wide sync is performed and each thread validates that it can read the expected
+ * values that other threads have written to their respective array slots. Each thread begins the
+ * validation from a given offset from its own index. For the first run of the test, all the offsets
+ * are zero, so memory reads should be coalesced as adjacent threads read from adjacent memory
+ * locations. On subsequent runs the offsets are randomized for each thread, leading to
+ * non-coalesced reads and cache thrashing.
+ * Test source
+ * ------------------------
+ *    - unit/cooperativeGrps/thread_block.cc
+ * Test requirements
+ * ------------------------
+ *    - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit_Thread_Block_Sync_Positive_Basic", "", uint8_t, uint16_t, uint32_t) {
+  SECTION("Global memory") { ThreadBlockSyncTest<true, TestType>(); }
+  SECTION("Shared memory") { ThreadBlockSyncTest<false, TestType>(); }
+}