rocrtst: Test for shader access after async_copy (#645)

New test that does a memory_copy, and right after has the shader access the data. This verifies that the memory is coherent and that all the probes and flushes were done correctly by the memory_copy. Signed-off-by: Sunday Clement <Sunday.Clement@amd.com>
2025-09-09 15:03:56 -04:00
@@ -505,4 +505,213 @@ void MemoryAccessTest::GPUAccessToCPUMemoryTest(void) {
  }
 }

+
+void MemoryAccessTest::MemoryAccessCoherentTest(void) {
+  hsa_status_t err;
+  
+  PrintMemorySubtestHeader("MemoryAccessCoherentTest in Memory Pools");
+
+  // find all cpu agents
+  std::vector<hsa_agent_t> cpus;
+  err = hsa_iterate_agents(rocrtst::IterateCPUAgents, &cpus);
+  ASSERT_EQ(err, HSA_STATUS_SUCCESS);
+  // find all gpu agents
+  std::vector<hsa_agent_t> gpus;
+  err = hsa_iterate_agents(rocrtst::IterateGPUAgents, &gpus);
+  ASSERT_EQ(err, HSA_STATUS_SUCCESS);
+
+  // - allocate queue
+  hsa_queue_t *queue = NULL;
+  uint32_t queue_size = 0;
+
+  ASSERT_SUCCESS(hsa_agent_get_info(gpus[0], HSA_AGENT_INFO_QUEUE_MAX_SIZE, &queue_size));
+
+  err = hsa_queue_create(gpus[0], queue_size, HSA_QUEUE_TYPE_SINGLE, nullptr, nullptr, 0, 0, &queue);
+  ASSERT_EQ(err, HSA_STATUS_SUCCESS);
+
+  
+  int* host_buffer_src = nullptr;
+  int* host_buffer_dst = nullptr;
+
+  //getting the host memory pool
+  hsa_amd_memory_pool_t host_memory_pool;
+  err = hsa_amd_agent_iterate_memory_pools(cpus[0], rocrtst::GetGlobalMemoryPool, &host_memory_pool);
+  ASSERT_EQ(err, HSA_STATUS_SUCCESS);
+
+  //getting the device memory pool
+  hsa_amd_memory_pool_t device_pool;
+  err = hsa_amd_agent_iterate_memory_pools(gpus[0], rocrtst::GetGlobalMemoryPool, &device_pool);
+  ASSERT_EQ(err, HSA_STATUS_SUCCESS);
+
+
+  // - allocate buffer in host memory
+  err = hsa_amd_memory_pool_allocate(host_memory_pool, kMemoryAllocSize*sizeof(int), 0, reinterpret_cast<void **>(&host_buffer_src));
+  ASSERT_EQ(err, HSA_STATUS_SUCCESS);
+
+  err = hsa_amd_memory_pool_allocate(host_memory_pool, kMemoryAllocSize*sizeof(int), 0, reinterpret_cast<void **>(&host_buffer_dst));
+  ASSERT_EQ(err, HSA_STATUS_SUCCESS);
+
+  // - fill host-buffer-src with increasing integers pattern
+  int* src = static_cast<int*>(host_buffer_src);
+  int* dst = static_cast<int*>(host_buffer_dst);
+  for (int j = 0; j < kMemoryAllocSize; ++j) 
+    host_buffer_src[j] = j;
+  
+  // - memset host-buffer-dst with a different pattern 
+  memset(host_buffer_dst, 0xA5, kMemoryAllocSize * sizeof(int));
+
+  // - Allocate a buffer in Device Memory
+  int* device_buffer_src = nullptr;
+  int* device_buffer_dst = nullptr;
+
+  err = hsa_amd_memory_pool_allocate(device_pool, kMemoryAllocSize * sizeof(int), 0, reinterpret_cast<void**>(&device_buffer_src));
+  ASSERT_EQ(err, HSA_STATUS_SUCCESS);
+  err = hsa_amd_memory_pool_allocate(device_pool, kMemoryAllocSize * sizeof(int), 0, reinterpret_cast<void**>(&device_buffer_dst));
+  ASSERT_EQ(err, HSA_STATUS_SUCCESS);
+
+  // - Allow GPU access to host memory
+  err = hsa_amd_agents_allow_access(1, &gpus[0], NULL, host_buffer_src);
+  ASSERT_EQ(err, HSA_STATUS_SUCCESS);
+  err = hsa_amd_agents_allow_access(1, &gpus[0], NULL, host_buffer_dst);
+  ASSERT_EQ(err, HSA_STATUS_SUCCESS);
+  err = hsa_amd_agents_allow_access(1, &gpus[0], NULL, device_buffer_src);
+  ASSERT_EQ(err, HSA_STATUS_SUCCESS);
+  err = hsa_amd_agents_allow_access(1, &gpus[0], NULL, device_buffer_dst);
+  ASSERT_EQ(err, HSA_STATUS_SUCCESS);
+  err = hsa_amd_agents_allow_access(1, &cpus[0], NULL, device_buffer_src);
+  ASSERT_EQ(err, HSA_STATUS_SUCCESS);
+  err = hsa_amd_agents_allow_access(1, &cpus[0], NULL, device_buffer_dst);
+  ASSERT_EQ(err, HSA_STATUS_SUCCESS);
+
+
+  // - Create new signals
+  hsa_signal_t signal_shader_start = {0};  // start signal
+  err = hsa_signal_create(1, 0, NULL, &signal_shader_start);
+  ASSERT_EQ(err, HSA_STATUS_SUCCESS);
+  hsa_signal_t signal_shader_end = {0};  // completion signal
+  err = hsa_signal_create(1, 0, NULL, &signal_shader_end);
+  ASSERT_EQ(err, HSA_STATUS_SUCCESS);
+  hsa_signal_t signal_test_end = {0};  // test end signal
+  err = hsa_signal_create(1, 0, NULL, &signal_test_end);
+  ASSERT_EQ(err, HSA_STATUS_SUCCESS);
+
+  // - handle kernel args
+  args* k_Args = NULL;
+
+  // Find a memory pool that supports kernel args
+  hsa_amd_memory_pool_t kernarg_pool;
+  err = hsa_amd_agent_iterate_memory_pools(cpus[0],
+                                            rocrtst::GetKernArgMemoryPool,
+                                            &kernarg_pool);
+  ASSERT_EQ(err, HSA_STATUS_SUCCESS);
+
+  // Allocate the kernel argument buffer from the kernarg_pool.
+  err = hsa_amd_memory_pool_allocate(kernarg_pool, sizeof(args_t), 0,
+                                      reinterpret_cast<void **>(&k_Args));
+  ASSERT_EQ(err, HSA_STATUS_SUCCESS);  
+
+  err = hsa_amd_agents_allow_access(1, &gpus[0], NULL, k_Args);
+  ASSERT_EQ(err, HSA_STATUS_SUCCESS);
+
+  // vector_copy kernel args
+  k_Args->a = device_buffer_src;
+  k_Args->b = device_buffer_dst;
+  k_Args->c = nullptr;
+
+
+  // - Place a Barrier-Value packet into the queue
+  // that waits on signal-shader-start to change from 1 to 0.
+  hsa_barrier_and_packet_t barrier_pkt;
+  memset(&barrier_pkt, 0, sizeof(barrier_pkt));
+  barrier_pkt.header = HSA_PACKET_TYPE_BARRIER_AND;
+  barrier_pkt.completion_signal = {0};
+
+  // Set signal dependency
+  barrier_pkt.dep_signal[0] = signal_shader_start;
+  for (int i = 1; i < 5; ++i) {
+      barrier_pkt.dep_signal[i] = {0};
+  }
+
+  uint64_t index = hsa_queue_load_write_index_relaxed(queue);
+  hsa_queue_store_write_index_relaxed(queue, index + 1);
+  reinterpret_cast<hsa_barrier_and_packet_t*>(queue->base_address)[index % queue->size] = barrier_pkt;
+
+  // - Place Dispatch packet into the queue
+  hsa_kernel_dispatch_packet_t dispatch_pkt;
+  memset(&dispatch_pkt, 0, sizeof(dispatch_pkt));
+
+  // - create kernel object for dispatch
+  set_kernel_file_name("vector_copy_kernels.hsaco");
+  set_kernel_name("vector_copy");
+  err = rocrtst::LoadKernelFromObjFile(this, &gpus[0]);
+  ASSERT_EQ(err, HSA_STATUS_SUCCESS);
+
+  // initialize aql packet
+  dispatch_pkt.workgroup_size_x = 256;
+  dispatch_pkt.workgroup_size_y = 1;
+  dispatch_pkt.workgroup_size_z = 1;
+  dispatch_pkt.grid_size_x = kMemoryAllocSize;
+  dispatch_pkt.grid_size_y = 1;
+  dispatch_pkt.grid_size_z = 1;
+  dispatch_pkt.private_segment_size = 0;
+  dispatch_pkt.grid_size_x = kMemoryAllocSize;
+  dispatch_pkt.kernel_object = kernel_object(); // Assumes kernel loaded
+  dispatch_pkt.kernarg_address = k_Args;
+  dispatch_pkt.completion_signal = signal_shader_end;
+
+  const uint32_t queue_mask = queue->size - 1;
+
+  index = hsa_queue_load_write_index_relaxed(queue);
+  hsa_queue_store_write_index_relaxed(queue, index + 1);
+
+  rocrtst::WriteAQLToQueueLoc(queue, index, &dispatch_pkt);
+
+  hsa_kernel_dispatch_packet_t *q_base_addr =
+      reinterpret_cast<hsa_kernel_dispatch_packet_t *>(queue->base_address);
+  rocrtst::AtomicSetPacketHeader(
+      (HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) |
+          (1 << HSA_PACKET_HEADER_BARRIER) |
+        (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) |
+          (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE),
+                (1 << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS),
+      reinterpret_cast<hsa_kernel_dispatch_packet_t *>
+                                        (&q_base_addr[index & queue_mask]));
+
+
+  err = hsa_amd_memory_async_copy(device_buffer_src, gpus[0], host_buffer_src, cpus[0],
+                                  kMemoryAllocSize * sizeof(int), 0, NULL, signal_shader_start);
+  ASSERT_EQ(err, HSA_STATUS_SUCCESS);
+
+  hsa_signal_t dep_signals[1] = { signal_shader_end };
+  err = hsa_amd_memory_async_copy(host_buffer_dst, cpus[0], device_buffer_dst, gpus[0],
+                                  kMemoryAllocSize * sizeof(int), 1, dep_signals, signal_test_end);
+  ASSERT_EQ(err, HSA_STATUS_SUCCESS);
+
+  // - Ring the doorbell to start the queue
+  index = hsa_queue_load_write_index_relaxed(queue);
+  hsa_signal_store_relaxed(queue->doorbell_signal, index);
+
+
+  // - Wait for the signal-test-end to change from 1 to 0.
+  while (hsa_signal_wait_scacquire(signal_test_end,
+      HSA_SIGNAL_CONDITION_LT, 1, (uint64_t)-1, HSA_WAIT_STATE_ACTIVE)) {}
+
+  // - Verify that host-buffer-src and host-buffer-dst have equal values.
+  for (int i = 0; i < kMemoryAllocSize; ++i) {
+    ASSERT_EQ(host_buffer_src[i], host_buffer_dst[i]);
+  }
+
+  //clean up allocated handles
+  if(host_buffer_src) { hsa_amd_memory_pool_free(host_buffer_src); }
+  if(host_buffer_dst) { hsa_amd_memory_pool_free(host_buffer_dst); }
+  if(device_buffer_src) { hsa_amd_memory_pool_free(device_buffer_src); }
+  if(device_buffer_dst) { hsa_amd_memory_pool_free(device_buffer_dst); }
+  if (signal_shader_start.handle) { hsa_signal_destroy(signal_shader_start); }
+  if (signal_shader_end.handle) { hsa_signal_destroy(signal_shader_end); }
+  if (signal_test_end.handle) { hsa_signal_destroy(signal_test_end); }
+  if (k_Args) { hsa_amd_memory_pool_free(k_Args); }
+  if (queue) { hsa_queue_destroy(queue); }
+
+}
+
 #undef RET_IF_HSA_ERR
@@ -79,6 +79,9 @@ class MemoryAccessTest : public TestBase {
  // @Brief: This test verify that GPU is able to Read & write CPU memory
  void GPUAccessToCPUMemoryTest(void);

+  // @Brief: This test verifies that memory accessed on the GPU after a copy is coherent
+  void MemoryAccessCoherentTest(void);
+

 private:
  void CPUAccessToGPUMemoryTest(hsa_agent_t cpuAgent,
@@ -376,6 +376,10 @@ set(BITCODE_LIBS "${COMMON_BITCODE_LIBS}")
 set(CL_FILE_LIST "${KERNELS_DIR}/gpuReadWrite_kernels.cl")
 build_sample_for_devices("gpuReadWrite")

+# Vector Copy
+set(BITCODE_LIBS "${COMMON_BITCODE_LIBS}")
+set(CL_FILE_LIST "${KERNELS_DIR}/vector_copy_kernel.cl")
+build_sample_for_devices("vector_copy")

 # Vector Add Debug Trap
 set(BITCODE_LIBS "${COMMON_BITCODE_LIBS}")
@@ -0,0 +1,51 @@
+/*
+ * =============================================================================
+ *   ROC Runtime Conformance Release License
+ * =============================================================================
+ * The University of Illinois/NCSA
+ * Open Source License (NCSA)
+ *
+ * Copyright (c) 2025, Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Developed by:
+ *
+ *                 AMD Research and AMD ROC Software Development
+ *
+ *                 Advanced Micro Devices, Inc.
+ *
+ *                 www.amd.com
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal with the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ *  - Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimers.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimers in
+ *    the documentation and/or other materials provided with the distribution.
+ *  - Neither the names of <Name of Development Group, Name of Institution>,
+ *    nor the names of its contributors may be used to endorse or promote
+ *    products derived from this Software without specific prior written
+ *    permission.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS WITH THE SOFTWARE.
+ *
+ */
+
+__kernel void vector_copy(__global const int * a,
+                           __global int * b, __global int * c) {
+  int i = get_global_id(0);
+  // Reading the device buffer and writing to separate device buffer
+  b[i] = a[i];  // both a[i] and b[i] point to device memory.
+}
@@ -139,6 +139,13 @@ TEST(rocrtstFunc, MemoryAccessTests) {
  RunCustomTestEpilog(&mt);
 }

+TEST(rocrtstFunc, MemoryAccessCoherent) {
+  MemoryAccessTest mt;
+  RunCustomTestProlog(&mt);
+  mt.MemoryAccessCoherentTest();
+  RunCustomTestEpilog(&mt);
+}
+
 TEST(rocrtstFunc, GroupMemoryAllocationTest) {
  MemoryAllocationTest ma(true, false);
  RunCustomTestProlog(&ma);