rocrtst: Test for shader access after async_copy (#645)

New test that does a memory_copy, and right after has the shader access
the data. This verifies that the memory is coherent and that all the
probes and flushes were done correctly by the memory_copy.

Signed-off-by: Sunday Clement <Sunday.Clement@amd.com>
Этот коммит содержится в:
Sunday Clement
2025-09-09 15:03:56 -04:00
коммит произвёл GitHub
родитель 1003c899ee
Коммит e9bb77614e
5 изменённых файлов: 274 добавлений и 0 удалений
+209
Просмотреть файл
@@ -505,4 +505,213 @@ void MemoryAccessTest::GPUAccessToCPUMemoryTest(void) {
}
}
void MemoryAccessTest::MemoryAccessCoherentTest(void) {
hsa_status_t err;
PrintMemorySubtestHeader("MemoryAccessCoherentTest in Memory Pools");
// find all cpu agents
std::vector<hsa_agent_t> cpus;
err = hsa_iterate_agents(rocrtst::IterateCPUAgents, &cpus);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
// find all gpu agents
std::vector<hsa_agent_t> gpus;
err = hsa_iterate_agents(rocrtst::IterateGPUAgents, &gpus);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
// - allocate queue
hsa_queue_t *queue = NULL;
uint32_t queue_size = 0;
ASSERT_SUCCESS(hsa_agent_get_info(gpus[0], HSA_AGENT_INFO_QUEUE_MAX_SIZE, &queue_size));
err = hsa_queue_create(gpus[0], queue_size, HSA_QUEUE_TYPE_SINGLE, nullptr, nullptr, 0, 0, &queue);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
int* host_buffer_src = nullptr;
int* host_buffer_dst = nullptr;
//getting the host memory pool
hsa_amd_memory_pool_t host_memory_pool;
err = hsa_amd_agent_iterate_memory_pools(cpus[0], rocrtst::GetGlobalMemoryPool, &host_memory_pool);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
//getting the device memory pool
hsa_amd_memory_pool_t device_pool;
err = hsa_amd_agent_iterate_memory_pools(gpus[0], rocrtst::GetGlobalMemoryPool, &device_pool);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
// - allocate buffer in host memory
err = hsa_amd_memory_pool_allocate(host_memory_pool, kMemoryAllocSize*sizeof(int), 0, reinterpret_cast<void **>(&host_buffer_src));
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
err = hsa_amd_memory_pool_allocate(host_memory_pool, kMemoryAllocSize*sizeof(int), 0, reinterpret_cast<void **>(&host_buffer_dst));
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
// - fill host-buffer-src with increasing integers pattern
int* src = static_cast<int*>(host_buffer_src);
int* dst = static_cast<int*>(host_buffer_dst);
for (int j = 0; j < kMemoryAllocSize; ++j)
host_buffer_src[j] = j;
// - memset host-buffer-dst with a different pattern
memset(host_buffer_dst, 0xA5, kMemoryAllocSize * sizeof(int));
// - Allocate a buffer in Device Memory
int* device_buffer_src = nullptr;
int* device_buffer_dst = nullptr;
err = hsa_amd_memory_pool_allocate(device_pool, kMemoryAllocSize * sizeof(int), 0, reinterpret_cast<void**>(&device_buffer_src));
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
err = hsa_amd_memory_pool_allocate(device_pool, kMemoryAllocSize * sizeof(int), 0, reinterpret_cast<void**>(&device_buffer_dst));
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
// - Allow GPU access to host memory
err = hsa_amd_agents_allow_access(1, &gpus[0], NULL, host_buffer_src);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
err = hsa_amd_agents_allow_access(1, &gpus[0], NULL, host_buffer_dst);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
err = hsa_amd_agents_allow_access(1, &gpus[0], NULL, device_buffer_src);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
err = hsa_amd_agents_allow_access(1, &gpus[0], NULL, device_buffer_dst);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
err = hsa_amd_agents_allow_access(1, &cpus[0], NULL, device_buffer_src);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
err = hsa_amd_agents_allow_access(1, &cpus[0], NULL, device_buffer_dst);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
// - Create new signals
hsa_signal_t signal_shader_start = {0}; // start signal
err = hsa_signal_create(1, 0, NULL, &signal_shader_start);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
hsa_signal_t signal_shader_end = {0}; // completion signal
err = hsa_signal_create(1, 0, NULL, &signal_shader_end);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
hsa_signal_t signal_test_end = {0}; // test end signal
err = hsa_signal_create(1, 0, NULL, &signal_test_end);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
// - handle kernel args
args* k_Args = NULL;
// Find a memory pool that supports kernel args
hsa_amd_memory_pool_t kernarg_pool;
err = hsa_amd_agent_iterate_memory_pools(cpus[0],
rocrtst::GetKernArgMemoryPool,
&kernarg_pool);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
// Allocate the kernel argument buffer from the kernarg_pool.
err = hsa_amd_memory_pool_allocate(kernarg_pool, sizeof(args_t), 0,
reinterpret_cast<void **>(&k_Args));
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
err = hsa_amd_agents_allow_access(1, &gpus[0], NULL, k_Args);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
// vector_copy kernel args
k_Args->a = device_buffer_src;
k_Args->b = device_buffer_dst;
k_Args->c = nullptr;
// - Place a Barrier-Value packet into the queue
// that waits on signal-shader-start to change from 1 to 0.
hsa_barrier_and_packet_t barrier_pkt;
memset(&barrier_pkt, 0, sizeof(barrier_pkt));
barrier_pkt.header = HSA_PACKET_TYPE_BARRIER_AND;
barrier_pkt.completion_signal = {0};
// Set signal dependency
barrier_pkt.dep_signal[0] = signal_shader_start;
for (int i = 1; i < 5; ++i) {
barrier_pkt.dep_signal[i] = {0};
}
uint64_t index = hsa_queue_load_write_index_relaxed(queue);
hsa_queue_store_write_index_relaxed(queue, index + 1);
reinterpret_cast<hsa_barrier_and_packet_t*>(queue->base_address)[index % queue->size] = barrier_pkt;
// - Place Dispatch packet into the queue
hsa_kernel_dispatch_packet_t dispatch_pkt;
memset(&dispatch_pkt, 0, sizeof(dispatch_pkt));
// - create kernel object for dispatch
set_kernel_file_name("vector_copy_kernels.hsaco");
set_kernel_name("vector_copy");
err = rocrtst::LoadKernelFromObjFile(this, &gpus[0]);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
// initialize aql packet
dispatch_pkt.workgroup_size_x = 256;
dispatch_pkt.workgroup_size_y = 1;
dispatch_pkt.workgroup_size_z = 1;
dispatch_pkt.grid_size_x = kMemoryAllocSize;
dispatch_pkt.grid_size_y = 1;
dispatch_pkt.grid_size_z = 1;
dispatch_pkt.private_segment_size = 0;
dispatch_pkt.grid_size_x = kMemoryAllocSize;
dispatch_pkt.kernel_object = kernel_object(); // Assumes kernel loaded
dispatch_pkt.kernarg_address = k_Args;
dispatch_pkt.completion_signal = signal_shader_end;
const uint32_t queue_mask = queue->size - 1;
index = hsa_queue_load_write_index_relaxed(queue);
hsa_queue_store_write_index_relaxed(queue, index + 1);
rocrtst::WriteAQLToQueueLoc(queue, index, &dispatch_pkt);
hsa_kernel_dispatch_packet_t *q_base_addr =
reinterpret_cast<hsa_kernel_dispatch_packet_t *>(queue->base_address);
rocrtst::AtomicSetPacketHeader(
(HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) |
(1 << HSA_PACKET_HEADER_BARRIER) |
(HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) |
(HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE),
(1 << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS),
reinterpret_cast<hsa_kernel_dispatch_packet_t *>
(&q_base_addr[index & queue_mask]));
err = hsa_amd_memory_async_copy(device_buffer_src, gpus[0], host_buffer_src, cpus[0],
kMemoryAllocSize * sizeof(int), 0, NULL, signal_shader_start);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
hsa_signal_t dep_signals[1] = { signal_shader_end };
err = hsa_amd_memory_async_copy(host_buffer_dst, cpus[0], device_buffer_dst, gpus[0],
kMemoryAllocSize * sizeof(int), 1, dep_signals, signal_test_end);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
// - Ring the doorbell to start the queue
index = hsa_queue_load_write_index_relaxed(queue);
hsa_signal_store_relaxed(queue->doorbell_signal, index);
// - Wait for the signal-test-end to change from 1 to 0.
while (hsa_signal_wait_scacquire(signal_test_end,
HSA_SIGNAL_CONDITION_LT, 1, (uint64_t)-1, HSA_WAIT_STATE_ACTIVE)) {}
// - Verify that host-buffer-src and host-buffer-dst have equal values.
for (int i = 0; i < kMemoryAllocSize; ++i) {
ASSERT_EQ(host_buffer_src[i], host_buffer_dst[i]);
}
//clean up allocated handles
if(host_buffer_src) { hsa_amd_memory_pool_free(host_buffer_src); }
if(host_buffer_dst) { hsa_amd_memory_pool_free(host_buffer_dst); }
if(device_buffer_src) { hsa_amd_memory_pool_free(device_buffer_src); }
if(device_buffer_dst) { hsa_amd_memory_pool_free(device_buffer_dst); }
if (signal_shader_start.handle) { hsa_signal_destroy(signal_shader_start); }
if (signal_shader_end.handle) { hsa_signal_destroy(signal_shader_end); }
if (signal_test_end.handle) { hsa_signal_destroy(signal_test_end); }
if (k_Args) { hsa_amd_memory_pool_free(k_Args); }
if (queue) { hsa_queue_destroy(queue); }
}
#undef RET_IF_HSA_ERR
+3
Просмотреть файл
@@ -79,6 +79,9 @@ class MemoryAccessTest : public TestBase {
// @Brief: This test verify that GPU is able to Read & write CPU memory
void GPUAccessToCPUMemoryTest(void);
// @Brief: This test verifies that memory accessed on the GPU after a copy is coherent
void MemoryAccessCoherentTest(void);
private:
void CPUAccessToGPUMemoryTest(hsa_agent_t cpuAgent,
+4
Просмотреть файл
@@ -376,6 +376,10 @@ set(BITCODE_LIBS "${COMMON_BITCODE_LIBS}")
set(CL_FILE_LIST "${KERNELS_DIR}/gpuReadWrite_kernels.cl")
build_sample_for_devices("gpuReadWrite")
# Vector Copy
set(BITCODE_LIBS "${COMMON_BITCODE_LIBS}")
set(CL_FILE_LIST "${KERNELS_DIR}/vector_copy_kernel.cl")
build_sample_for_devices("vector_copy")
# Vector Add Debug Trap
set(BITCODE_LIBS "${COMMON_BITCODE_LIBS}")
+51
Просмотреть файл
@@ -0,0 +1,51 @@
/*
* =============================================================================
* ROC Runtime Conformance Release License
* =============================================================================
* The University of Illinois/NCSA
* Open Source License (NCSA)
*
* Copyright (c) 2025, Advanced Micro Devices, Inc.
* All rights reserved.
*
* Developed by:
*
* AMD Research and AMD ROC Software Development
*
* Advanced Micro Devices, Inc.
*
* www.amd.com
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal with the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* - Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimers.
* - Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimers in
* the documentation and/or other materials provided with the distribution.
* - Neither the names of <Name of Development Group, Name of Institution>,
* nor the names of its contributors may be used to endorse or promote
* products derived from this Software without specific prior written
* permission.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS WITH THE SOFTWARE.
*
*/
__kernel void vector_copy(__global const int * a,
__global int * b, __global int * c) {
int i = get_global_id(0);
// Reading the device buffer and writing to separate device buffer
b[i] = a[i]; // both a[i] and b[i] point to device memory.
}
+7
Просмотреть файл
@@ -139,6 +139,13 @@ TEST(rocrtstFunc, MemoryAccessTests) {
RunCustomTestEpilog(&mt);
}
TEST(rocrtstFunc, MemoryAccessCoherent) {
MemoryAccessTest mt;
RunCustomTestProlog(&mt);
mt.MemoryAccessCoherentTest();
RunCustomTestEpilog(&mt);
}
TEST(rocrtstFunc, GroupMemoryAllocationTest) {
MemoryAllocationTest ma(true, false);
RunCustomTestProlog(&ma);