From b5e1645a1476ea48ceeb601076ccb0714cc2948b Mon Sep 17 00:00:00 2001 From: "Welton, Benjamin" Date: Tue, 5 Aug 2025 17:29:07 -0700 Subject: [PATCH] Fix hsa_code_object_app test deadlock with profiler serialization (#577) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Problem with original test: - Created circular dependencies between queues: * Queue1: Kernel A → Barrier(waits for signal_2) → Kernel C * Queue2: Barrier(waits for signal_1) → Kernel B → sets signal_2 - With strict "one kernel at a time" serialization, this created deadlock: * Queue1 executed Kernel A, then blocked on barrier waiting for signal_2 * Serializer switched to Queue2, but Queue2 was blocked waiting for signal_1 * Neither queue could proceed: Queue1 needed Queue2's Kernel B to complete, but Queue2 couldn't start until Queue1 finished completely - Test would hang indefinitely at hsa_signal_wait_relaxed() for signal_2 Solution implemented: - Reordered packet submission to eliminate circular dependencies - Ensured signal producers execute before consumers need them: * Kernel A produces signal_1 before Queue2's barrier needs it * Kernel B produces signal_2 before Queue1's continuation needs it - Dependencies now flow forward without cycles, allowing serializer progress Refactoring changes: - Extract common functionality into helper functions: * create_completion_signal() for signal creation * create_queue() for queue creation * submit_kernel_packet() for kernel dispatch packets * submit_barrier_packet() for barrier packets - Add comprehensive documentation explaining expected execution pattern - Simplify main() function making the dependency flow more readable Co-authored-by: Benjamin Welton --- .../hsa-code-object/hsa_code_object_app.cpp | 379 ++++++------------ 1 file changed, 125 insertions(+), 254 deletions(-) diff --git a/tests/bin/hsa-code-object/hsa_code_object_app.cpp b/tests/bin/hsa-code-object/hsa_code_object_app.cpp index 19ce108a50..a7645132fd 100644 --- a/tests/bin/hsa-code-object/hsa_code_object_app.cpp +++ b/tests/bin/hsa-code-object/hsa_code_object_app.cpp @@ -81,6 +81,93 @@ get_kernel(MQDependencyTest::CodeObject& code_object, return copy; } +hsa_signal_t +create_completion_signal() +{ + hsa_signal_t signal = {}; + hsa_status_t status = hsa_signal_create(1, 0, nullptr, &signal); + RET_IF_HSA_ERR(status) + return signal; +} + +hsa_queue_t* +create_queue(hsa_agent_t agent) +{ + hsa_queue_t* queue = nullptr; + hsa_status_t status = hsa_queue_create( + agent, 1024, HSA_QUEUE_TYPE_SINGLE, nullptr, nullptr, UINT32_MAX, UINT32_MAX, &queue); + RET_IF_HSA_ERR(status) + return queue; +} + +void +submit_kernel_packet(MQDependencyTest& obj, + hsa_queue_t* queue, + const MQDependencyTest::Kernel& kernel, + void* args, + hsa_signal_t completion_signal) +{ + MQDependencyTest::Aql packet{}; + packet.header.type = HSA_PACKET_TYPE_KERNEL_DISPATCH; + packet.header.barrier = 1; + packet.header.acquire = HSA_FENCE_SCOPE_SYSTEM; + packet.header.release = HSA_FENCE_SCOPE_SYSTEM; + + packet.dispatch.setup = 1; + packet.dispatch.workgroup_size_x = 64; + packet.dispatch.workgroup_size_y = 1; + packet.dispatch.workgroup_size_z = 1; + packet.dispatch.grid_size_x = 64; + packet.dispatch.grid_size_y = 1; + packet.dispatch.grid_size_z = 1; + + packet.dispatch.group_segment_size = kernel.group; + packet.dispatch.private_segment_size = kernel.scratch; + packet.dispatch.kernel_object = kernel.handle; + packet.dispatch.kernarg_address = args; + packet.dispatch.completion_signal = completion_signal; + + obj.submit_packet(queue, packet); +} + +void +submit_barrier_packet(MQDependencyTest& obj, hsa_queue_t* queue, hsa_signal_t dependency_signal) +{ + MQDependencyTest::Aql packet{}; + packet.header.type = HSA_PACKET_TYPE_BARRIER_AND; + packet.header.barrier = 1; + packet.header.acquire = HSA_FENCE_SCOPE_SYSTEM; + packet.header.release = HSA_FENCE_SCOPE_SYSTEM; + + packet.barrier_and.dep_signal[0] = dependency_signal; + obj.submit_packet(queue, packet); +} + +/** + * Expected Execution Pattern with Serialization: + * + * This test validates that the profiler's serialization mechanism can handle + * inter-queue dependencies without deadlock. The execution should follow this pattern: + * + * Phase 1: + * Queue1: Kernel A executes → sets signal_1 = 0 + * Queue1: Barrier blocks (waiting for signal_2) + * [Serializer switches to Queue2] + * Queue2: Barrier proceeds (signal_1 = 0) → Kernel B executes → sets signal_2 = 0 + * [Serializer switches back to Queue1] + * Queue1: Barrier proceeds (signal_2 = 0) → Kernel C executes + * + * Phase 2: + * Queue1: Kernel D executes → sets signal_4 = 0 + * Queue1: Barrier blocks (waiting for signal_5) + * [Serializer switches to Queue2] + * Queue2: Barrier proceeds (signal_4 = 0) → Kernel E executes → sets signal_5 = 0 + * [Serializer switches back to Queue1] + * Queue1: Barrier proceeds (signal_5 = 0) → Kernel F executes + * + * Key: Dependencies flow forward without cycles, allowing the serializer to make + * progress by switching between queues when one blocks on a barrier. + */ int main() { @@ -128,275 +215,59 @@ main() memset(c, 0, 64 * sizeof(uint32_t)); memset(d, 1, 64 * sizeof(uint32_t)); - // Create queue in gpu agent and prepare a kernel dispatch packet - hsa_queue_t* queue1 = nullptr; - status = hsa_queue_create(obj.gpu[0].agent, - 1024, - HSA_QUEUE_TYPE_SINGLE, - nullptr, - nullptr, - UINT32_MAX, - UINT32_MAX, - &queue1); - RET_IF_HSA_ERR(status) + // Create queues + hsa_queue_t* queue1 = create_queue(obj.gpu[0].agent); + hsa_queue_t* queue2 = create_queue(obj.gpu[0].agent); - // Create a signal with a value of 1 and attach it to the first kernel - // dispatch packet - hsa_signal_t completion_signal_1 = {}; - status = hsa_signal_create(1, 0, nullptr, &completion_signal_1); - RET_IF_HSA_ERR(status) + // Create completion signals + hsa_signal_t completion_signal_1 = create_completion_signal(); - // First dispath packet on queue 1, Kernel A - { - MQDependencyTest::Aql packet{}; - packet.header.type = HSA_PACKET_TYPE_KERNEL_DISPATCH; - packet.header.barrier = 1; - packet.header.acquire = HSA_FENCE_SCOPE_SYSTEM; - packet.header.release = HSA_FENCE_SCOPE_SYSTEM; + // Set up arguments for first batch + args->a = a; + args->b = b; - packet.dispatch.setup = 1; - packet.dispatch.workgroup_size_x = 64; - packet.dispatch.workgroup_size_y = 1; - packet.dispatch.workgroup_size_z = 1; - packet.dispatch.grid_size_x = 64; - packet.dispatch.grid_size_y = 1; - packet.dispatch.grid_size_z = 1; + // Create more completion signals + hsa_signal_t completion_signal_2 = create_completion_signal(); + hsa_signal_t completion_signal_3 = create_completion_signal(); - packet.dispatch.group_segment_size = copyA.group; - packet.dispatch.private_segment_size = copyA.scratch; - packet.dispatch.kernel_object = copyA.handle; + // First dispatch packet on queue 1, Kernel A + submit_kernel_packet(obj, queue1, copyA, args, completion_signal_1); - packet.dispatch.kernarg_address = args; - packet.dispatch.completion_signal = completion_signal_1; + // Barrier on queue 1 waiting for signal_2 (from queue2's Kernel B) + submit_barrier_packet(obj, queue1, completion_signal_2); - args->a = a; - args->b = b; - // Tell packet processor of A to launch the first kernel dispatch packet - obj.submit_packet(queue1, packet); - } + // Barrier on queue 2 waiting for signal_1 (from queue1's Kernel A) + submit_barrier_packet(obj, queue2, completion_signal_1); - // Create a signal with a value of 1 and attach it to the second kernel - // dispatch packet - hsa_signal_t completion_signal_2 = {}; - status = hsa_signal_create(1, 0, nullptr, &completion_signal_2); - RET_IF_HSA_ERR(status) + // Kernel B on queue 2 (waits for barrier above) + submit_kernel_packet(obj, queue2, copyB, args, completion_signal_2); - hsa_signal_t completion_signal_3 = {}; - status = hsa_signal_create(1, 0, nullptr, &completion_signal_3); - RET_IF_HSA_ERR(status) + // Second dispatch packet on queue 1, Kernel C (waits for barrier above) + submit_kernel_packet(obj, queue1, copyC, args, completion_signal_3); - // Create barrier-AND packet that is enqueued in queue 1 - { - MQDependencyTest::Aql packet{}; - packet.header.type = HSA_PACKET_TYPE_BARRIER_AND; - packet.header.barrier = 1; - packet.header.acquire = HSA_FENCE_SCOPE_SYSTEM; - packet.header.release = HSA_FENCE_SCOPE_SYSTEM; + // Set up arguments for second batch + args_memory->a = c; + args_memory->b = d; - packet.barrier_and.dep_signal[0] = completion_signal_2; - obj.submit_packet(queue1, packet); - } + // Create signals for second batch + hsa_signal_t completion_signal_4 = create_completion_signal(); + hsa_signal_t completion_signal_5 = create_completion_signal(); + hsa_signal_t completion_signal_6 = create_completion_signal(); + // Second batch: Kernel D on queue 1 + submit_kernel_packet(obj_memory, queue1, copyD, args_memory, completion_signal_4); - // Second dispath packet on queue 1, Kernel C - { - MQDependencyTest::Aql packet{}; - packet.header.type = HSA_PACKET_TYPE_KERNEL_DISPATCH; - packet.header.barrier = 1; - packet.header.acquire = HSA_FENCE_SCOPE_SYSTEM; - packet.header.release = HSA_FENCE_SCOPE_SYSTEM; + // Barrier on queue 1 waiting for signal_5 (from queue2's Kernel E) + submit_barrier_packet(obj_memory, queue1, completion_signal_5); - packet.dispatch.setup = 1; - packet.dispatch.workgroup_size_x = 64; - packet.dispatch.workgroup_size_y = 1; - packet.dispatch.workgroup_size_z = 1; - packet.dispatch.grid_size_x = 64; - packet.dispatch.grid_size_y = 1; - packet.dispatch.grid_size_z = 1; + // Barrier on queue 2 waiting for signal_4 (from queue1's Kernel D) + submit_barrier_packet(obj_memory, queue2, completion_signal_4); - packet.dispatch.group_segment_size = copyC.group; - packet.dispatch.private_segment_size = copyC.scratch; - packet.dispatch.kernel_object = copyC.handle; - packet.dispatch.completion_signal = completion_signal_3; - packet.dispatch.kernarg_address = args; + // Kernel E on queue 2 (waits for barrier above) + submit_kernel_packet(obj_memory, queue2, copyE, args_memory, completion_signal_5); - args->a = a; - args->b = b; - // Tell packet processor to launch the second kernel dispatch packet - obj.submit_packet(queue1, packet); - } + // Kernel F on queue 1 (waits for barrier above) + submit_kernel_packet(obj_memory, queue1, copyF, args_memory, completion_signal_6); - // Create queue 2 - hsa_queue_t* queue2 = nullptr; - status = hsa_queue_create(obj.gpu[0].agent, - 1024, - HSA_QUEUE_TYPE_SINGLE, - nullptr, - nullptr, - UINT32_MAX, - UINT32_MAX, - &queue2); - RET_IF_HSA_ERR(status) - - // Create barrier-AND packet that is enqueued in queue 2 - { - MQDependencyTest::Aql packet{}; - packet.header.type = HSA_PACKET_TYPE_BARRIER_AND; - packet.header.barrier = 1; - packet.header.acquire = HSA_FENCE_SCOPE_SYSTEM; - packet.header.release = HSA_FENCE_SCOPE_SYSTEM; - - packet.barrier_and.dep_signal[0] = completion_signal_1; - obj.submit_packet(queue2, packet); - } - - // Third dispath packet on queue 2, Kernel B - { - MQDependencyTest::Aql packet{}; - packet.header.type = HSA_PACKET_TYPE_KERNEL_DISPATCH; - packet.header.barrier = 1; - packet.header.acquire = HSA_FENCE_SCOPE_SYSTEM; - packet.header.release = HSA_FENCE_SCOPE_SYSTEM; - - packet.dispatch.setup = 1; - packet.dispatch.workgroup_size_x = 64; - packet.dispatch.workgroup_size_y = 1; - packet.dispatch.workgroup_size_z = 1; - packet.dispatch.grid_size_x = 64; - packet.dispatch.grid_size_y = 1; - packet.dispatch.grid_size_z = 1; - - packet.dispatch.group_segment_size = copyB.group; - packet.dispatch.private_segment_size = copyB.scratch; - packet.dispatch.kernel_object = copyB.handle; - - packet.dispatch.kernarg_address = args; - packet.dispatch.completion_signal = completion_signal_2; - - args->a = a; - args->b = b; - // Tell packet processor to launch the third kernel dispatch packet - obj.submit_packet(queue2, packet); - } - // Create a signal with a value of 1 and attach it to the first kernel - // dispatch packet - hsa_signal_t completion_signal_4 = {}; - status = hsa_signal_create(1, 0, nullptr, &completion_signal_4); - RET_IF_HSA_ERR(status) - // First dispath packet on queue 1, Kernel D - { - [[maybe_unused]] MQDependencyTest::Aql packet{}; - packet.header.type = HSA_PACKET_TYPE_KERNEL_DISPATCH; - packet.header.barrier = 1; - packet.header.acquire = HSA_FENCE_SCOPE_SYSTEM; - packet.header.release = HSA_FENCE_SCOPE_SYSTEM; - - packet.dispatch.setup = 1; - packet.dispatch.workgroup_size_x = 64; - packet.dispatch.workgroup_size_y = 1; - packet.dispatch.workgroup_size_z = 1; - packet.dispatch.grid_size_x = 64; - packet.dispatch.grid_size_y = 1; - packet.dispatch.grid_size_z = 1; - - packet.dispatch.group_segment_size = copyD.group; - packet.dispatch.private_segment_size = copyD.scratch; - packet.dispatch.kernel_object = copyD.handle; - - packet.dispatch.kernarg_address = args_memory; - packet.dispatch.completion_signal = completion_signal_4; - - args_memory->a = c; - args_memory->b = d; - // Tell packet processor of A to launch the first kernel dispatch packet - obj_memory.submit_packet(queue1, packet); - } - - // Create a signal with a value of 1 and attach it to the second kernel - // dispatch packet - hsa_signal_t completion_signal_5 = {}; - status = hsa_signal_create(1, 0, nullptr, &completion_signal_5); - RET_IF_HSA_ERR(status) - hsa_signal_t completion_signal_6 = {}; - status = hsa_signal_create(1, 0, nullptr, &completion_signal_6); - RET_IF_HSA_ERR(status) - - // Create barrier-AND packet that is enqueued in queue 1 - { - MQDependencyTest::Aql packet{}; - packet.header.type = HSA_PACKET_TYPE_BARRIER_AND; - packet.header.barrier = 1; - packet.header.acquire = HSA_FENCE_SCOPE_SYSTEM; - packet.header.release = HSA_FENCE_SCOPE_SYSTEM; - - packet.barrier_and.dep_signal[0] = completion_signal_5; - obj_memory.submit_packet(queue1, packet); - } - // Second dispath packet on queue 1, Kernel F - { - MQDependencyTest::Aql packet{}; - packet.header.type = HSA_PACKET_TYPE_KERNEL_DISPATCH; - packet.header.barrier = 1; - packet.header.acquire = HSA_FENCE_SCOPE_SYSTEM; - packet.header.release = HSA_FENCE_SCOPE_SYSTEM; - - packet.dispatch.setup = 1; - packet.dispatch.workgroup_size_x = 64; - packet.dispatch.workgroup_size_y = 1; - packet.dispatch.workgroup_size_z = 1; - packet.dispatch.grid_size_x = 64; - packet.dispatch.grid_size_y = 1; - packet.dispatch.grid_size_z = 1; - - packet.dispatch.group_segment_size = copyF.group; - packet.dispatch.private_segment_size = copyF.scratch; - packet.dispatch.kernel_object = copyF.handle; - packet.dispatch.completion_signal = completion_signal_6; - packet.dispatch.kernarg_address = args_memory; - - args_memory->a = c; - args_memory->b = d; - // Tell packet processor to launch the second kernel dispatch packet - obj_memory.submit_packet(queue1, packet); - } - // Create barrier-AND packet that is enqueued in queue 2 - { - MQDependencyTest::Aql packet{}; - packet.header.type = HSA_PACKET_TYPE_BARRIER_AND; - packet.header.barrier = 1; - packet.header.acquire = HSA_FENCE_SCOPE_SYSTEM; - packet.header.release = HSA_FENCE_SCOPE_SYSTEM; - - packet.barrier_and.dep_signal[0] = completion_signal_4; - obj_memory.submit_packet(queue2, packet); - } - // Third dispath packet on queue 2, Kernel - { - MQDependencyTest::Aql packet{}; - packet.header.type = HSA_PACKET_TYPE_KERNEL_DISPATCH; - packet.header.barrier = 1; - packet.header.acquire = HSA_FENCE_SCOPE_SYSTEM; - packet.header.release = HSA_FENCE_SCOPE_SYSTEM; - - packet.dispatch.setup = 1; - packet.dispatch.workgroup_size_x = 64; - packet.dispatch.workgroup_size_y = 1; - packet.dispatch.workgroup_size_z = 1; - packet.dispatch.grid_size_x = 64; - packet.dispatch.grid_size_y = 1; - packet.dispatch.grid_size_z = 1; - - packet.dispatch.group_segment_size = copyE.group; - packet.dispatch.private_segment_size = copyE.scratch; - packet.dispatch.kernel_object = copyE.handle; - - packet.dispatch.kernarg_address = args_memory; - packet.dispatch.completion_signal = completion_signal_5; - - args_memory->a = c; - args_memory->b = d; - // Tell packet processor to launch the third kernel dispatch packet - obj_memory.submit_packet(queue2, packet); - } // Wait on the completion signal hsa_signal_wait_relaxed( completion_signal_1, HSA_SIGNAL_CONDITION_EQ, 0, UINT64_MAX, HSA_WAIT_STATE_BLOCKED);