d4a51e4102
* Adding att v3 support * misc fix * bug fix * Python linting workflow and rules * fix regex * Adding temporary args * fix temporary args * fix format * remove att_perfcounters from test input * Review comments (#163) Co-authored-by: Giovanni Baraldi <gbaraldi@amd.com> * Revert "Review comments (#163)" This reverts commit 9ef0f8e5a4489d5581255e1b70ced2aef5c1c1d0. * Address review comments 2 * review changes * review comments * review * cmake alias * review * review * review * review * Enabling percounter in v3 script * review * formatting * formatting --------- Co-authored-by: Jonathan R. Madsen <jonathanrmadsen@gmail.com> Co-authored-by: Baraldi, Giovanni <Giovanni.Baraldi@amd.com> Co-authored-by: Giovanni Baraldi <gbaraldi@amd.com>
493 строки
18 KiB
C++
493 строки
18 KiB
C++
// MIT License
|
|
//
|
|
// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
|
|
//
|
|
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
// of this software and associated documentation files (the "Software"), to deal
|
|
// in the Software without restriction, including without limitation the rights
|
|
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
// copies of the Software, and to permit persons to whom the Software is
|
|
// furnished to do so, subject to the following conditions:
|
|
//
|
|
// The above copyright notice and this permission notice shall be included in
|
|
// all copies or substantial portions of the Software.
|
|
//
|
|
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
// THE SOFTWARE.
|
|
|
|
/** ROC Profiler Multi Queue Dependency Test
|
|
*
|
|
* The goal of this test is to ensure ROC profiler does not go to deadlock
|
|
* when multiple queue are created and they are dependent on each other
|
|
*
|
|
*/
|
|
|
|
#include "hsa_code_object_app.h"
|
|
|
|
enum class storage_type
|
|
{
|
|
CODE_OBJECT_STORAGE_FILE,
|
|
CODE_OBJECT_STORAGE_MEMORY
|
|
};
|
|
|
|
void
|
|
code_object_load(MQDependencyTest& obj,
|
|
storage_type type,
|
|
MQDependencyTest::CodeObject& code_object)
|
|
{
|
|
hsa_status_t status;
|
|
obj.device_discovery();
|
|
char agent_name[64];
|
|
status = hsa_agent_get_info(obj.gpu[0].agent, HSA_AGENT_INFO_NAME, agent_name);
|
|
RET_IF_HSA_ERR(status)
|
|
|
|
if(type == storage_type::CODE_OBJECT_STORAGE_FILE)
|
|
{
|
|
std::string hasco_file_path = std::string(agent_name) + std::string("_copy.hsaco");
|
|
obj.search_hasco(fs::current_path(), hasco_file_path);
|
|
if(!obj.load_code_object(hasco_file_path, obj.gpu[0].agent, code_object))
|
|
{
|
|
printf("Kernel file not found or not usable with given agent.\n");
|
|
abort();
|
|
}
|
|
}
|
|
else
|
|
{
|
|
std::string hasco_file_path = std::string(agent_name) + std::string("_copy_memory.hsaco");
|
|
obj.search_hasco(fs::current_path(), hasco_file_path);
|
|
if(!obj.load_code_object_memory(hasco_file_path, obj.gpu[0].agent, code_object))
|
|
{
|
|
abort();
|
|
}
|
|
}
|
|
}
|
|
|
|
MQDependencyTest::Kernel
|
|
get_kernel(MQDependencyTest::CodeObject& code_object,
|
|
std::string kernel_name,
|
|
MQDependencyTest& obj)
|
|
{
|
|
MQDependencyTest::Kernel copy;
|
|
if(!obj.get_kernel(code_object, kernel_name, obj.gpu[0].agent, copy))
|
|
{
|
|
printf("Test %s not found.\n", kernel_name.c_str());
|
|
abort();
|
|
}
|
|
return copy;
|
|
}
|
|
|
|
int
|
|
main()
|
|
{
|
|
hsa_status_t status;
|
|
MQDependencyTest obj;
|
|
MQDependencyTest obj_memory = {};
|
|
MQDependencyTest::CodeObject code_object = {}, code_object_memory = {};
|
|
|
|
code_object_load(obj, storage_type::CODE_OBJECT_STORAGE_FILE, code_object);
|
|
code_object_load(obj_memory, storage_type::CODE_OBJECT_STORAGE_MEMORY, code_object_memory);
|
|
|
|
MQDependencyTest::Kernel copyA = get_kernel(code_object, "copyA", obj);
|
|
MQDependencyTest::Kernel copyB = get_kernel(code_object, "copyB", obj);
|
|
MQDependencyTest::Kernel copyC = get_kernel(code_object, "copyC", obj);
|
|
|
|
MQDependencyTest::Kernel copyD = get_kernel(code_object_memory, "copyD", obj_memory);
|
|
MQDependencyTest::Kernel copyE = get_kernel(code_object_memory, "copyE", obj_memory);
|
|
MQDependencyTest::Kernel copyF = get_kernel(code_object_memory, "copyF", obj_memory);
|
|
|
|
struct args_t
|
|
{
|
|
uint32_t* a = nullptr;
|
|
uint32_t* b = nullptr;
|
|
MQDependencyTest::OCLHiddenArgs hidden = {};
|
|
};
|
|
|
|
args_t* args = static_cast<args_t*>(obj.hsa_malloc(sizeof(args_t), obj.kernarg));
|
|
*args = {};
|
|
|
|
uint32_t* a = static_cast<uint32_t*>(obj.hsa_malloc(64 * sizeof(uint32_t), obj.kernarg));
|
|
uint32_t* b = static_cast<uint32_t*>(obj.hsa_malloc(64 * sizeof(uint32_t), obj.kernarg));
|
|
|
|
memset(a, 0, 64 * sizeof(uint32_t));
|
|
memset(b, 1, 64 * sizeof(uint32_t));
|
|
|
|
args_t* args_memory =
|
|
static_cast<args_t*>(obj_memory.hsa_malloc(sizeof(args_t), obj_memory.kernarg));
|
|
*args_memory = {};
|
|
|
|
uint32_t* c =
|
|
static_cast<uint32_t*>(obj_memory.hsa_malloc(64 * sizeof(uint32_t), obj_memory.kernarg));
|
|
uint32_t* d =
|
|
static_cast<uint32_t*>(obj_memory.hsa_malloc(64 * sizeof(uint32_t), obj_memory.kernarg));
|
|
|
|
memset(c, 0, 64 * sizeof(uint32_t));
|
|
memset(d, 1, 64 * sizeof(uint32_t));
|
|
|
|
// Create queue in gpu agent and prepare a kernel dispatch packet
|
|
hsa_queue_t* queue1 = nullptr;
|
|
status = hsa_queue_create(obj.gpu[0].agent,
|
|
1024,
|
|
HSA_QUEUE_TYPE_SINGLE,
|
|
nullptr,
|
|
nullptr,
|
|
UINT32_MAX,
|
|
UINT32_MAX,
|
|
&queue1);
|
|
RET_IF_HSA_ERR(status)
|
|
|
|
// Create a signal with a value of 1 and attach it to the first kernel
|
|
// dispatch packet
|
|
hsa_signal_t completion_signal_1 = {};
|
|
status = hsa_signal_create(1, 0, nullptr, &completion_signal_1);
|
|
RET_IF_HSA_ERR(status)
|
|
|
|
// First dispath packet on queue 1, Kernel A
|
|
{
|
|
MQDependencyTest::Aql packet{};
|
|
packet.header.type = HSA_PACKET_TYPE_KERNEL_DISPATCH;
|
|
packet.header.barrier = 1;
|
|
packet.header.acquire = HSA_FENCE_SCOPE_SYSTEM;
|
|
packet.header.release = HSA_FENCE_SCOPE_SYSTEM;
|
|
|
|
packet.dispatch.setup = 1;
|
|
packet.dispatch.workgroup_size_x = 64;
|
|
packet.dispatch.workgroup_size_y = 1;
|
|
packet.dispatch.workgroup_size_z = 1;
|
|
packet.dispatch.grid_size_x = 64;
|
|
packet.dispatch.grid_size_y = 1;
|
|
packet.dispatch.grid_size_z = 1;
|
|
|
|
packet.dispatch.group_segment_size = copyA.group;
|
|
packet.dispatch.private_segment_size = copyA.scratch;
|
|
packet.dispatch.kernel_object = copyA.handle;
|
|
|
|
packet.dispatch.kernarg_address = args;
|
|
packet.dispatch.completion_signal = completion_signal_1;
|
|
|
|
args->a = a;
|
|
args->b = b;
|
|
// Tell packet processor of A to launch the first kernel dispatch packet
|
|
obj.submit_packet(queue1, packet);
|
|
}
|
|
|
|
// Create a signal with a value of 1 and attach it to the second kernel
|
|
// dispatch packet
|
|
hsa_signal_t completion_signal_2 = {};
|
|
status = hsa_signal_create(1, 0, nullptr, &completion_signal_2);
|
|
RET_IF_HSA_ERR(status)
|
|
|
|
hsa_signal_t completion_signal_3 = {};
|
|
status = hsa_signal_create(1, 0, nullptr, &completion_signal_3);
|
|
RET_IF_HSA_ERR(status)
|
|
|
|
// Create barrier-AND packet that is enqueued in queue 1
|
|
{
|
|
MQDependencyTest::Aql packet{};
|
|
packet.header.type = HSA_PACKET_TYPE_BARRIER_AND;
|
|
packet.header.barrier = 1;
|
|
packet.header.acquire = HSA_FENCE_SCOPE_SYSTEM;
|
|
packet.header.release = HSA_FENCE_SCOPE_SYSTEM;
|
|
|
|
packet.barrier_and.dep_signal[0] = completion_signal_2;
|
|
obj.submit_packet(queue1, packet);
|
|
}
|
|
|
|
// Second dispath packet on queue 1, Kernel C
|
|
{
|
|
MQDependencyTest::Aql packet{};
|
|
packet.header.type = HSA_PACKET_TYPE_KERNEL_DISPATCH;
|
|
packet.header.barrier = 1;
|
|
packet.header.acquire = HSA_FENCE_SCOPE_SYSTEM;
|
|
packet.header.release = HSA_FENCE_SCOPE_SYSTEM;
|
|
|
|
packet.dispatch.setup = 1;
|
|
packet.dispatch.workgroup_size_x = 64;
|
|
packet.dispatch.workgroup_size_y = 1;
|
|
packet.dispatch.workgroup_size_z = 1;
|
|
packet.dispatch.grid_size_x = 64;
|
|
packet.dispatch.grid_size_y = 1;
|
|
packet.dispatch.grid_size_z = 1;
|
|
|
|
packet.dispatch.group_segment_size = copyC.group;
|
|
packet.dispatch.private_segment_size = copyC.scratch;
|
|
packet.dispatch.kernel_object = copyC.handle;
|
|
packet.dispatch.completion_signal = completion_signal_3;
|
|
packet.dispatch.kernarg_address = args;
|
|
|
|
args->a = a;
|
|
args->b = b;
|
|
// Tell packet processor to launch the second kernel dispatch packet
|
|
obj.submit_packet(queue1, packet);
|
|
}
|
|
|
|
// Create queue 2
|
|
hsa_queue_t* queue2 = nullptr;
|
|
status = hsa_queue_create(obj.gpu[0].agent,
|
|
1024,
|
|
HSA_QUEUE_TYPE_SINGLE,
|
|
nullptr,
|
|
nullptr,
|
|
UINT32_MAX,
|
|
UINT32_MAX,
|
|
&queue2);
|
|
RET_IF_HSA_ERR(status)
|
|
|
|
// Create barrier-AND packet that is enqueued in queue 2
|
|
{
|
|
MQDependencyTest::Aql packet{};
|
|
packet.header.type = HSA_PACKET_TYPE_BARRIER_AND;
|
|
packet.header.barrier = 1;
|
|
packet.header.acquire = HSA_FENCE_SCOPE_SYSTEM;
|
|
packet.header.release = HSA_FENCE_SCOPE_SYSTEM;
|
|
|
|
packet.barrier_and.dep_signal[0] = completion_signal_1;
|
|
obj.submit_packet(queue2, packet);
|
|
}
|
|
|
|
// Third dispath packet on queue 2, Kernel B
|
|
{
|
|
MQDependencyTest::Aql packet{};
|
|
packet.header.type = HSA_PACKET_TYPE_KERNEL_DISPATCH;
|
|
packet.header.barrier = 1;
|
|
packet.header.acquire = HSA_FENCE_SCOPE_SYSTEM;
|
|
packet.header.release = HSA_FENCE_SCOPE_SYSTEM;
|
|
|
|
packet.dispatch.setup = 1;
|
|
packet.dispatch.workgroup_size_x = 64;
|
|
packet.dispatch.workgroup_size_y = 1;
|
|
packet.dispatch.workgroup_size_z = 1;
|
|
packet.dispatch.grid_size_x = 64;
|
|
packet.dispatch.grid_size_y = 1;
|
|
packet.dispatch.grid_size_z = 1;
|
|
|
|
packet.dispatch.group_segment_size = copyB.group;
|
|
packet.dispatch.private_segment_size = copyB.scratch;
|
|
packet.dispatch.kernel_object = copyB.handle;
|
|
|
|
packet.dispatch.kernarg_address = args;
|
|
packet.dispatch.completion_signal = completion_signal_2;
|
|
|
|
args->a = a;
|
|
args->b = b;
|
|
// Tell packet processor to launch the third kernel dispatch packet
|
|
obj.submit_packet(queue2, packet);
|
|
}
|
|
// Create a signal with a value of 1 and attach it to the first kernel
|
|
// dispatch packet
|
|
hsa_signal_t completion_signal_4 = {};
|
|
status = hsa_signal_create(1, 0, nullptr, &completion_signal_4);
|
|
RET_IF_HSA_ERR(status)
|
|
// First dispath packet on queue 1, Kernel D
|
|
{
|
|
[[maybe_unused]] MQDependencyTest::Aql packet{};
|
|
packet.header.type = HSA_PACKET_TYPE_KERNEL_DISPATCH;
|
|
packet.header.barrier = 1;
|
|
packet.header.acquire = HSA_FENCE_SCOPE_SYSTEM;
|
|
packet.header.release = HSA_FENCE_SCOPE_SYSTEM;
|
|
|
|
packet.dispatch.setup = 1;
|
|
packet.dispatch.workgroup_size_x = 64;
|
|
packet.dispatch.workgroup_size_y = 1;
|
|
packet.dispatch.workgroup_size_z = 1;
|
|
packet.dispatch.grid_size_x = 64;
|
|
packet.dispatch.grid_size_y = 1;
|
|
packet.dispatch.grid_size_z = 1;
|
|
|
|
packet.dispatch.group_segment_size = copyD.group;
|
|
packet.dispatch.private_segment_size = copyD.scratch;
|
|
packet.dispatch.kernel_object = copyD.handle;
|
|
|
|
packet.dispatch.kernarg_address = args_memory;
|
|
packet.dispatch.completion_signal = completion_signal_4;
|
|
|
|
args_memory->a = c;
|
|
args_memory->b = d;
|
|
// Tell packet processor of A to launch the first kernel dispatch packet
|
|
obj_memory.submit_packet(queue1, packet);
|
|
}
|
|
|
|
// Create a signal with a value of 1 and attach it to the second kernel
|
|
// dispatch packet
|
|
hsa_signal_t completion_signal_5 = {};
|
|
status = hsa_signal_create(1, 0, nullptr, &completion_signal_5);
|
|
RET_IF_HSA_ERR(status)
|
|
hsa_signal_t completion_signal_6 = {};
|
|
status = hsa_signal_create(1, 0, nullptr, &completion_signal_6);
|
|
RET_IF_HSA_ERR(status)
|
|
|
|
// Create barrier-AND packet that is enqueued in queue 1
|
|
{
|
|
MQDependencyTest::Aql packet{};
|
|
packet.header.type = HSA_PACKET_TYPE_BARRIER_AND;
|
|
packet.header.barrier = 1;
|
|
packet.header.acquire = HSA_FENCE_SCOPE_SYSTEM;
|
|
packet.header.release = HSA_FENCE_SCOPE_SYSTEM;
|
|
|
|
packet.barrier_and.dep_signal[0] = completion_signal_5;
|
|
obj_memory.submit_packet(queue1, packet);
|
|
}
|
|
// Second dispath packet on queue 1, Kernel F
|
|
{
|
|
MQDependencyTest::Aql packet{};
|
|
packet.header.type = HSA_PACKET_TYPE_KERNEL_DISPATCH;
|
|
packet.header.barrier = 1;
|
|
packet.header.acquire = HSA_FENCE_SCOPE_SYSTEM;
|
|
packet.header.release = HSA_FENCE_SCOPE_SYSTEM;
|
|
|
|
packet.dispatch.setup = 1;
|
|
packet.dispatch.workgroup_size_x = 64;
|
|
packet.dispatch.workgroup_size_y = 1;
|
|
packet.dispatch.workgroup_size_z = 1;
|
|
packet.dispatch.grid_size_x = 64;
|
|
packet.dispatch.grid_size_y = 1;
|
|
packet.dispatch.grid_size_z = 1;
|
|
|
|
packet.dispatch.group_segment_size = copyF.group;
|
|
packet.dispatch.private_segment_size = copyF.scratch;
|
|
packet.dispatch.kernel_object = copyF.handle;
|
|
packet.dispatch.completion_signal = completion_signal_6;
|
|
packet.dispatch.kernarg_address = args_memory;
|
|
|
|
args_memory->a = c;
|
|
args_memory->b = d;
|
|
// Tell packet processor to launch the second kernel dispatch packet
|
|
obj_memory.submit_packet(queue1, packet);
|
|
}
|
|
// Create barrier-AND packet that is enqueued in queue 2
|
|
{
|
|
MQDependencyTest::Aql packet{};
|
|
packet.header.type = HSA_PACKET_TYPE_BARRIER_AND;
|
|
packet.header.barrier = 1;
|
|
packet.header.acquire = HSA_FENCE_SCOPE_SYSTEM;
|
|
packet.header.release = HSA_FENCE_SCOPE_SYSTEM;
|
|
|
|
packet.barrier_and.dep_signal[0] = completion_signal_4;
|
|
obj_memory.submit_packet(queue2, packet);
|
|
}
|
|
// Third dispath packet on queue 2, Kernel
|
|
{
|
|
MQDependencyTest::Aql packet{};
|
|
packet.header.type = HSA_PACKET_TYPE_KERNEL_DISPATCH;
|
|
packet.header.barrier = 1;
|
|
packet.header.acquire = HSA_FENCE_SCOPE_SYSTEM;
|
|
packet.header.release = HSA_FENCE_SCOPE_SYSTEM;
|
|
|
|
packet.dispatch.setup = 1;
|
|
packet.dispatch.workgroup_size_x = 64;
|
|
packet.dispatch.workgroup_size_y = 1;
|
|
packet.dispatch.workgroup_size_z = 1;
|
|
packet.dispatch.grid_size_x = 64;
|
|
packet.dispatch.grid_size_y = 1;
|
|
packet.dispatch.grid_size_z = 1;
|
|
|
|
packet.dispatch.group_segment_size = copyE.group;
|
|
packet.dispatch.private_segment_size = copyE.scratch;
|
|
packet.dispatch.kernel_object = copyE.handle;
|
|
|
|
packet.dispatch.kernarg_address = args_memory;
|
|
packet.dispatch.completion_signal = completion_signal_5;
|
|
|
|
args_memory->a = c;
|
|
args_memory->b = d;
|
|
// Tell packet processor to launch the third kernel dispatch packet
|
|
obj_memory.submit_packet(queue2, packet);
|
|
}
|
|
// Wait on the completion signal
|
|
hsa_signal_wait_relaxed(
|
|
completion_signal_1, HSA_SIGNAL_CONDITION_EQ, 0, UINT64_MAX, HSA_WAIT_STATE_BLOCKED);
|
|
|
|
// Wait on the completion signal
|
|
hsa_signal_wait_relaxed(
|
|
completion_signal_2, HSA_SIGNAL_CONDITION_EQ, 0, UINT64_MAX, HSA_WAIT_STATE_BLOCKED);
|
|
|
|
// Wait on the completion signal
|
|
hsa_signal_wait_relaxed(
|
|
completion_signal_3, HSA_SIGNAL_CONDITION_EQ, 0, UINT64_MAX, HSA_WAIT_STATE_BLOCKED);
|
|
|
|
// Wait on the completion signal
|
|
hsa_signal_wait_relaxed(
|
|
completion_signal_4, HSA_SIGNAL_CONDITION_EQ, 0, UINT64_MAX, HSA_WAIT_STATE_BLOCKED);
|
|
|
|
// Wait on the completion signal
|
|
hsa_signal_wait_relaxed(
|
|
completion_signal_5, HSA_SIGNAL_CONDITION_EQ, 0, UINT64_MAX, HSA_WAIT_STATE_BLOCKED);
|
|
|
|
// Wait on the completion signal
|
|
hsa_signal_wait_relaxed(
|
|
completion_signal_6, HSA_SIGNAL_CONDITION_EQ, 0, UINT64_MAX, HSA_WAIT_STATE_BLOCKED);
|
|
|
|
for(int i = 0; i < 64; i++)
|
|
{
|
|
if(a[i] != b[i])
|
|
{
|
|
printf("error at %d: expected %d, got %d\n", i, b[i], a[i]);
|
|
abort();
|
|
}
|
|
}
|
|
|
|
// Clearing data structures and memory
|
|
status = hsa_signal_destroy(completion_signal_1);
|
|
RET_IF_HSA_ERR(status)
|
|
|
|
status = hsa_signal_destroy(completion_signal_2);
|
|
RET_IF_HSA_ERR(status)
|
|
|
|
status = hsa_signal_destroy(completion_signal_3);
|
|
RET_IF_HSA_ERR(status)
|
|
|
|
// Clearing data structures and memory
|
|
status = hsa_signal_destroy(completion_signal_4);
|
|
RET_IF_HSA_ERR(status)
|
|
|
|
status = hsa_signal_destroy(completion_signal_5);
|
|
RET_IF_HSA_ERR(status)
|
|
|
|
status = hsa_signal_destroy(completion_signal_6);
|
|
RET_IF_HSA_ERR(status)
|
|
|
|
if(queue1 != nullptr)
|
|
{
|
|
status = hsa_queue_destroy(queue1);
|
|
RET_IF_HSA_ERR(status)
|
|
}
|
|
|
|
if(queue2 != nullptr)
|
|
{
|
|
status = hsa_queue_destroy(queue2);
|
|
RET_IF_HSA_ERR(status)
|
|
}
|
|
|
|
status = hsa_memory_free(a);
|
|
RET_IF_HSA_ERR(status)
|
|
|
|
status = hsa_memory_free(b);
|
|
RET_IF_HSA_ERR(status)
|
|
|
|
status = hsa_memory_free(c);
|
|
RET_IF_HSA_ERR(status)
|
|
|
|
status = hsa_memory_free(d);
|
|
RET_IF_HSA_ERR(status)
|
|
|
|
status = hsa_executable_destroy(code_object.executable);
|
|
RET_IF_HSA_ERR(status)
|
|
|
|
status = hsa_code_object_reader_destroy(code_object.code_obj_rdr);
|
|
RET_IF_HSA_ERR(status)
|
|
|
|
status = hsa_executable_destroy(code_object_memory.executable);
|
|
RET_IF_HSA_ERR(status)
|
|
|
|
status = hsa_code_object_reader_destroy(code_object_memory.code_obj_rdr);
|
|
RET_IF_HSA_ERR(status)
|
|
|
|
close(code_object.file);
|
|
|
|
close(code_object_memory.file);
|
|
}
|