Add default values for kernel struct (#798)

* Add default values for kernel struct

* Update hsa-queue-dependency app

- default initializers
- check HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED for memory pools
- clang-tidy fixes (member -> static, etc.)

* Update run-ci.py

- add --progress --output-on-failure -V if no other options regarding verbosity are passed
- improve the ability to control the stages

---------

Co-authored-by: Jonathan R. Madsen <jonathanrmadsen@gmail.com>
This commit is contained in:
Benjamin Welton
2024-04-18 19:07:20 -07:00
committed by GitHub
parent c668028781
commit 29bc84ec0c
4 changed files with 130 additions and 79 deletions
+44 -14
View File
@@ -195,6 +195,8 @@ def generate_dashboard_script(args):
MEMCHECK = 0
_script = f"""
cmake_minimum_required(VERSION 3.21 FATAL_ERROR)
macro(dashboard_submit)
if("{SUBMIT}" GREATER 0)
ctest_submit({ARGN})
@@ -214,7 +216,10 @@ def generate_dashboard_script(args):
endmacro()
"""
STAGES = ";".join([itr.upper() for itr in args.stages])
_script += f"""
set(STAGES "{STAGES}")
ctest_start({DASHBOARD_MODE})
ctest_update(SOURCE "{SOURCE_DIR}" RETURN_VALUE _update_ret
CAPTURE_CMAKE_ERROR _update_err)
@@ -227,20 +232,24 @@ def generate_dashboard_script(args):
handle_error("Configure" _configure_ret)
ctest_build(BUILD "{BINARY_DIR}" RETURN_VALUE _build_ret)
dashboard_submit(PARTS Build RETURN_VALUE _submit_ret)
if("BUILD" IN_LIST STAGES)
ctest_build(BUILD "{BINARY_DIR}" RETURN_VALUE _build_ret)
dashboard_submit(PARTS Build RETURN_VALUE _submit_ret)
handle_error("Build" _build_ret)
if("{MEMCHECK}" GREATER 0)
ctest_memcheck(BUILD "{BINARY_DIR}" RETURN_VALUE _test_ret)
dashboard_submit(PARTS Test RETURN_VALUE _submit_ret)
else()
ctest_test(BUILD "{BINARY_DIR}" RETURN_VALUE _test_ret)
dashboard_submit(PARTS Test RETURN_VALUE _submit_ret)
handle_error("Build" _build_ret)
endif()
if("{CODECOV}" GREATER 0)
if("TEST" IN_LIST STAGES)
if("{MEMCHECK}" GREATER 0)
ctest_memcheck(BUILD "{BINARY_DIR}" RETURN_VALUE _test_ret)
dashboard_submit(PARTS Test RETURN_VALUE _submit_ret)
else()
ctest_test(BUILD "{BINARY_DIR}" RETURN_VALUE _test_ret)
dashboard_submit(PARTS Test RETURN_VALUE _submit_ret)
endif()
endif()
if("{CODECOV}" GREATER 0 AND "COVERAGE" IN_LIST STAGES)
ctest_coverage(
BUILD "{BINARY_DIR}"
RETURN_VALUE _coverage_ret
@@ -493,20 +502,39 @@ if __name__ == "__main__":
dashboard_args.append(f"{args.mode}{itr}")
try:
verbose_options = (
"--progress",
"-V",
"-VV",
"--debug",
"--output-on-failure",
"-Q",
"--quiet",
)
if not args.quiet and len(ctest_args) == 0:
ctest_args = ["--output-on-failure", "-V"]
elif not args.quiet:
opts_union = [x for x in ctest_args if x in verbose_options]
if len(opts_union) == 0:
ctest_args += ["--progress", "--output-on-failure", "-V"]
# always fail if no tests exist
ctest_args += ["--no-tests=error"]
run(
run_args = (
[CTEST_CMD]
+ dashboard_args
+ [
"-S",
os.path.join(args.binary_dir, "dashboard.cmake"),
]
+ ctest_args,
+ ctest_args
)
print("CTest command: {}".format(" ".join(run_args)))
run(
run_args,
check=True,
)
finally:
@@ -522,7 +550,9 @@ if __name__ == "__main__":
):
if not os.path.isfile(file):
continue
if "CoverageLog-" in os.path.basename(file):
elif "CoverageLog-" in os.path.basename(file):
continue
elif "Test.xml" in os.path.basename(file):
continue
print(f"\n\n###### Reading {file}... ######\n\n")
with open(file, "r") as inpf:
@@ -43,7 +43,7 @@ foreach(target_id ${GPU_TARGETS})
# generate kernel bitcodes
generate_hsaco(${target_id} ${CMAKE_CURRENT_SOURCE_DIR}/copy.cl
${target_id}_copy.hsaco)
endforeach(target_id)
endforeach()
add_custom_target(generate_hsaco_targets DEPENDS ${HSACO_TARGET_LIST})
@@ -71,8 +71,6 @@ find_package(rocprofiler-sdk REQUIRED)
target_link_libraries(multiqueue_testapp PRIVATE rocprofiler::rocprofiler
rocprofiler::tests-common-library)
target_compile_definitions(multiqueue_testapp PUBLIC AMD_INTERNAL_BUILD=1)
find_package(
hsa-runtime64
REQUIRED
@@ -64,12 +64,14 @@ main()
printf("Test kernel A not found.\n");
abort();
}
MQDependencyTest::Kernel copyB;
if(!obj.get_kernel(code_object, "copyB", obj.gpu[0].agent, copyB))
{
printf("Test kernel B not found.\n");
abort();
}
MQDependencyTest::Kernel copyC;
if(!obj.get_kernel(code_object, "copyC", obj.gpu[0].agent, copyC))
{
@@ -79,14 +81,13 @@ main()
struct args_t
{
uint32_t* a;
uint32_t* b;
MQDependencyTest::OCLHiddenArgs hidden;
uint32_t* a = nullptr;
uint32_t* b = nullptr;
MQDependencyTest::OCLHiddenArgs hidden = {};
};
args_t* args;
args = static_cast<args_t*>(obj.hsa_malloc(sizeof(args_t), obj.kernarg));
memset(args, 0, sizeof(args_t));
args_t* args = static_cast<args_t*>(obj.hsa_malloc(sizeof(args_t), obj.kernarg));
*args = {};
uint32_t* a = static_cast<uint32_t*>(obj.hsa_malloc(64 * sizeof(uint32_t), obj.kernarg));
uint32_t* b = static_cast<uint32_t*>(obj.hsa_malloc(64 * sizeof(uint32_t), obj.kernarg));
@@ -95,15 +96,21 @@ main()
memset(b, 1, 64 * sizeof(uint32_t));
// Create queue in gpu agent and prepare a kernel dispatch packet
hsa_queue_t* queue1;
status = hsa_queue_create(
obj.gpu[0].agent, 1024, HSA_QUEUE_TYPE_SINGLE, NULL, NULL, UINT32_MAX, UINT32_MAX, &queue1);
hsa_queue_t* queue1 = nullptr;
status = hsa_queue_create(obj.gpu[0].agent,
1024,
HSA_QUEUE_TYPE_SINGLE,
nullptr,
nullptr,
UINT32_MAX,
UINT32_MAX,
&queue1);
RET_IF_HSA_ERR(status)
// Create a signal with a value of 1 and attach it to the first kernel
// dispatch packet
hsa_signal_t completion_signal_1;
status = hsa_signal_create(1, 0, NULL, &completion_signal_1);
hsa_signal_t completion_signal_1 = {};
status = hsa_signal_create(1, 0, nullptr, &completion_signal_1);
RET_IF_HSA_ERR(status)
// First dispath packet on queue 1, Kernel A
@@ -137,12 +144,12 @@ main()
// Create a signal with a value of 1 and attach it to the second kernel
// dispatch packet
hsa_signal_t completion_signal_2;
status = hsa_signal_create(1, 0, NULL, &completion_signal_2);
hsa_signal_t completion_signal_2 = {};
status = hsa_signal_create(1, 0, nullptr, &completion_signal_2);
RET_IF_HSA_ERR(status)
hsa_signal_t completion_signal_3;
status = hsa_signal_create(1, 0, NULL, &completion_signal_3);
hsa_signal_t completion_signal_3 = {};
status = hsa_signal_create(1, 0, nullptr, &completion_signal_3);
RET_IF_HSA_ERR(status)
// Create barrier-AND packet that is enqueued in queue 1
@@ -186,9 +193,15 @@ main()
}
// Create queue 2
hsa_queue_t* queue2;
status = hsa_queue_create(
obj.gpu[0].agent, 1024, HSA_QUEUE_TYPE_SINGLE, NULL, NULL, UINT32_MAX, UINT32_MAX, &queue2);
hsa_queue_t* queue2 = nullptr;
status = hsa_queue_create(obj.gpu[0].agent,
1024,
HSA_QUEUE_TYPE_SINGLE,
nullptr,
nullptr,
UINT32_MAX,
UINT32_MAX,
&queue2);
RET_IF_HSA_ERR(status)
// Create barrier-AND packet that is enqueued in queue 2
@@ -277,6 +290,7 @@ main()
status = hsa_memory_free(a);
RET_IF_HSA_ERR(status)
status = hsa_memory_free(b);
RET_IF_HSA_ERR(status)
@@ -285,5 +299,6 @@ main()
status = hsa_code_object_reader_destroy(code_object.code_obj_rdr);
RET_IF_HSA_ERR(status)
close(code_object.file);
}
+52 -44
View File
@@ -88,18 +88,18 @@ public:
struct CodeObject
{
hsa_file_t file;
hsa_code_object_reader_t code_obj_rdr;
hsa_executable_t executable;
hsa_file_t file = 0;
hsa_code_object_reader_t code_obj_rdr = {};
hsa_executable_t executable = {};
};
struct Kernel
{
uint64_t handle;
uint32_t scratch;
uint32_t group;
uint32_t kernarg_size;
uint32_t kernarg_align;
uint64_t handle = 0;
uint32_t scratch = 0;
uint32_t group = 0;
uint32_t kernarg_size = 0;
uint32_t kernarg_align = 0;
};
union AqlHeader
@@ -112,23 +112,23 @@ public:
uint16_t release : 2;
uint16_t reserved : 3;
};
uint16_t raw;
uint16_t raw = 0;
};
struct BarrierValue
{
AqlHeader header;
uint8_t AmdFormat;
uint8_t reserved;
uint32_t reserved1;
hsa_signal_t signal;
hsa_signal_value_t value;
hsa_signal_value_t mask;
uint32_t cond;
uint32_t reserved2;
uint64_t reserved3;
uint64_t reserved4;
hsa_signal_t completion_signal;
AqlHeader header = {};
uint8_t AmdFormat = 0;
uint8_t reserved = 0;
uint32_t reserved1 = 0;
hsa_signal_t signal = {};
hsa_signal_value_t value = 0;
hsa_signal_value_t mask = 0;
uint32_t cond = 0;
uint32_t reserved2 = 0;
uint64_t reserved3 = 0;
uint64_t reserved4 = 0;
hsa_signal_t completion_signal = {};
};
union Aql
@@ -137,21 +137,23 @@ public:
hsa_kernel_dispatch_packet_t dispatch;
hsa_barrier_and_packet_t barrier_and;
hsa_barrier_or_packet_t barrier_or;
BarrierValue barrier_value;
BarrierValue barrier_value = {};
};
struct OCLHiddenArgs
{
uint64_t offset_x;
uint64_t offset_y;
uint64_t offset_z;
void* printf_buffer;
void* enqueue;
void* enqueue2;
void* multi_grid;
uint64_t offset_x = 0;
uint64_t offset_y = 0;
uint64_t offset_z = 0;
void* printf_buffer = nullptr;
void* enqueue = nullptr;
void* enqueue2 = nullptr;
void* multi_grid = nullptr;
};
bool load_code_object(std::string filename, hsa_agent_t agent, CodeObject& code_object)
static bool load_code_object(const std::string& filename,
hsa_agent_t agent,
CodeObject& code_object)
{
hsa_status_t err;
code_object.file = open(filename.c_str(), O_RDONLY);
@@ -181,10 +183,10 @@ public:
return true;
}
bool get_kernel(const CodeObject& code_object,
std::string kernel,
hsa_agent_t agent,
Kernel& kern)
static bool get_kernel(const CodeObject& code_object,
const std::string& kernel,
hsa_agent_t agent,
Kernel& kern)
{
hsa_executable_symbol_t symbol;
hsa_status_t err = hsa_executable_get_symbol_by_name(
@@ -207,7 +209,7 @@ public:
}
// Not for parallel insertion.
bool submit_packet(hsa_queue_t* queue, Aql& pkt)
static bool submit_packet(hsa_queue_t* queue, Aql& pkt)
{
size_t mask = queue->size - 1;
Aql* ring = static_cast<Aql*>(queue->base_address);
@@ -230,26 +232,26 @@ public:
return true;
}
void* hsa_malloc(size_t size, const Device::Memory& mem)
static void* hsa_malloc(size_t size, const Device::Memory& mem)
{
void* ret;
hsa_status_t err = hsa_amd_memory_pool_allocate(mem.pool, size, 0, &ret);
RET_IF_HSA_ERR(err);
err = hsa_amd_agents_allow_access(
Device::all_devices.size(), &Device::all_devices[0], nullptr, ret);
Device::all_devices.size(), Device::all_devices.data(), nullptr, ret);
RET_IF_HSA_ERR(err);
return ret;
}
void* hsa_malloc(size_t size, const Device& dev, bool fine)
static void* hsa_malloc(size_t size, const Device& dev, bool fine)
{
uint32_t index = fine ? dev.fine : dev.coarse;
assert(index != -1u && "Memory type unavailable.");
return hsa_malloc(size, dev.pools[index]);
}
bool device_discovery()
static bool device_discovery()
{
hsa_status_t err;
@@ -273,10 +275,15 @@ public:
error = hsa_amd_agent_iterate_memory_pools(
agent,
[](hsa_amd_memory_pool_t pool, void* data) {
std::vector<Device::Memory>& pools =
*reinterpret_cast<std::vector<Device::Memory>*>(data);
auto& pools = *reinterpret_cast<std::vector<Device::Memory>*>(data);
hsa_status_t status;
bool allowed = false;
status = hsa_amd_memory_pool_get_info(
pool, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED, &allowed);
if(!allowed) return HSA_STATUS_SUCCESS;
hsa_amd_segment_t segment;
status = hsa_amd_memory_pool_get_info(
pool, HSA_AMD_MEMORY_POOL_INFO_SEGMENT, &segment);
@@ -290,9 +297,10 @@ public:
RET_IF_HSA_ERR(status)
Device::Memory mem;
mem.pool = pool;
mem.fine = (flags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED);
mem.kernarg = (flags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT);
mem.pool = pool;
mem.fine = ((flags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED) != 0u);
mem.kernarg =
((flags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT) != 0u);
status = hsa_amd_memory_pool_get_info(
pool, HSA_AMD_MEMORY_POOL_INFO_SIZE, &mem.size);