rocr: Correct gpu dumped core contents (#2851)

Includes several tests (rocrtst) for this capability.
This commit is contained in:
cfreeamd
2026-01-30 11:38:09 -06:00
committato da GitHub
parent 7e6b7cb50b
commit 5172701708
6 ha cambiato i file con 1759 aggiunte e 13 eliminazioni
File diff soppresso perché troppo grande Carica Diff
@@ -0,0 +1,63 @@
/*
* Copyright © Advanced Micro Devices, Inc., or its affiliates.
*
* SPDX-License-Identifier: MIT
*/
#ifndef ROCRTST_SUITES_FUNCTIONAL_GPU_COREDUMP_H_
#define ROCRTST_SUITES_FUNCTIONAL_GPU_COREDUMP_H_
#include <string>
#include <sys/resource.h>
#include "common/base_rocr.h"
#include "hsa/hsa.h"
#include "suites/test_common/test_base.h"
class GpuCoreDumpTest : public TestBase {
public:
GpuCoreDumpTest();
virtual ~GpuCoreDumpTest();
// Override to avoid HSA init in parent
virtual void SetUp();
virtual void Run();
virtual void Close();
virtual void DisplayTestInfo(void);
virtual void DisplayResults(void) const;
// Test cases
void TestDefaultPattern(void);
void TestCustomPattern(void);
void TestDisableFlag(void);
void TestPatternSubstitution(void);
void TestInvalidPath(void);
void TestCoreDumpContentIntegrity(void);
void TestPipePattern(void);
private:
// Run faulting kernel in child process (returns child PID)
pid_t RunFaultingKernelInChild();
// Verify core dump file exists and is valid
bool VerifyCoreDumpFile(const std::string& filename);
// Check if file is a valid GPU core dump (ELF format)
bool IsValidGPUCoreDump(const std::string& filename);
// Clean up core dump files
void CleanupCoreDumps(const std::string& pattern);
// Validate PT_NOTE segment structure and contents
bool ValidateNoteSegment(int fd, uint64_t offset, uint64_t size);
// Validate PT_LOAD segment contents against live memory
bool ValidateLoadSegment(int fd, uint64_t file_offset, uint64_t vaddr, uint64_t size, pid_t child_pid);
// Check if prerequisites for core dump tests are met
bool CheckPrerequisites();
std::string test_dir_;
struct rlimit original_rlimit_;
bool prerequisites_met_;
};
#endif // ROCRTST_SUITES_FUNCTIONAL_GPU_COREDUMP_H_
@@ -0,0 +1,192 @@
/*
* Copyright © Advanced Micro Devices, Inc., or its affiliates.
*
* SPDX-License-Identifier: MIT
*/
// This test is based on TestExample but intentionally passes nullptr for
// the kernel array arguments to trigger a GPU fault. This is useful for
// testing GPU core dump functionality and fault handling.
//
// Key differences from TestExample:
// * No memory allocation for src_buffer or dst_buffer
// * Kernel arguments set with nullptr for array pointers
// * No result verification (we expect a fault)
// * Test is DISABLED by default to prevent running in CI
#include <algorithm>
#include <iostream>
#include <vector>
#include "suites/functional/test_fault_example.h"
#include "common/base_rocr_utils.h"
#include "common/common.h"
#include "common/helper_funcs.h"
#include "gtest/gtest.h"
#include "hsa/hsa.h"
static const uint32_t kNumBufferElements = rocrtst::isEmuModeEnabled() ? 4 : 256;
TestFaultExample::TestFaultExample(void) :
TestBase() {
set_num_iteration(1); // Only need one iteration to trigger the fault
set_title("Test Fault Example");
set_description("This test intentionally passes nullptr for kernel array "
"arguments to trigger a GPU fault. This is useful for testing GPU "
"core dump functionality and fault handling mechanisms. "
"NOTE: This test is DISABLED by default and should be run manually.");
set_kernel_file_name("test_case_template_kernels.hsaco");
set_kernel_name("square"); // kernel function name
}
TestFaultExample::~TestFaultExample(void) {
}
// Setup the test environment - similar to TestExample but without buffer allocation
void TestFaultExample::SetUp(void) {
hsa_status_t err;
TestBase::SetUp();
err = rocrtst::SetDefaultAgents(this);
ASSERT_EQ(HSA_STATUS_SUCCESS, err);
hsa_agent_t* gpu_dev = gpu_device1();
// Find and assign HSA_AMD_SEGMENT_GLOBAL pools for cpu, gpu and a kern_arg pool
err = rocrtst::SetPoolsTypical(this);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
// Create a queue
hsa_queue_t* q = nullptr;
rocrtst::CreateQueue(*gpu_dev, &q);
ASSERT_NE(q, nullptr);
set_main_queue(q);
err = rocrtst::LoadKernelFromObjFile(this, gpu_dev);
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
err = rocrtst::InitializeAQLPacket(this, &aql());
ASSERT_EQ(HSA_STATUS_SUCCESS, err);
// NOTE: We do NOT allocate src_buffer or dst_buffer
// We will pass nullptr to the kernel to trigger a fault
// Set up Kernel arguments with nullptr for array pointers
struct __attribute__((aligned(16))) local_args_t {
uint32_t* dstArray;
uint32_t* srcArray;
uint32_t size;
uint32_t pad;
uint64_t global_offset_x;
uint64_t global_offset_y;
uint64_t global_offset_z;
uint64_t printf_buffer;
uint64_t default_queue;
uint64_t completion_action;
} local_args;
// Intentionally set array pointers to nullptr to cause a fault
local_args.dstArray = nullptr;
local_args.srcArray = nullptr;
local_args.size = kNumBufferElements;
local_args.global_offset_x = 0;
local_args.global_offset_y = 0;
local_args.global_offset_z = 0;
local_args.printf_buffer = 0;
local_args.default_queue = 0;
local_args.completion_action = 0;
err = rocrtst::AllocAndSetKernArgs(this, &local_args, sizeof(local_args));
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
return;
}
// This wrapper atomically writes the provided header and setup to the
// provided AQL packet. The provided AQL packet address should be in the
// queue memory space.
static inline void AtomicSetPacketHeader(uint16_t header, uint16_t setup,
hsa_kernel_dispatch_packet_t* queue_packet) {
__atomic_store_n(reinterpret_cast<uint32_t*>(queue_packet),
header | (setup << 16), __ATOMIC_RELEASE);
}
void TestFaultExample::Run(void) {
// Compare required profile for this test case with what we're actually
// running on
if (!rocrtst::CheckProfile(this)) {
return;
}
TestBase::Run();
// Override whatever we need to...
aql().workgroup_size_x = kNumBufferElements;
aql().grid_size_x = kNumBufferElements;
hsa_kernel_dispatch_packet_t *queue_aql_packet;
uint64_t index;
if (verbosity() >= VERBOSE_STANDARD) {
std::cout << "Dispatching kernel with nullptr arrays - expecting GPU fault..." << std::endl;
}
// This function simply copies the data we've collected so far into our
// local AQL packet, except the the setup and header fields.
queue_aql_packet = WriteAQLToQueue(this, &index);
ASSERT_EQ(queue_aql_packet,
reinterpret_cast<hsa_kernel_dispatch_packet_t *>
(main_queue()->base_address) + index);
uint32_t aql_header = HSA_PACKET_TYPE_KERNEL_DISPATCH;
aql_header |= HSA_FENCE_SCOPE_SYSTEM <<
HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE;
aql_header |= HSA_FENCE_SCOPE_SYSTEM <<
HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE;
::AtomicSetPacketHeader(aql_header, aql().setup, queue_aql_packet);
hsa_signal_store_screlease(main_queue()->doorbell_signal, index);
// Wait on the dispatch signal until the kernel is finished (or faults).
// Note: This may trigger a GPU fault/exception
while (hsa_signal_wait_scacquire(aql().completion_signal,
HSA_SIGNAL_CONDITION_LT, 1, (uint64_t) - 1, HSA_WAIT_STATE_ACTIVE)) {
}
if (verbosity() >= VERBOSE_STANDARD) {
std::cout << "Kernel dispatch completed (fault may have occurred)" << std::endl;
}
hsa_signal_store_screlease(aql().completion_signal, 1);
// NOTE: We do NOT verify results since we expect a fault
}
void TestFaultExample::DisplayTestInfo(void) {
TestBase::DisplayTestInfo();
}
void TestFaultExample::DisplayResults(void) const {
// Compare required profile for this test case with what we're actually
// running on
if (!rocrtst::CheckProfile(this)) {
return;
}
TestBase::DisplayResults();
std::cout << "Test completed. Check for GPU core dump if fault handling is enabled." << std::endl;
return;
}
void TestFaultExample::Close() {
// NOTE: We do NOT free src_buffer or dst_buffer since we never allocated them
// This will close handles opened within rocrtst utility calls and call
// hsa_shut_down(), so it should be done after other hsa cleanup
TestBase::Close();
}
#undef RET_IF_HSA_ERR
@@ -0,0 +1,37 @@
/*
* Copyright © Advanced Micro Devices, Inc., or its affiliates.
*
* SPDX-License-Identifier: MIT
*/
#ifndef ROCRTST_SUITES_FUNCTIONAL_TEST_FAULT_EXAMPLE_H_
#define ROCRTST_SUITES_FUNCTIONAL_TEST_FAULT_EXAMPLE_H_
#include "common/base_rocr.h"
#include "hsa/hsa.h"
#include "suites/test_common/test_base.h"
class TestFaultExample : public TestBase {
public:
TestFaultExample();
// @Brief: Destructor for test case of TestFaultExample
virtual ~TestFaultExample();
// @Brief: Setup the environment for measurement
virtual void SetUp();
// @Brief: Core measurement execution
virtual void Run();
// @Brief: Clean up and retrieve the resource
virtual void Close();
// @Brief: Display results
virtual void DisplayResults() const;
// @Brief: Display information about what this test does
virtual void DisplayTestInfo(void);
};
#endif // ROCRTST_SUITES_FUNCTIONAL_TEST_FAULT_EXAMPLE_H_
@@ -69,6 +69,7 @@
#include "suites/stress/memory_concurrent_tests.h"
#include "suites/stress/queue_write_index_concurrent_tests.h"
#include "suites/test_common/test_case_template.h"
#include "suites/functional/test_fault_example.h"
#include "suites/test_common/main.h"
#include "suites/test_common/test_common.h"
#include "suites/functional/concurrent_init.h"
@@ -80,6 +81,7 @@
#include "suites/functional/signal_kernel.h"
#include "suites/functional/cu_masking.h"
#include "suites/functional/filter_devices.h"
#include "suites/functional/gpu_coredump.h"
#include "amd_smi/amdsmi.h"
#include "common/common.h"
#include "suites/functional/counted_queues.h"
@@ -320,6 +322,69 @@ TEST(rocrtstFunc, Memory_Available) {
);
}
TEST(rocrtstFunc, GpuCoreDump_DefaultPattern) {
RUN_IF_NOT_EMU_MODE(
GpuCoreDumpTest gcd;
RunCustomTestProlog(&gcd);
gcd.TestDefaultPattern();
RunCustomTestEpilog(&gcd);
);
}
TEST(rocrtstFunc, GpuCoreDump_CustomPattern) {
RUN_IF_NOT_EMU_MODE(
GpuCoreDumpTest gcd;
RunCustomTestProlog(&gcd);
gcd.TestCustomPattern();
RunCustomTestEpilog(&gcd);
);
}
TEST(rocrtstFunc, GpuCoreDump_DisableFlag) {
RUN_IF_NOT_EMU_MODE(
GpuCoreDumpTest gcd;
RunCustomTestProlog(&gcd);
gcd.TestDisableFlag();
RunCustomTestEpilog(&gcd);
);
}
TEST(rocrtstFunc, GpuCoreDump_PatternSubstitution) {
RUN_IF_NOT_EMU_MODE(
GpuCoreDumpTest gcd;
RunCustomTestProlog(&gcd);
gcd.TestPatternSubstitution();
RunCustomTestEpilog(&gcd);
);
}
TEST(rocrtstFunc, GpuCoreDump_InvalidPath) {
RUN_IF_NOT_EMU_MODE(
GpuCoreDumpTest gcd;
RunCustomTestProlog(&gcd);
gcd.TestInvalidPath();
RunCustomTestEpilog(&gcd);
);
}
TEST(rocrtstFunc, GpuCoreDump_ContentIntegrity) {
RUN_IF_NOT_EMU_MODE(
GpuCoreDumpTest gcd;
RunCustomTestProlog(&gcd);
gcd.TestCoreDumpContentIntegrity();
RunCustomTestEpilog(&gcd);
);
}
TEST(rocrtstFunc, GpuCoreDump_PipePattern) {
RUN_IF_NOT_EMU_MODE(
GpuCoreDumpTest gcd;
RunCustomTestProlog(&gcd);
gcd.TestPipePattern();
RunCustomTestEpilog(&gcd);
);
}
TEST(rocrtstFunc, Memory_Atomic_Add_Test) {
RUN_IF_NOT_EMU_MODE(
@@ -548,22 +548,28 @@ hsa_status_t write_core_dump_to_fd(int fd, const SegmentsInfo& segments,
return (uint32_t)0;
}
} (seg.stype);
if (size_limit != -1 && (offset + seg.size > size_limit)) {
if (show_progress) {
printf("Core limit file reached during pipe write\n");
}
return HSA_STATUS_SUCCESS;
}
phdr.p_offset = alignUp(offset, (uint64_t)1 << phdr.p_align);
phdrs.push_back(phdr);
offset += phdr.p_filesz;
}
// Write all program headers
for (const auto& phdr : phdrs) {
if (write(fd, &phdr, sizeof(phdr)) != sizeof(phdr)) {
perror("Failed to write program header to pipe");
return HSA_STATUS_ERROR;
if (is_reg_file) {
// For regular files, use pwrite to write at specific offset
for (size_t i = 0; i < phdrs.size(); i++) {
off_t phdr_offset = sizeof(Elf64_Ehdr) + i * sizeof(Elf64_Phdr);
if (pwrite(fd, &phdrs[i], sizeof(Elf64_Phdr), phdr_offset) != sizeof(Elf64_Phdr)) {
perror("Failed to write program header");
return HSA_STATUS_ERROR;
}
}
} else {
// For pipes, use sequential write
for (const auto& phdr : phdrs) {
if (write(fd, &phdr, sizeof(phdr)) != sizeof(phdr)) {
perror("Failed to write program header to pipe");
return HSA_STATUS_ERROR;
}
}
}
@@ -572,6 +578,15 @@ hsa_status_t write_core_dump_to_fd(int fd, const SegmentsInfo& segments,
const SegmentInfo& seg = segments[idx];
const Elf64_Phdr& phdr = phdrs[idx];
// Check if this segment would exceed size limit
if (size_limit != -1 && (phdr.p_offset + phdr.p_filesz > size_limit)) {
if (show_progress) {
fprintf(stderr, "Core file size limit reached, truncating at segment %zu\n", idx);
}
// Stop writing segments but return success - we wrote valid headers
return HSA_STATUS_SUCCESS;
}
if (is_reg_file) {
int error = posix_fallocate(fd, phdr.p_offset, phdr.p_filesz);
if (error != 0) {
@@ -588,9 +603,21 @@ hsa_status_t write_core_dump_to_fd(int fd, const SegmentsInfo& segments,
if (st != HSA_STATUS_SUCCESS) {
return st;
}
if (write(fd, copy_buffer.get(), curr_chunk) != (ssize_t)curr_chunk) {
perror("Failed to write segment data to pipe");
return HSA_STATUS_ERROR;
if (is_reg_file) {
// For regular files, use pwrite to write at specific offset
if (pwrite(fd, copy_buffer.get(), curr_chunk,
phdr.p_offset + phdr.p_filesz - remaining) !=
(ssize_t)curr_chunk) {
perror("Failed to write segment data");
return HSA_STATUS_ERROR;
}
} else {
// For pipes, use sequential write
if (write(fd, copy_buffer.get(), curr_chunk) != (ssize_t)curr_chunk) {
perror("Failed to write segment data to pipe");
return HSA_STATUS_ERROR;
}
}
remaining -= curr_chunk;
}