rocr: Correct gpu dumped core contents (#2851)
Includes several tests (rocrtst) for this capability.
This commit is contained in:
تفاوت فایلی نمایش داده نمی شود زیرا این فایل بسیار بزرگ است
Diff را بارگزاری کن
@@ -0,0 +1,63 @@
|
||||
/*
|
||||
* Copyright © Advanced Micro Devices, Inc., or its affiliates.
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*/
|
||||
#ifndef ROCRTST_SUITES_FUNCTIONAL_GPU_COREDUMP_H_
|
||||
#define ROCRTST_SUITES_FUNCTIONAL_GPU_COREDUMP_H_
|
||||
|
||||
#include <string>
|
||||
#include <sys/resource.h>
|
||||
#include "common/base_rocr.h"
|
||||
#include "hsa/hsa.h"
|
||||
#include "suites/test_common/test_base.h"
|
||||
|
||||
class GpuCoreDumpTest : public TestBase {
|
||||
public:
|
||||
GpuCoreDumpTest();
|
||||
virtual ~GpuCoreDumpTest();
|
||||
|
||||
// Override to avoid HSA init in parent
|
||||
virtual void SetUp();
|
||||
virtual void Run();
|
||||
virtual void Close();
|
||||
virtual void DisplayTestInfo(void);
|
||||
virtual void DisplayResults(void) const;
|
||||
|
||||
// Test cases
|
||||
void TestDefaultPattern(void);
|
||||
void TestCustomPattern(void);
|
||||
void TestDisableFlag(void);
|
||||
void TestPatternSubstitution(void);
|
||||
void TestInvalidPath(void);
|
||||
void TestCoreDumpContentIntegrity(void);
|
||||
void TestPipePattern(void);
|
||||
|
||||
private:
|
||||
// Run faulting kernel in child process (returns child PID)
|
||||
pid_t RunFaultingKernelInChild();
|
||||
|
||||
// Verify core dump file exists and is valid
|
||||
bool VerifyCoreDumpFile(const std::string& filename);
|
||||
|
||||
// Check if file is a valid GPU core dump (ELF format)
|
||||
bool IsValidGPUCoreDump(const std::string& filename);
|
||||
|
||||
// Clean up core dump files
|
||||
void CleanupCoreDumps(const std::string& pattern);
|
||||
|
||||
// Validate PT_NOTE segment structure and contents
|
||||
bool ValidateNoteSegment(int fd, uint64_t offset, uint64_t size);
|
||||
|
||||
// Validate PT_LOAD segment contents against live memory
|
||||
bool ValidateLoadSegment(int fd, uint64_t file_offset, uint64_t vaddr, uint64_t size, pid_t child_pid);
|
||||
|
||||
// Check if prerequisites for core dump tests are met
|
||||
bool CheckPrerequisites();
|
||||
|
||||
std::string test_dir_;
|
||||
struct rlimit original_rlimit_;
|
||||
bool prerequisites_met_;
|
||||
};
|
||||
|
||||
#endif // ROCRTST_SUITES_FUNCTIONAL_GPU_COREDUMP_H_
|
||||
@@ -0,0 +1,192 @@
|
||||
/*
|
||||
* Copyright © Advanced Micro Devices, Inc., or its affiliates.
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*/
|
||||
|
||||
// This test is based on TestExample but intentionally passes nullptr for
|
||||
// the kernel array arguments to trigger a GPU fault. This is useful for
|
||||
// testing GPU core dump functionality and fault handling.
|
||||
//
|
||||
// Key differences from TestExample:
|
||||
// * No memory allocation for src_buffer or dst_buffer
|
||||
// * Kernel arguments set with nullptr for array pointers
|
||||
// * No result verification (we expect a fault)
|
||||
// * Test is DISABLED by default to prevent running in CI
|
||||
|
||||
#include <algorithm>
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
|
||||
#include "suites/functional/test_fault_example.h"
|
||||
#include "common/base_rocr_utils.h"
|
||||
#include "common/common.h"
|
||||
#include "common/helper_funcs.h"
|
||||
#include "gtest/gtest.h"
|
||||
#include "hsa/hsa.h"
|
||||
|
||||
static const uint32_t kNumBufferElements = rocrtst::isEmuModeEnabled() ? 4 : 256;
|
||||
|
||||
TestFaultExample::TestFaultExample(void) :
|
||||
TestBase() {
|
||||
set_num_iteration(1); // Only need one iteration to trigger the fault
|
||||
set_title("Test Fault Example");
|
||||
set_description("This test intentionally passes nullptr for kernel array "
|
||||
"arguments to trigger a GPU fault. This is useful for testing GPU "
|
||||
"core dump functionality and fault handling mechanisms. "
|
||||
"NOTE: This test is DISABLED by default and should be run manually.");
|
||||
|
||||
set_kernel_file_name("test_case_template_kernels.hsaco");
|
||||
set_kernel_name("square"); // kernel function name
|
||||
}
|
||||
|
||||
TestFaultExample::~TestFaultExample(void) {
|
||||
}
|
||||
|
||||
// Setup the test environment - similar to TestExample but without buffer allocation
|
||||
void TestFaultExample::SetUp(void) {
|
||||
hsa_status_t err;
|
||||
|
||||
TestBase::SetUp();
|
||||
|
||||
err = rocrtst::SetDefaultAgents(this);
|
||||
ASSERT_EQ(HSA_STATUS_SUCCESS, err);
|
||||
|
||||
hsa_agent_t* gpu_dev = gpu_device1();
|
||||
|
||||
// Find and assign HSA_AMD_SEGMENT_GLOBAL pools for cpu, gpu and a kern_arg pool
|
||||
err = rocrtst::SetPoolsTypical(this);
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
|
||||
// Create a queue
|
||||
hsa_queue_t* q = nullptr;
|
||||
rocrtst::CreateQueue(*gpu_dev, &q);
|
||||
ASSERT_NE(q, nullptr);
|
||||
set_main_queue(q);
|
||||
|
||||
err = rocrtst::LoadKernelFromObjFile(this, gpu_dev);
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
|
||||
err = rocrtst::InitializeAQLPacket(this, &aql());
|
||||
ASSERT_EQ(HSA_STATUS_SUCCESS, err);
|
||||
|
||||
// NOTE: We do NOT allocate src_buffer or dst_buffer
|
||||
// We will pass nullptr to the kernel to trigger a fault
|
||||
|
||||
// Set up Kernel arguments with nullptr for array pointers
|
||||
struct __attribute__((aligned(16))) local_args_t {
|
||||
uint32_t* dstArray;
|
||||
uint32_t* srcArray;
|
||||
uint32_t size;
|
||||
uint32_t pad;
|
||||
uint64_t global_offset_x;
|
||||
uint64_t global_offset_y;
|
||||
uint64_t global_offset_z;
|
||||
uint64_t printf_buffer;
|
||||
uint64_t default_queue;
|
||||
uint64_t completion_action;
|
||||
} local_args;
|
||||
|
||||
// Intentionally set array pointers to nullptr to cause a fault
|
||||
local_args.dstArray = nullptr;
|
||||
local_args.srcArray = nullptr;
|
||||
local_args.size = kNumBufferElements;
|
||||
local_args.global_offset_x = 0;
|
||||
local_args.global_offset_y = 0;
|
||||
local_args.global_offset_z = 0;
|
||||
local_args.printf_buffer = 0;
|
||||
local_args.default_queue = 0;
|
||||
local_args.completion_action = 0;
|
||||
|
||||
err = rocrtst::AllocAndSetKernArgs(this, &local_args, sizeof(local_args));
|
||||
ASSERT_EQ(err, HSA_STATUS_SUCCESS);
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
// This wrapper atomically writes the provided header and setup to the
|
||||
// provided AQL packet. The provided AQL packet address should be in the
|
||||
// queue memory space.
|
||||
static inline void AtomicSetPacketHeader(uint16_t header, uint16_t setup,
|
||||
hsa_kernel_dispatch_packet_t* queue_packet) {
|
||||
__atomic_store_n(reinterpret_cast<uint32_t*>(queue_packet),
|
||||
header | (setup << 16), __ATOMIC_RELEASE);
|
||||
}
|
||||
|
||||
void TestFaultExample::Run(void) {
|
||||
// Compare required profile for this test case with what we're actually
|
||||
// running on
|
||||
if (!rocrtst::CheckProfile(this)) {
|
||||
return;
|
||||
}
|
||||
|
||||
TestBase::Run();
|
||||
|
||||
// Override whatever we need to...
|
||||
aql().workgroup_size_x = kNumBufferElements;
|
||||
aql().grid_size_x = kNumBufferElements;
|
||||
|
||||
hsa_kernel_dispatch_packet_t *queue_aql_packet;
|
||||
uint64_t index;
|
||||
|
||||
if (verbosity() >= VERBOSE_STANDARD) {
|
||||
std::cout << "Dispatching kernel with nullptr arrays - expecting GPU fault..." << std::endl;
|
||||
}
|
||||
|
||||
// This function simply copies the data we've collected so far into our
|
||||
// local AQL packet, except the the setup and header fields.
|
||||
queue_aql_packet = WriteAQLToQueue(this, &index);
|
||||
ASSERT_EQ(queue_aql_packet,
|
||||
reinterpret_cast<hsa_kernel_dispatch_packet_t *>
|
||||
(main_queue()->base_address) + index);
|
||||
uint32_t aql_header = HSA_PACKET_TYPE_KERNEL_DISPATCH;
|
||||
|
||||
aql_header |= HSA_FENCE_SCOPE_SYSTEM <<
|
||||
HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE;
|
||||
aql_header |= HSA_FENCE_SCOPE_SYSTEM <<
|
||||
HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE;
|
||||
|
||||
::AtomicSetPacketHeader(aql_header, aql().setup, queue_aql_packet);
|
||||
|
||||
hsa_signal_store_screlease(main_queue()->doorbell_signal, index);
|
||||
|
||||
// Wait on the dispatch signal until the kernel is finished (or faults).
|
||||
// Note: This may trigger a GPU fault/exception
|
||||
while (hsa_signal_wait_scacquire(aql().completion_signal,
|
||||
HSA_SIGNAL_CONDITION_LT, 1, (uint64_t) - 1, HSA_WAIT_STATE_ACTIVE)) {
|
||||
}
|
||||
|
||||
if (verbosity() >= VERBOSE_STANDARD) {
|
||||
std::cout << "Kernel dispatch completed (fault may have occurred)" << std::endl;
|
||||
}
|
||||
|
||||
hsa_signal_store_screlease(aql().completion_signal, 1);
|
||||
|
||||
// NOTE: We do NOT verify results since we expect a fault
|
||||
}
|
||||
|
||||
void TestFaultExample::DisplayTestInfo(void) {
|
||||
TestBase::DisplayTestInfo();
|
||||
}
|
||||
|
||||
void TestFaultExample::DisplayResults(void) const {
|
||||
// Compare required profile for this test case with what we're actually
|
||||
// running on
|
||||
if (!rocrtst::CheckProfile(this)) {
|
||||
return;
|
||||
}
|
||||
|
||||
TestBase::DisplayResults();
|
||||
std::cout << "Test completed. Check for GPU core dump if fault handling is enabled." << std::endl;
|
||||
return;
|
||||
}
|
||||
|
||||
void TestFaultExample::Close() {
|
||||
// NOTE: We do NOT free src_buffer or dst_buffer since we never allocated them
|
||||
|
||||
// This will close handles opened within rocrtst utility calls and call
|
||||
// hsa_shut_down(), so it should be done after other hsa cleanup
|
||||
TestBase::Close();
|
||||
}
|
||||
|
||||
#undef RET_IF_HSA_ERR
|
||||
@@ -0,0 +1,37 @@
|
||||
/*
|
||||
* Copyright © Advanced Micro Devices, Inc., or its affiliates.
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*/
|
||||
|
||||
#ifndef ROCRTST_SUITES_FUNCTIONAL_TEST_FAULT_EXAMPLE_H_
|
||||
#define ROCRTST_SUITES_FUNCTIONAL_TEST_FAULT_EXAMPLE_H_
|
||||
|
||||
#include "common/base_rocr.h"
|
||||
#include "hsa/hsa.h"
|
||||
#include "suites/test_common/test_base.h"
|
||||
|
||||
class TestFaultExample : public TestBase {
|
||||
public:
|
||||
TestFaultExample();
|
||||
|
||||
// @Brief: Destructor for test case of TestFaultExample
|
||||
virtual ~TestFaultExample();
|
||||
|
||||
// @Brief: Setup the environment for measurement
|
||||
virtual void SetUp();
|
||||
|
||||
// @Brief: Core measurement execution
|
||||
virtual void Run();
|
||||
|
||||
// @Brief: Clean up and retrieve the resource
|
||||
virtual void Close();
|
||||
|
||||
// @Brief: Display results
|
||||
virtual void DisplayResults() const;
|
||||
|
||||
// @Brief: Display information about what this test does
|
||||
virtual void DisplayTestInfo(void);
|
||||
};
|
||||
|
||||
#endif // ROCRTST_SUITES_FUNCTIONAL_TEST_FAULT_EXAMPLE_H_
|
||||
@@ -69,6 +69,7 @@
|
||||
#include "suites/stress/memory_concurrent_tests.h"
|
||||
#include "suites/stress/queue_write_index_concurrent_tests.h"
|
||||
#include "suites/test_common/test_case_template.h"
|
||||
#include "suites/functional/test_fault_example.h"
|
||||
#include "suites/test_common/main.h"
|
||||
#include "suites/test_common/test_common.h"
|
||||
#include "suites/functional/concurrent_init.h"
|
||||
@@ -80,6 +81,7 @@
|
||||
#include "suites/functional/signal_kernel.h"
|
||||
#include "suites/functional/cu_masking.h"
|
||||
#include "suites/functional/filter_devices.h"
|
||||
#include "suites/functional/gpu_coredump.h"
|
||||
#include "amd_smi/amdsmi.h"
|
||||
#include "common/common.h"
|
||||
#include "suites/functional/counted_queues.h"
|
||||
@@ -320,6 +322,69 @@ TEST(rocrtstFunc, Memory_Available) {
|
||||
);
|
||||
}
|
||||
|
||||
TEST(rocrtstFunc, GpuCoreDump_DefaultPattern) {
|
||||
RUN_IF_NOT_EMU_MODE(
|
||||
GpuCoreDumpTest gcd;
|
||||
RunCustomTestProlog(&gcd);
|
||||
gcd.TestDefaultPattern();
|
||||
RunCustomTestEpilog(&gcd);
|
||||
);
|
||||
}
|
||||
|
||||
TEST(rocrtstFunc, GpuCoreDump_CustomPattern) {
|
||||
RUN_IF_NOT_EMU_MODE(
|
||||
GpuCoreDumpTest gcd;
|
||||
RunCustomTestProlog(&gcd);
|
||||
gcd.TestCustomPattern();
|
||||
RunCustomTestEpilog(&gcd);
|
||||
);
|
||||
}
|
||||
|
||||
TEST(rocrtstFunc, GpuCoreDump_DisableFlag) {
|
||||
RUN_IF_NOT_EMU_MODE(
|
||||
GpuCoreDumpTest gcd;
|
||||
RunCustomTestProlog(&gcd);
|
||||
gcd.TestDisableFlag();
|
||||
RunCustomTestEpilog(&gcd);
|
||||
);
|
||||
}
|
||||
|
||||
TEST(rocrtstFunc, GpuCoreDump_PatternSubstitution) {
|
||||
RUN_IF_NOT_EMU_MODE(
|
||||
GpuCoreDumpTest gcd;
|
||||
RunCustomTestProlog(&gcd);
|
||||
gcd.TestPatternSubstitution();
|
||||
RunCustomTestEpilog(&gcd);
|
||||
);
|
||||
}
|
||||
|
||||
TEST(rocrtstFunc, GpuCoreDump_InvalidPath) {
|
||||
RUN_IF_NOT_EMU_MODE(
|
||||
GpuCoreDumpTest gcd;
|
||||
RunCustomTestProlog(&gcd);
|
||||
gcd.TestInvalidPath();
|
||||
RunCustomTestEpilog(&gcd);
|
||||
);
|
||||
}
|
||||
|
||||
TEST(rocrtstFunc, GpuCoreDump_ContentIntegrity) {
|
||||
RUN_IF_NOT_EMU_MODE(
|
||||
GpuCoreDumpTest gcd;
|
||||
RunCustomTestProlog(&gcd);
|
||||
gcd.TestCoreDumpContentIntegrity();
|
||||
RunCustomTestEpilog(&gcd);
|
||||
);
|
||||
}
|
||||
|
||||
TEST(rocrtstFunc, GpuCoreDump_PipePattern) {
|
||||
RUN_IF_NOT_EMU_MODE(
|
||||
GpuCoreDumpTest gcd;
|
||||
RunCustomTestProlog(&gcd);
|
||||
gcd.TestPipePattern();
|
||||
RunCustomTestEpilog(&gcd);
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
TEST(rocrtstFunc, Memory_Atomic_Add_Test) {
|
||||
RUN_IF_NOT_EMU_MODE(
|
||||
|
||||
@@ -548,22 +548,28 @@ hsa_status_t write_core_dump_to_fd(int fd, const SegmentsInfo& segments,
|
||||
return (uint32_t)0;
|
||||
}
|
||||
} (seg.stype);
|
||||
if (size_limit != -1 && (offset + seg.size > size_limit)) {
|
||||
if (show_progress) {
|
||||
printf("Core limit file reached during pipe write\n");
|
||||
}
|
||||
return HSA_STATUS_SUCCESS;
|
||||
}
|
||||
phdr.p_offset = alignUp(offset, (uint64_t)1 << phdr.p_align);
|
||||
phdrs.push_back(phdr);
|
||||
offset += phdr.p_filesz;
|
||||
}
|
||||
|
||||
// Write all program headers
|
||||
for (const auto& phdr : phdrs) {
|
||||
if (write(fd, &phdr, sizeof(phdr)) != sizeof(phdr)) {
|
||||
perror("Failed to write program header to pipe");
|
||||
return HSA_STATUS_ERROR;
|
||||
if (is_reg_file) {
|
||||
// For regular files, use pwrite to write at specific offset
|
||||
for (size_t i = 0; i < phdrs.size(); i++) {
|
||||
off_t phdr_offset = sizeof(Elf64_Ehdr) + i * sizeof(Elf64_Phdr);
|
||||
if (pwrite(fd, &phdrs[i], sizeof(Elf64_Phdr), phdr_offset) != sizeof(Elf64_Phdr)) {
|
||||
perror("Failed to write program header");
|
||||
return HSA_STATUS_ERROR;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// For pipes, use sequential write
|
||||
for (const auto& phdr : phdrs) {
|
||||
if (write(fd, &phdr, sizeof(phdr)) != sizeof(phdr)) {
|
||||
perror("Failed to write program header to pipe");
|
||||
return HSA_STATUS_ERROR;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -572,6 +578,15 @@ hsa_status_t write_core_dump_to_fd(int fd, const SegmentsInfo& segments,
|
||||
const SegmentInfo& seg = segments[idx];
|
||||
const Elf64_Phdr& phdr = phdrs[idx];
|
||||
|
||||
// Check if this segment would exceed size limit
|
||||
if (size_limit != -1 && (phdr.p_offset + phdr.p_filesz > size_limit)) {
|
||||
if (show_progress) {
|
||||
fprintf(stderr, "Core file size limit reached, truncating at segment %zu\n", idx);
|
||||
}
|
||||
// Stop writing segments but return success - we wrote valid headers
|
||||
return HSA_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
if (is_reg_file) {
|
||||
int error = posix_fallocate(fd, phdr.p_offset, phdr.p_filesz);
|
||||
if (error != 0) {
|
||||
@@ -588,9 +603,21 @@ hsa_status_t write_core_dump_to_fd(int fd, const SegmentsInfo& segments,
|
||||
if (st != HSA_STATUS_SUCCESS) {
|
||||
return st;
|
||||
}
|
||||
if (write(fd, copy_buffer.get(), curr_chunk) != (ssize_t)curr_chunk) {
|
||||
perror("Failed to write segment data to pipe");
|
||||
return HSA_STATUS_ERROR;
|
||||
|
||||
if (is_reg_file) {
|
||||
// For regular files, use pwrite to write at specific offset
|
||||
if (pwrite(fd, copy_buffer.get(), curr_chunk,
|
||||
phdr.p_offset + phdr.p_filesz - remaining) !=
|
||||
(ssize_t)curr_chunk) {
|
||||
perror("Failed to write segment data");
|
||||
return HSA_STATUS_ERROR;
|
||||
}
|
||||
} else {
|
||||
// For pipes, use sequential write
|
||||
if (write(fd, copy_buffer.get(), curr_chunk) != (ssize_t)curr_chunk) {
|
||||
perror("Failed to write segment data to pipe");
|
||||
return HSA_STATUS_ERROR;
|
||||
}
|
||||
}
|
||||
remaining -= curr_chunk;
|
||||
}
|
||||
|
||||
مرجع در شماره جدید
Block a user