rocr: Correct gpu dumped core contents (#2851)

Includes several tests (rocrtst) for this capability.
2026-01-30 11:38:09 -06:00
parent 7e6b7cb50b
commit 5172701708
@@ -0,0 +1,63 @@
+/*
+ * Copyright © Advanced Micro Devices, Inc., or its affiliates.
+ *
+ * SPDX-License-Identifier: MIT
+ */
+#ifndef ROCRTST_SUITES_FUNCTIONAL_GPU_COREDUMP_H_
+#define ROCRTST_SUITES_FUNCTIONAL_GPU_COREDUMP_H_
+
+#include <string>
+#include <sys/resource.h>
+#include "common/base_rocr.h"
+#include "hsa/hsa.h"
+#include "suites/test_common/test_base.h"
+
+class GpuCoreDumpTest : public TestBase {
+ public:
+  GpuCoreDumpTest();
+  virtual ~GpuCoreDumpTest();
+
+  // Override to avoid HSA init in parent
+  virtual void SetUp();
+  virtual void Run();
+  virtual void Close();
+  virtual void DisplayTestInfo(void);
+  virtual void DisplayResults(void) const;
+
+  // Test cases
+  void TestDefaultPattern(void);
+  void TestCustomPattern(void);
+  void TestDisableFlag(void);
+  void TestPatternSubstitution(void);
+  void TestInvalidPath(void);
+  void TestCoreDumpContentIntegrity(void);
+  void TestPipePattern(void);
+
+ private:
+  // Run faulting kernel in child process (returns child PID)
+  pid_t RunFaultingKernelInChild();
+
+  // Verify core dump file exists and is valid
+  bool VerifyCoreDumpFile(const std::string& filename);
+
+  // Check if file is a valid GPU core dump (ELF format)
+  bool IsValidGPUCoreDump(const std::string& filename);
+
+  // Clean up core dump files
+  void CleanupCoreDumps(const std::string& pattern);
+
+  // Validate PT_NOTE segment structure and contents
+  bool ValidateNoteSegment(int fd, uint64_t offset, uint64_t size);
+
+  // Validate PT_LOAD segment contents against live memory
+  bool ValidateLoadSegment(int fd, uint64_t file_offset, uint64_t vaddr, uint64_t size, pid_t child_pid);
+
+  // Check if prerequisites for core dump tests are met
+  bool CheckPrerequisites();
+
+  std::string test_dir_;
+  struct rlimit original_rlimit_;
+  bool prerequisites_met_;
+};
+
+#endif  // ROCRTST_SUITES_FUNCTIONAL_GPU_COREDUMP_H_
@@ -0,0 +1,192 @@
+/*
+ * Copyright © Advanced Micro Devices, Inc., or its affiliates.
+ *
+ * SPDX-License-Identifier: MIT
+ */
+
+// This test is based on TestExample but intentionally passes nullptr for
+// the kernel array arguments to trigger a GPU fault. This is useful for
+// testing GPU core dump functionality and fault handling.
+//
+// Key differences from TestExample:
+// * No memory allocation for src_buffer or dst_buffer
+// * Kernel arguments set with nullptr for array pointers
+// * No result verification (we expect a fault)
+// * Test is DISABLED by default to prevent running in CI
+
+#include <algorithm>
+#include <iostream>
+#include <vector>
+
+#include "suites/functional/test_fault_example.h"
+#include "common/base_rocr_utils.h"
+#include "common/common.h"
+#include "common/helper_funcs.h"
+#include "gtest/gtest.h"
+#include "hsa/hsa.h"
+
+static const uint32_t kNumBufferElements = rocrtst::isEmuModeEnabled() ? 4 : 256;
+
+TestFaultExample::TestFaultExample(void) :
+    TestBase() {
+  set_num_iteration(1);  // Only need one iteration to trigger the fault
+  set_title("Test Fault Example");
+  set_description("This test intentionally passes nullptr for kernel array "
+      "arguments to trigger a GPU fault. This is useful for testing GPU "
+      "core dump functionality and fault handling mechanisms. "
+      "NOTE: This test is DISABLED by default and should be run manually.");
+
+  set_kernel_file_name("test_case_template_kernels.hsaco");
+  set_kernel_name("square");  // kernel function name
+}
+
+TestFaultExample::~TestFaultExample(void) {
+}
+
+// Setup the test environment - similar to TestExample but without buffer allocation
+void TestFaultExample::SetUp(void) {
+  hsa_status_t err;
+
+  TestBase::SetUp();
+
+  err = rocrtst::SetDefaultAgents(this);
+  ASSERT_EQ(HSA_STATUS_SUCCESS, err);
+
+  hsa_agent_t* gpu_dev = gpu_device1();
+
+  // Find and assign HSA_AMD_SEGMENT_GLOBAL pools for cpu, gpu and a kern_arg pool
+  err = rocrtst::SetPoolsTypical(this);
+  ASSERT_EQ(err, HSA_STATUS_SUCCESS);
+
+  // Create a queue
+  hsa_queue_t* q = nullptr;
+  rocrtst::CreateQueue(*gpu_dev, &q);
+  ASSERT_NE(q, nullptr);
+  set_main_queue(q);
+
+  err = rocrtst::LoadKernelFromObjFile(this, gpu_dev);
+  ASSERT_EQ(err, HSA_STATUS_SUCCESS);
+
+  err = rocrtst::InitializeAQLPacket(this, &aql());
+  ASSERT_EQ(HSA_STATUS_SUCCESS, err);
+
+  // NOTE: We do NOT allocate src_buffer or dst_buffer
+  // We will pass nullptr to the kernel to trigger a fault
+
+  // Set up Kernel arguments with nullptr for array pointers
+  struct __attribute__((aligned(16))) local_args_t {
+    uint32_t* dstArray;
+    uint32_t* srcArray;
+    uint32_t size;
+    uint32_t pad;
+    uint64_t global_offset_x;
+    uint64_t global_offset_y;
+    uint64_t global_offset_z;
+    uint64_t printf_buffer;
+    uint64_t default_queue;
+    uint64_t completion_action;
+  } local_args;
+
+  // Intentionally set array pointers to nullptr to cause a fault
+  local_args.dstArray = nullptr;
+  local_args.srcArray = nullptr;
+  local_args.size = kNumBufferElements;
+  local_args.global_offset_x = 0;
+  local_args.global_offset_y = 0;
+  local_args.global_offset_z = 0;
+  local_args.printf_buffer = 0;
+  local_args.default_queue = 0;
+  local_args.completion_action = 0;
+
+  err = rocrtst::AllocAndSetKernArgs(this, &local_args, sizeof(local_args));
+  ASSERT_EQ(err, HSA_STATUS_SUCCESS);
+
+  return;
+}
+
+// This wrapper atomically writes the provided header and setup to the
+// provided AQL packet. The provided AQL packet address should be in the
+// queue memory space.
+static inline void AtomicSetPacketHeader(uint16_t header, uint16_t setup,
+                                  hsa_kernel_dispatch_packet_t* queue_packet) {
+  __atomic_store_n(reinterpret_cast<uint32_t*>(queue_packet),
+                   header | (setup << 16), __ATOMIC_RELEASE);
+}
+
+void TestFaultExample::Run(void) {
+  // Compare required profile for this test case with what we're actually
+  // running on
+  if (!rocrtst::CheckProfile(this)) {
+    return;
+  }
+
+  TestBase::Run();
+
+  // Override whatever we need to...
+  aql().workgroup_size_x = kNumBufferElements;
+  aql().grid_size_x = kNumBufferElements;
+
+  hsa_kernel_dispatch_packet_t *queue_aql_packet;
+  uint64_t index;
+
+  if (verbosity() >= VERBOSE_STANDARD) {
+    std::cout << "Dispatching kernel with nullptr arrays - expecting GPU fault..." << std::endl;
+  }
+
+  // This function simply copies the data we've collected so far into our
+  // local AQL packet, except the the setup and header fields.
+  queue_aql_packet = WriteAQLToQueue(this, &index);
+  ASSERT_EQ(queue_aql_packet,
+            reinterpret_cast<hsa_kernel_dispatch_packet_t *>
+                                    (main_queue()->base_address) + index);
+  uint32_t aql_header = HSA_PACKET_TYPE_KERNEL_DISPATCH;
+
+  aql_header |= HSA_FENCE_SCOPE_SYSTEM <<
+                HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE;
+  aql_header |= HSA_FENCE_SCOPE_SYSTEM <<
+                HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE;
+
+  ::AtomicSetPacketHeader(aql_header, aql().setup, queue_aql_packet);
+
+  hsa_signal_store_screlease(main_queue()->doorbell_signal, index);
+
+  // Wait on the dispatch signal until the kernel is finished (or faults).
+  // Note: This may trigger a GPU fault/exception
+  while (hsa_signal_wait_scacquire(aql().completion_signal,
+       HSA_SIGNAL_CONDITION_LT, 1, (uint64_t) - 1, HSA_WAIT_STATE_ACTIVE)) {
+  }
+
+  if (verbosity() >= VERBOSE_STANDARD) {
+    std::cout << "Kernel dispatch completed (fault may have occurred)" << std::endl;
+  }
+
+  hsa_signal_store_screlease(aql().completion_signal, 1);
+
+  // NOTE: We do NOT verify results since we expect a fault
+}
+
+void TestFaultExample::DisplayTestInfo(void) {
+  TestBase::DisplayTestInfo();
+}
+
+void TestFaultExample::DisplayResults(void) const {
+  // Compare required profile for this test case with what we're actually
+  // running on
+  if (!rocrtst::CheckProfile(this)) {
+    return;
+  }
+
+  TestBase::DisplayResults();
+  std::cout << "Test completed. Check for GPU core dump if fault handling is enabled." << std::endl;
+  return;
+}
+
+void TestFaultExample::Close() {
+  // NOTE: We do NOT free src_buffer or dst_buffer since we never allocated them
+
+  // This will close handles opened within rocrtst utility calls and call
+  // hsa_shut_down(), so it should be done after other hsa cleanup
+  TestBase::Close();
+}
+
+#undef RET_IF_HSA_ERR
@@ -0,0 +1,37 @@
+/*
+ * Copyright © Advanced Micro Devices, Inc., or its affiliates.
+ *
+ * SPDX-License-Identifier: MIT
+ */
+
+#ifndef ROCRTST_SUITES_FUNCTIONAL_TEST_FAULT_EXAMPLE_H_
+#define ROCRTST_SUITES_FUNCTIONAL_TEST_FAULT_EXAMPLE_H_
+
+#include "common/base_rocr.h"
+#include "hsa/hsa.h"
+#include "suites/test_common/test_base.h"
+
+class TestFaultExample : public TestBase {
+ public:
+  TestFaultExample();
+
+  // @Brief: Destructor for test case of TestFaultExample
+  virtual ~TestFaultExample();
+
+  // @Brief: Setup the environment for measurement
+  virtual void SetUp();
+
+  // @Brief: Core measurement execution
+  virtual void Run();
+
+  // @Brief: Clean up and retrieve the resource
+  virtual void Close();
+
+  // @Brief: Display results
+  virtual void DisplayResults() const;
+
+  // @Brief: Display information about what this test does
+  virtual void DisplayTestInfo(void);
+};
+
+#endif  // ROCRTST_SUITES_FUNCTIONAL_TEST_FAULT_EXAMPLE_H_
@@ -69,6 +69,7 @@
 #include "suites/stress/memory_concurrent_tests.h"
 #include "suites/stress/queue_write_index_concurrent_tests.h"
 #include "suites/test_common/test_case_template.h"
+#include "suites/functional/test_fault_example.h"
 #include "suites/test_common/main.h"
 #include "suites/test_common/test_common.h"
 #include "suites/functional/concurrent_init.h"
@@ -80,6 +81,7 @@
 #include "suites/functional/signal_kernel.h"
 #include "suites/functional/cu_masking.h"
 #include "suites/functional/filter_devices.h"
+#include "suites/functional/gpu_coredump.h"
 #include "amd_smi/amdsmi.h"
 #include "common/common.h"
 #include "suites/functional/counted_queues.h"
@@ -320,6 +322,69 @@ TEST(rocrtstFunc, Memory_Available) {
  );
 }

+TEST(rocrtstFunc, GpuCoreDump_DefaultPattern) {
+  RUN_IF_NOT_EMU_MODE(
+    GpuCoreDumpTest gcd;
+    RunCustomTestProlog(&gcd);
+    gcd.TestDefaultPattern();
+    RunCustomTestEpilog(&gcd);
+  );
+}
+
+TEST(rocrtstFunc, GpuCoreDump_CustomPattern) {
+  RUN_IF_NOT_EMU_MODE(
+    GpuCoreDumpTest gcd;
+    RunCustomTestProlog(&gcd);
+    gcd.TestCustomPattern();
+    RunCustomTestEpilog(&gcd);
+  );
+}
+
+TEST(rocrtstFunc, GpuCoreDump_DisableFlag) {
+  RUN_IF_NOT_EMU_MODE(
+    GpuCoreDumpTest gcd;
+    RunCustomTestProlog(&gcd);
+    gcd.TestDisableFlag();
+    RunCustomTestEpilog(&gcd);
+  );
+}
+
+TEST(rocrtstFunc, GpuCoreDump_PatternSubstitution) {
+  RUN_IF_NOT_EMU_MODE(
+    GpuCoreDumpTest gcd;
+    RunCustomTestProlog(&gcd);
+    gcd.TestPatternSubstitution();
+    RunCustomTestEpilog(&gcd);
+  );
+}
+
+TEST(rocrtstFunc, GpuCoreDump_InvalidPath) {
+  RUN_IF_NOT_EMU_MODE(
+    GpuCoreDumpTest gcd;
+    RunCustomTestProlog(&gcd);
+    gcd.TestInvalidPath();
+    RunCustomTestEpilog(&gcd);
+  );
+}
+
+TEST(rocrtstFunc, GpuCoreDump_ContentIntegrity) {
+  RUN_IF_NOT_EMU_MODE(
+    GpuCoreDumpTest gcd;
+    RunCustomTestProlog(&gcd);
+    gcd.TestCoreDumpContentIntegrity();
+    RunCustomTestEpilog(&gcd);
+  );
+}
+
+TEST(rocrtstFunc, GpuCoreDump_PipePattern) {
+  RUN_IF_NOT_EMU_MODE(
+    GpuCoreDumpTest gcd;
+    RunCustomTestProlog(&gcd);
+    gcd.TestPipePattern();
+    RunCustomTestEpilog(&gcd);
+  );
+}
+

 TEST(rocrtstFunc, Memory_Atomic_Add_Test) {
  RUN_IF_NOT_EMU_MODE(
@@ -548,22 +548,28 @@ hsa_status_t write_core_dump_to_fd(int fd, const SegmentsInfo& segments,
          return (uint32_t)0;
      }
    } (seg.stype);
-    if (size_limit != -1 && (offset + seg.size > size_limit)) {
-      if (show_progress) {
-        printf("Core limit file reached during pipe write\n");
-      }
-      return HSA_STATUS_SUCCESS;
-    }
    phdr.p_offset = alignUp(offset, (uint64_t)1 << phdr.p_align);
    phdrs.push_back(phdr);
    offset += phdr.p_filesz;
  }

  // Write all program headers
-  for (const auto& phdr : phdrs) {
-    if (write(fd, &phdr, sizeof(phdr)) != sizeof(phdr)) {
-      perror("Failed to write program header to pipe");
-      return HSA_STATUS_ERROR;
+  if (is_reg_file) {
+    // For regular files, use pwrite to write at specific offset
+    for (size_t i = 0; i < phdrs.size(); i++) {
+      off_t phdr_offset = sizeof(Elf64_Ehdr) + i * sizeof(Elf64_Phdr);
+      if (pwrite(fd, &phdrs[i], sizeof(Elf64_Phdr), phdr_offset) != sizeof(Elf64_Phdr)) {
+        perror("Failed to write program header");
+        return HSA_STATUS_ERROR;
+      }
+    }
+  } else {
+    // For pipes, use sequential write
+    for (const auto& phdr : phdrs) {
+      if (write(fd, &phdr, sizeof(phdr)) != sizeof(phdr)) {
+        perror("Failed to write program header to pipe");
+        return HSA_STATUS_ERROR;
+      }
    }
  }

@@ -572,6 +578,15 @@ hsa_status_t write_core_dump_to_fd(int fd, const SegmentsInfo& segments,
    const SegmentInfo& seg = segments[idx];
    const Elf64_Phdr& phdr = phdrs[idx];

+    // Check if this segment would exceed size limit
+    if (size_limit != -1 && (phdr.p_offset + phdr.p_filesz > size_limit)) {
+      if (show_progress) {
+        fprintf(stderr, "Core file size limit reached, truncating at segment %zu\n", idx);
+      }
+      // Stop writing segments but return success - we wrote valid headers
+      return HSA_STATUS_SUCCESS;
+    }
+
    if (is_reg_file) {
      int error = posix_fallocate(fd, phdr.p_offset, phdr.p_filesz);
      if (error != 0) {
@@ -588,9 +603,21 @@ hsa_status_t write_core_dump_to_fd(int fd, const SegmentsInfo& segments,
      if (st != HSA_STATUS_SUCCESS) {
        return st;
      }
-      if (write(fd, copy_buffer.get(), curr_chunk) != (ssize_t)curr_chunk) {
-        perror("Failed to write segment data to pipe");
-        return HSA_STATUS_ERROR;
+
+      if (is_reg_file) {
+        // For regular files, use pwrite to write at specific offset
+        if (pwrite(fd, copy_buffer.get(), curr_chunk,
+            phdr.p_offset + phdr.p_filesz - remaining) !=
+                                                        (ssize_t)curr_chunk) {
+          perror("Failed to write segment data");
+          return HSA_STATUS_ERROR;
+        }
+      } else {
+        // For pipes, use sequential write
+        if (write(fd, copy_buffer.get(), curr_chunk) != (ssize_t)curr_chunk) {
+          perror("Failed to write segment data to pipe");
+          return HSA_STATUS_ERROR;
+        }
      }
      remaining -= curr_chunk;
    }