diff --git a/projects/rocr-runtime/rocrtst/suites/functional/gpu_coredump.cc b/projects/rocr-runtime/rocrtst/suites/functional/gpu_coredump.cc new file mode 100644 index 0000000000..1686f4d8f3 --- /dev/null +++ b/projects/rocr-runtime/rocrtst/suites/functional/gpu_coredump.cc @@ -0,0 +1,1362 @@ +/* + * Copyright © Advanced Micro Devices, Inc., or its affiliates. + * + * SPDX-License-Identifier: MIT + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "suites/functional/gpu_coredump.h" +#include "common/base_rocr_utils.h" +#include "common/common.h" +#include "common/helper_funcs.h" +#include "gtest/gtest.h" +#include "hsa/hsa.h" + +// NT_AMDGPU_CORE_STATE from runtime/hsa-runtime/inc/amd_hsa_elf.h +// AMDGPU snapshots of runtime, agent and queues state for use in core dump +#define NT_AMDGPU_CORE_STATE 33 + +static const uint32_t kNumBufferElements = rocrtst::isEmuModeEnabled() ? 4 : 256; + +// Convert core pattern to glob pattern, substituting known values +// and using wildcards for unknowable values (timestamp, TID) +namespace { +std::string PatternToGlob(const std::string& pattern, pid_t child_pid) { + std::string result; + + // Get values we can know + char hostname[256]; + gethostname(hostname, sizeof(hostname)); + hostname[sizeof(hostname) - 1] = '\0'; + + char exe_path[PATH_MAX]; + std::string exe_name = "unknown"; + ssize_t len = readlink("/proc/self/exe", exe_path, sizeof(exe_path) - 1); + if (len > 0) { + exe_path[len] = '\0'; + char* base = basename(exe_path); + if (base) exe_name = base; + } + + // Parse and substitute + for (size_t i = 0; i < pattern.length(); i++) { + if (pattern[i] == '%' && i + 1 < pattern.length()) { + switch (pattern[i + 1]) { + case '%': result += '%'; break; + case 'p': result += std::to_string(child_pid); break; + case 'i': result += '*'; break; // TID - use wildcard + case 'h': result += hostname; break; + case 'e': result += exe_name; break; + case 't': result += '*'; break; // Timestamp - use wildcard + default: break; // Drop unsupported specifiers + } + i++; + } else { + result += pattern[i]; + } + } + + return result; +} + +// Find core dump file matching the glob pattern +std::string FindMatchingCoreDump(const std::string& glob_pattern) { + glob_t glob_result; + memset(&glob_result, 0, sizeof(glob_result)); + + int ret = glob(glob_pattern.c_str(), 0, nullptr, &glob_result); + std::string found_file; + + if (ret == 0 && glob_result.gl_pathc > 0) { + found_file = glob_result.gl_pathv[0]; + } + + globfree(&glob_result); + return found_file; +} +} // anonymous namespace +// RAII helper class for automatic HSA resource cleanup +class HSAResourceGuard { +public: + hsa_queue_t* queue = nullptr; + hsa_executable_t executable = {0}; + void* kernarg_buffer = nullptr; + hsa_signal_t signal = {0}; + int file_fd = -1; + hsa_code_object_reader_t code_obj_rdr = {0}; + + HSAResourceGuard() = default; + ~HSAResourceGuard() { + // Cleanup in reverse order of typical acquisition + if (signal.handle) hsa_signal_destroy(signal); + if (kernarg_buffer) hsa_memory_free(kernarg_buffer); + if (executable.handle) hsa_executable_destroy(executable); + if (code_obj_rdr.handle) hsa_code_object_reader_destroy(code_obj_rdr); + if (file_fd != -1) close(file_fd); + if (queue) hsa_queue_destroy(queue); + hsa_shut_down(); + } + + // Prevent copying + HSAResourceGuard(const HSAResourceGuard&) = delete; + HSAResourceGuard& operator=(const HSAResourceGuard&) = delete; +}; + +GpuCoreDumpTest::GpuCoreDumpTest(void) : TestBase() { + set_num_iteration(1); + set_title("GPU Core Dump Configuration Tests"); + set_description("Tests for configurable GPU core dump functionality including " + "custom patterns, format specifiers, and disable flag."); + + // Save original ulimit + getrlimit(RLIMIT_CORE, &original_rlimit_); + prerequisites_met_ = false; +} + +GpuCoreDumpTest::~GpuCoreDumpTest(void) { + // Restore original ulimit + setrlimit(RLIMIT_CORE, &original_rlimit_); +} + +bool GpuCoreDumpTest::CheckPrerequisites() { + // Check if /tmp is writable + if (access("/tmp", W_OK) != 0) { + std::cout << "SKIP: /tmp is not writable, cannot run core dump tests" << std::endl; + return false; + } + + // Try to set ulimit to 200MB + struct rlimit rlim; + rlim.rlim_cur = 200 * 1024 * 1024; + rlim.rlim_max = 200 * 1024 * 1024; + if (setrlimit(RLIMIT_CORE, &rlim) != 0) { + std::cout << "SKIP: Cannot set ulimit for core dumps: " << strerror(errno) << std::endl; + return false; + } + + // Verify ulimit was actually set + struct rlimit check_rlim; + if (getrlimit(RLIMIT_CORE, &check_rlim) == 0) { + if (check_rlim.rlim_cur < 200 * 1024 * 1024) { + std::cout << "SKIP: ulimit could not be set to required value (got " + << check_rlim.rlim_cur << " bytes)" << std::endl; + return false; + } + } + + return true; +} + +void GpuCoreDumpTest::SetUp(void) { + // Don't call TestBase::SetUp() - we don't want hsa_init() in parent + + // Check prerequisites first + prerequisites_met_ = CheckPrerequisites(); + if (!prerequisites_met_) { + return; // Skip setup if prerequisites not met + } + + // Create temporary directory for test dumps + test_dir_ = "/tmp/rocr_coredump_test_" + std::to_string(getpid()); + + int mkdir_result = mkdir(test_dir_.c_str(), 0755); + if (mkdir_result != 0 && errno != EEXIST) { + std::cerr << "Failed to create test directory " << test_dir_ + << ": " << strerror(errno) << std::endl; + prerequisites_met_ = false; + return; + } + + if (verbosity() > 0) { + std::cout << "Created test directory: " << test_dir_ << std::endl; + } +} + +void GpuCoreDumpTest::Run(void) { + // Nothing to do here - each test method handles its own execution +} + +void GpuCoreDumpTest::DisplayTestInfo(void) { + std::cout << "Test: " << title() << '\n'; + std::cout << description() << '\n'; +} + +void GpuCoreDumpTest::DisplayResults(void) const { + // Nothing to display +} + +void GpuCoreDumpTest::Close() { + // Don't call TestBase::Close() - we never called hsa_init() in parent + + // Clean up test directory (unless debugging) + const char* preserve = getenv("ROCR_TEST_PRESERVE_COREDUMP"); + if (!test_dir_.empty() && !preserve) { + CleanupCoreDumps(test_dir_ + "/*"); + rmdir(test_dir_.c_str()); + } else if (preserve) { + std::cout << " DEBUG: Test directory preserved: " << test_dir_ << '\n'; + } +} + +pid_t GpuCoreDumpTest::RunFaultingKernelInChild() { + pid_t pid = fork(); + + if (pid < 0) { + return -1; // Fork failed + } + + if (pid == 0) { + // Child process - verify environment is inherited + const char* coredump_file = getenv("HSA_COREDUMP_PATTERN"); + const char* disable_flag = getenv("HSA_DISABLE_COREDUMP_ON_EXCEPTION"); + const char* show_progress = getenv("HSA_COREDUMP_SHOW_PROGRESS"); + + fprintf(stderr, "CHILD: HSA_COREDUMP_PATTERN=%s\n", + coredump_file ? coredump_file : "(null)"); + fprintf(stderr, "CHILD: HSA_DISABLE_COREDUMP_ON_EXCEPTION=%s\n", + disable_flag ? disable_flag : "(null)"); + fprintf(stderr, "CHILD: HSA_COREDUMP_SHOW_PROGRESS=%s\n", + show_progress ? show_progress : "(null)"); + + // Child process - do ALL HSA work here + hsa_status_t err; + + // RAII guard will cleanup all resources on exit + HSAResourceGuard resources; + + // Initialize HSA + err = hsa_init(); + if (err != HSA_STATUS_SUCCESS) { + fprintf(stderr, "CHILD: hsa_init failed with error %d\n", err); + _exit(1); + } + fprintf(stderr, "CHILD: HSA initialized successfully\n"); + + // Find agents + hsa_agent_t cpu_agent = {0}; + hsa_agent_t gpu_agent = {0}; + + err = hsa_iterate_agents(rocrtst::FindCPUDevice, &cpu_agent); + // ProcessIterateError: INFO_BREAK -> SUCCESS, SUCCESS -> ERROR + if (err == HSA_STATUS_INFO_BREAK) { + err = HSA_STATUS_SUCCESS; + } else if (err == HSA_STATUS_SUCCESS) { + err = HSA_STATUS_ERROR; + } + if (err != HSA_STATUS_SUCCESS || cpu_agent.handle == 0) { + fprintf(stderr, "CHILD: Failed to find CPU agent\n"); + _exit(1); + } + + err = hsa_iterate_agents(rocrtst::FindGPUDevice, &gpu_agent); + if (err == HSA_STATUS_INFO_BREAK) { + err = HSA_STATUS_SUCCESS; + } else if (err == HSA_STATUS_SUCCESS) { + err = HSA_STATUS_ERROR; + } + if (err != HSA_STATUS_SUCCESS || gpu_agent.handle == 0) { + fprintf(stderr, "CHILD: Failed to find GPU agent\n"); + _exit(1); + } + + // Get profile to determine which pool finder to use + hsa_profile_t profile; + err = hsa_agent_get_info(gpu_agent, HSA_AGENT_INFO_PROFILE, &profile); + if (err != HSA_STATUS_SUCCESS) { + fprintf(stderr, "CHILD: Failed to get GPU agent profile\n"); + _exit(1); + } + + // Find kernarg pool only; we don't need cpu_pool since + // we're not allocating buffers + hsa_amd_memory_pool_t kernarg_pool; + + if (profile == HSA_PROFILE_FULL) { + // APU - use FindAPUStandardPool + err = hsa_amd_agent_iterate_memory_pools(cpu_agent, + rocrtst::FindAPUStandardPool, + &kernarg_pool); + } else { + // Discrete GPU - use FindKernArgPool + err = hsa_amd_agent_iterate_memory_pools(cpu_agent, + rocrtst::FindKernArgPool, + &kernarg_pool); + } + + if (err == HSA_STATUS_INFO_BREAK) { + err = HSA_STATUS_SUCCESS; + } else if (err == HSA_STATUS_SUCCESS) { + err = HSA_STATUS_ERROR; + } + if (err != HSA_STATUS_SUCCESS) { + fprintf(stderr, "CHILD: Failed to find kernarg memory pool\n"); + _exit(1); + } + + // Create queue + uint32_t queue_size = 0; + err = hsa_agent_get_info(gpu_agent, + HSA_AGENT_INFO_QUEUE_MAX_SIZE, &queue_size); + if (err != HSA_STATUS_SUCCESS) { + fprintf(stderr, "CHILD: Failed to get GPU agent max queue size\n"); + _exit(1); + } + + err = hsa_queue_create(gpu_agent, queue_size, HSA_QUEUE_TYPE_MULTI, + nullptr, nullptr, 0, 0, &resources.queue); + if (err != HSA_STATUS_SUCCESS) { + fprintf(stderr, "CHILD: Failed to create GPU queue\n"); + _exit(1); + } + + // Load kernel from file (no need to allocate src/dst buffers + // - we pass nullptr) + std::string kernel_path = rocrtst::LocateKernelFile( + "test_case_template_kernels.hsaco", gpu_agent); + if (kernel_path.empty()) { + fprintf(stderr, "CHILD: Failed to locate kernel file\n"); + _exit(1); + } + resources.file_fd = open(kernel_path.c_str(), O_RDONLY); + if (resources.file_fd == -1) { + fprintf(stderr, "CHILD: Failed to open kernel file %s: %s\n", + kernel_path.c_str(), strerror(errno)); + _exit(1); + } + fprintf(stderr, "CHILD: Kernel file opened: %s\n", + kernel_path.c_str()); + + err = hsa_code_object_reader_create_from_file(resources.file_fd, + &resources.code_obj_rdr); + if (err != HSA_STATUS_SUCCESS) { + fprintf(stderr, "CHILD: Failed to create code object reader\n"); + _exit(1); + } + + err = hsa_executable_create_alt(HSA_PROFILE_FULL, + HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT, + nullptr, &resources.executable); + if (err != HSA_STATUS_SUCCESS) { + fprintf(stderr, "CHILD: Failed to create executable\n"); + _exit(1); + } + + err = hsa_executable_load_agent_code_object(resources.executable, + gpu_agent, resources.code_obj_rdr, nullptr, nullptr); + // Can destroy reader now + hsa_code_object_reader_destroy(resources.code_obj_rdr); + resources.code_obj_rdr = {0}; + close(resources.file_fd); + resources.file_fd = -1; + + if (err != HSA_STATUS_SUCCESS) { + fprintf(stderr, "CHILD: Failed to load code object into executable\n"); + _exit(1); + } + + err = hsa_executable_freeze(resources.executable, nullptr); + if (err != HSA_STATUS_SUCCESS) { + fprintf(stderr, "CHILD: Failed to freeze executable\n"); + _exit(1); + } + // Get kernel symbol + hsa_executable_symbol_t symbol; + err = hsa_executable_get_symbol_by_name(resources.executable, + "square.kd", &gpu_agent, &symbol); + if (err != HSA_STATUS_SUCCESS) { + fprintf(stderr, "CHILD: Failed to get kernel symbol\n"); + _exit(1); + } + + uint64_t kernel_object = 0; + err = hsa_executable_symbol_get_info(symbol, + HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, + &kernel_object); + if (err != HSA_STATUS_SUCCESS) { + fprintf(stderr, "CHILD: Failed to get kernel object\n"); + _exit(1); + } + + // Allocate kernel arguments with nullptr arrays to cause fault + struct __attribute__((aligned(16))) kernel_args_t { + uint32_t* dstArray; + uint32_t* srcArray; + uint32_t size; + uint32_t pad; + uint64_t global_offset_x; + uint64_t global_offset_y; + uint64_t global_offset_z; + uint64_t printf_buffer; + uint64_t default_queue; + uint64_t completion_action; + } kernel_args; + + // Intentionally set to nullptr to cause a fault + kernel_args.dstArray = nullptr; + kernel_args.srcArray = nullptr; + kernel_args.size = kNumBufferElements; + kernel_args.pad = 0; + kernel_args.global_offset_x = 0; + kernel_args.global_offset_y = 0; + kernel_args.global_offset_z = 0; + kernel_args.printf_buffer = 0; + kernel_args.default_queue = 0; + kernel_args.completion_action = 0; + + err = hsa_amd_memory_pool_allocate(kernarg_pool, sizeof(kernel_args), + 0, &resources.kernarg_buffer); + if (err != HSA_STATUS_SUCCESS) { + fprintf(stderr, "CHILD: Failed to allocate kernarg buffer\n"); + _exit(1); + } + + memcpy(resources.kernarg_buffer, &kernel_args, sizeof(kernel_args)); + + // Create completion signal + err = hsa_signal_create(1, 0, nullptr, &resources.signal); + if (err != HSA_STATUS_SUCCESS) { + fprintf(stderr, "CHILD: Failed to create completion signal\n"); + _exit(1); + } + + // Create and dispatch AQL packet + hsa_kernel_dispatch_packet_t aql; + memset(&aql, 0, sizeof(aql)); + + aql.header = 0; + aql.setup = 1; + aql.workgroup_size_x = kNumBufferElements; + aql.workgroup_size_y = 1; + aql.workgroup_size_z = 1; + aql.grid_size_x = kNumBufferElements; + aql.grid_size_y = 1; + aql.grid_size_z = 1; + aql.private_segment_size = 0; + aql.group_segment_size = 0; + aql.kernel_object = kernel_object; + aql.kernarg_address = resources.kernarg_buffer; + aql.completion_signal = resources.signal; + + const uint32_t queue_mask = resources.queue->size - 1; + uint64_t index = hsa_queue_load_write_index_relaxed(resources.queue); + hsa_queue_store_write_index_relaxed(resources.queue, index + 1); + + // Write packet to queue + hsa_kernel_dispatch_packet_t* queue_packet = + &(reinterpret_cast( + resources.queue->base_address))[index & queue_mask]; + + queue_packet->workgroup_size_x = aql.workgroup_size_x; + queue_packet->workgroup_size_y = aql.workgroup_size_y; + queue_packet->workgroup_size_z = aql.workgroup_size_z; + queue_packet->grid_size_x = aql.grid_size_x; + queue_packet->grid_size_y = aql.grid_size_y; + queue_packet->grid_size_z = aql.grid_size_z; + queue_packet->private_segment_size = aql.private_segment_size; + queue_packet->group_segment_size = aql.group_segment_size; + queue_packet->kernel_object = aql.kernel_object; + queue_packet->kernarg_address = aql.kernarg_address; + queue_packet->completion_signal = aql.completion_signal; + + uint32_t aql_header = HSA_PACKET_TYPE_KERNEL_DISPATCH; + aql_header |= + HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE; + aql_header |= + HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE; + + __atomic_store_n(reinterpret_cast(queue_packet), + aql_header | (aql.setup << 16), __ATOMIC_RELEASE); + + // Ring doorbell + hsa_signal_store_screlease(resources.queue->doorbell_signal, index); + fprintf(stderr, "CHILD: Kernel dispatched, waiting for completion...\n"); + + // Wait for completion (or fault) + hsa_signal_wait_scacquire(resources.signal, HSA_SIGNAL_CONDITION_LT, 1, + UINT64_MAX, HSA_WAIT_STATE_BLOCKED); + + // Should not reach here if fault occurs - destructor will cleanup + fprintf(stderr, "CHILD: Kernel completed without fault! This should not happen with nullptr access.\n"); + _exit(0); + } + + // Parent process - wait for child with timeout + int status; + int timeout_ms = 10000; // 10 seconds + int elapsed = 0; + + while (elapsed < timeout_ms) { + pid_t result = waitpid(pid, &status, WNOHANG); + if (result == pid) { + // Child finished - check if it exited normally or faulted + if (WIFEXITED(status) && WEXITSTATUS(status) == 0) { + // Child exited normally without fault + return -2; + } + // Child crashed/faulted (expected) + usleep(100000); // 100ms for core dump to finish writing + return pid; + } else if (result < 0) { + // Error + return -1; + } + // Still running, wait a bit + usleep(10000); // 10ms + elapsed += 10; + } + + // Timeout - kill the child + if (verbosity() > 0) { + std::cout << " Child process timeout, killing...\n"; + } + kill(pid, SIGKILL); + waitpid(pid, &status, 0); + usleep(100000); // Give time for core dump + return pid; +} + +bool GpuCoreDumpTest::VerifyCoreDumpFile(const std::string& filename) { + std::error_code ec; + if (!std::filesystem::exists(filename, ec)) { + if (verbosity() > 0) { + if (ec) { + std::cout << " Error checking file " << filename << ": " << ec.message() << '\n'; + } else { + std::cout << " Core dump file not found: " << filename << '\n'; + } + } + return false; + } + + if (access(filename.c_str(), R_OK) != 0) { + if (verbosity() > 0) { + std::cout << " Core dump file not readable: " << filename << '\n'; + } + return false; + } + + if (!IsValidGPUCoreDump(filename)) { + if (verbosity() > 0) { + std::cout << " File is not a valid GPU core dump: " << filename << '\n'; + } + return false; + } + + if (verbosity() > 0) { + struct stat st; + stat(filename.c_str(), &st); + std::cout << " Core dump verified: " << filename + << " (size: " << st.st_size << " bytes)\n"; + } + + return true; +} + +bool GpuCoreDumpTest::IsValidGPUCoreDump(const std::string& filename) { + int fd = open(filename.c_str(), O_RDONLY); + if (fd < 0) { + return false; + } + + Elf64_Ehdr ehdr; + if (read(fd, &ehdr, sizeof(ehdr)) != sizeof(ehdr)) { + close(fd); + return false; + } + + close(fd); + + // Verify ELF magic number + if (ehdr.e_ident[EI_MAG0] != ELFMAG0 || + ehdr.e_ident[EI_MAG1] != ELFMAG1 || + ehdr.e_ident[EI_MAG2] != ELFMAG2 || + ehdr.e_ident[EI_MAG3] != ELFMAG3) { + return false; + } + + if (ehdr.e_ident[EI_CLASS] != ELFCLASS64) { + return false; + } + + if (ehdr.e_type != ET_CORE) { + return false; + } + + // EM_AMDGPU = 224 + if (ehdr.e_machine != 224) { + return false; + } + + return true; +} + +void GpuCoreDumpTest::CleanupCoreDumps(const std::string& pattern) { + glob_t glob_result; + memset(&glob_result, 0, sizeof(glob_result)); + + int ret = glob(pattern.c_str(), GLOB_TILDE, nullptr, &glob_result); + if (ret == 0) { + for (size_t i = 0; i < glob_result.gl_pathc; ++i) { + unlink(glob_result.gl_pathv[i]); + } + } + globfree(&glob_result); +} + +void GpuCoreDumpTest::TestDefaultPattern(void) { + if (!prerequisites_met_) { + std::cout << "SKIPPED: Prerequisites not met for GPU core dump tests" << std::endl; + return; + } + + if (verbosity() > 0) { + std::cout << + " Testing default pattern (kernel core_pattern + .gpu suffix)...\n"; + } + + // Read kernel core pattern + std::ifstream pattern_file("/proc/sys/kernel/core_pattern"); + std::string kernel_pattern; + if (pattern_file.is_open()) { + std::getline(pattern_file, kernel_pattern); + } + + // Unset HSA_COREDUMP_PATTERN to use default + unsetenv("HSA_COREDUMP_PATTERN"); + unsetenv("HSA_DISABLE_COREDUMP_ON_EXCEPTION"); + setenv("HSA_COREDUMP_SHOW_PROGRESS", "1", 1); + + std::string expected; + + if (kernel_pattern.empty()) { + expected = "gpucore.%p.gpu"; + } else if (kernel_pattern[0] == '|') { + if (verbosity() > 0) { + std::cout << + " Kernel uses pipe pattern - testing graceful fault handling\n"; + } + + pid_t child_pid = RunFaultingKernelInChild(); + if (child_pid < 0) { + FAIL() << "Failed to run test in child process"; + return; + } + + if (verbosity() > 0) { + std::cout << + " GPU fault handled successfully (pipe pattern in use)\n"; + } + return; + } else { + expected = kernel_pattern; + // Replace /proc prefix with /tmp for testing (proc paths become invalid after child exits) + // Only replace if pattern STARTS with /proc (position 0) + if (expected.find("/proc") == 0) { + // Pattern starts with /proc + size_t cwd_pos = expected.find("/cwd"); + if (cwd_pos != std::string::npos) { + // Replace /proc/%P/cwd (or /proc/%p/cwd) with /tmp + expected.replace(0, cwd_pos + 4, "/tmp"); + } else { + // Just replace /proc with /tmp + expected.replace(0, 5, "/tmp"); + } + // Override kernel pattern for this test + setenv("HSA_COREDUMP_PATTERN", expected.c_str(), 1); + if (verbosity() > 0) { + std::cout << " Kernel pattern starts with /proc, overriding with /tmp for test\n"; + } + // Don't add .gpu suffix - runtime won't add it for custom patterns + } else { + // For non-/proc patterns, runtime adds .gpu suffix + expected += ".gpu"; + } + } + + // Run test in child and get PID + pid_t child_pid = RunFaultingKernelInChild(); + if (child_pid < 0) { + FAIL() << "Failed to run test in child process"; + return; + } + + // Convert pattern to glob pattern (handles %t and %i with wildcards) + std::string glob_pattern = PatternToGlob(expected, child_pid); + std::string actual_file = FindMatchingCoreDump(glob_pattern); + + if (!actual_file.empty()) { + bool success = VerifyCoreDumpFile(actual_file); + EXPECT_TRUE(success); + if (success) { + unlink(actual_file.c_str()); + } + } else { + FAIL() << "No core dump found matching pattern: " << glob_pattern; + } + + // Cleanup - unset if we overrode it for /proc patterns + unsetenv("HSA_COREDUMP_PATTERN"); +} + +void GpuCoreDumpTest::TestCustomPattern(void) { + if (!prerequisites_met_) { + std::cout << "SKIPPED: Prerequisites not met for GPU core dump tests" << std::endl; + return; + } + + if (verbosity() > 0) { + std::cout << " Testing custom pattern (HSA_COREDUMP_PATTERN)...\n"; + } + + std::string pattern = test_dir_ + "/custom_gpu_core.%p"; + setenv("HSA_COREDUMP_PATTERN", pattern.c_str(), 1); + setenv("HSA_COREDUMP_SHOW_PROGRESS", "1", 1); + unsetenv("HSA_DISABLE_COREDUMP_ON_EXCEPTION"); + + pid_t child_pid = RunFaultingKernelInChild(); + if (child_pid == -2) { + std::cout << "NOTE: Child completed without GPU fault - " + << "environment may not support fault triggering" + << std::endl; + unsetenv("HSA_COREDUMP_PATTERN"); + return; + } + if (child_pid < 0) { + FAIL() << "Failed to run test in child process"; + unsetenv("HSA_COREDUMP_PATTERN"); + return; + } + + // Use glob to find file + std::string glob_pattern = PatternToGlob(pattern, child_pid); + std::string actual_file = FindMatchingCoreDump(glob_pattern); + + if (!actual_file.empty()) { + bool success = VerifyCoreDumpFile(actual_file); + EXPECT_TRUE(success); + if (success) { + unlink(actual_file.c_str()); + } + } else { + FAIL() << "No core dump found matching pattern: " << glob_pattern; + } + + unsetenv("HSA_COREDUMP_PATTERN"); +} + +bool GpuCoreDumpTest::ValidateNoteSegment(int fd, uint64_t offset, + uint64_t size) { + if (verbosity() > 0) { + std::cout << " Validating PT_NOTE segment at offset " << offset << "\n"; + } + + // Read note header + if (lseek(fd, offset, SEEK_SET) == -1) { + if (verbosity() > 0) { + std::cout << " Failed to seek to note offset\n"; + } + return false; + } + + uint32_t namesz, descsz, type; + if (read(fd, &namesz, 4) != 4 || + read(fd, &descsz, 4) != 4 || + read(fd, &type, 4) != 4) { + if (verbosity() > 0) { + std::cout << " Failed to read note header\n"; + } + return false; + } + + // Validate note header + if (namesz != 7) { // "AMDGPU\0" + if (verbosity() > 0) { + std::cout << " Invalid namesz: " << namesz << " (expected 7)\n"; + } + return false; + } + + if (type != NT_AMDGPU_CORE_STATE) { + if (verbosity() > 0) { + std::cout << " Invalid note type: " << type + << " (expected " << NT_AMDGPU_CORE_STATE << ")\n"; + } + return false; + } + + // Read and validate name + char name[8]; + if (read(fd, name, 8) != 8) { + if (verbosity() > 0) { + std::cout << " Failed to read note name\n"; + } + return false; + } + + if (strncmp(name, "AMDGPU", 6) != 0) { + if (verbosity() > 0) { + std::cout << " Invalid note name: " << std::string(name, 6) << '\n'; + } + return false; + } + + // Read and validate descriptor structure + uint64_t note_version; + if (read(fd, ¬e_version, 8) != 8) { + if (verbosity() > 0) { + std::cout << " Failed to read note version\n"; + } + return false; + } + + // Note version - currently hardcoded to 1 in amd_core_dump.cpp + // This version tracks the PT_NOTE descriptor format and would only change + // if the snapshot data structure (runtime/agent/queue info) is modified + if (note_version != 1) { + if (verbosity() > 0) { + std::cout << " Invalid note version: " << + note_version << " (expected 1)\n"; + } + return false; + } + + uint32_t version_major, version_minor; + if (read(fd, &version_major, 4) != 4 || read(fd, &version_minor, 4) != 4) { + if (verbosity() > 0) { + std::cout << " Failed to read KMT version\n"; + } + return false; + } + + uint64_t runtime_info_size; + if (read(fd, &runtime_info_size, 8) != 8) { + if (verbosity() > 0) { + std::cout << " Failed to read runtime_info_size\n"; + } + return false; + } + + if (runtime_info_size == 0) { + if (verbosity() > 0) { + std::cout << " Invalid runtime_info_size: 0\n"; + } + return false; + } + + uint32_t n_agents, agent_entry_size; + if (read(fd, &n_agents, 4) != 4 || read(fd, &agent_entry_size, 4) != 4) { + if (verbosity() > 0) { + std::cout << " Failed to read agent info\n"; + } + return false; + } + + if (n_agents == 0) { + if (verbosity() > 0) { + std::cout << " Invalid n_agents: 0 (expected at least 1 GPU)\n"; + } + return false; + } + + if (agent_entry_size == 0) { + if (verbosity() > 0) { + std::cout << " Invalid agent_entry_size: 0\n"; + } + return false; + } + + if (verbosity() > 0) { + std::cout << " PT_NOTE validation passed:\n"; + std::cout << " Note version: " << note_version << '\n'; + std::cout << " KMT version: " << version_major << "." << + version_minor << '\n'; + std::cout << " Runtime info size: " << runtime_info_size << '\n'; + std::cout << " Number of agents: " << n_agents << '\n'; + std::cout << " Agent entry size: " << agent_entry_size << '\n'; + } + + return true; +} + +bool GpuCoreDumpTest::ValidateLoadSegment(int fd, uint64_t file_offset, + uint64_t vaddr, uint64_t size, + pid_t child_pid) { + if (verbosity() > 0) { + std::cout << " Validating PT_LOAD segment:\n"; + std::cout << " File offset: 0x" << std::hex << file_offset << + std::dec << '\n'; + std::cout << " Virtual addr: 0x" << std::hex << vaddr << + std::dec << '\n'; + std::cout << " Size: " << size << " bytes\n"; + } + + // Sample size - check first 4KB or segment size, whichever is smaller + size_t sample_size = std::min(size, (uint64_t)4096); + std::vector data(sample_size); + + // Read from core dump file at specified offset + if (lseek(fd, file_offset, SEEK_SET) == -1) { + if (verbosity() > 0) { + std::cout << " Failed to seek to file offset\n"; + } + return false; + } + + ssize_t bytes_read = read(fd, data.data(), sample_size); + if (bytes_read != (ssize_t)sample_size) { + if (verbosity() > 0) { + std::cout << " Failed to read from core file (read " << + bytes_read << " bytes)\n"; + } + return false; + } + + // Validate that data was actually written (not all zeros + // from posix_fallocate) + bool all_zeros = std::all_of(data.begin(), data.end(), + [](uint8_t b) { return b == 0; }); + if (all_zeros) { + if (verbosity() > 0) { + std::cout << " FAIL: Segment data is all zeros (not written)\n"; + } + return false; + } + + // Check for variation in the data + bool has_variation = false; + for (size_t i = 1; i < data.size(); i++) { + if (data[i] != data[0]) { + has_variation = true; + break; + } + } + + // All 0xFF is a valid pattern for uninitialized GPU memory + bool all_ff = std::all_of(data.begin(), data.end(), + [](uint8_t b) { return b == 0xFF; }); + + // Pass if: has variation OR all 0xFF (valid GPU memory patterns) + // Fail if: no variation AND not all 0xFF (possible error in writing + // core dump data) + if (!has_variation && !all_ff) { + if (verbosity() > 0) { + std::cout << + " FAIL: Segment data has suspicious repeating pattern: 0x" << + std::hex << (int)data[0] << std::dec << " repeated\n"; + } + return false; + } + + if (verbosity() > 0) { + if (all_ff) { + std::cout << " PT_LOAD validation passed: Segment contains 0xFF " + "pattern (valid uninitialized GPU memory)\n"; + } else { + std::cout << " PT_LOAD validation passed: Segment contains non-zero" + " data with variation\n"; + } + } + + return true; +} + +void GpuCoreDumpTest::TestCoreDumpContentIntegrity(void) { + if (!prerequisites_met_) { + std::cout << "SKIPPED: Prerequisites not met for GPU core dump tests" << std::endl; + return; + } + + if (verbosity() > 0) { + std::cout << + " Testing core dump content integrity (headers and VRAM data)...\n"; + } + + std::string pattern = test_dir_ + "/integrity_test.%p"; + setenv("HSA_COREDUMP_PATTERN", pattern.c_str(), 1); + setenv("HSA_COREDUMP_SHOW_PROGRESS", "1", 1); + unsetenv("HSA_DISABLE_COREDUMP_ON_EXCEPTION"); + + pid_t child_pid = RunFaultingKernelInChild(); + if (child_pid == -2) { + std::cout << "NOTE: Child completed without GPU fault - " + << "environment may not support fault triggering" + << std::endl; + unsetenv("HSA_COREDUMP_PATTERN"); + return; + } + if (child_pid < 0) { + FAIL() << "Failed to run test in child process"; + unsetenv("HSA_COREDUMP_PATTERN"); + return; + } + + // Find the core dump file + std::string glob_pattern = PatternToGlob(pattern, child_pid); + std::string core_file = FindMatchingCoreDump(glob_pattern); + + if (core_file.empty()) { + FAIL() << "No core dump found matching pattern: " << glob_pattern; + unsetenv("HSA_COREDUMP_PATTERN"); + return; + } + + if (verbosity() > 0) { + std::cout << " Found core dump: " << core_file << '\n'; + } + + // Open and parse the core dump + int fd = open(core_file.c_str(), O_RDONLY); + if (fd == -1) { + FAIL() << "Failed to open core dump file: " << core_file; + unsetenv("HSA_COREDUMP_PATTERN"); + return; + } + + // Read ELF header + Elf64_Ehdr ehdr; + if (read(fd, &ehdr, sizeof(ehdr)) != sizeof(ehdr)) { + close(fd); + FAIL() << "Failed to read ELF header"; + unsetenv("HSA_COREDUMP_PATTERN"); + return; + } + + // Verify it's a valid ELF core dump + EXPECT_EQ(ehdr.e_ident[EI_MAG0], ELFMAG0); + EXPECT_EQ(ehdr.e_ident[EI_MAG1], ELFMAG1); + EXPECT_EQ(ehdr.e_ident[EI_MAG2], ELFMAG2); + EXPECT_EQ(ehdr.e_ident[EI_MAG3], ELFMAG3); + EXPECT_EQ(ehdr.e_type, ET_CORE); + + // Read program headers + std::vector phdrs(ehdr.e_phnum); + if (pread(fd, phdrs.data(), ehdr.e_phnum * sizeof(Elf64_Phdr), ehdr.e_phoff) + != (ssize_t)(ehdr.e_phnum * sizeof(Elf64_Phdr))) { + close(fd); + FAIL() << "Failed to read program headers"; + unsetenv("HSA_COREDUMP_PATTERN"); + return; + } + + if (verbosity() > 0) { + std::cout << " Found " << ehdr.e_phnum << " program headers\n"; + } + + bool found_note = false; + bool found_load = false; + int note_failures = 0; + int segments_with_data = 0; // Count segments with actual non-zero data + + // Validate each segment + for (const auto& phdr : phdrs) { + if (phdr.p_type == PT_NOTE) { + found_note = true; + if (!ValidateNoteSegment(fd, phdr.p_offset, phdr.p_filesz)) { + note_failures++; + } + } else if (phdr.p_type == PT_LOAD) { + found_load = true; + // ValidateLoadSegment returns true if segment has non-zero data + // Some segments can legitimately be all zeros + // (unmapped/unused GPU memory) + if (ValidateLoadSegment(fd, phdr.p_offset, phdr.p_vaddr, + phdr.p_filesz, child_pid)) { + segments_with_data++; + } + } + } + + close(fd); + + // Report results + EXPECT_TRUE(found_note) << "No PT_NOTE segment found in core dump"; + EXPECT_TRUE(found_load) << "No PT_LOAD segments found in core dump"; + EXPECT_EQ(note_failures, 0) << "PT_NOTE segment validation failed"; + EXPECT_GT(segments_with_data, 0) << "No PT_LOAD segments contain non-zero" + " data - core dump may not be working correctly"; + + if (verbosity() > 0) { + std::cout << " Validation summary: " << segments_with_data << + " PT_LOAD segments with non-zero data\n"; + if (note_failures == 0 && segments_with_data > 0) { + std::cout << " Core dump validation passed!\n"; + } + } + + unsetenv("HSA_COREDUMP_PATTERN"); +} + +void GpuCoreDumpTest::TestDisableFlag(void) { + if (!prerequisites_met_) { + std::cout << "SKIPPED: Prerequisites not met for GPU core dump tests" << std::endl; + return; + } + + if (verbosity() > 0) { + std::cout << + " Testing disable flag (HSA_DISABLE_COREDUMP_ON_EXCEPTION=1)...\n"; + } + + std::string pattern = test_dir_ + "/should_not_exist.%p"; + setenv("HSA_DISABLE_COREDUMP_ON_EXCEPTION", "1", 1); + setenv("HSA_COREDUMP_PATTERN", pattern.c_str(), 1); + + pid_t child_pid = RunFaultingKernelInChild(); + if (child_pid < 0) { + FAIL() << "Failed to run test in child process"; + unsetenv("HSA_DISABLE_COREDUMP_ON_EXCEPTION"); + unsetenv("HSA_COREDUMP_PATTERN"); + return; + } + + // Use glob to check if any file was created + std::string glob_pattern = PatternToGlob(pattern, child_pid); + std::string actual_file = FindMatchingCoreDump(glob_pattern); + + EXPECT_TRUE(actual_file.empty()) << + "Core dump should not have been created when disabled"; + + if (verbosity() > 0) { + if (actual_file.empty()) { + std::cout << " Correctly prevented core dump creation\n"; + } + } + + if (!actual_file.empty()) { + unlink(actual_file.c_str()); + } + + unsetenv("HSA_DISABLE_COREDUMP_ON_EXCEPTION"); + unsetenv("HSA_COREDUMP_PATTERN"); +} + +void GpuCoreDumpTest::TestPatternSubstitution(void) { + if (!prerequisites_met_) { + std::cout << "SKIPPED: Prerequisites not met for GPU core dump tests" << std::endl; + return; + } + + if (verbosity() > 0) { + std::cout << " Testing pattern substitution (%p, %e, %t)...\n"; + } + + // Test pattern with multiple specifiers including timestamp + std::string pattern = test_dir_ + "/core.%p.%e.%t.dump"; + setenv("HSA_COREDUMP_PATTERN", pattern.c_str(), 1); + unsetenv("HSA_DISABLE_COREDUMP_ON_EXCEPTION"); + + pid_t child_pid = RunFaultingKernelInChild(); + if (child_pid == -2) { + std::cout << "NOTE: Child completed without GPU fault - " + << "environment may not support fault triggering" + << std::endl; + unsetenv("HSA_COREDUMP_PATTERN"); + return; + } + if (child_pid < 0) { + FAIL() << "Failed to run test in child process"; + unsetenv("HSA_COREDUMP_PATTERN"); + return; + } + + // Use glob to find file (handles %t wildcard) + std::string glob_pattern = PatternToGlob(pattern, child_pid); + std::string actual_file = FindMatchingCoreDump(glob_pattern); + + if (!actual_file.empty()) { + bool success = VerifyCoreDumpFile(actual_file); + EXPECT_TRUE(success); + if (success) { + unlink(actual_file.c_str()); + } + } else { + FAIL() << "No core dump found matching pattern: " << glob_pattern; + } + + unsetenv("HSA_COREDUMP_PATTERN"); +} + +void GpuCoreDumpTest::TestInvalidPath(void) { + if (!prerequisites_met_) { + std::cout << "SKIPPED: Prerequisites not met for GPU core dump tests" << std::endl; + return; + } + + if (verbosity() > 0) { + std::cout << " Testing invalid path handling...\n"; + } + + std::string pattern = "/nonexistent_dir_12345/core.%p"; + setenv("HSA_COREDUMP_PATTERN", pattern.c_str(), 1); + unsetenv("HSA_DISABLE_COREDUMP_ON_EXCEPTION"); + + pid_t child_pid = RunFaultingKernelInChild(); + if (child_pid < 0) { + FAIL() << "Failed to run test in child process"; + unsetenv("HSA_COREDUMP_PATTERN"); + return; + } + + // Use glob to check if any file was created + std::string glob_pattern = PatternToGlob(pattern, child_pid); + std::string actual_file = FindMatchingCoreDump(glob_pattern); + + EXPECT_TRUE(actual_file.empty()) << + "Core dump should not be created with invalid path"; + + if (verbosity() > 0) { + if (actual_file.empty()) { + std::cout << " Correctly handled invalid path\n"; + } + } + + unsetenv("HSA_COREDUMP_PATTERN"); +} + +void GpuCoreDumpTest::TestPipePattern(void) { + if (!prerequisites_met_) { + std::cout << "SKIPPED: Prerequisites not met for GPU core dump tests" << std::endl; + return; + } + + if (verbosity() > 0) { + std::cout << " Testing pipe pattern (using tee to capture output)...\n"; + } + + std::string output_file = test_dir_ + "/pipe_output.core"; + std::string pattern = "|sh -c \"cat > " + output_file + "\""; + + setenv("HSA_COREDUMP_PATTERN", pattern.c_str(), 1); + setenv("HSA_COREDUMP_SHOW_PROGRESS", "1", 1); + unsetenv("HSA_DISABLE_COREDUMP_ON_EXCEPTION"); + + pid_t child_pid = RunFaultingKernelInChild(); + if (child_pid == -2) { + std::cout << "NOTE: Child completed without GPU fault - " + << "environment may not support fault triggering" + << std::endl; + unsetenv("HSA_COREDUMP_PATTERN"); + return; + } + if (child_pid < 0) { + FAIL() << "Failed to run test in child process"; + unsetenv("HSA_COREDUMP_PATTERN"); + return; + } + + // Give cat time to finish writing + usleep(500000); // 500ms + + // Verify the output file exists + std::error_code ec; + if (!std::filesystem::exists(output_file, ec)) { + if (ec) { + FAIL() << "Error checking pipe output file: " << ec.message(); + } else { + FAIL() << "Pipe output file not created: " << output_file; + } + unsetenv("HSA_COREDUMP_PATTERN"); + return; + } + + if (verbosity() > 0) { + struct stat st; + stat(output_file.c_str(), &st); + std::cout << " Pipe output file created: " << output_file + << " (size: " << st.st_size << " bytes)\n"; + } + + // Validate the piped core dump has same structure as file-based dump + int fd = open(output_file.c_str(), O_RDONLY); + if (fd == -1) { + FAIL() << "Failed to open pipe output file"; + unsetenv("HSA_COREDUMP_PATTERN"); + return; + } + + // Read and validate ELF header + Elf64_Ehdr ehdr; + if (read(fd, &ehdr, sizeof(ehdr)) != sizeof(ehdr)) { + close(fd); + FAIL() << "Failed to read ELF header from pipe output"; + unsetenv("HSA_COREDUMP_PATTERN"); + return; + } + + EXPECT_EQ(ehdr.e_ident[EI_MAG0], ELFMAG0); + EXPECT_EQ(ehdr.e_ident[EI_MAG1], ELFMAG1); + EXPECT_EQ(ehdr.e_ident[EI_MAG2], ELFMAG2); + EXPECT_EQ(ehdr.e_ident[EI_MAG3], ELFMAG3); + EXPECT_EQ(ehdr.e_type, ET_CORE); + + // Read program headers + std::vector phdrs(ehdr.e_phnum); + if (pread(fd, phdrs.data(), ehdr.e_phnum * sizeof(Elf64_Phdr), ehdr.e_phoff) + != (ssize_t)(ehdr.e_phnum * sizeof(Elf64_Phdr))) { + close(fd); + FAIL() << "Failed to read program headers from pipe output"; + unsetenv("HSA_COREDUMP_PATTERN"); + return; + } + + // Validate we have PT_NOTE and PT_LOAD segments + bool found_note = false; + bool found_load = false; + int note_failures = 0; + int segments_with_data = 0; + + for (const auto& phdr : phdrs) { + if (phdr.p_type == PT_NOTE) { + found_note = true; + if (!ValidateNoteSegment(fd, phdr.p_offset, phdr.p_filesz)) { + note_failures++; + } + } else if (phdr.p_type == PT_LOAD) { + found_load = true; + if (ValidateLoadSegment(fd, phdr.p_offset, phdr.p_vaddr, + phdr.p_filesz, child_pid)) { + segments_with_data++; + } + } + } + + close(fd); + + // Validate results + EXPECT_TRUE(found_note) << "No PT_NOTE segment in pipe output"; + EXPECT_TRUE(found_load) << "No PT_LOAD segments in pipe output"; + EXPECT_EQ(note_failures, 0) << "PT_NOTE validation failed in pipe output"; + EXPECT_GT(segments_with_data, 0) << + "No PT_LOAD segments contain data in pipe output"; + + if (verbosity() > 0) { + std::cout << " Pipe pattern validation: " << segments_with_data + << " PT_LOAD segments with non-zero data\n"; + if (note_failures == 0 && segments_with_data > 0) { + std::cout << " Pipe pattern test passed!\n"; + } + } + + // Cleanup + unlink(output_file.c_str()); + unsetenv("HSA_COREDUMP_PATTERN"); +} diff --git a/projects/rocr-runtime/rocrtst/suites/functional/gpu_coredump.h b/projects/rocr-runtime/rocrtst/suites/functional/gpu_coredump.h new file mode 100644 index 0000000000..07a7712da9 --- /dev/null +++ b/projects/rocr-runtime/rocrtst/suites/functional/gpu_coredump.h @@ -0,0 +1,63 @@ +/* + * Copyright © Advanced Micro Devices, Inc., or its affiliates. + * + * SPDX-License-Identifier: MIT + */ +#ifndef ROCRTST_SUITES_FUNCTIONAL_GPU_COREDUMP_H_ +#define ROCRTST_SUITES_FUNCTIONAL_GPU_COREDUMP_H_ + +#include +#include +#include "common/base_rocr.h" +#include "hsa/hsa.h" +#include "suites/test_common/test_base.h" + +class GpuCoreDumpTest : public TestBase { + public: + GpuCoreDumpTest(); + virtual ~GpuCoreDumpTest(); + + // Override to avoid HSA init in parent + virtual void SetUp(); + virtual void Run(); + virtual void Close(); + virtual void DisplayTestInfo(void); + virtual void DisplayResults(void) const; + + // Test cases + void TestDefaultPattern(void); + void TestCustomPattern(void); + void TestDisableFlag(void); + void TestPatternSubstitution(void); + void TestInvalidPath(void); + void TestCoreDumpContentIntegrity(void); + void TestPipePattern(void); + + private: + // Run faulting kernel in child process (returns child PID) + pid_t RunFaultingKernelInChild(); + + // Verify core dump file exists and is valid + bool VerifyCoreDumpFile(const std::string& filename); + + // Check if file is a valid GPU core dump (ELF format) + bool IsValidGPUCoreDump(const std::string& filename); + + // Clean up core dump files + void CleanupCoreDumps(const std::string& pattern); + + // Validate PT_NOTE segment structure and contents + bool ValidateNoteSegment(int fd, uint64_t offset, uint64_t size); + + // Validate PT_LOAD segment contents against live memory + bool ValidateLoadSegment(int fd, uint64_t file_offset, uint64_t vaddr, uint64_t size, pid_t child_pid); + + // Check if prerequisites for core dump tests are met + bool CheckPrerequisites(); + + std::string test_dir_; + struct rlimit original_rlimit_; + bool prerequisites_met_; +}; + +#endif // ROCRTST_SUITES_FUNCTIONAL_GPU_COREDUMP_H_ diff --git a/projects/rocr-runtime/rocrtst/suites/functional/test_fault_example.cc b/projects/rocr-runtime/rocrtst/suites/functional/test_fault_example.cc new file mode 100644 index 0000000000..061ee0196f --- /dev/null +++ b/projects/rocr-runtime/rocrtst/suites/functional/test_fault_example.cc @@ -0,0 +1,192 @@ +/* + * Copyright © Advanced Micro Devices, Inc., or its affiliates. + * + * SPDX-License-Identifier: MIT + */ + +// This test is based on TestExample but intentionally passes nullptr for +// the kernel array arguments to trigger a GPU fault. This is useful for +// testing GPU core dump functionality and fault handling. +// +// Key differences from TestExample: +// * No memory allocation for src_buffer or dst_buffer +// * Kernel arguments set with nullptr for array pointers +// * No result verification (we expect a fault) +// * Test is DISABLED by default to prevent running in CI + +#include +#include +#include + +#include "suites/functional/test_fault_example.h" +#include "common/base_rocr_utils.h" +#include "common/common.h" +#include "common/helper_funcs.h" +#include "gtest/gtest.h" +#include "hsa/hsa.h" + +static const uint32_t kNumBufferElements = rocrtst::isEmuModeEnabled() ? 4 : 256; + +TestFaultExample::TestFaultExample(void) : + TestBase() { + set_num_iteration(1); // Only need one iteration to trigger the fault + set_title("Test Fault Example"); + set_description("This test intentionally passes nullptr for kernel array " + "arguments to trigger a GPU fault. This is useful for testing GPU " + "core dump functionality and fault handling mechanisms. " + "NOTE: This test is DISABLED by default and should be run manually."); + + set_kernel_file_name("test_case_template_kernels.hsaco"); + set_kernel_name("square"); // kernel function name +} + +TestFaultExample::~TestFaultExample(void) { +} + +// Setup the test environment - similar to TestExample but without buffer allocation +void TestFaultExample::SetUp(void) { + hsa_status_t err; + + TestBase::SetUp(); + + err = rocrtst::SetDefaultAgents(this); + ASSERT_EQ(HSA_STATUS_SUCCESS, err); + + hsa_agent_t* gpu_dev = gpu_device1(); + + // Find and assign HSA_AMD_SEGMENT_GLOBAL pools for cpu, gpu and a kern_arg pool + err = rocrtst::SetPoolsTypical(this); + ASSERT_EQ(err, HSA_STATUS_SUCCESS); + + // Create a queue + hsa_queue_t* q = nullptr; + rocrtst::CreateQueue(*gpu_dev, &q); + ASSERT_NE(q, nullptr); + set_main_queue(q); + + err = rocrtst::LoadKernelFromObjFile(this, gpu_dev); + ASSERT_EQ(err, HSA_STATUS_SUCCESS); + + err = rocrtst::InitializeAQLPacket(this, &aql()); + ASSERT_EQ(HSA_STATUS_SUCCESS, err); + + // NOTE: We do NOT allocate src_buffer or dst_buffer + // We will pass nullptr to the kernel to trigger a fault + + // Set up Kernel arguments with nullptr for array pointers + struct __attribute__((aligned(16))) local_args_t { + uint32_t* dstArray; + uint32_t* srcArray; + uint32_t size; + uint32_t pad; + uint64_t global_offset_x; + uint64_t global_offset_y; + uint64_t global_offset_z; + uint64_t printf_buffer; + uint64_t default_queue; + uint64_t completion_action; + } local_args; + + // Intentionally set array pointers to nullptr to cause a fault + local_args.dstArray = nullptr; + local_args.srcArray = nullptr; + local_args.size = kNumBufferElements; + local_args.global_offset_x = 0; + local_args.global_offset_y = 0; + local_args.global_offset_z = 0; + local_args.printf_buffer = 0; + local_args.default_queue = 0; + local_args.completion_action = 0; + + err = rocrtst::AllocAndSetKernArgs(this, &local_args, sizeof(local_args)); + ASSERT_EQ(err, HSA_STATUS_SUCCESS); + + return; +} + +// This wrapper atomically writes the provided header and setup to the +// provided AQL packet. The provided AQL packet address should be in the +// queue memory space. +static inline void AtomicSetPacketHeader(uint16_t header, uint16_t setup, + hsa_kernel_dispatch_packet_t* queue_packet) { + __atomic_store_n(reinterpret_cast(queue_packet), + header | (setup << 16), __ATOMIC_RELEASE); +} + +void TestFaultExample::Run(void) { + // Compare required profile for this test case with what we're actually + // running on + if (!rocrtst::CheckProfile(this)) { + return; + } + + TestBase::Run(); + + // Override whatever we need to... + aql().workgroup_size_x = kNumBufferElements; + aql().grid_size_x = kNumBufferElements; + + hsa_kernel_dispatch_packet_t *queue_aql_packet; + uint64_t index; + + if (verbosity() >= VERBOSE_STANDARD) { + std::cout << "Dispatching kernel with nullptr arrays - expecting GPU fault..." << std::endl; + } + + // This function simply copies the data we've collected so far into our + // local AQL packet, except the the setup and header fields. + queue_aql_packet = WriteAQLToQueue(this, &index); + ASSERT_EQ(queue_aql_packet, + reinterpret_cast + (main_queue()->base_address) + index); + uint32_t aql_header = HSA_PACKET_TYPE_KERNEL_DISPATCH; + + aql_header |= HSA_FENCE_SCOPE_SYSTEM << + HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE; + aql_header |= HSA_FENCE_SCOPE_SYSTEM << + HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE; + + ::AtomicSetPacketHeader(aql_header, aql().setup, queue_aql_packet); + + hsa_signal_store_screlease(main_queue()->doorbell_signal, index); + + // Wait on the dispatch signal until the kernel is finished (or faults). + // Note: This may trigger a GPU fault/exception + while (hsa_signal_wait_scacquire(aql().completion_signal, + HSA_SIGNAL_CONDITION_LT, 1, (uint64_t) - 1, HSA_WAIT_STATE_ACTIVE)) { + } + + if (verbosity() >= VERBOSE_STANDARD) { + std::cout << "Kernel dispatch completed (fault may have occurred)" << std::endl; + } + + hsa_signal_store_screlease(aql().completion_signal, 1); + + // NOTE: We do NOT verify results since we expect a fault +} + +void TestFaultExample::DisplayTestInfo(void) { + TestBase::DisplayTestInfo(); +} + +void TestFaultExample::DisplayResults(void) const { + // Compare required profile for this test case with what we're actually + // running on + if (!rocrtst::CheckProfile(this)) { + return; + } + + TestBase::DisplayResults(); + std::cout << "Test completed. Check for GPU core dump if fault handling is enabled." << std::endl; + return; +} + +void TestFaultExample::Close() { + // NOTE: We do NOT free src_buffer or dst_buffer since we never allocated them + + // This will close handles opened within rocrtst utility calls and call + // hsa_shut_down(), so it should be done after other hsa cleanup + TestBase::Close(); +} + +#undef RET_IF_HSA_ERR diff --git a/projects/rocr-runtime/rocrtst/suites/functional/test_fault_example.h b/projects/rocr-runtime/rocrtst/suites/functional/test_fault_example.h new file mode 100644 index 0000000000..5135dab226 --- /dev/null +++ b/projects/rocr-runtime/rocrtst/suites/functional/test_fault_example.h @@ -0,0 +1,37 @@ +/* + * Copyright © Advanced Micro Devices, Inc., or its affiliates. + * + * SPDX-License-Identifier: MIT + */ + +#ifndef ROCRTST_SUITES_FUNCTIONAL_TEST_FAULT_EXAMPLE_H_ +#define ROCRTST_SUITES_FUNCTIONAL_TEST_FAULT_EXAMPLE_H_ + +#include "common/base_rocr.h" +#include "hsa/hsa.h" +#include "suites/test_common/test_base.h" + +class TestFaultExample : public TestBase { + public: + TestFaultExample(); + + // @Brief: Destructor for test case of TestFaultExample + virtual ~TestFaultExample(); + + // @Brief: Setup the environment for measurement + virtual void SetUp(); + + // @Brief: Core measurement execution + virtual void Run(); + + // @Brief: Clean up and retrieve the resource + virtual void Close(); + + // @Brief: Display results + virtual void DisplayResults() const; + + // @Brief: Display information about what this test does + virtual void DisplayTestInfo(void); +}; + +#endif // ROCRTST_SUITES_FUNCTIONAL_TEST_FAULT_EXAMPLE_H_ diff --git a/projects/rocr-runtime/rocrtst/suites/test_common/main.cc b/projects/rocr-runtime/rocrtst/suites/test_common/main.cc index 2ae0df15ef..d29d2c7977 100644 --- a/projects/rocr-runtime/rocrtst/suites/test_common/main.cc +++ b/projects/rocr-runtime/rocrtst/suites/test_common/main.cc @@ -69,6 +69,7 @@ #include "suites/stress/memory_concurrent_tests.h" #include "suites/stress/queue_write_index_concurrent_tests.h" #include "suites/test_common/test_case_template.h" +#include "suites/functional/test_fault_example.h" #include "suites/test_common/main.h" #include "suites/test_common/test_common.h" #include "suites/functional/concurrent_init.h" @@ -80,6 +81,7 @@ #include "suites/functional/signal_kernel.h" #include "suites/functional/cu_masking.h" #include "suites/functional/filter_devices.h" +#include "suites/functional/gpu_coredump.h" #include "amd_smi/amdsmi.h" #include "common/common.h" #include "suites/functional/counted_queues.h" @@ -320,6 +322,69 @@ TEST(rocrtstFunc, Memory_Available) { ); } +TEST(rocrtstFunc, GpuCoreDump_DefaultPattern) { + RUN_IF_NOT_EMU_MODE( + GpuCoreDumpTest gcd; + RunCustomTestProlog(&gcd); + gcd.TestDefaultPattern(); + RunCustomTestEpilog(&gcd); + ); +} + +TEST(rocrtstFunc, GpuCoreDump_CustomPattern) { + RUN_IF_NOT_EMU_MODE( + GpuCoreDumpTest gcd; + RunCustomTestProlog(&gcd); + gcd.TestCustomPattern(); + RunCustomTestEpilog(&gcd); + ); +} + +TEST(rocrtstFunc, GpuCoreDump_DisableFlag) { + RUN_IF_NOT_EMU_MODE( + GpuCoreDumpTest gcd; + RunCustomTestProlog(&gcd); + gcd.TestDisableFlag(); + RunCustomTestEpilog(&gcd); + ); +} + +TEST(rocrtstFunc, GpuCoreDump_PatternSubstitution) { + RUN_IF_NOT_EMU_MODE( + GpuCoreDumpTest gcd; + RunCustomTestProlog(&gcd); + gcd.TestPatternSubstitution(); + RunCustomTestEpilog(&gcd); + ); +} + +TEST(rocrtstFunc, GpuCoreDump_InvalidPath) { + RUN_IF_NOT_EMU_MODE( + GpuCoreDumpTest gcd; + RunCustomTestProlog(&gcd); + gcd.TestInvalidPath(); + RunCustomTestEpilog(&gcd); + ); +} + +TEST(rocrtstFunc, GpuCoreDump_ContentIntegrity) { + RUN_IF_NOT_EMU_MODE( + GpuCoreDumpTest gcd; + RunCustomTestProlog(&gcd); + gcd.TestCoreDumpContentIntegrity(); + RunCustomTestEpilog(&gcd); + ); +} + +TEST(rocrtstFunc, GpuCoreDump_PipePattern) { + RUN_IF_NOT_EMU_MODE( + GpuCoreDumpTest gcd; + RunCustomTestProlog(&gcd); + gcd.TestPipePattern(); + RunCustomTestEpilog(&gcd); + ); +} + TEST(rocrtstFunc, Memory_Atomic_Add_Test) { RUN_IF_NOT_EMU_MODE( diff --git a/projects/rocr-runtime/runtime/hsa-runtime/libamdhsacode/lnx/amd_core_dump.cpp b/projects/rocr-runtime/runtime/hsa-runtime/libamdhsacode/lnx/amd_core_dump.cpp index 3fc0c51d3a..c1eea12d96 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/libamdhsacode/lnx/amd_core_dump.cpp +++ b/projects/rocr-runtime/runtime/hsa-runtime/libamdhsacode/lnx/amd_core_dump.cpp @@ -548,22 +548,28 @@ hsa_status_t write_core_dump_to_fd(int fd, const SegmentsInfo& segments, return (uint32_t)0; } } (seg.stype); - if (size_limit != -1 && (offset + seg.size > size_limit)) { - if (show_progress) { - printf("Core limit file reached during pipe write\n"); - } - return HSA_STATUS_SUCCESS; - } phdr.p_offset = alignUp(offset, (uint64_t)1 << phdr.p_align); phdrs.push_back(phdr); offset += phdr.p_filesz; } // Write all program headers - for (const auto& phdr : phdrs) { - if (write(fd, &phdr, sizeof(phdr)) != sizeof(phdr)) { - perror("Failed to write program header to pipe"); - return HSA_STATUS_ERROR; + if (is_reg_file) { + // For regular files, use pwrite to write at specific offset + for (size_t i = 0; i < phdrs.size(); i++) { + off_t phdr_offset = sizeof(Elf64_Ehdr) + i * sizeof(Elf64_Phdr); + if (pwrite(fd, &phdrs[i], sizeof(Elf64_Phdr), phdr_offset) != sizeof(Elf64_Phdr)) { + perror("Failed to write program header"); + return HSA_STATUS_ERROR; + } + } + } else { + // For pipes, use sequential write + for (const auto& phdr : phdrs) { + if (write(fd, &phdr, sizeof(phdr)) != sizeof(phdr)) { + perror("Failed to write program header to pipe"); + return HSA_STATUS_ERROR; + } } } @@ -572,6 +578,15 @@ hsa_status_t write_core_dump_to_fd(int fd, const SegmentsInfo& segments, const SegmentInfo& seg = segments[idx]; const Elf64_Phdr& phdr = phdrs[idx]; + // Check if this segment would exceed size limit + if (size_limit != -1 && (phdr.p_offset + phdr.p_filesz > size_limit)) { + if (show_progress) { + fprintf(stderr, "Core file size limit reached, truncating at segment %zu\n", idx); + } + // Stop writing segments but return success - we wrote valid headers + return HSA_STATUS_SUCCESS; + } + if (is_reg_file) { int error = posix_fallocate(fd, phdr.p_offset, phdr.p_filesz); if (error != 0) { @@ -588,9 +603,21 @@ hsa_status_t write_core_dump_to_fd(int fd, const SegmentsInfo& segments, if (st != HSA_STATUS_SUCCESS) { return st; } - if (write(fd, copy_buffer.get(), curr_chunk) != (ssize_t)curr_chunk) { - perror("Failed to write segment data to pipe"); - return HSA_STATUS_ERROR; + + if (is_reg_file) { + // For regular files, use pwrite to write at specific offset + if (pwrite(fd, copy_buffer.get(), curr_chunk, + phdr.p_offset + phdr.p_filesz - remaining) != + (ssize_t)curr_chunk) { + perror("Failed to write segment data"); + return HSA_STATUS_ERROR; + } + } else { + // For pipes, use sequential write + if (write(fd, copy_buffer.get(), curr_chunk) != (ssize_t)curr_chunk) { + perror("Failed to write segment data to pipe"); + return HSA_STATUS_ERROR; + } } remaining -= curr_chunk; }