From a044536b8d690a9ae5962a93e7596d9eec2030b7 Mon Sep 17 00:00:00 2001 From: "Gavini, Sumanth" Date: Mon, 17 Nov 2025 16:24:45 -0600 Subject: [PATCH] [SWDEV-563788] - Fix: amdsmitst crash from kernel in xcp_metrics read (#826) Use fork/waitpid to isolate API call and detect SIGKILL from kernel Signed-off-by: Sumanth Gavini --- .../functional/gpu_partition_metrics_read.cc | 37 +++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/tests/amd_smi_test/functional/gpu_partition_metrics_read.cc b/tests/amd_smi_test/functional/gpu_partition_metrics_read.cc index 8bd5e77227..14b6387bb5 100644 --- a/tests/amd_smi_test/functional/gpu_partition_metrics_read.cc +++ b/tests/amd_smi_test/functional/gpu_partition_metrics_read.cc @@ -83,6 +83,43 @@ void TestGpuPartitionMetricsRead::Run(void) { std::cout << "\n\n"; std::cout << "\t**GPU PARTITION METRICS: Using static struct (Backwards Compatibility):\n"; } + + // Test if xcp_metrics causes kernel crash + pid_t test_pid = fork(); + if (test_pid == 0) { + // Child: try reading xcp_metrics + amdsmi_gpu_metrics_t test_smu = {}; + amdsmi_get_gpu_partition_metrics_info(processor_handles_[i], &test_smu); + _exit(0); + } + if (test_pid < 0) { + FAIL() << "Fork failed"; + } + + // Parent: wait for child (3 second timeout: 30 iterations × 100ms) + constexpr int MAX_WAIT_RETRIES = 30; + constexpr int WAIT_INTERVAL_US = 100000; // 100ms in microseconds + int status; + bool child_exited = false; + for (int retry = 0; retry < MAX_WAIT_RETRIES; retry++) { + if (waitpid(test_pid, &status, WNOHANG) > 0) { + child_exited = true; + if (WIFSIGNALED(status)) { + // Child process terminated by signal - fail the test + FAIL() << "FAILED: Child process terminated by signal (signal " << WTERMSIG(status) << ")"; + } + break; + } + usleep(WAIT_INTERVAL_US); + } + + // Handle timeout - child still running after 3 seconds + if (!child_exited) { + kill(test_pid, SIGKILL); + waitpid(test_pid, &status, 0); // Clean up zombie process + FAIL() << "FAILED: Timeout waiting for child process (hung for 3+ seconds)"; + } + amdsmi_gpu_metrics_t smu = {}; err = amdsmi_get_gpu_partition_metrics_info(processor_handles_[i], &smu); const char *status_string;