From a93b9d473de684d24fe46267e9dfe840ec164e07 Mon Sep 17 00:00:00 2001 From: "Pryor, Adam" Date: Tue, 7 Oct 2025 21:31:23 -0500 Subject: [PATCH] [SWDEV-558895] Fix rsmi monitor fds (#748) Signed-off-by: adapryor [ROCm/amdsmi commit: 346e1516afed1038f37d98878372edace0acebd0] --- projects/amdsmi/rocm_smi/src/rocm_smi.cc | 33 ++++++++++++++----- .../amdsmi/rocm_smi/src/rocm_smi_io_link.cc | 10 ++++-- .../amdsmi/rocm_smi/src/rocm_smi_monitor.cc | 4 ++- 3 files changed, 36 insertions(+), 11 deletions(-) diff --git a/projects/amdsmi/rocm_smi/src/rocm_smi.cc b/projects/amdsmi/rocm_smi/src/rocm_smi.cc index ff48a7b5c4..1b193f9924 100644 --- a/projects/amdsmi/rocm_smi/src/rocm_smi.cc +++ b/projects/amdsmi/rocm_smi/src/rocm_smi.cc @@ -7252,6 +7252,11 @@ rsmi_event_notification_init(uint32_t dv_ind) { DEVICE_MUTEX std::lock_guard guard(*smi.kfd_notif_evt_fh_mutex()); + + if (dev->evt_notif_anon_fd() > 0 && dev->evt_notif_anon_file_ptr() != nullptr) { + return RSMI_STATUS_SUCCESS; + } + if (smi.kfd_notif_evt_fh() == -1) { assert(smi.kfd_notif_evt_fh_refcnt() == 0); int kfd_fd = open(kPathKFDIoctl, O_RDWR | O_CLOEXEC); @@ -7666,15 +7671,27 @@ rsmi_status_t rsmi_event_notification_stop(uint32_t dv_ind) { std::lock_guard guard(*smi.kfd_notif_evt_fh_mutex()); - if (dev->evt_notif_anon_fd() == -1) { - return RSMI_STATUS_INVALID_ARGS; - } -// close(dev->evt_notif_anon_fd()); FILE *anon_fp = smi.devices()[dv_ind]->evt_notif_anon_file_ptr(); - fclose(anon_fp); - assert(errno == 0 || errno == EAGAIN); - dev->set_evt_notif_anon_file_ptr(nullptr); - dev->set_evt_notif_anon_fd(-1); + int anon_fd = smi.devices()[dv_ind]->evt_notif_anon_fd(); + + // If nothing to close, success + if (!anon_fp && anon_fd <= 0) { + return RSMI_STATUS_SUCCESS; + } + + // Clear state first so nobody else can race a second close + smi.devices()[dv_ind]->set_evt_notif_anon_file_ptr(nullptr); + smi.devices()[dv_ind]->set_evt_notif_anon_fd(-1); + + if (anon_fp) { + if (fclose(anon_fp) != 0) { + return amd::smi::ErrnoToRsmiStatus(errno); + } + } else { // no FILE*, but fd was valid + if (close(anon_fd) != 0) { + return amd::smi::ErrnoToRsmiStatus(errno); + } + } if (smi.kfd_notif_evt_fh_refcnt_dec() == 0) { int ret = close(smi.kfd_notif_evt_fh()); diff --git a/projects/amdsmi/rocm_smi/src/rocm_smi_io_link.cc b/projects/amdsmi/rocm_smi/src/rocm_smi_io_link.cc index c18b6ae82b..5d8c4f6a0d 100644 --- a/projects/amdsmi/rocm_smi/src/rocm_smi_io_link.cc +++ b/projects/amdsmi/rocm_smi/src/rocm_smi_io_link.cc @@ -196,7 +196,10 @@ static int DiscoverLinks(std::map, std::string link_path_root = LinkPathRoot(node_indx, directory); auto io_link_dir = opendir(link_path_root.c_str()); - assert(io_link_dir != nullptr); + + if (!io_link_dir) { + return errno ? errno : 1; + } auto dentry_io_link = readdir(io_link_dir); while (dentry_io_link != nullptr) { @@ -264,7 +267,10 @@ static int DiscoverLinksPerNode(uint32_t node_indx, std::map *sensors) { auto hwmon_dir = opendir(dir_path.c_str()); - assert(hwmon_dir != nullptr); + if (!hwmon_dir) { + return errno ? errno : 1; + } assert(sensors != nullptr); sensors->clear();