[SWDEV-558895] Fix rsmi_event_notification_get segfaulting (#738)

Signed-off-by: adapryor <Adam.pryor@amd.com>

[ROCm/amdsmi commit: ce016f0dcb]
This commit is contained in:
Pryor, Adam
2025-10-06 15:10:56 -05:00
committed by GitHub
szülő 6975b29c15
commit d1679c7ade
2 fájl változott, egészen pontosan 21 új sor hozzáadva és 2 régi sor törölve
+3
Fájl megtekintése
@@ -206,6 +206,9 @@ GPU: 0
### Resolved Issues
- **Fixed event monitoring segfaults causing RDC to crash**.
- Adds mutex locking around access to device event notification file pointer
- **Fixed an issue where amdsmi_get_gpu_od_volt_info() returned a reference to a python object**.
- The returned dictionary was changed to return values in all fields
@@ -7357,8 +7357,23 @@ rsmi_event_notification_get(int timeout_ms,
return;
}
FILE *anon_fp =
smi.devices()[fd_indx_to_dev_id[i]]->evt_notif_anon_file_ptr();
const uint32_t dv_ind = fd_indx_to_dev_id[i];
auto& dev = *smi.devices()[dv_ind];
// Ensure protected access of anon_fp
amd::smi::pthread_wrap pw(*amd::smi::GetMutex(dv_ind));
amd::smi::ScopedPthread lock(pw);
FILE *anon_fp = dev.evt_notif_anon_file_ptr();
if (!anon_fp) {
std::ostringstream ss;
ss << "Null evt_notif_anon_file_ptr() for dv_ind=" << dv_ind;
LOG_ERROR(ss);
continue;
}
flockfile(anon_fp); // serialize stdio on this stream
data_item =
reinterpret_cast<rsmi_evt_notification_data_t *>(&data[*num_elem]);
@@ -7614,6 +7629,7 @@ rsmi_event_notification_get(int timeout_ms,
data_item =
reinterpret_cast<rsmi_evt_notification_data_t *>(&data[*num_elem]);
}
funlockfile(anon_fp); // // paired with flockfile; RAII unlock of device mutex on scope exit
}
};