From dd7c30ec6b0eeb12424d50cfb608af8edd12254e Mon Sep 17 00:00:00 2001 From: Sean Keely Date: Thu, 29 Jul 2021 17:39:13 -0500 Subject: [PATCH] Correct data race in GpuAgent::GetXgmiBlit. Threads may race against xgmi_peer_list_ when dynamically assigning peers to sdma engines. Change-Id: I300c10f0cfa0ff7d6a5515364070a0895e2f4644 [ROCm/ROCR-Runtime commit: bb4dfbba1e6af2c01bb52a3eb1cbfe981a191640] --- .../rocr-runtime/runtime/hsa-runtime/core/inc/amd_gpu_agent.h | 3 +++ .../runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp | 2 ++ 2 files changed, 5 insertions(+) diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_gpu_agent.h b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_gpu_agent.h index 71b1d3fa44..18ef156c39 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_gpu_agent.h +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_gpu_agent.h @@ -400,6 +400,9 @@ class GpuAgent : public GpuAgentInt { // List of agents connected via xGMI std::vector xgmi_peer_list_; + // Protects xgmi_peer_list_ + KernelMutex xgmi_peer_list_lock_; + // @brief AQL queues for cache management and blit compute usage. enum QueueEnum { QueueUtility, // Cache management and device to {host,device} blit compute diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp index da7932d951..c19b91fdb4 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp @@ -1483,6 +1483,8 @@ lazy_ptr& GpuAgent::GetXgmiBlit(const core::Agent& dst_agent) { uint32_t xgmi_engine_cnt = properties_.NumSdmaXgmiEngines; assert((xgmi_engine_cnt > 0) && ("Illegal condition, should not happen")); + ScopedAcquire lock(&xgmi_peer_list_lock_); + for (uint32_t idx = 0; idx < xgmi_peer_list_.size(); idx++) { uint64_t dst_handle = dst_agent.public_handle().handle; uint64_t peer_handle = xgmi_peer_list_[idx]->public_handle().handle;