From 2958f7eacedc2e498ee232ba9c76e54239cf829d Mon Sep 17 00:00:00 2001 From: Stanley Tsang Date: Fri, 13 Nov 2020 10:32:42 -0700 Subject: [PATCH] Fixing IPC handle leak (#302) --- src/clique/CliqueManager.cc | 6 +++- src/clique/HandleCache.h | 72 ++++++++++++++++++++++++++++++++----- 2 files changed, 68 insertions(+), 10 deletions(-) diff --git a/src/clique/CliqueManager.cc b/src/clique/CliqueManager.cc index fef2a91599..dde0cf0085 100644 --- a/src/clique/CliqueManager.cc +++ b/src/clique/CliqueManager.cc @@ -82,7 +82,11 @@ void CliqueManager::CleanUp() { // Release caches if (m_ipcHandleSendCache) delete m_ipcHandleSendCache; - if (m_ipcHandleSendCache) delete m_ipcHandleRecvCache; + if (m_ipcHandleRecvCache) + { + m_ipcHandleRecvCache->close(); + delete m_ipcHandleRecvCache; + } // Close shared memory m_shmHandles.Close(); diff --git a/src/clique/HandleCache.h b/src/clique/HandleCache.h index dc479e00e8..245b78f80d 100644 --- a/src/clique/HandleCache.h +++ b/src/clique/HandleCache.h @@ -103,14 +103,16 @@ public: return std::pair(it, inserted); } - + ncclResult_t close(); private: - void pop() - { - typename LRUCache::iterator it = m_cache.find(m_lruHistory.front()); - m_cache.erase(it); - m_lruHistory.pop_front(); - } + // tag for dispatch + template + struct CloseTag{}; + + hipError_t CloseIfPointer(CloseTag tag, iterator it); + hipError_t CloseIfPointer(CloseTag tag, iterator it); + + void pop(); void updateHistory(const iterator& it) { @@ -133,8 +135,60 @@ auto hipIpcMemHandleEqual = [](const hipIpcMemHandle_t& l, const hipIpcMemHandle return memcmp(l.reserved, r.reserved, sizeof(l.reserved)) == 0; }; -//typedef llvm::DenseMap SendCache; -//typedef llvm::DenseMap RecvCache; +template < + class Key, + class Value, + class Hash, + class KeyEqual, + class Allocator +> +ncclResult_t NcclIpcHandleCache::close() +{ + for (auto it = m_cache.begin(); it != m_cache.end(); it ++) + { + CUDACHECK(CloseIfPointer(CloseTag{}, it)); + } + return ncclSuccess; +} + +template < + class Key, + class Value, + class Hash, + class KeyEqual, + class Allocator +> +void NcclIpcHandleCache::pop() +{ + typename LRUCache::iterator it = m_cache.find(m_lruHistory.front()); + CloseIfPointer(CloseTag{}, it); + m_cache.erase(it); + m_lruHistory.pop_front(); +} + +template < + class Key, + class Value, + class Hash, + class KeyEqual, + class Allocator +> +hipError_t NcclIpcHandleCache::CloseIfPointer(CloseTag tag, iterator it) +{ + return hipIpcCloseMemHandle(it->second.first); +} + +template < + class Key, + class Value, + class Hash, + class KeyEqual, + class Allocator +> +hipError_t NcclIpcHandleCache::CloseIfPointer(CloseTag tag, iterator it) +{ + return hipSuccess; +} typedef NcclIpcHandleCache, std::equal_to, std::allocator< std::pair::iterator>>>> NcclIpcHandleSendCache; typedef NcclIpcHandleCache::iterator>>>> NcclIpcHandleRecvCache;