diff --git a/CppCheckSuppressions.txt b/CppCheckSuppressions.txt new file mode 100644 index 0000000000..c81515ca8d --- /dev/null +++ b/CppCheckSuppressions.txt @@ -0,0 +1,84 @@ +arrayIndexThenCheck:src/bootstrap.cc:304 +arrayIndexThenCheck:src/debug.cc:88 +arrayIndexThenCheck:src/graph/search.cc:844 +arrayIndexThenCheck:src/graph/search.cc:916 +arrayIndexThenCheck:src/graph/search.cc:927 +clarifyCalculation:src/graph/topo.cc:702 +clarifyCalculation:src/graph/topo.cc:720 +clarifyCondition:src/enqueue.cc:416 +funcArgNamesDifferent:src/graph/topo.cc:135 +funcArgNamesDifferent:src/graph/topo.h:144 +nullPointerRedundantCheck:src/misc/utils.cc:102 +nullPointerRedundantCheck:src/misc/utils.cc:109 +nullPointerRedundantCheck:src/proxy.cc:143 +nullPointerRedundantCheck:src/proxy.cc:144 +nullPointerRedundantCheck:src/proxy.cc:147 +nullPointerRedundantCheck:src/proxy.cc:148 +nullPointerRedundantCheck:src/proxy.cc:149 +nullPointerRedundantCheck:src/proxy.cc:150 +nullPointerRedundantCheck:src/proxy.cc:151 +nullPointerRedundantCheck:src/proxy.cc:155 +nullPointerRedundantCheck:src/proxy.cc:159 +nullPointerRedundantCheck:src/proxy.cc:160 +nullPointerRedundantCheck:src/proxy.cc:161 +nullPointerRedundantCheck:src/proxy.cc:163 +nullPointerRedundantCheck:src/proxy.cc:165 +nullPointerRedundantCheck:src/proxy.cc:167 +nullPointerRedundantCheck:src/proxy.cc:168 +nullPointerRedundantCheck:src/proxy.cc:340 +nullPointerRedundantCheck:src/proxy.cc:342 +nullPointerRedundantCheck:src/proxy.cc:93 +nullPointerRedundantCheck:src/proxy.cc:94 +redundantAssignment:src/proxy.cc:161 +redundantAssignment:src/proxy.cc:163 +redundantCopy:src/graph/rings.cc:16 +redundantCopy:src/graph/rings.cc:17 +terminateStrncpy:src/misc/utils.cc:99 +terminateStrncpy:src/transport/net_socket.cc:245 +unreachableCode:src/transport/net.cc:555 +unreadVariable:src/graph/tuning.cc:109 +unreadVariable:src/graph/tuning.cc:110 +unreadVariable:src/graph/tuning.cc:113 +unusedFunction:src/graph/topo.cc:37 +unusedFunction:src/graph/topo.cc:836 +unusedFunction:src/misc/gdrwrap.cc:109 +unusedFunction:src/misc/gdrwrap.cc:117 +unusedFunction:src/misc/gdrwrap.cc:130 +unusedFunction:src/misc/gdrwrap.cc:144 +unusedFunction:src/misc/gdrwrap.cc:158 +unusedFunction:src/misc/gdrwrap.cc:172 +unusedFunction:src/misc/gdrwrap.cc:186 +unusedFunction:src/misc/gdrwrap.cc:200 +unusedFunction:src/misc/gdrwrap.cc:209 +unusedFunction:src/misc/gdrwrap.cc:218 +unusedFunction:src/misc/gdrwrap.cc:232 +unusedFunction:src/misc/gdrwrap.cc:52 +unusedFunction:src/misc/ibvwrap.cc:203 +unusedFunction:src/misc/ibvwrap.cc:239 +unusedFunction:src/misc/ibvwrap.cc:255 +unusedFunction:src/misc/nvmlwrap.cc:112 +unusedFunction:src/misc/nvmlwrap_stub.cc:31 +unusedFunction:src/misc/nvmlwrap_stub.cc:35 +unusedFunction:src/transport.cc:71 +unusedLabel:src/bootstrap.cc:349 +unusedLabel:src/clique/ShmObject.h:112 +unusedLabel:src/clique/ShmObject.h:204 +unusedLabel:src/enqueue.cc:108 +unusedLabel:src/enqueue.cc:1093 +unusedLabel:src/enqueue.cc:989 +unusedLabel:src/init.cc:1189 +unusedLabel:src/init.cc:1240 +unusedLabel:src/init.cc:1267 +unusedLabel:src/transport.cc:238 +unusedStructMember:src/graph/xml.cc:410 +unusedStructMember:src/graph/xml.cc:411 +unusedStructMember:src/graph/xml.cc:412 +unusedStructMember:src/graph/xml.cc:428 +unusedStructMember:src/graph/xml.cc:431 +unusedStructMember:src/graph/xml.cc:432 +unusedStructMember:src/graph/xml.cc:435 +unusedStructMember:src/graph/xml.cc:437 +variableScope:src/graph/search.cc:494 +variableScope:src/init.cc:240 +variableScope:src/transport/net_ib.cc:117 +variableScope:src/transport/net_socket.cc:431 diff --git a/install.sh b/install.sh index 3e1a2c77aa..5caf182068 100755 --- a/install.sh +++ b/install.sh @@ -205,9 +205,9 @@ fi check_exit_code "$?" if ($build_tests) || (($run_tests) && [[ ! -f ./test/UnitTests ]]); then - CXX=$ROCM_BIN_PATH/$compiler $cmake_executable $cmake_common_options -DBUILD_TESTS=ON -DCMAKE_INSTALL_PREFIX=$ROCM_PATH ../../. + CXX=$ROCM_BIN_PATH/$compiler $cmake_executable $cmake_common_options -DBUILD_TESTS=ON -DCMAKE_INSTALL_PREFIX=$ROCM_PATH -DROCM_PATH=$ROCM_PATH ../../. else - CXX=$ROCM_BIN_PATH/$compiler $cmake_executable $cmake_common_options -DBUILD_TESTS=OFF -DCMAKE_INSTALL_PREFIX=$ROCM_PATH ../../. + CXX=$ROCM_BIN_PATH/$compiler $cmake_executable $cmake_common_options -DBUILD_TESTS=OFF -DCMAKE_INSTALL_PREFIX=$ROCM_PATH -DROCM_PATH=$ROCM_PATH ../../. fi check_exit_code "$?" diff --git a/src/clique/CliqueManager.cc b/src/clique/CliqueManager.cc index 49a193d77a..1a6560c4d4 100644 --- a/src/clique/CliqueManager.cc +++ b/src/clique/CliqueManager.cc @@ -56,14 +56,26 @@ CliqueManager::CliqueManager(int const rank, cliqueMode_t const cliqueMode) : m_rank(rank), m_numRanks(numRanks), + m_hash(0), m_cliqueMode(cliqueMode), m_opIndexHead(0), m_opIndexTail(0), m_init(false), + m_gcnArch(0), + m_allReduceByteLimit(0), m_pinnedCliquePtrs(NULL), - m_fineGrainBarrierMem(NULL) -{ -} + m_gpuBarrierGlobalCount(NULL), + m_gpuBarrierGlobalSense(NULL), + m_gpuBarrierLocalSense(NULL), + m_cpuBarrierCount(NULL), + m_shmHandles(), + m_ipcHandleSendCache(), + m_ipcHandleRecvCache(), + m_sharedCpuMemory(), + m_sharedIpcHandle(), + m_fineGrainBarrierMem(NULL), + m_sharedBarrierCount(NULL) +{} CliqueManager::~CliqueManager() { @@ -128,11 +140,6 @@ ncclResult_t CliqueManager::Init(ncclUniqueId const* commId, int suffix) WARN("Invalid rank specified. Expected 0 <= %d < %d for CliqueManager", m_rank, m_numRanks); return ncclInvalidUsage; } - if (commId == NULL) - { - WARN("CommId should not be empty"); - return ncclInvalidUsage; - } // For now, opt-into clique based kernels via RCCL_ENABLE_CLIQUE env var if (!rcclParamEnableClique()) @@ -350,7 +357,7 @@ ncclResult_t CliqueManager::GetNumChannelsToUse(ncclFunc_t const coll, ncclDataType_t const datatype, ncclRedOp_t const op, int const totalNumChannels, - uint8_t* numChannelstoUse) + uint8_t* numChannelstoUse) const { size_t const totalBytes = count * ncclTypeSize(datatype); *numChannelstoUse = 1; @@ -467,20 +474,6 @@ ncclResult_t CliqueManager::WaitForPointers(ncclWorkElem* args) return ncclSuccess; } -std::string HandleToString(hipIpcMemHandle_t handle) -{ - char mapping[17] = "0123456789ABCDEF"; - std::string result; - for (int i = 0; i < 4; i++) - { - unsigned char val = (unsigned char)handle.reserved[i]; - result += mapping[val / 16]; - result += mapping[val % 16]; - } - return result; -} - - ncclResult_t CliqueManager::CheckCacheForPtr(void* devPtr, NcclIpcHandleSendCache* cache, int rank, diff --git a/src/clique/CliqueManager.h b/src/clique/CliqueManager.h index 45f566c322..2fbef06319 100644 --- a/src/clique/CliqueManager.h +++ b/src/clique/CliqueManager.h @@ -69,7 +69,7 @@ public: ncclDataType_t const datatype, ncclRedOp_t const op, int const totalNumChannels, - uint8_t* numChannelstoUse); + uint8_t* numChannelstoUse) const; // Blocking call that only returns the in-progress clique pointers are ready // This needs to be called in same order as DeclarePointers @@ -79,14 +79,14 @@ public: static ncclResult_t BootstrapRootInit(int pid, unsigned long hash); protected: - ncclResult_t CheckCacheForPtr(void* devPtr, - NcclIpcHandleSendCache* cache, - int rank, - std::pair* handlePair); + static ncclResult_t CheckCacheForPtr(void* devPtr, + NcclIpcHandleSendCache* cache, + int rank, + std::pair* handlePair); - ncclResult_t CheckCacheForHandle(std::pair const& handlePair, - NcclIpcHandleRecvCache* cache, - void** ptr); + static ncclResult_t CheckCacheForHandle(std::pair const& handlePair, + NcclIpcHandleRecvCache* cache, + void** ptr); int m_rank; // Associated rank int m_numRanks; // Total number of ranks diff --git a/src/clique/HandleCache.h b/src/clique/HandleCache.h index dc479e00e8..978e19d41f 100644 --- a/src/clique/HandleCache.h +++ b/src/clique/HandleCache.h @@ -114,7 +114,7 @@ private: void updateHistory(const iterator& it) { - if (m_lruHistory.size() > 0) + if (!m_lruHistory.empty()) { m_lruHistory.splice(m_lruHistory.end(), m_lruHistory, (it->second).second); } diff --git a/src/clique/HandleShm.cc b/src/clique/HandleShm.cc index 937390cf20..4e73fb4e3d 100644 --- a/src/clique/HandleShm.cc +++ b/src/clique/HandleShm.cc @@ -28,7 +28,7 @@ THE SOFTWARE. #include "Hash.h" #include "shm.h" -NcclIpcHandleShm::NcclIpcHandleShm(int rank, int numRanks, int projid, int numHandlesPerRank, int capacity, std::string suffix) : +NcclIpcHandleShm::NcclIpcHandleShm(int rank, int numRanks, int projid, int numHandlesPerRank, int capacity, std::string const& suffix) : ShmObject>(numRanks * numHandlesPerRank * capacity * sizeof(std::pair), CliqueShmNames["IpcHandles"] + suffix, rank, @@ -39,7 +39,9 @@ NcclIpcHandleShm::NcclIpcHandleShm(int rank, int numRanks, int projid, int numHa { } -NcclIpcHandleShm::NcclIpcHandleShm() +NcclIpcHandleShm::NcclIpcHandleShm() : + m_numHandlesPerRank(0), + m_numHandlesPerOpCount(0) { } diff --git a/src/clique/HandleShm.h b/src/clique/HandleShm.h index c681de0eb4..c3f47cb458 100644 --- a/src/clique/HandleShm.h +++ b/src/clique/HandleShm.h @@ -33,7 +33,7 @@ THE SOFTWARE. class NcclIpcHandleShm : public ShmObject> { public: - NcclIpcHandleShm(int rank, int numRanks, int projid, int numHandlesPerRank, int capacity, std::string suffix); + NcclIpcHandleShm(int rank, int numRanks, int projid, int numHandlesPerRank, int capacity, std::string const& suffix); NcclIpcHandleShm(); diff --git a/src/clique/MsgQueue.cc b/src/clique/MsgQueue.cc index ba1da846bc..22cfbbfb20 100644 --- a/src/clique/MsgQueue.cc +++ b/src/clique/MsgQueue.cc @@ -28,7 +28,7 @@ THE SOFTWARE. #define MSG_SIZE 1 #define MSG_QUEUE_TIMEOUT 60 -ncclResult_t MsgQueueGetId(std::string name, bool exclusive, mqd_t& mq_desc) +ncclResult_t MsgQueueGetId(std::string const& name, bool exclusive, mqd_t& mq_desc) { int flag = (exclusive == true ? O_CREAT | O_EXCL : O_CREAT); struct mq_attr attr; @@ -83,7 +83,7 @@ ncclResult_t MsgQueueWaitUntilEmpty(mqd_t const& mq_desc) return ncclSuccess; } -ncclResult_t MsgQueueClose(std::string name, mqd_t& mq_desc, bool unlink) +ncclResult_t MsgQueueClose(std::string const& name, mqd_t& mq_desc, bool unlink) { if (unlink) { @@ -93,9 +93,9 @@ ncclResult_t MsgQueueClose(std::string name, mqd_t& mq_desc, bool unlink) return ncclSuccess; } -ncclResult_t MsgQueueUnlink(std::string name) +ncclResult_t MsgQueueUnlink(std::string const& name) { std::string mq_name = "/" + name; SYSCHECK(mq_unlink(mq_name.c_str()), "mq_unlink"); return ncclSuccess; -} \ No newline at end of file +} diff --git a/src/clique/MsgQueue.h b/src/clique/MsgQueue.h index af91add388..74513d9125 100644 --- a/src/clique/MsgQueue.h +++ b/src/clique/MsgQueue.h @@ -29,11 +29,11 @@ THE SOFTWARE. #include "nccl.h" #include "core.h" -ncclResult_t MsgQueueGetId(std::string name, bool exclusive, mqd_t& mq_desc); +ncclResult_t MsgQueueGetId(std::string const& name, bool exclusive, mqd_t& mq_desc); ncclResult_t MsgQueueSend(mqd_t const& mq_desc, const char* msgp, size_t msgsz); ncclResult_t MsgQueueRecv(mqd_t const& mq_desc, char* msgp, size_t msgsz); ncclResult_t MsgQueueWaitUntilEmpty(mqd_t const& mq_desc); -ncclResult_t MsgQueueClose(std::string name, mqd_t& mq_desc, bool unlink); -ncclResult_t MsgQueueUnlink(std::string name); +ncclResult_t MsgQueueClose(std::string const& name, mqd_t& mq_desc, bool unlink); +ncclResult_t MsgQueueUnlink(std::string const& name); #endif diff --git a/src/clique/ShmObject.h b/src/clique/ShmObject.h index e62adec65f..4663084f91 100644 --- a/src/clique/ShmObject.h +++ b/src/clique/ShmObject.h @@ -53,8 +53,8 @@ template class ShmObject { public: -ShmObject(size_t size, std::string fileName, int rank, int numRanks, int projid) : - m_shmSize(size), + ShmObject(size_t size, std::string const& fileName, int rank, int numRanks, int projid) : + m_shmSize(size), m_shmName(fileName), m_rank(rank), m_numRanks(numRanks), @@ -62,7 +62,14 @@ ShmObject(size_t size, std::string fileName, int rank, int numRanks, int projid) m_alloc(false), m_shmPtr(nullptr) {} - ShmObject() {} + ShmObject() : + m_shmSize(0), + m_shmName(""), + m_rank(0), + m_numRanks(0), + m_projid(0), + m_alloc(false), + m_shmPtr(nullptr) {} ~ShmObject() {} @@ -82,7 +89,7 @@ ShmObject(size_t size, std::string fileName, int rank, int numRanks, int projid) return m_shmPtr; } protected: - ncclResult_t BroadcastMessage(mqd_t& mq_desc, bool pass) + ncclResult_t BroadcastMessage(mqd_t& mq_desc, bool pass) const { char msg_text[1]; msg_text[0] = (pass == 0 ? 'F': 'P'); @@ -112,11 +119,11 @@ dropback: template struct OpenTag{}; - ncclResult_t InitIfSemaphore(OpenTag tag); + static ncclResult_t InitIfSemaphore(OpenTag tag); ncclResult_t InitIfSemaphore(OpenTag tag); - ncclResult_t InitIfSemaphore(OpenTag tag); + static ncclResult_t InitIfSemaphore(OpenTag tag); ncclResult_t InitIfSemaphore(OpenTag tag); - ncclResult_t InitIfSemaphore(OpenTag> tag); + static ncclResult_t InitIfSemaphore(OpenTag> tag); size_t m_shmSize; std::string m_shmName; @@ -134,9 +141,6 @@ ncclResult_t ShmObject::Open() if (m_alloc == false) { int shmFd; - int protection = PROT_READ | PROT_WRITE; - int visibility = MAP_SHARED; - INFO(NCCL_INIT, "Rank %d Initializing message queue for %s\n", m_rank, m_shmName.c_str()); NCCLCHECK(MsgQueueGetId(m_shmName, false, mq_desc)); diff --git a/src/graph/rome_models.cc b/src/graph/rome_models.cc index 8662ff2bf2..c43c03e3d9 100644 --- a/src/graph/rome_models.cc +++ b/src/graph/rome_models.cc @@ -391,7 +391,7 @@ static struct rcclRomeModel rome_model_59 = { .connMatrix = { 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, }, .gdrLevel = { 4, 4, 4, 4, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 4, 4, 4, 4, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 4, 4, 4, 4, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 4, 4, 4, 4, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 4, 4, 4, 4, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 4, 4, 4, 4, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 4, 4, 4, 4, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 4, 4, 4, 4, }, .pattern = "42424242", - .ringBase = "N0 1 3 2 6 7 15 14 10 11 9 8 12 13 5 4 0 N0|N1 3 7 6 13 12 8 9 10 11 15 14 5 4 0 1 2 N1|N2 4 5 1 0 2 3 7 6 14 15 11 10 8 9 13 12 N6|N3 6 2 3 1 0 4 5 13 12 8 9 11 10 14 15 7 N3|N4 8 12 13 6 7 3 2 1 0 4 5 14 15 11 10 9 N4|N5 11 15 14 6 7 3 2 0 1 5 4 12 13 9 8 10 N5|N6 12 13 5 4 0 1 3 2 6 7 15 14 10 11 9 8 N4|N7 14 5 4 0 1 2 3 7 6 13 12 8 9 10 11 15 N7|N3 7 6 14 15 11 10 8 9 13 12 4 5 1 0 2 3 N1|N1 2 3 1 0 4 5 13 12 8 9 11 10 14 15 7 6 N3|N2 5 14 15 11 10 9 8 12 13 6 7 3 2 1 0 4 N2|N4 9 8 10 11 15 14 6 7 3 2 0 1 5 4 12 13 N6|N5 10 11 9 8 12 13 5 4 0 1 3 2 6 7 15 14 N7|N7 15 14 5 4 0 1 2 3 7 6 13 12 8 9 10 11 N5|N0 0 2 3 7 6 14 15 11 10 8 9 13 12 4 5 1 N0|N6 13 12 8 9 11 10 14 15 7 6 2 3 1 0 4 5 N2|", + .ringBase = "N0 1 0 4 5 14 15 11 10 9 8 12 13 6 7 3 2 N1|N1 3 2 0 1 5 4 12 13 9 8 10 11 15 14 6 7 N3|N2 5 4 0 1 3 2 6 7 15 14 10 11 9 8 12 13 N6|N3 7 6 13 12 8 9 10 11 15 14 5 4 0 1 2 3 N1|N5 11 10 8 9 13 12 4 5 1 0 2 3 7 6 14 15 N7|N6 13 12 8 9 11 10 14 15 7 6 2 3 1 0 4 5 N2|N2 4 5 14 15 11 10 9 8 12 13 6 7 3 2 1 0 N0|N7 15 14 6 7 3 2 0 1 5 4 12 13 9 8 10 11 N5|N0 0 1 3 2 6 7 15 14 10 11 9 8 12 13 5 4 N2|N1 2 3 7 6 13 12 8 9 10 11 15 14 5 4 0 1 N0|N7 14 15 11 10 8 9 13 12 4 5 1 0 2 3 7 6 N3|N4 8 9 11 10 14 15 7 6 2 3 1 0 4 5 13 12 N6|N4 9 8 12 13 6 7 3 2 1 0 4 5 14 15 11 10 N5|N3 6 7 3 2 0 1 5 4 12 13 9 8 10 11 15 14 N7|N6 12 13 5 4 0 1 3 2 6 7 15 14 10 11 9 8 N4|N5 10 11 15 14 5 4 0 1 2 3 7 6 13 12 8 9 N4|", .netGdrLevel = -2, }; @@ -430,7 +430,7 @@ static struct rcclRomeModel rome_model_65 = { .connMatrix = { 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, }, .gdrLevel = { 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5, }, .pattern = "42424242", - .ringBase = "N0 1 3 2 6 7 15 14 10 11 9 8 12 13 5 4 0 N0|N1 3 7 6 13 12 8 9 10 11 15 14 5 4 0 1 2 N1|N2 4 5 1 0 2 3 7 6 14 15 11 10 8 9 13 12 N6|N3 6 2 3 1 0 4 5 13 12 8 9 11 10 14 15 7 N3|N4 8 12 13 6 7 3 2 1 0 4 5 14 15 11 10 9 N4|N5 11 15 14 6 7 3 2 0 1 5 4 12 13 9 8 10 N5|N6 12 13 5 4 0 1 3 2 6 7 15 14 10 11 9 8 N4|N7 14 5 4 0 1 2 3 7 6 13 12 8 9 10 11 15 N7|N3 7 6 14 15 11 10 8 9 13 12 4 5 1 0 2 3 N1|N1 2 3 1 0 4 5 13 12 8 9 11 10 14 15 7 6 N3|N2 5 14 15 11 10 9 8 12 13 6 7 3 2 1 0 4 N2|N4 9 8 10 11 15 14 6 7 3 2 0 1 5 4 12 13 N6|N5 10 11 9 8 12 13 5 4 0 1 3 2 6 7 15 14 N7|N7 15 14 5 4 0 1 2 3 7 6 13 12 8 9 10 11 N5|N0 0 2 3 7 6 14 15 11 10 8 9 13 12 4 5 1 N0|N6 13 12 8 9 11 10 14 15 7 6 2 3 1 0 4 5 N2|", + .ringBase = "N0 1 0 4 5 14 15 11 10 9 8 12 13 6 7 3 2 N1|N1 3 2 0 1 5 4 12 13 9 8 10 11 15 14 6 7 N3|N2 5 4 0 1 3 2 6 7 15 14 10 11 9 8 12 13 N6|N3 7 6 13 12 8 9 10 11 15 14 5 4 0 1 2 3 N1|N5 11 10 8 9 13 12 4 5 1 0 2 3 7 6 14 15 N7|N6 13 12 8 9 11 10 14 15 7 6 2 3 1 0 4 5 N2|N2 4 5 14 15 11 10 9 8 12 13 6 7 3 2 1 0 N0|N7 15 14 6 7 3 2 0 1 5 4 12 13 9 8 10 11 N5|N0 0 1 3 2 6 7 15 14 10 11 9 8 12 13 5 4 N2|N1 2 3 7 6 13 12 8 9 10 11 15 14 5 4 0 1 N0|N7 14 15 11 10 8 9 13 12 4 5 1 0 2 3 7 6 N3|N4 8 9 11 10 14 15 7 6 2 3 1 0 4 5 13 12 N6|N4 9 8 12 13 6 7 3 2 1 0 4 5 14 15 11 10 N5|N3 6 7 3 2 0 1 5 4 12 13 9 8 10 11 15 14 N7|N6 12 13 5 4 0 1 3 2 6 7 15 14 10 11 9 8 N4|N5 10 11 15 14 5 4 0 1 2 3 7 6 13 12 8 9 N4|", .netGdrLevel = 5, }; diff --git a/src/graph/tuning.cc b/src/graph/tuning.cc index db22608f3f..4328c91c75 100644 --- a/src/graph/tuning.cc +++ b/src/graph/tuning.cc @@ -137,7 +137,6 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom if (a == NCCL_ALGO_RING && (p == NCCL_PROTO_LL || p == NCCL_PROTO_LL128)) busBw *= 0.05; if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_SIMPLE) (nNodes == 2) ? busBw *= 0.33 : busBw *= 0.11; if (a == NCCL_ALGO_TREE && (p == NCCL_PROTO_LL || p == NCCL_PROTO_LL128)) busBw *= 0.04; - if (gcn == 910 && p == NCCL_PROTO_LL && nNodes == 1 && nRanks == 16) busBw *= 5.9; if (gcn == 910 && a == NCCL_ALGO_TREE && p == NCCL_PROTO_SIMPLE && nNodes == 2 && nRanks == 32) busBw *= 3.2; #else if (compCap80) busBw = std::min(busBw, 235.0f); diff --git a/src/include/alloc.h b/src/include/alloc.h index 9307e048f7..49ef05d1e0 100644 --- a/src/include/alloc.h +++ b/src/include/alloc.h @@ -90,16 +90,6 @@ static ncclResult_t ncclCudaMemcpy(T* dst, T* src, size_t nelem) { return ncclSuccess; } -static bool hasFineGrainVramPcie() { - int *ptr; - if (hipExtMallocWithFlags((void**)&ptr, sizeof(int), hipDeviceMallocFinegrained) == hipSuccess) { - CUDACHECK(hipFree(ptr)); - return true; - } - else - return false; -} - // Allocate memory to be potentially ibv_reg_mr'd. This needs to be // allocated on separate pages as those pages will be marked DONTFORK // and if they are shared, that could cause a crash in a child process diff --git a/src/include/net.h b/src/include/net.h index 2c9a5ced9c..10a2d85432 100644 --- a/src/include/net.h +++ b/src/include/net.h @@ -43,7 +43,6 @@ static ncclResult_t ncclGpuGdrSupport(int* gdrSupport) { NCCLCHECK(ncclNet->getProperties(dev, &props)); if ((props.ptrSupport & NCCL_PTR_CUDA) == 0) continue; #if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__) - if (!hasFineGrainVramPcie()) continue; *gdrSupport = 1; break; #endif diff --git a/src/include/transport.h b/src/include/transport.h index e64dfbf748..bd18ac4b0d 100644 --- a/src/include/transport.h +++ b/src/include/transport.h @@ -29,6 +29,7 @@ struct ncclPeerInfo { int rank; int cudaDev; int gdrSupport; + bool hasFineGrain; uint64_t hostHash; uint64_t pidHash; dev_t shmDev; diff --git a/src/init.cc b/src/init.cc index 7dfa0dbc19..83a2cc6ce2 100644 --- a/src/init.cc +++ b/src/init.cc @@ -640,7 +640,18 @@ static ncclResult_t fillInfo(struct ncclComm* comm, struct ncclPeerInfo* info, u info->busId = comm->busId; - NCCLCHECK(ncclGpuGdrSupport(&info->gdrSupport)); + // detect if fine grained memory is available on this GPU + int *ptr; + if (hipExtMallocWithFlags((void**)&ptr, sizeof(int), hipDeviceMallocFinegrained) == hipSuccess) { + CUDACHECK(hipFree(ptr)); + info->hasFineGrain = true; + NCCLCHECK(ncclGpuGdrSupport(&info->gdrSupport)); + } + else { + info->hasFineGrain = false; + info->gdrSupport = 0; + } + return ncclSuccess; } diff --git a/src/transport/p2p.cc b/src/transport/p2p.cc index 297697dc04..d98e18c8bc 100644 --- a/src/transport/p2p.cc +++ b/src/transport/p2p.cc @@ -58,7 +58,7 @@ int busIdToCudaDev(int64_t busId) { /* Determine if two peers can communicate through p2p */ ncclResult_t p2pCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) { #if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__) - if (!hasFineGrainVramPcie()) { + if (!info1->hasFineGrain || !info2->hasFineGrain) { *ret = 0; return ncclSuccess; } @@ -90,17 +90,6 @@ ncclResult_t p2pCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTop #endif } -#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__) - int dev; - CUDACHECK(hipGetDevice(&dev)); - CUDACHECK(hipSetDevice(cudaDev2)); - if (!hasFineGrainVramPcie()) { - *ret = 0; - CUDACHECK(hipSetDevice(dev)); - return ncclSuccess; - } - CUDACHECK(hipSetDevice(dev)); -#endif // Check that CUDA can do P2P int p2p; if (hipDeviceCanAccessPeer(&p2p, cudaDev1, cudaDev2) != hipSuccess) { diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 9b00ddbdcb..83048c465e 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -67,6 +67,13 @@ if(BUILD_TESTS) target_link_libraries(UnitTestsMultiProcess PRIVATE ${GTEST_BOTH_LIBRARIES}) target_link_libraries(UnitTestsMultiProcess PRIVATE hip::host hip::device) + find_program( rocminfo_executable rocminfo ) + execute_process(COMMAND bash "-c" "${rocminfo_executable} | grep 'Device Type' | grep GPU | wc -l | tr -d '\n'" OUTPUT_VARIABLE gtest_num_gpus) + if(${gtest_num_gpus} MATCHES "0" OR ${gtest_num_gpus} MATCHES "1") + set(gtest_num_gpus "2") + endif() + target_compile_options(UnitTests PRIVATE -DGTESTS_NUM_GPUS=${gtest_num_gpus}) + # UnitTests using static library of rccl requires passing rccl # through -l and -L instead of command line input. if(BUILD_STATIC) @@ -81,13 +88,13 @@ if(BUILD_TESTS) # HIPCC adds /opt/rocm/lib as RPATH, even though the install process is supposed to # remove RPATH. It also occurs before any user-specified rpath, which effectively overrides the user rpath. # As a work-around, set the correct RPATH for the unit test executable as a post-install step - if (CMAKE_INSTALL_PREFIX MATCHES "/opt/rocm*") + if (CMAKE_INSTALL_PREFIX MATCHES "${ROCM_PATH}") # install_prefix/CMAKE_INSTALL_PREFIX was not explicitly specified, so look in build/release - add_custom_command( TARGET UnitTests POST_BUILD COMMAND chrpath ARGS -r ${CMAKE_BINARY_DIR}:/opt/rocm/lib ${CMAKE_BINARY_DIR}/test/UnitTests) - add_custom_command( TARGET UnitTestsMultiProcess POST_BUILD COMMAND chrpath ARGS -r ${CMAKE_BINARY_DIR}:/opt/rocm/lib ${CMAKE_BINARY_DIR}/test/UnitTestsMultiProcess) + add_custom_command( TARGET UnitTests POST_BUILD COMMAND chrpath ARGS -r ${CMAKE_BINARY_DIR}:${ROCM_PATH}/lib ${CMAKE_BINARY_DIR}/test/UnitTests) + add_custom_command( TARGET UnitTestsMultiProcess POST_BUILD COMMAND chrpath ARGS -r ${CMAKE_BINARY_DIR}:${ROCM_PATH}/lib ${CMAKE_BINARY_DIR}/test/UnitTestsMultiProcess) else() - add_custom_command( TARGET UnitTests POST_BUILD COMMAND chrpath ARGS -r ${CMAKE_INSTALL_PREFIX}/lib:/opt/rocm/lib ${CMAKE_INSTALL_PREFIX}/test/UnitTests) - add_custom_command( TARGET UnitTestsMultiProcess POST_BUILD COMMAND chrpath ARGS -r ${CMAKE_INSTALL_PREFIX}/lib:/opt/rocm/lib ${CMAKE_INSTALL_PREFIX}/test/UnitTestsMultiProcess) + add_custom_command( TARGET UnitTests POST_BUILD COMMAND chrpath ARGS -r ${CMAKE_INSTALL_PREFIX}/lib:${ROCM_PATH}/lib ${CMAKE_INSTALL_PREFIX}/test/UnitTests) + add_custom_command( TARGET UnitTestsMultiProcess POST_BUILD COMMAND chrpath ARGS -r ${CMAKE_INSTALL_PREFIX}/lib:${ROCM_PATH}/lib ${CMAKE_INSTALL_PREFIX}/test/UnitTestsMultiProcess) endif() else() message("Not building unit tests") diff --git a/test/CorrectnessTest.hpp b/test/CorrectnessTest.hpp index 9df068c162..7bd6dbe0e2 100644 --- a/test/CorrectnessTest.hpp +++ b/test/CorrectnessTest.hpp @@ -812,7 +812,7 @@ dropback: case ncclUint64: isMatch &= (outputU8[j] == expectedU8[j]); break; case ncclFloat32: isMatch &= (fabs(outputF4[j] - expectedF4[j]) < 1e-5); break; case ncclFloat64: isMatch &= (fabs(outputF8[j] - expectedF8[j]) < 1e-12); break; - case ncclBfloat16: isMatch &= (fabs((float)outputB2[j] - (float)expectedB2[j]) < 5e-2); break; + case ncclBfloat16: isMatch &= (fabs((float)outputB2[j] - (float)expectedB2[j]) < 9e-2); break; default: fprintf(stderr, "[ERROR] Unsupported datatype\n"); exit(0); diff --git a/test/test_AllGather.cpp b/test/test_AllGather.cpp index 295892bbd8..280c1cea48 100644 --- a/test/test_AllGather.cpp +++ b/test/test_AllGather.cpp @@ -9,6 +9,8 @@ namespace CorrectnessTests { TEST_P(AllGatherCorrectnessTest, Correctness) { + // Adjust numElements to be multiple of numDevices + numElements = (numElements/numDevices)*numDevices; if (numDevices > numDevicesAvailable) return; if (numElements % numDevices != 0) return; @@ -107,7 +109,7 @@ namespace CorrectnessTests // Number of elements testing::Values(2520, 3026520), // Number of devices - testing::Values(2,3,4,5,6,7,8), + testing::Range(2,(GTESTS_NUM_GPUS+1)), // In-place or not testing::Values(false, true), testing::Values("")), diff --git a/test/test_AllReduce.cpp b/test/test_AllReduce.cpp index 0dab003c33..3a8697bcc6 100644 --- a/test/test_AllReduce.cpp +++ b/test/test_AllReduce.cpp @@ -46,7 +46,7 @@ namespace CorrectnessTests // Number of elements testing::Values(1024, 1048576), // Number of devices - testing::Values(2,3,4,5,6,7,8), + testing::Range(2,(GTESTS_NUM_GPUS+1)), // In-place or not testing::Values(false, true), testing::Values("RCCL_ENABLE_CLIQUE=0", "RCCL_ENABLE_CLIQUE=1")), @@ -71,7 +71,7 @@ namespace CorrectnessTests // Number of elements testing::Values(1024, 1048576), // Number of devices - testing::Values(2,3,4,5,6,7,8), + testing::Range(2,(GTESTS_NUM_GPUS+1)), // In-place or not testing::Values(false, true), testing::Values("RCCL_ENABLE_CLIQUE=0", "RCCL_ENABLE_CLIQUE=1")), diff --git a/test/test_AllReduceGroup.cpp b/test/test_AllReduceGroup.cpp index fe265a9c91..fb24ab7d94 100644 --- a/test/test_AllReduceGroup.cpp +++ b/test/test_AllReduceGroup.cpp @@ -58,7 +58,7 @@ namespace CorrectnessTests // Number of elements testing::Values(1024, 1048576), // Number of devices - testing::Values(2,3,4,5,6,7,8), + testing::Range(2,(GTESTS_NUM_GPUS+1)), // In-place or not testing::Values(false, true), testing::Values("RCCL_ENABLE_CLIQUE=0", "RCCL_ENABLE_CLIQUE=1")), @@ -74,7 +74,7 @@ namespace CorrectnessTests // Number of elements testing::Values(1024, 1048576), // Number of devices - testing::Values(2,3,4,5,6,7,8), + testing::Range(2,(GTESTS_NUM_GPUS+1)), // In-place or not testing::Values(false, true), testing::Values("RCCL_ENABLE_CLIQUE=0", "RCCL_ENABLE_CLIQUE=1")), diff --git a/test/test_AllToAll.cpp b/test/test_AllToAll.cpp index 17ab932978..8997a319c5 100644 --- a/test/test_AllToAll.cpp +++ b/test/test_AllToAll.cpp @@ -59,7 +59,7 @@ namespace CorrectnessTests // Number of elements testing::Values(1024, 1048576), // Number of devices - testing::Values(2,3,4,5,6,7,8), + testing::Range(2,(GTESTS_NUM_GPUS+1)), // In-place or not testing::Values(false), testing::Values("")), diff --git a/test/test_AllToAllv.cpp b/test/test_AllToAllv.cpp index 1d204fb30e..cb303c3679 100644 --- a/test/test_AllToAllv.cpp +++ b/test/test_AllToAllv.cpp @@ -67,7 +67,7 @@ namespace CorrectnessTests // Number of elements testing::Values(2520, 3026520), // Number of devices - testing::Values(2,3,4,5,6,7,8), + testing::Range(2,(GTESTS_NUM_GPUS+1)), // In-place or not testing::Values(false), testing::Values("")), diff --git a/test/test_Broadcast.cpp b/test/test_Broadcast.cpp index d273a67780..173f5f2c52 100644 --- a/test/test_Broadcast.cpp +++ b/test/test_Broadcast.cpp @@ -63,7 +63,7 @@ namespace CorrectnessTests // Number of elements testing::Values(1024, 1048576), // Number of devices - testing::Values(2,3,4,5,6,7,8), + testing::Range(2,(GTESTS_NUM_GPUS+1)), // In-place or not testing::Values(false, true), testing::Values("")), diff --git a/test/test_CombinedCalls.cpp b/test/test_CombinedCalls.cpp index 278c1bf067..b951470606 100644 --- a/test/test_CombinedCalls.cpp +++ b/test/test_CombinedCalls.cpp @@ -27,6 +27,8 @@ namespace CorrectnessTests ncclFuncs.push_back(ncclCollReduce); ncclFuncs.push_back(ncclCollReduceScatter); + // Adjust numElements to be multiple of numDevices + numElements = (numElements/numDevices)*numDevices; for (int i = 0; i < datasets.size(); i++) { datasets[i].Initialize(numDevices, numElements, dataType, inPlace, ncclFuncs[i]); @@ -119,7 +121,7 @@ namespace CorrectnessTests // Number of elements testing::Values(2520, 3026520), // Number of devices - testing::Values(2,3,4,5,6,7,8), + testing::Range(2,(GTESTS_NUM_GPUS+1)), // In-place or not testing::Values(false), testing::Values("RCCL_ENABLE_CLIQUE=0", "RCCL_ENABLE_CLIQUE=1", "RCCL_P2P_NET_DISABLE=0", "RCCL_P2P_NET_DISABLE=1")), diff --git a/test/test_Gather.cpp b/test/test_Gather.cpp index 837ec30ea7..8bf4edd6d5 100644 --- a/test/test_Gather.cpp +++ b/test/test_Gather.cpp @@ -63,7 +63,7 @@ namespace CorrectnessTests // Number of elements testing::Values(1024, 1048576), // Number of devices - testing::Values(2,3,4,5,6,7,8), + testing::Range(2,(GTESTS_NUM_GPUS+1)), // In-place or not testing::Values(false), testing::Values("")), diff --git a/test/test_GroupCalls.cpp b/test/test_GroupCalls.cpp index 1199846477..cb05ab6e5e 100644 --- a/test/test_GroupCalls.cpp +++ b/test/test_GroupCalls.cpp @@ -26,6 +26,8 @@ namespace CorrectnessTests ncclFuncs.push_back(ncclCollReduce); ncclFuncs.push_back(ncclCollReduceScatter); + // Adjust numElements to be multiple of numDevices + numElements = (numElements/numDevices)*numDevices; for (int i = 0; i < datasets.size(); i++) { datasets[i].Initialize(numDevices, numElements, dataType, inPlace, ncclFuncs[i]); @@ -120,7 +122,7 @@ namespace CorrectnessTests // Number of elements testing::Values(2520, 3026520), // Number of devices - testing::Values(2,3,4,5,6,7,8), + testing::Range(2,(GTESTS_NUM_GPUS+1)), // In-place or not testing::Values(false, true), testing::Values("RCCL_ENABLE_CLIQUE=0", "RCCL_ENABLE_CLIQUE=1")), diff --git a/test/test_Reduce.cpp b/test/test_Reduce.cpp index 8927ba59c2..35b4576e9e 100644 --- a/test/test_Reduce.cpp +++ b/test/test_Reduce.cpp @@ -63,7 +63,7 @@ namespace CorrectnessTests // Number of elements testing::Values(1024, 1048576), // Number of devices - testing::Values(2,3,4,5,6,7,8), + testing::Range(2,(GTESTS_NUM_GPUS+1)), // In-place or not testing::Values(false, true), testing::Values("")), diff --git a/test/test_ReduceScatter.cpp b/test/test_ReduceScatter.cpp index bed3a5f5f3..7462a3ab6b 100644 --- a/test/test_ReduceScatter.cpp +++ b/test/test_ReduceScatter.cpp @@ -10,6 +10,8 @@ namespace CorrectnessTests { TEST_P(ReduceScatterCorrectnessTest, Correctness) { + // Adjust numElements to be multiple of numDevices + numElements = (numElements/numDevices)*numDevices; if (numDevices > numDevicesAvailable) return; if (numElements % numDevices != 0) return; @@ -61,7 +63,7 @@ namespace CorrectnessTests // Number of elements testing::Values(2520, 3026520), // Number of devices - testing::Values(2,3,4,5,6,7,8), + testing::Range(2,(GTESTS_NUM_GPUS+1)), // In-place or not testing::Values(false, true), testing::Values("")), diff --git a/test/test_Scatter.cpp b/test/test_Scatter.cpp index 82f4f9088b..8ceec37589 100644 --- a/test/test_Scatter.cpp +++ b/test/test_Scatter.cpp @@ -63,7 +63,7 @@ namespace CorrectnessTests // Number of elements testing::Values(1024, 1048576), // Number of devices - testing::Values(2,3,4,5,6,7,8), + testing::Range(2,(GTESTS_NUM_GPUS+1)), // In-place or not testing::Values(false), testing::Values("")), diff --git a/tools/TransferBench/TransferBench.cpp b/tools/TransferBench/TransferBench.cpp index e4c2a20b79..79a2e9bbe7 100644 --- a/tools/TransferBench/TransferBench.cpp +++ b/tools/TransferBench/TransferBench.cpp @@ -77,7 +77,8 @@ int main(int argc, char **argv) maxN = std::max(maxN, N); // Execute only peer to peer benchmark mode, similar to rocm-bandwidth-test - if (!strcmp(argv[1], "p2p")) + if (!strcmp(argv[1], "p2p") || !strcmp(argv[1], "p2p_rr") || + !strcmp(argv[1], "g2g") || !strcmp(argv[1], "g2g_rr")) { int numBlocksToUse = 0; if (argc > 3) @@ -85,8 +86,13 @@ int main(int argc, char **argv) else HIP_CALL(hipDeviceGetAttribute(&numBlocksToUse, hipDeviceAttributeMultiprocessorCount, 0)); + // Perform either local read (+remote write) [EXE = SRC] or + // remote read (+local write) [EXE = DST] + int readMode = (!strcmp(argv[1], "p2p_rr") || !strcmp(argv[1], "g2g_rr") ? 1 : 0); + int skipCpu = (!strcmp(argv[1], "g2g") || !strcmp(argv[1], "g2g_rr") ? 1 : 0); + // Execute peer to peer benchmark mode - RunPeerToPeerBenchmarks(ev, numBytesPerLink / sizeof(float), numBlocksToUse); + RunPeerToPeerBenchmarks(ev, numBytesPerLink / sizeof(float), numBlocksToUse, readMode, skipCpu); exit(0); } @@ -394,15 +400,20 @@ void DisplayUsage(char const* cmdName) HIP_CALL(hipGetDeviceCount(&numGpuDevices)); int const numCpuDevices = numa_num_configured_nodes(); - printf("Usage: %s configFile \n", cmdName); - - printf(" configFile: File containing Links to execute (see below for format)\n"); - printf(" Specifying \"p2p\" as the configFile will execute a peer to peer benchmark (3rd argument used as # CUs to use)\n"); - printf(" N : (Optional) Number of bytes to transfer per link.\n"); - printf(" If not specified, defaults to %lu bytes. Must be a multiple of 4 bytes\n", DEFAULT_BYTES_PER_LINK); - printf(" If 0 is specified, a range of Ns will be benchmarked\n"); - printf(" If a negative number is specified, a configFile gets generated with this number as default number of CUs per link\n"); - printf(" May append a suffix ('K', 'M', 'G') for kilobytes / megabytes / gigabytes\n"); + printf("Usage: %s config \n", cmdName); + printf(" config: Either:\n"); + printf(" - Filename of configFile containing Links to execute (see below for format)\n"); + printf(" - Name of preset benchmark:\n"); + printf(" p2p - All CPU/GPU pairs benchmark\n"); + printf(" p2p_rr - All CPU/GPU pairs benchmark with remote reads\n"); + printf(" g2g - All GPU/GPU pairs benchmark\n"); + printf(" g2g_rr - All GPU/GPU pairs benchmark with remote reads\n"); + printf(" - 3rd optional argument will be used as # of CUs to use (uses all by default)\n"); + printf(" N : (Optional) Number of bytes to transfer per link.\n"); + printf(" If not specified, defaults to %lu bytes. Must be a multiple of 4 bytes\n", DEFAULT_BYTES_PER_LINK); + printf(" If 0 is specified, a range of Ns will be benchmarked\n"); + printf(" If a negative number is specified, a configFile gets generated with this number as default number of CUs per link\n"); + printf(" May append a suffix ('K', 'M', 'G') for kilobytes / megabytes / gigabytes\n"); printf("\n"); printf("Configfile Format:\n"); printf("==================\n"); @@ -1104,7 +1115,7 @@ void RunLink(EnvVars const& ev, size_t const N, int const iteration, Link& link) } } -void RunPeerToPeerBenchmarks(EnvVars const& ev, size_t N, int numBlocksToUse) +void RunPeerToPeerBenchmarks(EnvVars const& ev, size_t N, int numBlocksToUse, int readMode, int skipCpu) { // Collect the number of available CPUs/GPUs on this machine int numGpus; @@ -1122,42 +1133,44 @@ void RunPeerToPeerBenchmarks(EnvVars const& ev, size_t N, int numBlocksToUse) printf("Using %d CUs per transfer\n", numBlocksToUse); // Perform unidirectional / bidirectional - for (int readMode = 0; readMode < 2; readMode++) + for (int isBidirectional = 0; isBidirectional <= 1; isBidirectional++) { - for (int isBidirectional = 0; isBidirectional <= 1; isBidirectional++) + // Print header + printf("%sdirectional copy peak bandwidth GB/s [%s read / %s write]\n", isBidirectional ? "Bi" : "Uni", + readMode == 0 ? "Local" : "Remote", + readMode == 0 ? "Remote" : "Local"); + printf("%10s", "D/D"); + if (!skipCpu) { - // Print header - printf("%sdirectional copy peak bandwidth GB/s [%s read / %s write]\n", isBidirectional ? "Bi" : "Uni", - readMode == 0 ? "Local" : "Remote", - readMode == 0 ? "Remote" : "Local"); - printf("%10s", "D/D"); for (int i = 0; i < numCpus; i++) printf("%7s %02d", "CPU", i); - for (int i = 0; i < numGpus; i++) - printf("%7s %02d", "GPU", i); - printf("\n"); + } + for (int i = 0; i < numGpus; i++) + printf("%7s %02d", "GPU", i); + printf("\n"); - // Loop over all possible src/dst pairs - for (int src = 0; src < numDevices; src++) + // Loop over all possible src/dst pairs + for (int src = 0; src < numDevices; src++) + { + MemType const& srcMemType = (src < numCpus ? MEM_CPU : MEM_GPU); + if (skipCpu && srcMemType == MEM_CPU) continue; + int srcIndex = (srcMemType == MEM_CPU ? src : src - numCpus); + printf("%7s %02d", (srcMemType == MEM_CPU) ? "CPU" : "GPU", srcIndex); + for (int dst = 0; dst < numDevices; dst++) { - MemType const& srcMemType = (src < numCpus ? MEM_CPU : MEM_GPU); - int srcIndex = (srcMemType == MEM_CPU ? src : src - numCpus); - printf("%7s %02d", (srcMemType == MEM_CPU) ? "CPU" : "GPU", srcIndex); - for (int dst = 0; dst < numDevices; dst++) - { - MemType const& dstMemType = (dst < numCpus ? MEM_CPU : MEM_GPU); - int dstIndex = (dstMemType == MEM_CPU ? dst : dst - numCpus); - double bandwidth = GetPeakBandwidth(ev, N, isBidirectional, srcMemType, srcIndex, dstMemType, dstIndex, readMode); - if (bandwidth == 0) - printf("%10s", "N/A"); - else - printf("%10.2f", bandwidth); - fflush(stdout); - } - printf("\n"); + MemType const& dstMemType = (dst < numCpus ? MEM_CPU : MEM_GPU); + if (skipCpu && dstMemType == MEM_CPU) continue; + int dstIndex = (dstMemType == MEM_CPU ? dst : dst - numCpus); + double bandwidth = GetPeakBandwidth(ev, N, isBidirectional, srcMemType, srcIndex, dstMemType, dstIndex, readMode); + if (bandwidth == 0) + printf("%10s", "N/A"); + else + printf("%10.2f", bandwidth); + fflush(stdout); } printf("\n"); } + printf("\n"); } } diff --git a/tools/TransferBench/TransferBench.hpp b/tools/TransferBench/TransferBench.hpp index 88f91c8186..5328fdb74e 100644 --- a/tools/TransferBench/TransferBench.hpp +++ b/tools/TransferBench/TransferBench.hpp @@ -122,7 +122,7 @@ void DeallocateMemory(MemType memType, int devIndex, float* memPtr); void CheckPages(char* byteArray, size_t numBytes, int targetId); void CheckOrFill(ModeType mode, int N, bool isMemset, bool isHipCall, std::vector const& fillPattern, float* ptr); void RunLink(EnvVars const& ev, size_t const N, int const iteration, Link& link); -void RunPeerToPeerBenchmarks(EnvVars const& ev, size_t N, int numBlocksToUse); +void RunPeerToPeerBenchmarks(EnvVars const& ev, size_t N, int numBlocksToUse, int readMode, int skipCpu); double GetPeakBandwidth(EnvVars const& ev, size_t N, int isBidirectional, MemType srcMemType, int srcIndex, MemType dstMemType, int dstIndex,