Merge remote-tracking branch 'origin/develop' into 2.11.4
This commit is contained in:
@@ -0,0 +1,84 @@
|
||||
arrayIndexThenCheck:src/bootstrap.cc:304
|
||||
arrayIndexThenCheck:src/debug.cc:88
|
||||
arrayIndexThenCheck:src/graph/search.cc:844
|
||||
arrayIndexThenCheck:src/graph/search.cc:916
|
||||
arrayIndexThenCheck:src/graph/search.cc:927
|
||||
clarifyCalculation:src/graph/topo.cc:702
|
||||
clarifyCalculation:src/graph/topo.cc:720
|
||||
clarifyCondition:src/enqueue.cc:416
|
||||
funcArgNamesDifferent:src/graph/topo.cc:135
|
||||
funcArgNamesDifferent:src/graph/topo.h:144
|
||||
nullPointerRedundantCheck:src/misc/utils.cc:102
|
||||
nullPointerRedundantCheck:src/misc/utils.cc:109
|
||||
nullPointerRedundantCheck:src/proxy.cc:143
|
||||
nullPointerRedundantCheck:src/proxy.cc:144
|
||||
nullPointerRedundantCheck:src/proxy.cc:147
|
||||
nullPointerRedundantCheck:src/proxy.cc:148
|
||||
nullPointerRedundantCheck:src/proxy.cc:149
|
||||
nullPointerRedundantCheck:src/proxy.cc:150
|
||||
nullPointerRedundantCheck:src/proxy.cc:151
|
||||
nullPointerRedundantCheck:src/proxy.cc:155
|
||||
nullPointerRedundantCheck:src/proxy.cc:159
|
||||
nullPointerRedundantCheck:src/proxy.cc:160
|
||||
nullPointerRedundantCheck:src/proxy.cc:161
|
||||
nullPointerRedundantCheck:src/proxy.cc:163
|
||||
nullPointerRedundantCheck:src/proxy.cc:165
|
||||
nullPointerRedundantCheck:src/proxy.cc:167
|
||||
nullPointerRedundantCheck:src/proxy.cc:168
|
||||
nullPointerRedundantCheck:src/proxy.cc:340
|
||||
nullPointerRedundantCheck:src/proxy.cc:342
|
||||
nullPointerRedundantCheck:src/proxy.cc:93
|
||||
nullPointerRedundantCheck:src/proxy.cc:94
|
||||
redundantAssignment:src/proxy.cc:161
|
||||
redundantAssignment:src/proxy.cc:163
|
||||
redundantCopy:src/graph/rings.cc:16
|
||||
redundantCopy:src/graph/rings.cc:17
|
||||
terminateStrncpy:src/misc/utils.cc:99
|
||||
terminateStrncpy:src/transport/net_socket.cc:245
|
||||
unreachableCode:src/transport/net.cc:555
|
||||
unreadVariable:src/graph/tuning.cc:109
|
||||
unreadVariable:src/graph/tuning.cc:110
|
||||
unreadVariable:src/graph/tuning.cc:113
|
||||
unusedFunction:src/graph/topo.cc:37
|
||||
unusedFunction:src/graph/topo.cc:836
|
||||
unusedFunction:src/misc/gdrwrap.cc:109
|
||||
unusedFunction:src/misc/gdrwrap.cc:117
|
||||
unusedFunction:src/misc/gdrwrap.cc:130
|
||||
unusedFunction:src/misc/gdrwrap.cc:144
|
||||
unusedFunction:src/misc/gdrwrap.cc:158
|
||||
unusedFunction:src/misc/gdrwrap.cc:172
|
||||
unusedFunction:src/misc/gdrwrap.cc:186
|
||||
unusedFunction:src/misc/gdrwrap.cc:200
|
||||
unusedFunction:src/misc/gdrwrap.cc:209
|
||||
unusedFunction:src/misc/gdrwrap.cc:218
|
||||
unusedFunction:src/misc/gdrwrap.cc:232
|
||||
unusedFunction:src/misc/gdrwrap.cc:52
|
||||
unusedFunction:src/misc/ibvwrap.cc:203
|
||||
unusedFunction:src/misc/ibvwrap.cc:239
|
||||
unusedFunction:src/misc/ibvwrap.cc:255
|
||||
unusedFunction:src/misc/nvmlwrap.cc:112
|
||||
unusedFunction:src/misc/nvmlwrap_stub.cc:31
|
||||
unusedFunction:src/misc/nvmlwrap_stub.cc:35
|
||||
unusedFunction:src/transport.cc:71
|
||||
unusedLabel:src/bootstrap.cc:349
|
||||
unusedLabel:src/clique/ShmObject.h:112
|
||||
unusedLabel:src/clique/ShmObject.h:204
|
||||
unusedLabel:src/enqueue.cc:108
|
||||
unusedLabel:src/enqueue.cc:1093
|
||||
unusedLabel:src/enqueue.cc:989
|
||||
unusedLabel:src/init.cc:1189
|
||||
unusedLabel:src/init.cc:1240
|
||||
unusedLabel:src/init.cc:1267
|
||||
unusedLabel:src/transport.cc:238
|
||||
unusedStructMember:src/graph/xml.cc:410
|
||||
unusedStructMember:src/graph/xml.cc:411
|
||||
unusedStructMember:src/graph/xml.cc:412
|
||||
unusedStructMember:src/graph/xml.cc:428
|
||||
unusedStructMember:src/graph/xml.cc:431
|
||||
unusedStructMember:src/graph/xml.cc:432
|
||||
unusedStructMember:src/graph/xml.cc:435
|
||||
unusedStructMember:src/graph/xml.cc:437
|
||||
variableScope:src/graph/search.cc:494
|
||||
variableScope:src/init.cc:240
|
||||
variableScope:src/transport/net_ib.cc:117
|
||||
variableScope:src/transport/net_socket.cc:431
|
||||
+2
-2
@@ -205,9 +205,9 @@ fi
|
||||
check_exit_code "$?"
|
||||
|
||||
if ($build_tests) || (($run_tests) && [[ ! -f ./test/UnitTests ]]); then
|
||||
CXX=$ROCM_BIN_PATH/$compiler $cmake_executable $cmake_common_options -DBUILD_TESTS=ON -DCMAKE_INSTALL_PREFIX=$ROCM_PATH ../../.
|
||||
CXX=$ROCM_BIN_PATH/$compiler $cmake_executable $cmake_common_options -DBUILD_TESTS=ON -DCMAKE_INSTALL_PREFIX=$ROCM_PATH -DROCM_PATH=$ROCM_PATH ../../.
|
||||
else
|
||||
CXX=$ROCM_BIN_PATH/$compiler $cmake_executable $cmake_common_options -DBUILD_TESTS=OFF -DCMAKE_INSTALL_PREFIX=$ROCM_PATH ../../.
|
||||
CXX=$ROCM_BIN_PATH/$compiler $cmake_executable $cmake_common_options -DBUILD_TESTS=OFF -DCMAKE_INSTALL_PREFIX=$ROCM_PATH -DROCM_PATH=$ROCM_PATH ../../.
|
||||
fi
|
||||
check_exit_code "$?"
|
||||
|
||||
|
||||
@@ -56,14 +56,26 @@ CliqueManager::CliqueManager(int const rank,
|
||||
cliqueMode_t const cliqueMode) :
|
||||
m_rank(rank),
|
||||
m_numRanks(numRanks),
|
||||
m_hash(0),
|
||||
m_cliqueMode(cliqueMode),
|
||||
m_opIndexHead(0),
|
||||
m_opIndexTail(0),
|
||||
m_init(false),
|
||||
m_gcnArch(0),
|
||||
m_allReduceByteLimit(0),
|
||||
m_pinnedCliquePtrs(NULL),
|
||||
m_fineGrainBarrierMem(NULL)
|
||||
{
|
||||
}
|
||||
m_gpuBarrierGlobalCount(NULL),
|
||||
m_gpuBarrierGlobalSense(NULL),
|
||||
m_gpuBarrierLocalSense(NULL),
|
||||
m_cpuBarrierCount(NULL),
|
||||
m_shmHandles(),
|
||||
m_ipcHandleSendCache(),
|
||||
m_ipcHandleRecvCache(),
|
||||
m_sharedCpuMemory(),
|
||||
m_sharedIpcHandle(),
|
||||
m_fineGrainBarrierMem(NULL),
|
||||
m_sharedBarrierCount(NULL)
|
||||
{}
|
||||
|
||||
CliqueManager::~CliqueManager()
|
||||
{
|
||||
@@ -128,11 +140,6 @@ ncclResult_t CliqueManager::Init(ncclUniqueId const* commId, int suffix)
|
||||
WARN("Invalid rank specified. Expected 0 <= %d < %d for CliqueManager", m_rank, m_numRanks);
|
||||
return ncclInvalidUsage;
|
||||
}
|
||||
if (commId == NULL)
|
||||
{
|
||||
WARN("CommId should not be empty");
|
||||
return ncclInvalidUsage;
|
||||
}
|
||||
|
||||
// For now, opt-into clique based kernels via RCCL_ENABLE_CLIQUE env var
|
||||
if (!rcclParamEnableClique())
|
||||
@@ -350,7 +357,7 @@ ncclResult_t CliqueManager::GetNumChannelsToUse(ncclFunc_t const coll,
|
||||
ncclDataType_t const datatype,
|
||||
ncclRedOp_t const op,
|
||||
int const totalNumChannels,
|
||||
uint8_t* numChannelstoUse)
|
||||
uint8_t* numChannelstoUse) const
|
||||
{
|
||||
size_t const totalBytes = count * ncclTypeSize(datatype);
|
||||
*numChannelstoUse = 1;
|
||||
@@ -467,20 +474,6 @@ ncclResult_t CliqueManager::WaitForPointers(ncclWorkElem* args)
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
std::string HandleToString(hipIpcMemHandle_t handle)
|
||||
{
|
||||
char mapping[17] = "0123456789ABCDEF";
|
||||
std::string result;
|
||||
for (int i = 0; i < 4; i++)
|
||||
{
|
||||
unsigned char val = (unsigned char)handle.reserved[i];
|
||||
result += mapping[val / 16];
|
||||
result += mapping[val % 16];
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
ncclResult_t CliqueManager::CheckCacheForPtr(void* devPtr,
|
||||
NcclIpcHandleSendCache* cache,
|
||||
int rank,
|
||||
|
||||
@@ -69,7 +69,7 @@ public:
|
||||
ncclDataType_t const datatype,
|
||||
ncclRedOp_t const op,
|
||||
int const totalNumChannels,
|
||||
uint8_t* numChannelstoUse);
|
||||
uint8_t* numChannelstoUse) const;
|
||||
|
||||
// Blocking call that only returns the in-progress clique pointers are ready
|
||||
// This needs to be called in same order as DeclarePointers
|
||||
@@ -79,14 +79,14 @@ public:
|
||||
static ncclResult_t BootstrapRootInit(int pid, unsigned long hash);
|
||||
|
||||
protected:
|
||||
ncclResult_t CheckCacheForPtr(void* devPtr,
|
||||
NcclIpcHandleSendCache* cache,
|
||||
int rank,
|
||||
std::pair<hipIpcMemHandle_t, size_t>* handlePair);
|
||||
static ncclResult_t CheckCacheForPtr(void* devPtr,
|
||||
NcclIpcHandleSendCache* cache,
|
||||
int rank,
|
||||
std::pair<hipIpcMemHandle_t, size_t>* handlePair);
|
||||
|
||||
ncclResult_t CheckCacheForHandle(std::pair<hipIpcMemHandle_t, size_t> const& handlePair,
|
||||
NcclIpcHandleRecvCache* cache,
|
||||
void** ptr);
|
||||
static ncclResult_t CheckCacheForHandle(std::pair<hipIpcMemHandle_t, size_t> const& handlePair,
|
||||
NcclIpcHandleRecvCache* cache,
|
||||
void** ptr);
|
||||
|
||||
int m_rank; // Associated rank
|
||||
int m_numRanks; // Total number of ranks
|
||||
|
||||
@@ -114,7 +114,7 @@ private:
|
||||
|
||||
void updateHistory(const iterator& it)
|
||||
{
|
||||
if (m_lruHistory.size() > 0)
|
||||
if (!m_lruHistory.empty())
|
||||
{
|
||||
m_lruHistory.splice(m_lruHistory.end(), m_lruHistory, (it->second).second);
|
||||
}
|
||||
|
||||
@@ -28,7 +28,7 @@ THE SOFTWARE.
|
||||
#include "Hash.h"
|
||||
#include "shm.h"
|
||||
|
||||
NcclIpcHandleShm::NcclIpcHandleShm(int rank, int numRanks, int projid, int numHandlesPerRank, int capacity, std::string suffix) :
|
||||
NcclIpcHandleShm::NcclIpcHandleShm(int rank, int numRanks, int projid, int numHandlesPerRank, int capacity, std::string const& suffix) :
|
||||
ShmObject<std::pair<hipIpcMemHandle_t,size_t>>(numRanks * numHandlesPerRank * capacity * sizeof(std::pair<hipIpcMemHandle_t,size_t>),
|
||||
CliqueShmNames["IpcHandles"] + suffix,
|
||||
rank,
|
||||
@@ -39,7 +39,9 @@ NcclIpcHandleShm::NcclIpcHandleShm(int rank, int numRanks, int projid, int numHa
|
||||
{
|
||||
}
|
||||
|
||||
NcclIpcHandleShm::NcclIpcHandleShm()
|
||||
NcclIpcHandleShm::NcclIpcHandleShm() :
|
||||
m_numHandlesPerRank(0),
|
||||
m_numHandlesPerOpCount(0)
|
||||
{
|
||||
}
|
||||
|
||||
|
||||
@@ -33,7 +33,7 @@ THE SOFTWARE.
|
||||
class NcclIpcHandleShm : public ShmObject<std::pair<hipIpcMemHandle_t,size_t>>
|
||||
{
|
||||
public:
|
||||
NcclIpcHandleShm(int rank, int numRanks, int projid, int numHandlesPerRank, int capacity, std::string suffix);
|
||||
NcclIpcHandleShm(int rank, int numRanks, int projid, int numHandlesPerRank, int capacity, std::string const& suffix);
|
||||
|
||||
NcclIpcHandleShm();
|
||||
|
||||
|
||||
@@ -28,7 +28,7 @@ THE SOFTWARE.
|
||||
#define MSG_SIZE 1
|
||||
#define MSG_QUEUE_TIMEOUT 60
|
||||
|
||||
ncclResult_t MsgQueueGetId(std::string name, bool exclusive, mqd_t& mq_desc)
|
||||
ncclResult_t MsgQueueGetId(std::string const& name, bool exclusive, mqd_t& mq_desc)
|
||||
{
|
||||
int flag = (exclusive == true ? O_CREAT | O_EXCL : O_CREAT);
|
||||
struct mq_attr attr;
|
||||
@@ -83,7 +83,7 @@ ncclResult_t MsgQueueWaitUntilEmpty(mqd_t const& mq_desc)
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t MsgQueueClose(std::string name, mqd_t& mq_desc, bool unlink)
|
||||
ncclResult_t MsgQueueClose(std::string const& name, mqd_t& mq_desc, bool unlink)
|
||||
{
|
||||
if (unlink)
|
||||
{
|
||||
@@ -93,9 +93,9 @@ ncclResult_t MsgQueueClose(std::string name, mqd_t& mq_desc, bool unlink)
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t MsgQueueUnlink(std::string name)
|
||||
ncclResult_t MsgQueueUnlink(std::string const& name)
|
||||
{
|
||||
std::string mq_name = "/" + name;
|
||||
SYSCHECK(mq_unlink(mq_name.c_str()), "mq_unlink");
|
||||
return ncclSuccess;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -29,11 +29,11 @@ THE SOFTWARE.
|
||||
#include "nccl.h"
|
||||
#include "core.h"
|
||||
|
||||
ncclResult_t MsgQueueGetId(std::string name, bool exclusive, mqd_t& mq_desc);
|
||||
ncclResult_t MsgQueueGetId(std::string const& name, bool exclusive, mqd_t& mq_desc);
|
||||
ncclResult_t MsgQueueSend(mqd_t const& mq_desc, const char* msgp, size_t msgsz);
|
||||
ncclResult_t MsgQueueRecv(mqd_t const& mq_desc, char* msgp, size_t msgsz);
|
||||
ncclResult_t MsgQueueWaitUntilEmpty(mqd_t const& mq_desc);
|
||||
ncclResult_t MsgQueueClose(std::string name, mqd_t& mq_desc, bool unlink);
|
||||
ncclResult_t MsgQueueUnlink(std::string name);
|
||||
ncclResult_t MsgQueueClose(std::string const& name, mqd_t& mq_desc, bool unlink);
|
||||
ncclResult_t MsgQueueUnlink(std::string const& name);
|
||||
|
||||
#endif
|
||||
|
||||
+14
-10
@@ -53,8 +53,8 @@ template <typename T>
|
||||
class ShmObject
|
||||
{
|
||||
public:
|
||||
ShmObject(size_t size, std::string fileName, int rank, int numRanks, int projid) :
|
||||
m_shmSize(size),
|
||||
ShmObject(size_t size, std::string const& fileName, int rank, int numRanks, int projid) :
|
||||
m_shmSize(size),
|
||||
m_shmName(fileName),
|
||||
m_rank(rank),
|
||||
m_numRanks(numRanks),
|
||||
@@ -62,7 +62,14 @@ ShmObject(size_t size, std::string fileName, int rank, int numRanks, int projid)
|
||||
m_alloc(false),
|
||||
m_shmPtr(nullptr) {}
|
||||
|
||||
ShmObject() {}
|
||||
ShmObject() :
|
||||
m_shmSize(0),
|
||||
m_shmName(""),
|
||||
m_rank(0),
|
||||
m_numRanks(0),
|
||||
m_projid(0),
|
||||
m_alloc(false),
|
||||
m_shmPtr(nullptr) {}
|
||||
|
||||
~ShmObject() {}
|
||||
|
||||
@@ -82,7 +89,7 @@ ShmObject(size_t size, std::string fileName, int rank, int numRanks, int projid)
|
||||
return m_shmPtr;
|
||||
}
|
||||
protected:
|
||||
ncclResult_t BroadcastMessage(mqd_t& mq_desc, bool pass)
|
||||
ncclResult_t BroadcastMessage(mqd_t& mq_desc, bool pass) const
|
||||
{
|
||||
char msg_text[1];
|
||||
msg_text[0] = (pass == 0 ? 'F': 'P');
|
||||
@@ -112,11 +119,11 @@ dropback:
|
||||
template<class U>
|
||||
struct OpenTag{};
|
||||
|
||||
ncclResult_t InitIfSemaphore(OpenTag<int> tag);
|
||||
static ncclResult_t InitIfSemaphore(OpenTag<int> tag);
|
||||
ncclResult_t InitIfSemaphore(OpenTag<uint32_t> tag);
|
||||
ncclResult_t InitIfSemaphore(OpenTag<hipIpcMemHandle_t> tag);
|
||||
static ncclResult_t InitIfSemaphore(OpenTag<hipIpcMemHandle_t> tag);
|
||||
ncclResult_t InitIfSemaphore(OpenTag<sem_t> tag);
|
||||
ncclResult_t InitIfSemaphore(OpenTag<std::pair<hipIpcMemHandle_t,size_t>> tag);
|
||||
static ncclResult_t InitIfSemaphore(OpenTag<std::pair<hipIpcMemHandle_t,size_t>> tag);
|
||||
|
||||
size_t m_shmSize;
|
||||
std::string m_shmName;
|
||||
@@ -134,9 +141,6 @@ ncclResult_t ShmObject<T>::Open()
|
||||
if (m_alloc == false)
|
||||
{
|
||||
int shmFd;
|
||||
int protection = PROT_READ | PROT_WRITE;
|
||||
int visibility = MAP_SHARED;
|
||||
|
||||
INFO(NCCL_INIT, "Rank %d Initializing message queue for %s\n", m_rank, m_shmName.c_str());
|
||||
|
||||
NCCLCHECK(MsgQueueGetId(m_shmName, false, mq_desc));
|
||||
|
||||
@@ -391,7 +391,7 @@ static struct rcclRomeModel rome_model_59 = {
|
||||
.connMatrix = { 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, },
|
||||
.gdrLevel = { 4, 4, 4, 4, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 4, 4, 4, 4, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 4, 4, 4, 4, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 4, 4, 4, 4, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 4, 4, 4, 4, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 4, 4, 4, 4, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 4, 4, 4, 4, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 4, 4, 4, 4, },
|
||||
.pattern = "42424242",
|
||||
.ringBase = "N0 1 3 2 6 7 15 14 10 11 9 8 12 13 5 4 0 N0|N1 3 7 6 13 12 8 9 10 11 15 14 5 4 0 1 2 N1|N2 4 5 1 0 2 3 7 6 14 15 11 10 8 9 13 12 N6|N3 6 2 3 1 0 4 5 13 12 8 9 11 10 14 15 7 N3|N4 8 12 13 6 7 3 2 1 0 4 5 14 15 11 10 9 N4|N5 11 15 14 6 7 3 2 0 1 5 4 12 13 9 8 10 N5|N6 12 13 5 4 0 1 3 2 6 7 15 14 10 11 9 8 N4|N7 14 5 4 0 1 2 3 7 6 13 12 8 9 10 11 15 N7|N3 7 6 14 15 11 10 8 9 13 12 4 5 1 0 2 3 N1|N1 2 3 1 0 4 5 13 12 8 9 11 10 14 15 7 6 N3|N2 5 14 15 11 10 9 8 12 13 6 7 3 2 1 0 4 N2|N4 9 8 10 11 15 14 6 7 3 2 0 1 5 4 12 13 N6|N5 10 11 9 8 12 13 5 4 0 1 3 2 6 7 15 14 N7|N7 15 14 5 4 0 1 2 3 7 6 13 12 8 9 10 11 N5|N0 0 2 3 7 6 14 15 11 10 8 9 13 12 4 5 1 N0|N6 13 12 8 9 11 10 14 15 7 6 2 3 1 0 4 5 N2|",
|
||||
.ringBase = "N0 1 0 4 5 14 15 11 10 9 8 12 13 6 7 3 2 N1|N1 3 2 0 1 5 4 12 13 9 8 10 11 15 14 6 7 N3|N2 5 4 0 1 3 2 6 7 15 14 10 11 9 8 12 13 N6|N3 7 6 13 12 8 9 10 11 15 14 5 4 0 1 2 3 N1|N5 11 10 8 9 13 12 4 5 1 0 2 3 7 6 14 15 N7|N6 13 12 8 9 11 10 14 15 7 6 2 3 1 0 4 5 N2|N2 4 5 14 15 11 10 9 8 12 13 6 7 3 2 1 0 N0|N7 15 14 6 7 3 2 0 1 5 4 12 13 9 8 10 11 N5|N0 0 1 3 2 6 7 15 14 10 11 9 8 12 13 5 4 N2|N1 2 3 7 6 13 12 8 9 10 11 15 14 5 4 0 1 N0|N7 14 15 11 10 8 9 13 12 4 5 1 0 2 3 7 6 N3|N4 8 9 11 10 14 15 7 6 2 3 1 0 4 5 13 12 N6|N4 9 8 12 13 6 7 3 2 1 0 4 5 14 15 11 10 N5|N3 6 7 3 2 0 1 5 4 12 13 9 8 10 11 15 14 N7|N6 12 13 5 4 0 1 3 2 6 7 15 14 10 11 9 8 N4|N5 10 11 15 14 5 4 0 1 2 3 7 6 13 12 8 9 N4|",
|
||||
.netGdrLevel = -2,
|
||||
};
|
||||
|
||||
@@ -430,7 +430,7 @@ static struct rcclRomeModel rome_model_65 = {
|
||||
.connMatrix = { 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, },
|
||||
.gdrLevel = { 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5, },
|
||||
.pattern = "42424242",
|
||||
.ringBase = "N0 1 3 2 6 7 15 14 10 11 9 8 12 13 5 4 0 N0|N1 3 7 6 13 12 8 9 10 11 15 14 5 4 0 1 2 N1|N2 4 5 1 0 2 3 7 6 14 15 11 10 8 9 13 12 N6|N3 6 2 3 1 0 4 5 13 12 8 9 11 10 14 15 7 N3|N4 8 12 13 6 7 3 2 1 0 4 5 14 15 11 10 9 N4|N5 11 15 14 6 7 3 2 0 1 5 4 12 13 9 8 10 N5|N6 12 13 5 4 0 1 3 2 6 7 15 14 10 11 9 8 N4|N7 14 5 4 0 1 2 3 7 6 13 12 8 9 10 11 15 N7|N3 7 6 14 15 11 10 8 9 13 12 4 5 1 0 2 3 N1|N1 2 3 1 0 4 5 13 12 8 9 11 10 14 15 7 6 N3|N2 5 14 15 11 10 9 8 12 13 6 7 3 2 1 0 4 N2|N4 9 8 10 11 15 14 6 7 3 2 0 1 5 4 12 13 N6|N5 10 11 9 8 12 13 5 4 0 1 3 2 6 7 15 14 N7|N7 15 14 5 4 0 1 2 3 7 6 13 12 8 9 10 11 N5|N0 0 2 3 7 6 14 15 11 10 8 9 13 12 4 5 1 N0|N6 13 12 8 9 11 10 14 15 7 6 2 3 1 0 4 5 N2|",
|
||||
.ringBase = "N0 1 0 4 5 14 15 11 10 9 8 12 13 6 7 3 2 N1|N1 3 2 0 1 5 4 12 13 9 8 10 11 15 14 6 7 N3|N2 5 4 0 1 3 2 6 7 15 14 10 11 9 8 12 13 N6|N3 7 6 13 12 8 9 10 11 15 14 5 4 0 1 2 3 N1|N5 11 10 8 9 13 12 4 5 1 0 2 3 7 6 14 15 N7|N6 13 12 8 9 11 10 14 15 7 6 2 3 1 0 4 5 N2|N2 4 5 14 15 11 10 9 8 12 13 6 7 3 2 1 0 N0|N7 15 14 6 7 3 2 0 1 5 4 12 13 9 8 10 11 N5|N0 0 1 3 2 6 7 15 14 10 11 9 8 12 13 5 4 N2|N1 2 3 7 6 13 12 8 9 10 11 15 14 5 4 0 1 N0|N7 14 15 11 10 8 9 13 12 4 5 1 0 2 3 7 6 N3|N4 8 9 11 10 14 15 7 6 2 3 1 0 4 5 13 12 N6|N4 9 8 12 13 6 7 3 2 1 0 4 5 14 15 11 10 N5|N3 6 7 3 2 0 1 5 4 12 13 9 8 10 11 15 14 N7|N6 12 13 5 4 0 1 3 2 6 7 15 14 10 11 9 8 N4|N5 10 11 15 14 5 4 0 1 2 3 7 6 13 12 8 9 N4|",
|
||||
.netGdrLevel = 5,
|
||||
};
|
||||
|
||||
|
||||
@@ -137,7 +137,6 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
|
||||
if (a == NCCL_ALGO_RING && (p == NCCL_PROTO_LL || p == NCCL_PROTO_LL128)) busBw *= 0.05;
|
||||
if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_SIMPLE) (nNodes == 2) ? busBw *= 0.33 : busBw *= 0.11;
|
||||
if (a == NCCL_ALGO_TREE && (p == NCCL_PROTO_LL || p == NCCL_PROTO_LL128)) busBw *= 0.04;
|
||||
if (gcn == 910 && p == NCCL_PROTO_LL && nNodes == 1 && nRanks == 16) busBw *= 5.9;
|
||||
if (gcn == 910 && a == NCCL_ALGO_TREE && p == NCCL_PROTO_SIMPLE && nNodes == 2 && nRanks == 32) busBw *= 3.2;
|
||||
#else
|
||||
if (compCap80) busBw = std::min(busBw, 235.0f);
|
||||
|
||||
@@ -90,16 +90,6 @@ static ncclResult_t ncclCudaMemcpy(T* dst, T* src, size_t nelem) {
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static bool hasFineGrainVramPcie() {
|
||||
int *ptr;
|
||||
if (hipExtMallocWithFlags((void**)&ptr, sizeof(int), hipDeviceMallocFinegrained) == hipSuccess) {
|
||||
CUDACHECK(hipFree(ptr));
|
||||
return true;
|
||||
}
|
||||
else
|
||||
return false;
|
||||
}
|
||||
|
||||
// Allocate memory to be potentially ibv_reg_mr'd. This needs to be
|
||||
// allocated on separate pages as those pages will be marked DONTFORK
|
||||
// and if they are shared, that could cause a crash in a child process
|
||||
|
||||
@@ -43,7 +43,6 @@ static ncclResult_t ncclGpuGdrSupport(int* gdrSupport) {
|
||||
NCCLCHECK(ncclNet->getProperties(dev, &props));
|
||||
if ((props.ptrSupport & NCCL_PTR_CUDA) == 0) continue;
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
|
||||
if (!hasFineGrainVramPcie()) continue;
|
||||
*gdrSupport = 1;
|
||||
break;
|
||||
#endif
|
||||
|
||||
@@ -29,6 +29,7 @@ struct ncclPeerInfo {
|
||||
int rank;
|
||||
int cudaDev;
|
||||
int gdrSupport;
|
||||
bool hasFineGrain;
|
||||
uint64_t hostHash;
|
||||
uint64_t pidHash;
|
||||
dev_t shmDev;
|
||||
|
||||
+12
-1
@@ -640,7 +640,18 @@ static ncclResult_t fillInfo(struct ncclComm* comm, struct ncclPeerInfo* info, u
|
||||
|
||||
info->busId = comm->busId;
|
||||
|
||||
NCCLCHECK(ncclGpuGdrSupport(&info->gdrSupport));
|
||||
// detect if fine grained memory is available on this GPU
|
||||
int *ptr;
|
||||
if (hipExtMallocWithFlags((void**)&ptr, sizeof(int), hipDeviceMallocFinegrained) == hipSuccess) {
|
||||
CUDACHECK(hipFree(ptr));
|
||||
info->hasFineGrain = true;
|
||||
NCCLCHECK(ncclGpuGdrSupport(&info->gdrSupport));
|
||||
}
|
||||
else {
|
||||
info->hasFineGrain = false;
|
||||
info->gdrSupport = 0;
|
||||
}
|
||||
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
|
||||
+1
-12
@@ -58,7 +58,7 @@ int busIdToCudaDev(int64_t busId) {
|
||||
/* Determine if two peers can communicate through p2p */
|
||||
ncclResult_t p2pCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) {
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
|
||||
if (!hasFineGrainVramPcie()) {
|
||||
if (!info1->hasFineGrain || !info2->hasFineGrain) {
|
||||
*ret = 0;
|
||||
return ncclSuccess;
|
||||
}
|
||||
@@ -90,17 +90,6 @@ ncclResult_t p2pCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTop
|
||||
#endif
|
||||
}
|
||||
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
|
||||
int dev;
|
||||
CUDACHECK(hipGetDevice(&dev));
|
||||
CUDACHECK(hipSetDevice(cudaDev2));
|
||||
if (!hasFineGrainVramPcie()) {
|
||||
*ret = 0;
|
||||
CUDACHECK(hipSetDevice(dev));
|
||||
return ncclSuccess;
|
||||
}
|
||||
CUDACHECK(hipSetDevice(dev));
|
||||
#endif
|
||||
// Check that CUDA can do P2P
|
||||
int p2p;
|
||||
if (hipDeviceCanAccessPeer(&p2p, cudaDev1, cudaDev2) != hipSuccess) {
|
||||
|
||||
+12
-5
@@ -67,6 +67,13 @@ if(BUILD_TESTS)
|
||||
target_link_libraries(UnitTestsMultiProcess PRIVATE ${GTEST_BOTH_LIBRARIES})
|
||||
target_link_libraries(UnitTestsMultiProcess PRIVATE hip::host hip::device)
|
||||
|
||||
find_program( rocminfo_executable rocminfo )
|
||||
execute_process(COMMAND bash "-c" "${rocminfo_executable} | grep 'Device Type' | grep GPU | wc -l | tr -d '\n'" OUTPUT_VARIABLE gtest_num_gpus)
|
||||
if(${gtest_num_gpus} MATCHES "0" OR ${gtest_num_gpus} MATCHES "1")
|
||||
set(gtest_num_gpus "2")
|
||||
endif()
|
||||
target_compile_options(UnitTests PRIVATE -DGTESTS_NUM_GPUS=${gtest_num_gpus})
|
||||
|
||||
# UnitTests using static library of rccl requires passing rccl
|
||||
# through -l and -L instead of command line input.
|
||||
if(BUILD_STATIC)
|
||||
@@ -81,13 +88,13 @@ if(BUILD_TESTS)
|
||||
# HIPCC adds /opt/rocm/lib as RPATH, even though the install process is supposed to
|
||||
# remove RPATH. It also occurs before any user-specified rpath, which effectively overrides the user rpath.
|
||||
# As a work-around, set the correct RPATH for the unit test executable as a post-install step
|
||||
if (CMAKE_INSTALL_PREFIX MATCHES "/opt/rocm*")
|
||||
if (CMAKE_INSTALL_PREFIX MATCHES "${ROCM_PATH}")
|
||||
# install_prefix/CMAKE_INSTALL_PREFIX was not explicitly specified, so look in build/release
|
||||
add_custom_command( TARGET UnitTests POST_BUILD COMMAND chrpath ARGS -r ${CMAKE_BINARY_DIR}:/opt/rocm/lib ${CMAKE_BINARY_DIR}/test/UnitTests)
|
||||
add_custom_command( TARGET UnitTestsMultiProcess POST_BUILD COMMAND chrpath ARGS -r ${CMAKE_BINARY_DIR}:/opt/rocm/lib ${CMAKE_BINARY_DIR}/test/UnitTestsMultiProcess)
|
||||
add_custom_command( TARGET UnitTests POST_BUILD COMMAND chrpath ARGS -r ${CMAKE_BINARY_DIR}:${ROCM_PATH}/lib ${CMAKE_BINARY_DIR}/test/UnitTests)
|
||||
add_custom_command( TARGET UnitTestsMultiProcess POST_BUILD COMMAND chrpath ARGS -r ${CMAKE_BINARY_DIR}:${ROCM_PATH}/lib ${CMAKE_BINARY_DIR}/test/UnitTestsMultiProcess)
|
||||
else()
|
||||
add_custom_command( TARGET UnitTests POST_BUILD COMMAND chrpath ARGS -r ${CMAKE_INSTALL_PREFIX}/lib:/opt/rocm/lib ${CMAKE_INSTALL_PREFIX}/test/UnitTests)
|
||||
add_custom_command( TARGET UnitTestsMultiProcess POST_BUILD COMMAND chrpath ARGS -r ${CMAKE_INSTALL_PREFIX}/lib:/opt/rocm/lib ${CMAKE_INSTALL_PREFIX}/test/UnitTestsMultiProcess)
|
||||
add_custom_command( TARGET UnitTests POST_BUILD COMMAND chrpath ARGS -r ${CMAKE_INSTALL_PREFIX}/lib:${ROCM_PATH}/lib ${CMAKE_INSTALL_PREFIX}/test/UnitTests)
|
||||
add_custom_command( TARGET UnitTestsMultiProcess POST_BUILD COMMAND chrpath ARGS -r ${CMAKE_INSTALL_PREFIX}/lib:${ROCM_PATH}/lib ${CMAKE_INSTALL_PREFIX}/test/UnitTestsMultiProcess)
|
||||
endif()
|
||||
else()
|
||||
message("Not building unit tests")
|
||||
|
||||
@@ -812,7 +812,7 @@ dropback:
|
||||
case ncclUint64: isMatch &= (outputU8[j] == expectedU8[j]); break;
|
||||
case ncclFloat32: isMatch &= (fabs(outputF4[j] - expectedF4[j]) < 1e-5); break;
|
||||
case ncclFloat64: isMatch &= (fabs(outputF8[j] - expectedF8[j]) < 1e-12); break;
|
||||
case ncclBfloat16: isMatch &= (fabs((float)outputB2[j] - (float)expectedB2[j]) < 5e-2); break;
|
||||
case ncclBfloat16: isMatch &= (fabs((float)outputB2[j] - (float)expectedB2[j]) < 9e-2); break;
|
||||
default:
|
||||
fprintf(stderr, "[ERROR] Unsupported datatype\n");
|
||||
exit(0);
|
||||
|
||||
@@ -9,6 +9,8 @@ namespace CorrectnessTests
|
||||
{
|
||||
TEST_P(AllGatherCorrectnessTest, Correctness)
|
||||
{
|
||||
// Adjust numElements to be multiple of numDevices
|
||||
numElements = (numElements/numDevices)*numDevices;
|
||||
if (numDevices > numDevicesAvailable) return;
|
||||
if (numElements % numDevices != 0) return;
|
||||
|
||||
@@ -107,7 +109,7 @@ namespace CorrectnessTests
|
||||
// Number of elements
|
||||
testing::Values(2520, 3026520),
|
||||
// Number of devices
|
||||
testing::Values(2,3,4,5,6,7,8),
|
||||
testing::Range(2,(GTESTS_NUM_GPUS+1)),
|
||||
// In-place or not
|
||||
testing::Values(false, true),
|
||||
testing::Values("")),
|
||||
|
||||
@@ -46,7 +46,7 @@ namespace CorrectnessTests
|
||||
// Number of elements
|
||||
testing::Values(1024, 1048576),
|
||||
// Number of devices
|
||||
testing::Values(2,3,4,5,6,7,8),
|
||||
testing::Range(2,(GTESTS_NUM_GPUS+1)),
|
||||
// In-place or not
|
||||
testing::Values(false, true),
|
||||
testing::Values("RCCL_ENABLE_CLIQUE=0", "RCCL_ENABLE_CLIQUE=1")),
|
||||
@@ -71,7 +71,7 @@ namespace CorrectnessTests
|
||||
// Number of elements
|
||||
testing::Values(1024, 1048576),
|
||||
// Number of devices
|
||||
testing::Values(2,3,4,5,6,7,8),
|
||||
testing::Range(2,(GTESTS_NUM_GPUS+1)),
|
||||
// In-place or not
|
||||
testing::Values(false, true),
|
||||
testing::Values("RCCL_ENABLE_CLIQUE=0", "RCCL_ENABLE_CLIQUE=1")),
|
||||
|
||||
@@ -58,7 +58,7 @@ namespace CorrectnessTests
|
||||
// Number of elements
|
||||
testing::Values(1024, 1048576),
|
||||
// Number of devices
|
||||
testing::Values(2,3,4,5,6,7,8),
|
||||
testing::Range(2,(GTESTS_NUM_GPUS+1)),
|
||||
// In-place or not
|
||||
testing::Values(false, true),
|
||||
testing::Values("RCCL_ENABLE_CLIQUE=0", "RCCL_ENABLE_CLIQUE=1")),
|
||||
@@ -74,7 +74,7 @@ namespace CorrectnessTests
|
||||
// Number of elements
|
||||
testing::Values(1024, 1048576),
|
||||
// Number of devices
|
||||
testing::Values(2,3,4,5,6,7,8),
|
||||
testing::Range(2,(GTESTS_NUM_GPUS+1)),
|
||||
// In-place or not
|
||||
testing::Values(false, true),
|
||||
testing::Values("RCCL_ENABLE_CLIQUE=0", "RCCL_ENABLE_CLIQUE=1")),
|
||||
|
||||
@@ -59,7 +59,7 @@ namespace CorrectnessTests
|
||||
// Number of elements
|
||||
testing::Values(1024, 1048576),
|
||||
// Number of devices
|
||||
testing::Values(2,3,4,5,6,7,8),
|
||||
testing::Range(2,(GTESTS_NUM_GPUS+1)),
|
||||
// In-place or not
|
||||
testing::Values(false),
|
||||
testing::Values("")),
|
||||
|
||||
@@ -67,7 +67,7 @@ namespace CorrectnessTests
|
||||
// Number of elements
|
||||
testing::Values(2520, 3026520),
|
||||
// Number of devices
|
||||
testing::Values(2,3,4,5,6,7,8),
|
||||
testing::Range(2,(GTESTS_NUM_GPUS+1)),
|
||||
// In-place or not
|
||||
testing::Values(false),
|
||||
testing::Values("")),
|
||||
|
||||
@@ -63,7 +63,7 @@ namespace CorrectnessTests
|
||||
// Number of elements
|
||||
testing::Values(1024, 1048576),
|
||||
// Number of devices
|
||||
testing::Values(2,3,4,5,6,7,8),
|
||||
testing::Range(2,(GTESTS_NUM_GPUS+1)),
|
||||
// In-place or not
|
||||
testing::Values(false, true),
|
||||
testing::Values("")),
|
||||
|
||||
@@ -27,6 +27,8 @@ namespace CorrectnessTests
|
||||
ncclFuncs.push_back(ncclCollReduce);
|
||||
ncclFuncs.push_back(ncclCollReduceScatter);
|
||||
|
||||
// Adjust numElements to be multiple of numDevices
|
||||
numElements = (numElements/numDevices)*numDevices;
|
||||
for (int i = 0; i < datasets.size(); i++)
|
||||
{
|
||||
datasets[i].Initialize(numDevices, numElements, dataType, inPlace, ncclFuncs[i]);
|
||||
@@ -119,7 +121,7 @@ namespace CorrectnessTests
|
||||
// Number of elements
|
||||
testing::Values(2520, 3026520),
|
||||
// Number of devices
|
||||
testing::Values(2,3,4,5,6,7,8),
|
||||
testing::Range(2,(GTESTS_NUM_GPUS+1)),
|
||||
// In-place or not
|
||||
testing::Values(false),
|
||||
testing::Values("RCCL_ENABLE_CLIQUE=0", "RCCL_ENABLE_CLIQUE=1", "RCCL_P2P_NET_DISABLE=0", "RCCL_P2P_NET_DISABLE=1")),
|
||||
|
||||
@@ -63,7 +63,7 @@ namespace CorrectnessTests
|
||||
// Number of elements
|
||||
testing::Values(1024, 1048576),
|
||||
// Number of devices
|
||||
testing::Values(2,3,4,5,6,7,8),
|
||||
testing::Range(2,(GTESTS_NUM_GPUS+1)),
|
||||
// In-place or not
|
||||
testing::Values(false),
|
||||
testing::Values("")),
|
||||
|
||||
@@ -26,6 +26,8 @@ namespace CorrectnessTests
|
||||
ncclFuncs.push_back(ncclCollReduce);
|
||||
ncclFuncs.push_back(ncclCollReduceScatter);
|
||||
|
||||
// Adjust numElements to be multiple of numDevices
|
||||
numElements = (numElements/numDevices)*numDevices;
|
||||
for (int i = 0; i < datasets.size(); i++)
|
||||
{
|
||||
datasets[i].Initialize(numDevices, numElements, dataType, inPlace, ncclFuncs[i]);
|
||||
@@ -120,7 +122,7 @@ namespace CorrectnessTests
|
||||
// Number of elements
|
||||
testing::Values(2520, 3026520),
|
||||
// Number of devices
|
||||
testing::Values(2,3,4,5,6,7,8),
|
||||
testing::Range(2,(GTESTS_NUM_GPUS+1)),
|
||||
// In-place or not
|
||||
testing::Values(false, true),
|
||||
testing::Values("RCCL_ENABLE_CLIQUE=0", "RCCL_ENABLE_CLIQUE=1")),
|
||||
|
||||
@@ -63,7 +63,7 @@ namespace CorrectnessTests
|
||||
// Number of elements
|
||||
testing::Values(1024, 1048576),
|
||||
// Number of devices
|
||||
testing::Values(2,3,4,5,6,7,8),
|
||||
testing::Range(2,(GTESTS_NUM_GPUS+1)),
|
||||
// In-place or not
|
||||
testing::Values(false, true),
|
||||
testing::Values("")),
|
||||
|
||||
@@ -10,6 +10,8 @@ namespace CorrectnessTests
|
||||
{
|
||||
TEST_P(ReduceScatterCorrectnessTest, Correctness)
|
||||
{
|
||||
// Adjust numElements to be multiple of numDevices
|
||||
numElements = (numElements/numDevices)*numDevices;
|
||||
if (numDevices > numDevicesAvailable) return;
|
||||
if (numElements % numDevices != 0) return;
|
||||
|
||||
@@ -61,7 +63,7 @@ namespace CorrectnessTests
|
||||
// Number of elements
|
||||
testing::Values(2520, 3026520),
|
||||
// Number of devices
|
||||
testing::Values(2,3,4,5,6,7,8),
|
||||
testing::Range(2,(GTESTS_NUM_GPUS+1)),
|
||||
// In-place or not
|
||||
testing::Values(false, true),
|
||||
testing::Values("")),
|
||||
|
||||
@@ -63,7 +63,7 @@ namespace CorrectnessTests
|
||||
// Number of elements
|
||||
testing::Values(1024, 1048576),
|
||||
// Number of devices
|
||||
testing::Values(2,3,4,5,6,7,8),
|
||||
testing::Range(2,(GTESTS_NUM_GPUS+1)),
|
||||
// In-place or not
|
||||
testing::Values(false),
|
||||
testing::Values("")),
|
||||
|
||||
@@ -77,7 +77,8 @@ int main(int argc, char **argv)
|
||||
maxN = std::max(maxN, N);
|
||||
|
||||
// Execute only peer to peer benchmark mode, similar to rocm-bandwidth-test
|
||||
if (!strcmp(argv[1], "p2p"))
|
||||
if (!strcmp(argv[1], "p2p") || !strcmp(argv[1], "p2p_rr") ||
|
||||
!strcmp(argv[1], "g2g") || !strcmp(argv[1], "g2g_rr"))
|
||||
{
|
||||
int numBlocksToUse = 0;
|
||||
if (argc > 3)
|
||||
@@ -85,8 +86,13 @@ int main(int argc, char **argv)
|
||||
else
|
||||
HIP_CALL(hipDeviceGetAttribute(&numBlocksToUse, hipDeviceAttributeMultiprocessorCount, 0));
|
||||
|
||||
// Perform either local read (+remote write) [EXE = SRC] or
|
||||
// remote read (+local write) [EXE = DST]
|
||||
int readMode = (!strcmp(argv[1], "p2p_rr") || !strcmp(argv[1], "g2g_rr") ? 1 : 0);
|
||||
int skipCpu = (!strcmp(argv[1], "g2g") || !strcmp(argv[1], "g2g_rr") ? 1 : 0);
|
||||
|
||||
// Execute peer to peer benchmark mode
|
||||
RunPeerToPeerBenchmarks(ev, numBytesPerLink / sizeof(float), numBlocksToUse);
|
||||
RunPeerToPeerBenchmarks(ev, numBytesPerLink / sizeof(float), numBlocksToUse, readMode, skipCpu);
|
||||
exit(0);
|
||||
}
|
||||
|
||||
@@ -394,15 +400,20 @@ void DisplayUsage(char const* cmdName)
|
||||
HIP_CALL(hipGetDeviceCount(&numGpuDevices));
|
||||
int const numCpuDevices = numa_num_configured_nodes();
|
||||
|
||||
printf("Usage: %s configFile <N>\n", cmdName);
|
||||
|
||||
printf(" configFile: File containing Links to execute (see below for format)\n");
|
||||
printf(" Specifying \"p2p\" as the configFile will execute a peer to peer benchmark (3rd argument used as # CUs to use)\n");
|
||||
printf(" N : (Optional) Number of bytes to transfer per link.\n");
|
||||
printf(" If not specified, defaults to %lu bytes. Must be a multiple of 4 bytes\n", DEFAULT_BYTES_PER_LINK);
|
||||
printf(" If 0 is specified, a range of Ns will be benchmarked\n");
|
||||
printf(" If a negative number is specified, a configFile gets generated with this number as default number of CUs per link\n");
|
||||
printf(" May append a suffix ('K', 'M', 'G') for kilobytes / megabytes / gigabytes\n");
|
||||
printf("Usage: %s config <N>\n", cmdName);
|
||||
printf(" config: Either:\n");
|
||||
printf(" - Filename of configFile containing Links to execute (see below for format)\n");
|
||||
printf(" - Name of preset benchmark:\n");
|
||||
printf(" p2p - All CPU/GPU pairs benchmark\n");
|
||||
printf(" p2p_rr - All CPU/GPU pairs benchmark with remote reads\n");
|
||||
printf(" g2g - All GPU/GPU pairs benchmark\n");
|
||||
printf(" g2g_rr - All GPU/GPU pairs benchmark with remote reads\n");
|
||||
printf(" - 3rd optional argument will be used as # of CUs to use (uses all by default)\n");
|
||||
printf(" N : (Optional) Number of bytes to transfer per link.\n");
|
||||
printf(" If not specified, defaults to %lu bytes. Must be a multiple of 4 bytes\n", DEFAULT_BYTES_PER_LINK);
|
||||
printf(" If 0 is specified, a range of Ns will be benchmarked\n");
|
||||
printf(" If a negative number is specified, a configFile gets generated with this number as default number of CUs per link\n");
|
||||
printf(" May append a suffix ('K', 'M', 'G') for kilobytes / megabytes / gigabytes\n");
|
||||
printf("\n");
|
||||
printf("Configfile Format:\n");
|
||||
printf("==================\n");
|
||||
@@ -1104,7 +1115,7 @@ void RunLink(EnvVars const& ev, size_t const N, int const iteration, Link& link)
|
||||
}
|
||||
}
|
||||
|
||||
void RunPeerToPeerBenchmarks(EnvVars const& ev, size_t N, int numBlocksToUse)
|
||||
void RunPeerToPeerBenchmarks(EnvVars const& ev, size_t N, int numBlocksToUse, int readMode, int skipCpu)
|
||||
{
|
||||
// Collect the number of available CPUs/GPUs on this machine
|
||||
int numGpus;
|
||||
@@ -1122,42 +1133,44 @@ void RunPeerToPeerBenchmarks(EnvVars const& ev, size_t N, int numBlocksToUse)
|
||||
printf("Using %d CUs per transfer\n", numBlocksToUse);
|
||||
|
||||
// Perform unidirectional / bidirectional
|
||||
for (int readMode = 0; readMode < 2; readMode++)
|
||||
for (int isBidirectional = 0; isBidirectional <= 1; isBidirectional++)
|
||||
{
|
||||
for (int isBidirectional = 0; isBidirectional <= 1; isBidirectional++)
|
||||
// Print header
|
||||
printf("%sdirectional copy peak bandwidth GB/s [%s read / %s write]\n", isBidirectional ? "Bi" : "Uni",
|
||||
readMode == 0 ? "Local" : "Remote",
|
||||
readMode == 0 ? "Remote" : "Local");
|
||||
printf("%10s", "D/D");
|
||||
if (!skipCpu)
|
||||
{
|
||||
// Print header
|
||||
printf("%sdirectional copy peak bandwidth GB/s [%s read / %s write]\n", isBidirectional ? "Bi" : "Uni",
|
||||
readMode == 0 ? "Local" : "Remote",
|
||||
readMode == 0 ? "Remote" : "Local");
|
||||
printf("%10s", "D/D");
|
||||
for (int i = 0; i < numCpus; i++)
|
||||
printf("%7s %02d", "CPU", i);
|
||||
for (int i = 0; i < numGpus; i++)
|
||||
printf("%7s %02d", "GPU", i);
|
||||
printf("\n");
|
||||
}
|
||||
for (int i = 0; i < numGpus; i++)
|
||||
printf("%7s %02d", "GPU", i);
|
||||
printf("\n");
|
||||
|
||||
// Loop over all possible src/dst pairs
|
||||
for (int src = 0; src < numDevices; src++)
|
||||
// Loop over all possible src/dst pairs
|
||||
for (int src = 0; src < numDevices; src++)
|
||||
{
|
||||
MemType const& srcMemType = (src < numCpus ? MEM_CPU : MEM_GPU);
|
||||
if (skipCpu && srcMemType == MEM_CPU) continue;
|
||||
int srcIndex = (srcMemType == MEM_CPU ? src : src - numCpus);
|
||||
printf("%7s %02d", (srcMemType == MEM_CPU) ? "CPU" : "GPU", srcIndex);
|
||||
for (int dst = 0; dst < numDevices; dst++)
|
||||
{
|
||||
MemType const& srcMemType = (src < numCpus ? MEM_CPU : MEM_GPU);
|
||||
int srcIndex = (srcMemType == MEM_CPU ? src : src - numCpus);
|
||||
printf("%7s %02d", (srcMemType == MEM_CPU) ? "CPU" : "GPU", srcIndex);
|
||||
for (int dst = 0; dst < numDevices; dst++)
|
||||
{
|
||||
MemType const& dstMemType = (dst < numCpus ? MEM_CPU : MEM_GPU);
|
||||
int dstIndex = (dstMemType == MEM_CPU ? dst : dst - numCpus);
|
||||
double bandwidth = GetPeakBandwidth(ev, N, isBidirectional, srcMemType, srcIndex, dstMemType, dstIndex, readMode);
|
||||
if (bandwidth == 0)
|
||||
printf("%10s", "N/A");
|
||||
else
|
||||
printf("%10.2f", bandwidth);
|
||||
fflush(stdout);
|
||||
}
|
||||
printf("\n");
|
||||
MemType const& dstMemType = (dst < numCpus ? MEM_CPU : MEM_GPU);
|
||||
if (skipCpu && dstMemType == MEM_CPU) continue;
|
||||
int dstIndex = (dstMemType == MEM_CPU ? dst : dst - numCpus);
|
||||
double bandwidth = GetPeakBandwidth(ev, N, isBidirectional, srcMemType, srcIndex, dstMemType, dstIndex, readMode);
|
||||
if (bandwidth == 0)
|
||||
printf("%10s", "N/A");
|
||||
else
|
||||
printf("%10.2f", bandwidth);
|
||||
fflush(stdout);
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -122,7 +122,7 @@ void DeallocateMemory(MemType memType, int devIndex, float* memPtr);
|
||||
void CheckPages(char* byteArray, size_t numBytes, int targetId);
|
||||
void CheckOrFill(ModeType mode, int N, bool isMemset, bool isHipCall, std::vector<float> const& fillPattern, float* ptr);
|
||||
void RunLink(EnvVars const& ev, size_t const N, int const iteration, Link& link);
|
||||
void RunPeerToPeerBenchmarks(EnvVars const& ev, size_t N, int numBlocksToUse);
|
||||
void RunPeerToPeerBenchmarks(EnvVars const& ev, size_t N, int numBlocksToUse, int readMode, int skipCpu);
|
||||
double GetPeakBandwidth(EnvVars const& ev, size_t N, int isBidirectional,
|
||||
MemType srcMemType, int srcIndex,
|
||||
MemType dstMemType, int dstIndex,
|
||||
|
||||
Reference in New Issue
Block a user