Merge remote-tracking branch 'origin/develop' into 2.11.4

This commit is contained in:
Wenkai Du
2022-01-03 09:54:16 -08:00
melakukan 434ecb0e1f
33 mengubah file dengan 241 tambahan dan 141 penghapusan
+84
Melihat File
@@ -0,0 +1,84 @@
arrayIndexThenCheck:src/bootstrap.cc:304
arrayIndexThenCheck:src/debug.cc:88
arrayIndexThenCheck:src/graph/search.cc:844
arrayIndexThenCheck:src/graph/search.cc:916
arrayIndexThenCheck:src/graph/search.cc:927
clarifyCalculation:src/graph/topo.cc:702
clarifyCalculation:src/graph/topo.cc:720
clarifyCondition:src/enqueue.cc:416
funcArgNamesDifferent:src/graph/topo.cc:135
funcArgNamesDifferent:src/graph/topo.h:144
nullPointerRedundantCheck:src/misc/utils.cc:102
nullPointerRedundantCheck:src/misc/utils.cc:109
nullPointerRedundantCheck:src/proxy.cc:143
nullPointerRedundantCheck:src/proxy.cc:144
nullPointerRedundantCheck:src/proxy.cc:147
nullPointerRedundantCheck:src/proxy.cc:148
nullPointerRedundantCheck:src/proxy.cc:149
nullPointerRedundantCheck:src/proxy.cc:150
nullPointerRedundantCheck:src/proxy.cc:151
nullPointerRedundantCheck:src/proxy.cc:155
nullPointerRedundantCheck:src/proxy.cc:159
nullPointerRedundantCheck:src/proxy.cc:160
nullPointerRedundantCheck:src/proxy.cc:161
nullPointerRedundantCheck:src/proxy.cc:163
nullPointerRedundantCheck:src/proxy.cc:165
nullPointerRedundantCheck:src/proxy.cc:167
nullPointerRedundantCheck:src/proxy.cc:168
nullPointerRedundantCheck:src/proxy.cc:340
nullPointerRedundantCheck:src/proxy.cc:342
nullPointerRedundantCheck:src/proxy.cc:93
nullPointerRedundantCheck:src/proxy.cc:94
redundantAssignment:src/proxy.cc:161
redundantAssignment:src/proxy.cc:163
redundantCopy:src/graph/rings.cc:16
redundantCopy:src/graph/rings.cc:17
terminateStrncpy:src/misc/utils.cc:99
terminateStrncpy:src/transport/net_socket.cc:245
unreachableCode:src/transport/net.cc:555
unreadVariable:src/graph/tuning.cc:109
unreadVariable:src/graph/tuning.cc:110
unreadVariable:src/graph/tuning.cc:113
unusedFunction:src/graph/topo.cc:37
unusedFunction:src/graph/topo.cc:836
unusedFunction:src/misc/gdrwrap.cc:109
unusedFunction:src/misc/gdrwrap.cc:117
unusedFunction:src/misc/gdrwrap.cc:130
unusedFunction:src/misc/gdrwrap.cc:144
unusedFunction:src/misc/gdrwrap.cc:158
unusedFunction:src/misc/gdrwrap.cc:172
unusedFunction:src/misc/gdrwrap.cc:186
unusedFunction:src/misc/gdrwrap.cc:200
unusedFunction:src/misc/gdrwrap.cc:209
unusedFunction:src/misc/gdrwrap.cc:218
unusedFunction:src/misc/gdrwrap.cc:232
unusedFunction:src/misc/gdrwrap.cc:52
unusedFunction:src/misc/ibvwrap.cc:203
unusedFunction:src/misc/ibvwrap.cc:239
unusedFunction:src/misc/ibvwrap.cc:255
unusedFunction:src/misc/nvmlwrap.cc:112
unusedFunction:src/misc/nvmlwrap_stub.cc:31
unusedFunction:src/misc/nvmlwrap_stub.cc:35
unusedFunction:src/transport.cc:71
unusedLabel:src/bootstrap.cc:349
unusedLabel:src/clique/ShmObject.h:112
unusedLabel:src/clique/ShmObject.h:204
unusedLabel:src/enqueue.cc:108
unusedLabel:src/enqueue.cc:1093
unusedLabel:src/enqueue.cc:989
unusedLabel:src/init.cc:1189
unusedLabel:src/init.cc:1240
unusedLabel:src/init.cc:1267
unusedLabel:src/transport.cc:238
unusedStructMember:src/graph/xml.cc:410
unusedStructMember:src/graph/xml.cc:411
unusedStructMember:src/graph/xml.cc:412
unusedStructMember:src/graph/xml.cc:428
unusedStructMember:src/graph/xml.cc:431
unusedStructMember:src/graph/xml.cc:432
unusedStructMember:src/graph/xml.cc:435
unusedStructMember:src/graph/xml.cc:437
variableScope:src/graph/search.cc:494
variableScope:src/init.cc:240
variableScope:src/transport/net_ib.cc:117
variableScope:src/transport/net_socket.cc:431
+2 -2
Melihat File
@@ -205,9 +205,9 @@ fi
check_exit_code "$?"
if ($build_tests) || (($run_tests) && [[ ! -f ./test/UnitTests ]]); then
CXX=$ROCM_BIN_PATH/$compiler $cmake_executable $cmake_common_options -DBUILD_TESTS=ON -DCMAKE_INSTALL_PREFIX=$ROCM_PATH ../../.
CXX=$ROCM_BIN_PATH/$compiler $cmake_executable $cmake_common_options -DBUILD_TESTS=ON -DCMAKE_INSTALL_PREFIX=$ROCM_PATH -DROCM_PATH=$ROCM_PATH ../../.
else
CXX=$ROCM_BIN_PATH/$compiler $cmake_executable $cmake_common_options -DBUILD_TESTS=OFF -DCMAKE_INSTALL_PREFIX=$ROCM_PATH ../../.
CXX=$ROCM_BIN_PATH/$compiler $cmake_executable $cmake_common_options -DBUILD_TESTS=OFF -DCMAKE_INSTALL_PREFIX=$ROCM_PATH -DROCM_PATH=$ROCM_PATH ../../.
fi
check_exit_code "$?"
+16 -23
Melihat File
@@ -56,14 +56,26 @@ CliqueManager::CliqueManager(int const rank,
cliqueMode_t const cliqueMode) :
m_rank(rank),
m_numRanks(numRanks),
m_hash(0),
m_cliqueMode(cliqueMode),
m_opIndexHead(0),
m_opIndexTail(0),
m_init(false),
m_gcnArch(0),
m_allReduceByteLimit(0),
m_pinnedCliquePtrs(NULL),
m_fineGrainBarrierMem(NULL)
{
}
m_gpuBarrierGlobalCount(NULL),
m_gpuBarrierGlobalSense(NULL),
m_gpuBarrierLocalSense(NULL),
m_cpuBarrierCount(NULL),
m_shmHandles(),
m_ipcHandleSendCache(),
m_ipcHandleRecvCache(),
m_sharedCpuMemory(),
m_sharedIpcHandle(),
m_fineGrainBarrierMem(NULL),
m_sharedBarrierCount(NULL)
{}
CliqueManager::~CliqueManager()
{
@@ -128,11 +140,6 @@ ncclResult_t CliqueManager::Init(ncclUniqueId const* commId, int suffix)
WARN("Invalid rank specified. Expected 0 <= %d < %d for CliqueManager", m_rank, m_numRanks);
return ncclInvalidUsage;
}
if (commId == NULL)
{
WARN("CommId should not be empty");
return ncclInvalidUsage;
}
// For now, opt-into clique based kernels via RCCL_ENABLE_CLIQUE env var
if (!rcclParamEnableClique())
@@ -350,7 +357,7 @@ ncclResult_t CliqueManager::GetNumChannelsToUse(ncclFunc_t const coll,
ncclDataType_t const datatype,
ncclRedOp_t const op,
int const totalNumChannels,
uint8_t* numChannelstoUse)
uint8_t* numChannelstoUse) const
{
size_t const totalBytes = count * ncclTypeSize(datatype);
*numChannelstoUse = 1;
@@ -467,20 +474,6 @@ ncclResult_t CliqueManager::WaitForPointers(ncclWorkElem* args)
return ncclSuccess;
}
std::string HandleToString(hipIpcMemHandle_t handle)
{
char mapping[17] = "0123456789ABCDEF";
std::string result;
for (int i = 0; i < 4; i++)
{
unsigned char val = (unsigned char)handle.reserved[i];
result += mapping[val / 16];
result += mapping[val % 16];
}
return result;
}
ncclResult_t CliqueManager::CheckCacheForPtr(void* devPtr,
NcclIpcHandleSendCache* cache,
int rank,
+8 -8
Melihat File
@@ -69,7 +69,7 @@ public:
ncclDataType_t const datatype,
ncclRedOp_t const op,
int const totalNumChannels,
uint8_t* numChannelstoUse);
uint8_t* numChannelstoUse) const;
// Blocking call that only returns the in-progress clique pointers are ready
// This needs to be called in same order as DeclarePointers
@@ -79,14 +79,14 @@ public:
static ncclResult_t BootstrapRootInit(int pid, unsigned long hash);
protected:
ncclResult_t CheckCacheForPtr(void* devPtr,
NcclIpcHandleSendCache* cache,
int rank,
std::pair<hipIpcMemHandle_t, size_t>* handlePair);
static ncclResult_t CheckCacheForPtr(void* devPtr,
NcclIpcHandleSendCache* cache,
int rank,
std::pair<hipIpcMemHandle_t, size_t>* handlePair);
ncclResult_t CheckCacheForHandle(std::pair<hipIpcMemHandle_t, size_t> const& handlePair,
NcclIpcHandleRecvCache* cache,
void** ptr);
static ncclResult_t CheckCacheForHandle(std::pair<hipIpcMemHandle_t, size_t> const& handlePair,
NcclIpcHandleRecvCache* cache,
void** ptr);
int m_rank; // Associated rank
int m_numRanks; // Total number of ranks
+1 -1
Melihat File
@@ -114,7 +114,7 @@ private:
void updateHistory(const iterator& it)
{
if (m_lruHistory.size() > 0)
if (!m_lruHistory.empty())
{
m_lruHistory.splice(m_lruHistory.end(), m_lruHistory, (it->second).second);
}
+4 -2
Melihat File
@@ -28,7 +28,7 @@ THE SOFTWARE.
#include "Hash.h"
#include "shm.h"
NcclIpcHandleShm::NcclIpcHandleShm(int rank, int numRanks, int projid, int numHandlesPerRank, int capacity, std::string suffix) :
NcclIpcHandleShm::NcclIpcHandleShm(int rank, int numRanks, int projid, int numHandlesPerRank, int capacity, std::string const& suffix) :
ShmObject<std::pair<hipIpcMemHandle_t,size_t>>(numRanks * numHandlesPerRank * capacity * sizeof(std::pair<hipIpcMemHandle_t,size_t>),
CliqueShmNames["IpcHandles"] + suffix,
rank,
@@ -39,7 +39,9 @@ NcclIpcHandleShm::NcclIpcHandleShm(int rank, int numRanks, int projid, int numHa
{
}
NcclIpcHandleShm::NcclIpcHandleShm()
NcclIpcHandleShm::NcclIpcHandleShm() :
m_numHandlesPerRank(0),
m_numHandlesPerOpCount(0)
{
}
+1 -1
Melihat File
@@ -33,7 +33,7 @@ THE SOFTWARE.
class NcclIpcHandleShm : public ShmObject<std::pair<hipIpcMemHandle_t,size_t>>
{
public:
NcclIpcHandleShm(int rank, int numRanks, int projid, int numHandlesPerRank, int capacity, std::string suffix);
NcclIpcHandleShm(int rank, int numRanks, int projid, int numHandlesPerRank, int capacity, std::string const& suffix);
NcclIpcHandleShm();
+4 -4
Melihat File
@@ -28,7 +28,7 @@ THE SOFTWARE.
#define MSG_SIZE 1
#define MSG_QUEUE_TIMEOUT 60
ncclResult_t MsgQueueGetId(std::string name, bool exclusive, mqd_t& mq_desc)
ncclResult_t MsgQueueGetId(std::string const& name, bool exclusive, mqd_t& mq_desc)
{
int flag = (exclusive == true ? O_CREAT | O_EXCL : O_CREAT);
struct mq_attr attr;
@@ -83,7 +83,7 @@ ncclResult_t MsgQueueWaitUntilEmpty(mqd_t const& mq_desc)
return ncclSuccess;
}
ncclResult_t MsgQueueClose(std::string name, mqd_t& mq_desc, bool unlink)
ncclResult_t MsgQueueClose(std::string const& name, mqd_t& mq_desc, bool unlink)
{
if (unlink)
{
@@ -93,9 +93,9 @@ ncclResult_t MsgQueueClose(std::string name, mqd_t& mq_desc, bool unlink)
return ncclSuccess;
}
ncclResult_t MsgQueueUnlink(std::string name)
ncclResult_t MsgQueueUnlink(std::string const& name)
{
std::string mq_name = "/" + name;
SYSCHECK(mq_unlink(mq_name.c_str()), "mq_unlink");
return ncclSuccess;
}
}
+3 -3
Melihat File
@@ -29,11 +29,11 @@ THE SOFTWARE.
#include "nccl.h"
#include "core.h"
ncclResult_t MsgQueueGetId(std::string name, bool exclusive, mqd_t& mq_desc);
ncclResult_t MsgQueueGetId(std::string const& name, bool exclusive, mqd_t& mq_desc);
ncclResult_t MsgQueueSend(mqd_t const& mq_desc, const char* msgp, size_t msgsz);
ncclResult_t MsgQueueRecv(mqd_t const& mq_desc, char* msgp, size_t msgsz);
ncclResult_t MsgQueueWaitUntilEmpty(mqd_t const& mq_desc);
ncclResult_t MsgQueueClose(std::string name, mqd_t& mq_desc, bool unlink);
ncclResult_t MsgQueueUnlink(std::string name);
ncclResult_t MsgQueueClose(std::string const& name, mqd_t& mq_desc, bool unlink);
ncclResult_t MsgQueueUnlink(std::string const& name);
#endif
+14 -10
Melihat File
@@ -53,8 +53,8 @@ template <typename T>
class ShmObject
{
public:
ShmObject(size_t size, std::string fileName, int rank, int numRanks, int projid) :
m_shmSize(size),
ShmObject(size_t size, std::string const& fileName, int rank, int numRanks, int projid) :
m_shmSize(size),
m_shmName(fileName),
m_rank(rank),
m_numRanks(numRanks),
@@ -62,7 +62,14 @@ ShmObject(size_t size, std::string fileName, int rank, int numRanks, int projid)
m_alloc(false),
m_shmPtr(nullptr) {}
ShmObject() {}
ShmObject() :
m_shmSize(0),
m_shmName(""),
m_rank(0),
m_numRanks(0),
m_projid(0),
m_alloc(false),
m_shmPtr(nullptr) {}
~ShmObject() {}
@@ -82,7 +89,7 @@ ShmObject(size_t size, std::string fileName, int rank, int numRanks, int projid)
return m_shmPtr;
}
protected:
ncclResult_t BroadcastMessage(mqd_t& mq_desc, bool pass)
ncclResult_t BroadcastMessage(mqd_t& mq_desc, bool pass) const
{
char msg_text[1];
msg_text[0] = (pass == 0 ? 'F': 'P');
@@ -112,11 +119,11 @@ dropback:
template<class U>
struct OpenTag{};
ncclResult_t InitIfSemaphore(OpenTag<int> tag);
static ncclResult_t InitIfSemaphore(OpenTag<int> tag);
ncclResult_t InitIfSemaphore(OpenTag<uint32_t> tag);
ncclResult_t InitIfSemaphore(OpenTag<hipIpcMemHandle_t> tag);
static ncclResult_t InitIfSemaphore(OpenTag<hipIpcMemHandle_t> tag);
ncclResult_t InitIfSemaphore(OpenTag<sem_t> tag);
ncclResult_t InitIfSemaphore(OpenTag<std::pair<hipIpcMemHandle_t,size_t>> tag);
static ncclResult_t InitIfSemaphore(OpenTag<std::pair<hipIpcMemHandle_t,size_t>> tag);
size_t m_shmSize;
std::string m_shmName;
@@ -134,9 +141,6 @@ ncclResult_t ShmObject<T>::Open()
if (m_alloc == false)
{
int shmFd;
int protection = PROT_READ | PROT_WRITE;
int visibility = MAP_SHARED;
INFO(NCCL_INIT, "Rank %d Initializing message queue for %s\n", m_rank, m_shmName.c_str());
NCCLCHECK(MsgQueueGetId(m_shmName, false, mq_desc));
+2 -2
Melihat File
@@ -391,7 +391,7 @@ static struct rcclRomeModel rome_model_59 = {
.connMatrix = { 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, },
.gdrLevel = { 4, 4, 4, 4, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 4, 4, 4, 4, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 4, 4, 4, 4, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 4, 4, 4, 4, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 4, 4, 4, 4, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 4, 4, 4, 4, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 4, 4, 4, 4, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 4, 4, 4, 4, },
.pattern = "42424242",
.ringBase = "N0 1 3 2 6 7 15 14 10 11 9 8 12 13 5 4 0 N0|N1 3 7 6 13 12 8 9 10 11 15 14 5 4 0 1 2 N1|N2 4 5 1 0 2 3 7 6 14 15 11 10 8 9 13 12 N6|N3 6 2 3 1 0 4 5 13 12 8 9 11 10 14 15 7 N3|N4 8 12 13 6 7 3 2 1 0 4 5 14 15 11 10 9 N4|N5 11 15 14 6 7 3 2 0 1 5 4 12 13 9 8 10 N5|N6 12 13 5 4 0 1 3 2 6 7 15 14 10 11 9 8 N4|N7 14 5 4 0 1 2 3 7 6 13 12 8 9 10 11 15 N7|N3 7 6 14 15 11 10 8 9 13 12 4 5 1 0 2 3 N1|N1 2 3 1 0 4 5 13 12 8 9 11 10 14 15 7 6 N3|N2 5 14 15 11 10 9 8 12 13 6 7 3 2 1 0 4 N2|N4 9 8 10 11 15 14 6 7 3 2 0 1 5 4 12 13 N6|N5 10 11 9 8 12 13 5 4 0 1 3 2 6 7 15 14 N7|N7 15 14 5 4 0 1 2 3 7 6 13 12 8 9 10 11 N5|N0 0 2 3 7 6 14 15 11 10 8 9 13 12 4 5 1 N0|N6 13 12 8 9 11 10 14 15 7 6 2 3 1 0 4 5 N2|",
.ringBase = "N0 1 0 4 5 14 15 11 10 9 8 12 13 6 7 3 2 N1|N1 3 2 0 1 5 4 12 13 9 8 10 11 15 14 6 7 N3|N2 5 4 0 1 3 2 6 7 15 14 10 11 9 8 12 13 N6|N3 7 6 13 12 8 9 10 11 15 14 5 4 0 1 2 3 N1|N5 11 10 8 9 13 12 4 5 1 0 2 3 7 6 14 15 N7|N6 13 12 8 9 11 10 14 15 7 6 2 3 1 0 4 5 N2|N2 4 5 14 15 11 10 9 8 12 13 6 7 3 2 1 0 N0|N7 15 14 6 7 3 2 0 1 5 4 12 13 9 8 10 11 N5|N0 0 1 3 2 6 7 15 14 10 11 9 8 12 13 5 4 N2|N1 2 3 7 6 13 12 8 9 10 11 15 14 5 4 0 1 N0|N7 14 15 11 10 8 9 13 12 4 5 1 0 2 3 7 6 N3|N4 8 9 11 10 14 15 7 6 2 3 1 0 4 5 13 12 N6|N4 9 8 12 13 6 7 3 2 1 0 4 5 14 15 11 10 N5|N3 6 7 3 2 0 1 5 4 12 13 9 8 10 11 15 14 N7|N6 12 13 5 4 0 1 3 2 6 7 15 14 10 11 9 8 N4|N5 10 11 15 14 5 4 0 1 2 3 7 6 13 12 8 9 N4|",
.netGdrLevel = -2,
};
@@ -430,7 +430,7 @@ static struct rcclRomeModel rome_model_65 = {
.connMatrix = { 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, },
.gdrLevel = { 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5, },
.pattern = "42424242",
.ringBase = "N0 1 3 2 6 7 15 14 10 11 9 8 12 13 5 4 0 N0|N1 3 7 6 13 12 8 9 10 11 15 14 5 4 0 1 2 N1|N2 4 5 1 0 2 3 7 6 14 15 11 10 8 9 13 12 N6|N3 6 2 3 1 0 4 5 13 12 8 9 11 10 14 15 7 N3|N4 8 12 13 6 7 3 2 1 0 4 5 14 15 11 10 9 N4|N5 11 15 14 6 7 3 2 0 1 5 4 12 13 9 8 10 N5|N6 12 13 5 4 0 1 3 2 6 7 15 14 10 11 9 8 N4|N7 14 5 4 0 1 2 3 7 6 13 12 8 9 10 11 15 N7|N3 7 6 14 15 11 10 8 9 13 12 4 5 1 0 2 3 N1|N1 2 3 1 0 4 5 13 12 8 9 11 10 14 15 7 6 N3|N2 5 14 15 11 10 9 8 12 13 6 7 3 2 1 0 4 N2|N4 9 8 10 11 15 14 6 7 3 2 0 1 5 4 12 13 N6|N5 10 11 9 8 12 13 5 4 0 1 3 2 6 7 15 14 N7|N7 15 14 5 4 0 1 2 3 7 6 13 12 8 9 10 11 N5|N0 0 2 3 7 6 14 15 11 10 8 9 13 12 4 5 1 N0|N6 13 12 8 9 11 10 14 15 7 6 2 3 1 0 4 5 N2|",
.ringBase = "N0 1 0 4 5 14 15 11 10 9 8 12 13 6 7 3 2 N1|N1 3 2 0 1 5 4 12 13 9 8 10 11 15 14 6 7 N3|N2 5 4 0 1 3 2 6 7 15 14 10 11 9 8 12 13 N6|N3 7 6 13 12 8 9 10 11 15 14 5 4 0 1 2 3 N1|N5 11 10 8 9 13 12 4 5 1 0 2 3 7 6 14 15 N7|N6 13 12 8 9 11 10 14 15 7 6 2 3 1 0 4 5 N2|N2 4 5 14 15 11 10 9 8 12 13 6 7 3 2 1 0 N0|N7 15 14 6 7 3 2 0 1 5 4 12 13 9 8 10 11 N5|N0 0 1 3 2 6 7 15 14 10 11 9 8 12 13 5 4 N2|N1 2 3 7 6 13 12 8 9 10 11 15 14 5 4 0 1 N0|N7 14 15 11 10 8 9 13 12 4 5 1 0 2 3 7 6 N3|N4 8 9 11 10 14 15 7 6 2 3 1 0 4 5 13 12 N6|N4 9 8 12 13 6 7 3 2 1 0 4 5 14 15 11 10 N5|N3 6 7 3 2 0 1 5 4 12 13 9 8 10 11 15 14 N7|N6 12 13 5 4 0 1 3 2 6 7 15 14 10 11 9 8 N4|N5 10 11 15 14 5 4 0 1 2 3 7 6 13 12 8 9 N4|",
.netGdrLevel = 5,
};
-1
Melihat File
@@ -137,7 +137,6 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
if (a == NCCL_ALGO_RING && (p == NCCL_PROTO_LL || p == NCCL_PROTO_LL128)) busBw *= 0.05;
if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_SIMPLE) (nNodes == 2) ? busBw *= 0.33 : busBw *= 0.11;
if (a == NCCL_ALGO_TREE && (p == NCCL_PROTO_LL || p == NCCL_PROTO_LL128)) busBw *= 0.04;
if (gcn == 910 && p == NCCL_PROTO_LL && nNodes == 1 && nRanks == 16) busBw *= 5.9;
if (gcn == 910 && a == NCCL_ALGO_TREE && p == NCCL_PROTO_SIMPLE && nNodes == 2 && nRanks == 32) busBw *= 3.2;
#else
if (compCap80) busBw = std::min(busBw, 235.0f);
-10
Melihat File
@@ -90,16 +90,6 @@ static ncclResult_t ncclCudaMemcpy(T* dst, T* src, size_t nelem) {
return ncclSuccess;
}
static bool hasFineGrainVramPcie() {
int *ptr;
if (hipExtMallocWithFlags((void**)&ptr, sizeof(int), hipDeviceMallocFinegrained) == hipSuccess) {
CUDACHECK(hipFree(ptr));
return true;
}
else
return false;
}
// Allocate memory to be potentially ibv_reg_mr'd. This needs to be
// allocated on separate pages as those pages will be marked DONTFORK
// and if they are shared, that could cause a crash in a child process
-1
Melihat File
@@ -43,7 +43,6 @@ static ncclResult_t ncclGpuGdrSupport(int* gdrSupport) {
NCCLCHECK(ncclNet->getProperties(dev, &props));
if ((props.ptrSupport & NCCL_PTR_CUDA) == 0) continue;
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
if (!hasFineGrainVramPcie()) continue;
*gdrSupport = 1;
break;
#endif
+1
Melihat File
@@ -29,6 +29,7 @@ struct ncclPeerInfo {
int rank;
int cudaDev;
int gdrSupport;
bool hasFineGrain;
uint64_t hostHash;
uint64_t pidHash;
dev_t shmDev;
+12 -1
Melihat File
@@ -640,7 +640,18 @@ static ncclResult_t fillInfo(struct ncclComm* comm, struct ncclPeerInfo* info, u
info->busId = comm->busId;
NCCLCHECK(ncclGpuGdrSupport(&info->gdrSupport));
// detect if fine grained memory is available on this GPU
int *ptr;
if (hipExtMallocWithFlags((void**)&ptr, sizeof(int), hipDeviceMallocFinegrained) == hipSuccess) {
CUDACHECK(hipFree(ptr));
info->hasFineGrain = true;
NCCLCHECK(ncclGpuGdrSupport(&info->gdrSupport));
}
else {
info->hasFineGrain = false;
info->gdrSupport = 0;
}
return ncclSuccess;
}
+1 -12
Melihat File
@@ -58,7 +58,7 @@ int busIdToCudaDev(int64_t busId) {
/* Determine if two peers can communicate through p2p */
ncclResult_t p2pCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) {
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
if (!hasFineGrainVramPcie()) {
if (!info1->hasFineGrain || !info2->hasFineGrain) {
*ret = 0;
return ncclSuccess;
}
@@ -90,17 +90,6 @@ ncclResult_t p2pCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTop
#endif
}
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
int dev;
CUDACHECK(hipGetDevice(&dev));
CUDACHECK(hipSetDevice(cudaDev2));
if (!hasFineGrainVramPcie()) {
*ret = 0;
CUDACHECK(hipSetDevice(dev));
return ncclSuccess;
}
CUDACHECK(hipSetDevice(dev));
#endif
// Check that CUDA can do P2P
int p2p;
if (hipDeviceCanAccessPeer(&p2p, cudaDev1, cudaDev2) != hipSuccess) {
+12 -5
Melihat File
@@ -67,6 +67,13 @@ if(BUILD_TESTS)
target_link_libraries(UnitTestsMultiProcess PRIVATE ${GTEST_BOTH_LIBRARIES})
target_link_libraries(UnitTestsMultiProcess PRIVATE hip::host hip::device)
find_program( rocminfo_executable rocminfo )
execute_process(COMMAND bash "-c" "${rocminfo_executable} | grep 'Device Type' | grep GPU | wc -l | tr -d '\n'" OUTPUT_VARIABLE gtest_num_gpus)
if(${gtest_num_gpus} MATCHES "0" OR ${gtest_num_gpus} MATCHES "1")
set(gtest_num_gpus "2")
endif()
target_compile_options(UnitTests PRIVATE -DGTESTS_NUM_GPUS=${gtest_num_gpus})
# UnitTests using static library of rccl requires passing rccl
# through -l and -L instead of command line input.
if(BUILD_STATIC)
@@ -81,13 +88,13 @@ if(BUILD_TESTS)
# HIPCC adds /opt/rocm/lib as RPATH, even though the install process is supposed to
# remove RPATH. It also occurs before any user-specified rpath, which effectively overrides the user rpath.
# As a work-around, set the correct RPATH for the unit test executable as a post-install step
if (CMAKE_INSTALL_PREFIX MATCHES "/opt/rocm*")
if (CMAKE_INSTALL_PREFIX MATCHES "${ROCM_PATH}")
# install_prefix/CMAKE_INSTALL_PREFIX was not explicitly specified, so look in build/release
add_custom_command( TARGET UnitTests POST_BUILD COMMAND chrpath ARGS -r ${CMAKE_BINARY_DIR}:/opt/rocm/lib ${CMAKE_BINARY_DIR}/test/UnitTests)
add_custom_command( TARGET UnitTestsMultiProcess POST_BUILD COMMAND chrpath ARGS -r ${CMAKE_BINARY_DIR}:/opt/rocm/lib ${CMAKE_BINARY_DIR}/test/UnitTestsMultiProcess)
add_custom_command( TARGET UnitTests POST_BUILD COMMAND chrpath ARGS -r ${CMAKE_BINARY_DIR}:${ROCM_PATH}/lib ${CMAKE_BINARY_DIR}/test/UnitTests)
add_custom_command( TARGET UnitTestsMultiProcess POST_BUILD COMMAND chrpath ARGS -r ${CMAKE_BINARY_DIR}:${ROCM_PATH}/lib ${CMAKE_BINARY_DIR}/test/UnitTestsMultiProcess)
else()
add_custom_command( TARGET UnitTests POST_BUILD COMMAND chrpath ARGS -r ${CMAKE_INSTALL_PREFIX}/lib:/opt/rocm/lib ${CMAKE_INSTALL_PREFIX}/test/UnitTests)
add_custom_command( TARGET UnitTestsMultiProcess POST_BUILD COMMAND chrpath ARGS -r ${CMAKE_INSTALL_PREFIX}/lib:/opt/rocm/lib ${CMAKE_INSTALL_PREFIX}/test/UnitTestsMultiProcess)
add_custom_command( TARGET UnitTests POST_BUILD COMMAND chrpath ARGS -r ${CMAKE_INSTALL_PREFIX}/lib:${ROCM_PATH}/lib ${CMAKE_INSTALL_PREFIX}/test/UnitTests)
add_custom_command( TARGET UnitTestsMultiProcess POST_BUILD COMMAND chrpath ARGS -r ${CMAKE_INSTALL_PREFIX}/lib:${ROCM_PATH}/lib ${CMAKE_INSTALL_PREFIX}/test/UnitTestsMultiProcess)
endif()
else()
message("Not building unit tests")
+1 -1
Melihat File
@@ -812,7 +812,7 @@ dropback:
case ncclUint64: isMatch &= (outputU8[j] == expectedU8[j]); break;
case ncclFloat32: isMatch &= (fabs(outputF4[j] - expectedF4[j]) < 1e-5); break;
case ncclFloat64: isMatch &= (fabs(outputF8[j] - expectedF8[j]) < 1e-12); break;
case ncclBfloat16: isMatch &= (fabs((float)outputB2[j] - (float)expectedB2[j]) < 5e-2); break;
case ncclBfloat16: isMatch &= (fabs((float)outputB2[j] - (float)expectedB2[j]) < 9e-2); break;
default:
fprintf(stderr, "[ERROR] Unsupported datatype\n");
exit(0);
+3 -1
Melihat File
@@ -9,6 +9,8 @@ namespace CorrectnessTests
{
TEST_P(AllGatherCorrectnessTest, Correctness)
{
// Adjust numElements to be multiple of numDevices
numElements = (numElements/numDevices)*numDevices;
if (numDevices > numDevicesAvailable) return;
if (numElements % numDevices != 0) return;
@@ -107,7 +109,7 @@ namespace CorrectnessTests
// Number of elements
testing::Values(2520, 3026520),
// Number of devices
testing::Values(2,3,4,5,6,7,8),
testing::Range(2,(GTESTS_NUM_GPUS+1)),
// In-place or not
testing::Values(false, true),
testing::Values("")),
+2 -2
Melihat File
@@ -46,7 +46,7 @@ namespace CorrectnessTests
// Number of elements
testing::Values(1024, 1048576),
// Number of devices
testing::Values(2,3,4,5,6,7,8),
testing::Range(2,(GTESTS_NUM_GPUS+1)),
// In-place or not
testing::Values(false, true),
testing::Values("RCCL_ENABLE_CLIQUE=0", "RCCL_ENABLE_CLIQUE=1")),
@@ -71,7 +71,7 @@ namespace CorrectnessTests
// Number of elements
testing::Values(1024, 1048576),
// Number of devices
testing::Values(2,3,4,5,6,7,8),
testing::Range(2,(GTESTS_NUM_GPUS+1)),
// In-place or not
testing::Values(false, true),
testing::Values("RCCL_ENABLE_CLIQUE=0", "RCCL_ENABLE_CLIQUE=1")),
+2 -2
Melihat File
@@ -58,7 +58,7 @@ namespace CorrectnessTests
// Number of elements
testing::Values(1024, 1048576),
// Number of devices
testing::Values(2,3,4,5,6,7,8),
testing::Range(2,(GTESTS_NUM_GPUS+1)),
// In-place or not
testing::Values(false, true),
testing::Values("RCCL_ENABLE_CLIQUE=0", "RCCL_ENABLE_CLIQUE=1")),
@@ -74,7 +74,7 @@ namespace CorrectnessTests
// Number of elements
testing::Values(1024, 1048576),
// Number of devices
testing::Values(2,3,4,5,6,7,8),
testing::Range(2,(GTESTS_NUM_GPUS+1)),
// In-place or not
testing::Values(false, true),
testing::Values("RCCL_ENABLE_CLIQUE=0", "RCCL_ENABLE_CLIQUE=1")),
+1 -1
Melihat File
@@ -59,7 +59,7 @@ namespace CorrectnessTests
// Number of elements
testing::Values(1024, 1048576),
// Number of devices
testing::Values(2,3,4,5,6,7,8),
testing::Range(2,(GTESTS_NUM_GPUS+1)),
// In-place or not
testing::Values(false),
testing::Values("")),
+1 -1
Melihat File
@@ -67,7 +67,7 @@ namespace CorrectnessTests
// Number of elements
testing::Values(2520, 3026520),
// Number of devices
testing::Values(2,3,4,5,6,7,8),
testing::Range(2,(GTESTS_NUM_GPUS+1)),
// In-place or not
testing::Values(false),
testing::Values("")),
+1 -1
Melihat File
@@ -63,7 +63,7 @@ namespace CorrectnessTests
// Number of elements
testing::Values(1024, 1048576),
// Number of devices
testing::Values(2,3,4,5,6,7,8),
testing::Range(2,(GTESTS_NUM_GPUS+1)),
// In-place or not
testing::Values(false, true),
testing::Values("")),
+3 -1
Melihat File
@@ -27,6 +27,8 @@ namespace CorrectnessTests
ncclFuncs.push_back(ncclCollReduce);
ncclFuncs.push_back(ncclCollReduceScatter);
// Adjust numElements to be multiple of numDevices
numElements = (numElements/numDevices)*numDevices;
for (int i = 0; i < datasets.size(); i++)
{
datasets[i].Initialize(numDevices, numElements, dataType, inPlace, ncclFuncs[i]);
@@ -119,7 +121,7 @@ namespace CorrectnessTests
// Number of elements
testing::Values(2520, 3026520),
// Number of devices
testing::Values(2,3,4,5,6,7,8),
testing::Range(2,(GTESTS_NUM_GPUS+1)),
// In-place or not
testing::Values(false),
testing::Values("RCCL_ENABLE_CLIQUE=0", "RCCL_ENABLE_CLIQUE=1", "RCCL_P2P_NET_DISABLE=0", "RCCL_P2P_NET_DISABLE=1")),
+1 -1
Melihat File
@@ -63,7 +63,7 @@ namespace CorrectnessTests
// Number of elements
testing::Values(1024, 1048576),
// Number of devices
testing::Values(2,3,4,5,6,7,8),
testing::Range(2,(GTESTS_NUM_GPUS+1)),
// In-place or not
testing::Values(false),
testing::Values("")),
+3 -1
Melihat File
@@ -26,6 +26,8 @@ namespace CorrectnessTests
ncclFuncs.push_back(ncclCollReduce);
ncclFuncs.push_back(ncclCollReduceScatter);
// Adjust numElements to be multiple of numDevices
numElements = (numElements/numDevices)*numDevices;
for (int i = 0; i < datasets.size(); i++)
{
datasets[i].Initialize(numDevices, numElements, dataType, inPlace, ncclFuncs[i]);
@@ -120,7 +122,7 @@ namespace CorrectnessTests
// Number of elements
testing::Values(2520, 3026520),
// Number of devices
testing::Values(2,3,4,5,6,7,8),
testing::Range(2,(GTESTS_NUM_GPUS+1)),
// In-place or not
testing::Values(false, true),
testing::Values("RCCL_ENABLE_CLIQUE=0", "RCCL_ENABLE_CLIQUE=1")),
+1 -1
Melihat File
@@ -63,7 +63,7 @@ namespace CorrectnessTests
// Number of elements
testing::Values(1024, 1048576),
// Number of devices
testing::Values(2,3,4,5,6,7,8),
testing::Range(2,(GTESTS_NUM_GPUS+1)),
// In-place or not
testing::Values(false, true),
testing::Values("")),
+3 -1
Melihat File
@@ -10,6 +10,8 @@ namespace CorrectnessTests
{
TEST_P(ReduceScatterCorrectnessTest, Correctness)
{
// Adjust numElements to be multiple of numDevices
numElements = (numElements/numDevices)*numDevices;
if (numDevices > numDevicesAvailable) return;
if (numElements % numDevices != 0) return;
@@ -61,7 +63,7 @@ namespace CorrectnessTests
// Number of elements
testing::Values(2520, 3026520),
// Number of devices
testing::Values(2,3,4,5,6,7,8),
testing::Range(2,(GTESTS_NUM_GPUS+1)),
// In-place or not
testing::Values(false, true),
testing::Values("")),
+1 -1
Melihat File
@@ -63,7 +63,7 @@ namespace CorrectnessTests
// Number of elements
testing::Values(1024, 1048576),
// Number of devices
testing::Values(2,3,4,5,6,7,8),
testing::Range(2,(GTESTS_NUM_GPUS+1)),
// In-place or not
testing::Values(false),
testing::Values("")),
+52 -39
Melihat File
@@ -77,7 +77,8 @@ int main(int argc, char **argv)
maxN = std::max(maxN, N);
// Execute only peer to peer benchmark mode, similar to rocm-bandwidth-test
if (!strcmp(argv[1], "p2p"))
if (!strcmp(argv[1], "p2p") || !strcmp(argv[1], "p2p_rr") ||
!strcmp(argv[1], "g2g") || !strcmp(argv[1], "g2g_rr"))
{
int numBlocksToUse = 0;
if (argc > 3)
@@ -85,8 +86,13 @@ int main(int argc, char **argv)
else
HIP_CALL(hipDeviceGetAttribute(&numBlocksToUse, hipDeviceAttributeMultiprocessorCount, 0));
// Perform either local read (+remote write) [EXE = SRC] or
// remote read (+local write) [EXE = DST]
int readMode = (!strcmp(argv[1], "p2p_rr") || !strcmp(argv[1], "g2g_rr") ? 1 : 0);
int skipCpu = (!strcmp(argv[1], "g2g") || !strcmp(argv[1], "g2g_rr") ? 1 : 0);
// Execute peer to peer benchmark mode
RunPeerToPeerBenchmarks(ev, numBytesPerLink / sizeof(float), numBlocksToUse);
RunPeerToPeerBenchmarks(ev, numBytesPerLink / sizeof(float), numBlocksToUse, readMode, skipCpu);
exit(0);
}
@@ -394,15 +400,20 @@ void DisplayUsage(char const* cmdName)
HIP_CALL(hipGetDeviceCount(&numGpuDevices));
int const numCpuDevices = numa_num_configured_nodes();
printf("Usage: %s configFile <N>\n", cmdName);
printf(" configFile: File containing Links to execute (see below for format)\n");
printf(" Specifying \"p2p\" as the configFile will execute a peer to peer benchmark (3rd argument used as # CUs to use)\n");
printf(" N : (Optional) Number of bytes to transfer per link.\n");
printf(" If not specified, defaults to %lu bytes. Must be a multiple of 4 bytes\n", DEFAULT_BYTES_PER_LINK);
printf(" If 0 is specified, a range of Ns will be benchmarked\n");
printf(" If a negative number is specified, a configFile gets generated with this number as default number of CUs per link\n");
printf(" May append a suffix ('K', 'M', 'G') for kilobytes / megabytes / gigabytes\n");
printf("Usage: %s config <N>\n", cmdName);
printf(" config: Either:\n");
printf(" - Filename of configFile containing Links to execute (see below for format)\n");
printf(" - Name of preset benchmark:\n");
printf(" p2p - All CPU/GPU pairs benchmark\n");
printf(" p2p_rr - All CPU/GPU pairs benchmark with remote reads\n");
printf(" g2g - All GPU/GPU pairs benchmark\n");
printf(" g2g_rr - All GPU/GPU pairs benchmark with remote reads\n");
printf(" - 3rd optional argument will be used as # of CUs to use (uses all by default)\n");
printf(" N : (Optional) Number of bytes to transfer per link.\n");
printf(" If not specified, defaults to %lu bytes. Must be a multiple of 4 bytes\n", DEFAULT_BYTES_PER_LINK);
printf(" If 0 is specified, a range of Ns will be benchmarked\n");
printf(" If a negative number is specified, a configFile gets generated with this number as default number of CUs per link\n");
printf(" May append a suffix ('K', 'M', 'G') for kilobytes / megabytes / gigabytes\n");
printf("\n");
printf("Configfile Format:\n");
printf("==================\n");
@@ -1104,7 +1115,7 @@ void RunLink(EnvVars const& ev, size_t const N, int const iteration, Link& link)
}
}
void RunPeerToPeerBenchmarks(EnvVars const& ev, size_t N, int numBlocksToUse)
void RunPeerToPeerBenchmarks(EnvVars const& ev, size_t N, int numBlocksToUse, int readMode, int skipCpu)
{
// Collect the number of available CPUs/GPUs on this machine
int numGpus;
@@ -1122,42 +1133,44 @@ void RunPeerToPeerBenchmarks(EnvVars const& ev, size_t N, int numBlocksToUse)
printf("Using %d CUs per transfer\n", numBlocksToUse);
// Perform unidirectional / bidirectional
for (int readMode = 0; readMode < 2; readMode++)
for (int isBidirectional = 0; isBidirectional <= 1; isBidirectional++)
{
for (int isBidirectional = 0; isBidirectional <= 1; isBidirectional++)
// Print header
printf("%sdirectional copy peak bandwidth GB/s [%s read / %s write]\n", isBidirectional ? "Bi" : "Uni",
readMode == 0 ? "Local" : "Remote",
readMode == 0 ? "Remote" : "Local");
printf("%10s", "D/D");
if (!skipCpu)
{
// Print header
printf("%sdirectional copy peak bandwidth GB/s [%s read / %s write]\n", isBidirectional ? "Bi" : "Uni",
readMode == 0 ? "Local" : "Remote",
readMode == 0 ? "Remote" : "Local");
printf("%10s", "D/D");
for (int i = 0; i < numCpus; i++)
printf("%7s %02d", "CPU", i);
for (int i = 0; i < numGpus; i++)
printf("%7s %02d", "GPU", i);
printf("\n");
}
for (int i = 0; i < numGpus; i++)
printf("%7s %02d", "GPU", i);
printf("\n");
// Loop over all possible src/dst pairs
for (int src = 0; src < numDevices; src++)
// Loop over all possible src/dst pairs
for (int src = 0; src < numDevices; src++)
{
MemType const& srcMemType = (src < numCpus ? MEM_CPU : MEM_GPU);
if (skipCpu && srcMemType == MEM_CPU) continue;
int srcIndex = (srcMemType == MEM_CPU ? src : src - numCpus);
printf("%7s %02d", (srcMemType == MEM_CPU) ? "CPU" : "GPU", srcIndex);
for (int dst = 0; dst < numDevices; dst++)
{
MemType const& srcMemType = (src < numCpus ? MEM_CPU : MEM_GPU);
int srcIndex = (srcMemType == MEM_CPU ? src : src - numCpus);
printf("%7s %02d", (srcMemType == MEM_CPU) ? "CPU" : "GPU", srcIndex);
for (int dst = 0; dst < numDevices; dst++)
{
MemType const& dstMemType = (dst < numCpus ? MEM_CPU : MEM_GPU);
int dstIndex = (dstMemType == MEM_CPU ? dst : dst - numCpus);
double bandwidth = GetPeakBandwidth(ev, N, isBidirectional, srcMemType, srcIndex, dstMemType, dstIndex, readMode);
if (bandwidth == 0)
printf("%10s", "N/A");
else
printf("%10.2f", bandwidth);
fflush(stdout);
}
printf("\n");
MemType const& dstMemType = (dst < numCpus ? MEM_CPU : MEM_GPU);
if (skipCpu && dstMemType == MEM_CPU) continue;
int dstIndex = (dstMemType == MEM_CPU ? dst : dst - numCpus);
double bandwidth = GetPeakBandwidth(ev, N, isBidirectional, srcMemType, srcIndex, dstMemType, dstIndex, readMode);
if (bandwidth == 0)
printf("%10s", "N/A");
else
printf("%10.2f", bandwidth);
fflush(stdout);
}
printf("\n");
}
printf("\n");
}
}
+1 -1
Melihat File
@@ -122,7 +122,7 @@ void DeallocateMemory(MemType memType, int devIndex, float* memPtr);
void CheckPages(char* byteArray, size_t numBytes, int targetId);
void CheckOrFill(ModeType mode, int N, bool isMemset, bool isHipCall, std::vector<float> const& fillPattern, float* ptr);
void RunLink(EnvVars const& ev, size_t const N, int const iteration, Link& link);
void RunPeerToPeerBenchmarks(EnvVars const& ev, size_t N, int numBlocksToUse);
void RunPeerToPeerBenchmarks(EnvVars const& ev, size_t N, int numBlocksToUse, int readMode, int skipCpu);
double GetPeakBandwidth(EnvVars const& ev, size_t N, int isBidirectional,
MemType srcMemType, int srcIndex,
MemType dstMemType, int dstIndex,