Add peer2peer bandwidth and latency test
Change-Id: I6d88e4aa9f6e64096af16579eebef4740734203e
[ROCm/hip-tests commit: 5c530e7c32]
This commit is contained in:
کامیت شده توسط
Maneesh Gupta
والد
a51e393772
کامیت
ef65747f38
@@ -16,13 +16,15 @@ int p_iterations = 10;
|
||||
int p_beatsperiteration=1;
|
||||
int p_device = 0;
|
||||
int p_detailed = 0;
|
||||
bool p_async = 0;
|
||||
bool p_async = 0;
|
||||
int p_alignedhost = 0; // align host allocs to this granularity, in bytes. 64 or 4096 are good values to try.
|
||||
int p_onesize = 0;
|
||||
int p_onesize = 0;
|
||||
|
||||
bool p_h2d = true;
|
||||
bool p_d2h = true;
|
||||
bool p_bidir = true;
|
||||
bool p_p2p = false;
|
||||
|
||||
|
||||
//#define NO_CHECK
|
||||
|
||||
@@ -70,7 +72,7 @@ std::string sizeToString(int size)
|
||||
|
||||
|
||||
// ****************************************************************************
|
||||
hipError_t memcopy(void * dst, const void *src, size_t sizeBytes, enum hipMemcpyKind kind)
|
||||
hipError_t memcopy(void * dst, const void *src, size_t sizeBytes, enum hipMemcpyKind kind )
|
||||
{
|
||||
if (p_async) {
|
||||
return hipMemcpyAsync(dst, src, sizeBytes, kind, NULL);
|
||||
@@ -632,6 +634,9 @@ void RunBenchmark_Bidir(ResultDatabase &resultDB)
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
#define failed(...) \
|
||||
printf ("error: ");\
|
||||
printf (__VA_ARGS__);\
|
||||
@@ -646,6 +651,326 @@ int parseInt(const char *str, int *output)
|
||||
}
|
||||
|
||||
|
||||
void checkPeer2PeerSupport()
|
||||
{
|
||||
int deviceCnt;
|
||||
hipGetDeviceCount(&deviceCnt);
|
||||
std::cout << "Total no. of available gpu #" << deviceCnt << "\n" << std::endl;
|
||||
|
||||
for(int deviceId=0; deviceId<deviceCnt; deviceId++)
|
||||
{
|
||||
hipDeviceProp_t props;
|
||||
hipGetDeviceProperties(&props, deviceId);
|
||||
std::cout << "for gpu#" << deviceId << " " << props.name << std::endl;
|
||||
std::cout << " peer2peer supported : ";
|
||||
int PeerCnt=0;
|
||||
for (int i=0; i<deviceCnt; i++) {
|
||||
int isPeer;
|
||||
hipDeviceCanAccessPeer(&isPeer, i, deviceId);
|
||||
if (isPeer) {
|
||||
std::cout << "gpu#" << i << " ";
|
||||
++PeerCnt;
|
||||
}
|
||||
}
|
||||
if (PeerCnt==0)
|
||||
std::cout << "NONE" << " ";
|
||||
|
||||
std::cout << std::endl;
|
||||
std::cout << " peer2peer not supported : ";
|
||||
int nonPeerCnt=0;
|
||||
for (int i=0; i<deviceCnt; i++) {
|
||||
int isPeer;
|
||||
hipDeviceCanAccessPeer(&isPeer, i, deviceId);
|
||||
if (!isPeer && (i!=deviceId)) {
|
||||
std::cout << "gpu#" << i << " ";
|
||||
++nonPeerCnt;
|
||||
}
|
||||
}
|
||||
if (nonPeerCnt==0)
|
||||
std::cout << "NONE" << " ";
|
||||
|
||||
std::cout <<"\n"<<std::endl;
|
||||
}
|
||||
|
||||
std::cout << "\nNote: For non-supported peer2peer devices, memcopy will use/follow the normal behaviour (GPU1-->host then host-->GPU2)\n\n" << std::endl;
|
||||
}
|
||||
|
||||
void enablePeer2Peer(int currentGpu, int peerGpu)
|
||||
{
|
||||
int canAccessPeer;
|
||||
|
||||
hipSetDevice(currentGpu);
|
||||
hipDeviceCanAccessPeer(&canAccessPeer, currentGpu, peerGpu);
|
||||
|
||||
if(canAccessPeer==1){
|
||||
hipDeviceEnablePeerAccess(peerGpu, 0);
|
||||
}
|
||||
}
|
||||
|
||||
void disablePeer2Peer(int currentGpu, int peerGpu)
|
||||
{
|
||||
int canAccessPeer;
|
||||
|
||||
hipSetDevice(currentGpu);
|
||||
hipDeviceCanAccessPeer(&canAccessPeer, currentGpu, peerGpu);
|
||||
|
||||
if(canAccessPeer==1){
|
||||
hipDeviceDisablePeerAccess(peerGpu);
|
||||
}
|
||||
}
|
||||
|
||||
std::string gpuIDToString(int gpuID)
|
||||
{
|
||||
using namespace std;
|
||||
stringstream ss;
|
||||
ss << gpuID;
|
||||
return ss.str();
|
||||
}
|
||||
|
||||
void RunBenchmark_P2P_Unidir(ResultDatabase &resultDB)
|
||||
{
|
||||
int gpuCount;
|
||||
hipGetDeviceCount(&gpuCount);
|
||||
|
||||
int currentGpu, peerGpu;
|
||||
|
||||
long long numMaxFloats = 1024 * (sizes[nSizes-1]) / 4;
|
||||
|
||||
for (currentGpu=0; currentGpu<gpuCount; currentGpu++) {
|
||||
|
||||
for (peerGpu=0; peerGpu<gpuCount; peerGpu++){
|
||||
|
||||
if (currentGpu == peerGpu)
|
||||
continue;
|
||||
|
||||
float *currentGpuMem, *peerGpuMem;
|
||||
|
||||
hipSetDevice(currentGpu);
|
||||
hipMalloc((void**)¤tGpuMem, sizeof(float) * numMaxFloats);
|
||||
|
||||
hipSetDevice(peerGpu);
|
||||
hipMalloc((void**)&peerGpuMem, sizeof(float) * numMaxFloats);
|
||||
|
||||
enablePeer2Peer(currentGpu, peerGpu);
|
||||
|
||||
hipEvent_t start, stop;
|
||||
hipEventCreate(&start);
|
||||
hipEventCreate(&stop);
|
||||
CHECK_HIP_ERROR();
|
||||
|
||||
// Three passes, forward and backward both
|
||||
for (int pass = 0; pass < p_iterations; pass++)
|
||||
{
|
||||
// store the times temporarily to estimate latency
|
||||
//float times[nSizes];
|
||||
// Step through sizes forward on even passes and backward on odd
|
||||
for (int i = 0; i < nSizes; i++)
|
||||
{
|
||||
int sizeIndex;
|
||||
if ((pass % 2) == 0)
|
||||
sizeIndex = i;
|
||||
else
|
||||
sizeIndex = (nSizes - 1) - i;
|
||||
|
||||
const int thisSize = p_onesize ? p_onesize : sizes[sizeIndex];
|
||||
const int nbytes = sizeToBytes(thisSize);
|
||||
|
||||
hipDeviceSynchronize();
|
||||
|
||||
hipEventRecord(start, 0);
|
||||
|
||||
for (int j=0;j<p_beatsperiteration;j++) {
|
||||
hipMemcpy(peerGpuMem, currentGpuMem, nbytes, hipMemcpyDeviceToDevice);
|
||||
}
|
||||
|
||||
hipEventRecord(stop, 0);
|
||||
|
||||
hipEventSynchronize(stop);
|
||||
|
||||
float t = 0;
|
||||
hipEventElapsedTime(&t, start, stop);
|
||||
//times[sizeIndex] = t;
|
||||
|
||||
// Convert to GB/sec
|
||||
if (p_verbose)
|
||||
{
|
||||
std::cerr << "size " << sizeToString(thisSize) << " took " << t << " ms\n";
|
||||
}
|
||||
|
||||
double speed = (double(sizeToBytes(thisSize) * p_beatsperiteration) / (1000*1000)) / t;
|
||||
char sizeStr[256];
|
||||
if (p_beatsperiteration>1) {
|
||||
sprintf(sizeStr, "%9sx%d", sizeToString(thisSize).c_str(), p_beatsperiteration);
|
||||
} else {
|
||||
sprintf(sizeStr, "%9s", sizeToString(thisSize).c_str());
|
||||
}
|
||||
|
||||
string cGpu, pGpu;
|
||||
cGpu = gpuIDToString(currentGpu);
|
||||
pGpu = gpuIDToString(peerGpu);
|
||||
|
||||
resultDB.AddResult(std::string("p2p_uni") + "_gpu" + std::string(cGpu)+ "_gpu" + std::string(pGpu), sizeStr, "GB/sec", speed);
|
||||
resultDB.AddResult(std::string("P2P_uni") + "_gpu" + std::string(cGpu)+ "_gpu" + std::string(pGpu), sizeStr, "ms", t);
|
||||
|
||||
if (p_onesize) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
if (p_onesize) {
|
||||
numMaxFloats = sizeToBytes(p_onesize) / sizeof(float);
|
||||
}
|
||||
|
||||
disablePeer2Peer(currentGpu, peerGpu);
|
||||
|
||||
hipEventDestroy(start);
|
||||
hipEventDestroy(stop);
|
||||
|
||||
// Cleanup
|
||||
hipFree((void*)currentGpuMem);
|
||||
hipFree((void*)peerGpuMem);
|
||||
CHECK_HIP_ERROR();
|
||||
|
||||
hipSetDevice(peerGpu);
|
||||
hipDeviceReset();
|
||||
|
||||
hipSetDevice(currentGpu);
|
||||
hipDeviceReset();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
void RunBenchmark_P2P_Bidir(ResultDatabase &resultDB) {
|
||||
|
||||
int gpuCount;
|
||||
hipGetDeviceCount(&gpuCount);
|
||||
|
||||
hipStream_t stream[2];
|
||||
|
||||
int currentGpu, peerGpu;
|
||||
|
||||
long long numMaxFloats = 1024 * (sizes[nSizes-1]) / 4;
|
||||
|
||||
for (currentGpu=0; currentGpu<gpuCount; currentGpu++) {
|
||||
|
||||
for (peerGpu=0; peerGpu<gpuCount; peerGpu++){
|
||||
|
||||
if (currentGpu == peerGpu)
|
||||
continue;
|
||||
|
||||
float *currentGpuMem[2], *peerGpuMem[2];
|
||||
|
||||
hipSetDevice(currentGpu);
|
||||
hipMalloc((void**)¤tGpuMem[0], sizeof(float) * numMaxFloats);
|
||||
hipMalloc((void**)¤tGpuMem[1], sizeof(float) * numMaxFloats);
|
||||
|
||||
hipSetDevice(peerGpu);
|
||||
hipMalloc((void**)&peerGpuMem[0], sizeof(float) * numMaxFloats);
|
||||
hipMalloc((void**)&peerGpuMem[1], sizeof(float) * numMaxFloats);
|
||||
|
||||
enablePeer2Peer(currentGpu, peerGpu);
|
||||
|
||||
hipEvent_t start, stop;
|
||||
hipEventCreate(&start);
|
||||
hipEventCreate(&stop);
|
||||
CHECK_HIP_ERROR();
|
||||
|
||||
hipStreamCreate(&stream[0]);
|
||||
hipStreamCreate(&stream[1]);
|
||||
|
||||
// Three passes, forward and backward both
|
||||
for (int pass = 0; pass < p_iterations; pass++)
|
||||
{
|
||||
// store the times temporarily to estimate latency
|
||||
//float times[nSizes];
|
||||
// Step through sizes forward on even passes and backward on odd
|
||||
for (int i = 0; i < nSizes; i++)
|
||||
{
|
||||
int sizeIndex;
|
||||
if ((pass % 2) == 0)
|
||||
sizeIndex = i;
|
||||
else
|
||||
sizeIndex = (nSizes - 1) - i;
|
||||
|
||||
const int thisSize = p_onesize ? p_onesize : sizes[sizeIndex];
|
||||
const int nbytes = sizeToBytes(thisSize);
|
||||
|
||||
hipDeviceSynchronize();
|
||||
|
||||
hipEventRecord(start, 0);
|
||||
|
||||
for (int j=0;j<p_beatsperiteration;j++) {
|
||||
hipMemcpyAsync(peerGpuMem[0], currentGpuMem[0], nbytes, hipMemcpyDeviceToDevice, stream[0]);
|
||||
hipMemcpyAsync(currentGpuMem[1], peerGpuMem[1], nbytes, hipMemcpyDeviceToDevice, stream[1]);
|
||||
}
|
||||
|
||||
hipEventRecord(stop, 0);
|
||||
|
||||
hipEventSynchronize(stop);
|
||||
|
||||
float t = 0;
|
||||
hipEventElapsedTime(&t, start, stop);
|
||||
//times[sizeIndex] = t;
|
||||
|
||||
// Convert to GB/sec
|
||||
if (p_verbose)
|
||||
{
|
||||
std::cerr << "size " << sizeToString(thisSize) << " took " << t << " ms\n";
|
||||
}
|
||||
|
||||
double speed = (double(sizeToBytes(thisSize) * p_beatsperiteration) / (1000*1000)) / t;
|
||||
char sizeStr[256];
|
||||
if (p_beatsperiteration>1) {
|
||||
sprintf(sizeStr, "%9sx%d", sizeToString(thisSize).c_str(), p_beatsperiteration);
|
||||
} else {
|
||||
sprintf(sizeStr, "%9s", sizeToString(thisSize).c_str());
|
||||
}
|
||||
|
||||
string cGpu, pGpu;
|
||||
cGpu = gpuIDToString(currentGpu);
|
||||
pGpu = gpuIDToString(peerGpu);
|
||||
|
||||
resultDB.AddResult(std::string("p2p_bi") + "_gpu" + std::string(cGpu)+ "_gpu" + std::string(pGpu), sizeStr, "GB/sec", speed);
|
||||
resultDB.AddResult(std::string("P2P_bi") + "_gpu" + std::string(cGpu)+ "_gpu" + std::string(pGpu), sizeStr, "ms", t);
|
||||
|
||||
if (p_onesize) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
if (p_onesize) {
|
||||
numMaxFloats = sizeToBytes(p_onesize) / sizeof(float);
|
||||
}
|
||||
|
||||
disablePeer2Peer(currentGpu, peerGpu);
|
||||
|
||||
hipEventDestroy(start);
|
||||
hipEventDestroy(stop);
|
||||
|
||||
for (int i=0; i<2; i++) {
|
||||
hipStreamDestroy(stream[i]);
|
||||
|
||||
hipFree((void*)currentGpuMem[i]);
|
||||
hipFree((void*)peerGpuMem[i]);
|
||||
CHECK_HIP_ERROR();
|
||||
}
|
||||
|
||||
hipSetDevice(peerGpu);
|
||||
hipDeviceReset();
|
||||
|
||||
hipSetDevice(currentGpu);
|
||||
hipDeviceReset();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void printConfig() {
|
||||
hipDeviceProp_t props;
|
||||
hipGetDeviceProperties(&props, p_device);
|
||||
@@ -662,9 +987,9 @@ void help() {
|
||||
printf (" --d2h : Run only device-to-host test.\n");
|
||||
printf (" --h2d : Run only host-to-device test.\n");
|
||||
printf (" --bidir : Run only bidir copy test.\n");
|
||||
printf (" --p2p : Run only peer2peer unidir and bidir copy tests.\n");
|
||||
printf (" --verbose : Print verbose status messages as test is run.\n");
|
||||
printf (" --detailed : Print detailed report (including all trials).\n");
|
||||
|
||||
printf (" --async : Use hipMemcpyAsync(with NULL stream) for H2D/D2H. Default uses hipMemcpy.\n");
|
||||
printf (" --onesize, -o : Only run one measurement, at specified size (in KB, or if negative in bytes)\n");
|
||||
|
||||
@@ -712,6 +1037,12 @@ int parseStandardArguments(int argc, char *argv[])
|
||||
p_d2h = false;
|
||||
p_bidir = true;
|
||||
|
||||
} else if (!strcmp(arg, "--p2p")) {
|
||||
p_h2d = false;
|
||||
p_d2h = false;
|
||||
p_bidir = false;
|
||||
p_p2p = true;
|
||||
|
||||
} else if (!strcmp(arg, "--help") || (!strcmp(arg, "-h"))) {
|
||||
help();
|
||||
exit(EXIT_SUCCESS);
|
||||
@@ -737,39 +1068,57 @@ int main(int argc, char *argv[])
|
||||
{
|
||||
parseStandardArguments(argc, argv);
|
||||
|
||||
printConfig();
|
||||
if (p_p2p) {
|
||||
checkPeer2PeerSupport();
|
||||
|
||||
if (p_h2d) {
|
||||
ResultDatabase resultDB;
|
||||
RunBenchmark_H2D(resultDB);
|
||||
ResultDatabase resultDB_Unidir, resultDB_Bidir;
|
||||
|
||||
resultDB.DumpSummary(std::cout);
|
||||
RunBenchmark_P2P_Unidir(resultDB_Unidir);
|
||||
RunBenchmark_P2P_Bidir(resultDB_Bidir);
|
||||
|
||||
resultDB_Unidir.DumpSummary(std::cout);
|
||||
resultDB_Bidir.DumpSummary(std::cout);
|
||||
|
||||
if (p_detailed) {
|
||||
resultDB.DumpDetailed(std::cout);
|
||||
resultDB_Unidir.DumpDetailed(std::cout);
|
||||
resultDB_Bidir.DumpDetailed(std::cout);
|
||||
}
|
||||
}
|
||||
else {
|
||||
printConfig();
|
||||
|
||||
if (p_d2h) {
|
||||
ResultDatabase resultDB;
|
||||
RunBenchmark_D2H(resultDB);
|
||||
if (p_h2d) {
|
||||
ResultDatabase resultDB;
|
||||
RunBenchmark_H2D(resultDB);
|
||||
|
||||
resultDB.DumpSummary(std::cout);
|
||||
resultDB.DumpSummary(std::cout);
|
||||
|
||||
if (p_detailed) {
|
||||
resultDB.DumpDetailed(std::cout);
|
||||
if (p_detailed) {
|
||||
resultDB.DumpDetailed(std::cout);
|
||||
}
|
||||
}
|
||||
|
||||
if (p_d2h) {
|
||||
ResultDatabase resultDB;
|
||||
RunBenchmark_D2H(resultDB);
|
||||
|
||||
resultDB.DumpSummary(std::cout);
|
||||
|
||||
if (p_detailed) {
|
||||
resultDB.DumpDetailed(std::cout);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
if (p_bidir) {
|
||||
ResultDatabase resultDB;
|
||||
RunBenchmark_Bidir(resultDB);
|
||||
if (p_bidir) {
|
||||
ResultDatabase resultDB;
|
||||
RunBenchmark_Bidir(resultDB);
|
||||
|
||||
resultDB.DumpSummary(std::cout);
|
||||
resultDB.DumpSummary(std::cout);
|
||||
|
||||
if (p_detailed) {
|
||||
resultDB.DumpDetailed(std::cout);
|
||||
if (p_detailed) {
|
||||
resultDB.DumpDetailed(std::cout);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
مرجع در شماره جدید
Block a user