Add peer2peer bandwidth and latency test

Change-Id: I6d88e4aa9f6e64096af16579eebef4740734203e


[ROCm/hip-tests commit: 5c530e7c32]
This commit is contained in:
Sandeep Kumar
2017-06-12 17:14:12 +05:30
کامیت شده توسط Maneesh Gupta
والد a51e393772
کامیت ef65747f38
@@ -16,13 +16,15 @@ int p_iterations = 10;
int p_beatsperiteration=1;
int p_device = 0;
int p_detailed = 0;
bool p_async = 0;
bool p_async = 0;
int p_alignedhost = 0; // align host allocs to this granularity, in bytes. 64 or 4096 are good values to try.
int p_onesize = 0;
int p_onesize = 0;
bool p_h2d = true;
bool p_d2h = true;
bool p_bidir = true;
bool p_p2p = false;
//#define NO_CHECK
@@ -70,7 +72,7 @@ std::string sizeToString(int size)
// ****************************************************************************
hipError_t memcopy(void * dst, const void *src, size_t sizeBytes, enum hipMemcpyKind kind)
hipError_t memcopy(void * dst, const void *src, size_t sizeBytes, enum hipMemcpyKind kind )
{
if (p_async) {
return hipMemcpyAsync(dst, src, sizeBytes, kind, NULL);
@@ -632,6 +634,9 @@ void RunBenchmark_Bidir(ResultDatabase &resultDB)
}
#define failed(...) \
printf ("error: ");\
printf (__VA_ARGS__);\
@@ -646,6 +651,326 @@ int parseInt(const char *str, int *output)
}
void checkPeer2PeerSupport()
{
int deviceCnt;
hipGetDeviceCount(&deviceCnt);
std::cout << "Total no. of available gpu #" << deviceCnt << "\n" << std::endl;
for(int deviceId=0; deviceId<deviceCnt; deviceId++)
{
hipDeviceProp_t props;
hipGetDeviceProperties(&props, deviceId);
std::cout << "for gpu#" << deviceId << " " << props.name << std::endl;
std::cout << " peer2peer supported : ";
int PeerCnt=0;
for (int i=0; i<deviceCnt; i++) {
int isPeer;
hipDeviceCanAccessPeer(&isPeer, i, deviceId);
if (isPeer) {
std::cout << "gpu#" << i << " ";
++PeerCnt;
}
}
if (PeerCnt==0)
std::cout << "NONE" << " ";
std::cout << std::endl;
std::cout << " peer2peer not supported : ";
int nonPeerCnt=0;
for (int i=0; i<deviceCnt; i++) {
int isPeer;
hipDeviceCanAccessPeer(&isPeer, i, deviceId);
if (!isPeer && (i!=deviceId)) {
std::cout << "gpu#" << i << " ";
++nonPeerCnt;
}
}
if (nonPeerCnt==0)
std::cout << "NONE" << " ";
std::cout <<"\n"<<std::endl;
}
std::cout << "\nNote: For non-supported peer2peer devices, memcopy will use/follow the normal behaviour (GPU1-->host then host-->GPU2)\n\n" << std::endl;
}
void enablePeer2Peer(int currentGpu, int peerGpu)
{
int canAccessPeer;
hipSetDevice(currentGpu);
hipDeviceCanAccessPeer(&canAccessPeer, currentGpu, peerGpu);
if(canAccessPeer==1){
hipDeviceEnablePeerAccess(peerGpu, 0);
}
}
void disablePeer2Peer(int currentGpu, int peerGpu)
{
int canAccessPeer;
hipSetDevice(currentGpu);
hipDeviceCanAccessPeer(&canAccessPeer, currentGpu, peerGpu);
if(canAccessPeer==1){
hipDeviceDisablePeerAccess(peerGpu);
}
}
std::string gpuIDToString(int gpuID)
{
using namespace std;
stringstream ss;
ss << gpuID;
return ss.str();
}
void RunBenchmark_P2P_Unidir(ResultDatabase &resultDB)
{
int gpuCount;
hipGetDeviceCount(&gpuCount);
int currentGpu, peerGpu;
long long numMaxFloats = 1024 * (sizes[nSizes-1]) / 4;
for (currentGpu=0; currentGpu<gpuCount; currentGpu++) {
for (peerGpu=0; peerGpu<gpuCount; peerGpu++){
if (currentGpu == peerGpu)
continue;
float *currentGpuMem, *peerGpuMem;
hipSetDevice(currentGpu);
hipMalloc((void**)&currentGpuMem, sizeof(float) * numMaxFloats);
hipSetDevice(peerGpu);
hipMalloc((void**)&peerGpuMem, sizeof(float) * numMaxFloats);
enablePeer2Peer(currentGpu, peerGpu);
hipEvent_t start, stop;
hipEventCreate(&start);
hipEventCreate(&stop);
CHECK_HIP_ERROR();
// Three passes, forward and backward both
for (int pass = 0; pass < p_iterations; pass++)
{
// store the times temporarily to estimate latency
//float times[nSizes];
// Step through sizes forward on even passes and backward on odd
for (int i = 0; i < nSizes; i++)
{
int sizeIndex;
if ((pass % 2) == 0)
sizeIndex = i;
else
sizeIndex = (nSizes - 1) - i;
const int thisSize = p_onesize ? p_onesize : sizes[sizeIndex];
const int nbytes = sizeToBytes(thisSize);
hipDeviceSynchronize();
hipEventRecord(start, 0);
for (int j=0;j<p_beatsperiteration;j++) {
hipMemcpy(peerGpuMem, currentGpuMem, nbytes, hipMemcpyDeviceToDevice);
}
hipEventRecord(stop, 0);
hipEventSynchronize(stop);
float t = 0;
hipEventElapsedTime(&t, start, stop);
//times[sizeIndex] = t;
// Convert to GB/sec
if (p_verbose)
{
std::cerr << "size " << sizeToString(thisSize) << " took " << t << " ms\n";
}
double speed = (double(sizeToBytes(thisSize) * p_beatsperiteration) / (1000*1000)) / t;
char sizeStr[256];
if (p_beatsperiteration>1) {
sprintf(sizeStr, "%9sx%d", sizeToString(thisSize).c_str(), p_beatsperiteration);
} else {
sprintf(sizeStr, "%9s", sizeToString(thisSize).c_str());
}
string cGpu, pGpu;
cGpu = gpuIDToString(currentGpu);
pGpu = gpuIDToString(peerGpu);
resultDB.AddResult(std::string("p2p_uni") + "_gpu" + std::string(cGpu)+ "_gpu" + std::string(pGpu), sizeStr, "GB/sec", speed);
resultDB.AddResult(std::string("P2P_uni") + "_gpu" + std::string(cGpu)+ "_gpu" + std::string(pGpu), sizeStr, "ms", t);
if (p_onesize) {
break;
}
}
}
if (p_onesize) {
numMaxFloats = sizeToBytes(p_onesize) / sizeof(float);
}
disablePeer2Peer(currentGpu, peerGpu);
hipEventDestroy(start);
hipEventDestroy(stop);
// Cleanup
hipFree((void*)currentGpuMem);
hipFree((void*)peerGpuMem);
CHECK_HIP_ERROR();
hipSetDevice(peerGpu);
hipDeviceReset();
hipSetDevice(currentGpu);
hipDeviceReset();
}
}
}
void RunBenchmark_P2P_Bidir(ResultDatabase &resultDB) {
int gpuCount;
hipGetDeviceCount(&gpuCount);
hipStream_t stream[2];
int currentGpu, peerGpu;
long long numMaxFloats = 1024 * (sizes[nSizes-1]) / 4;
for (currentGpu=0; currentGpu<gpuCount; currentGpu++) {
for (peerGpu=0; peerGpu<gpuCount; peerGpu++){
if (currentGpu == peerGpu)
continue;
float *currentGpuMem[2], *peerGpuMem[2];
hipSetDevice(currentGpu);
hipMalloc((void**)&currentGpuMem[0], sizeof(float) * numMaxFloats);
hipMalloc((void**)&currentGpuMem[1], sizeof(float) * numMaxFloats);
hipSetDevice(peerGpu);
hipMalloc((void**)&peerGpuMem[0], sizeof(float) * numMaxFloats);
hipMalloc((void**)&peerGpuMem[1], sizeof(float) * numMaxFloats);
enablePeer2Peer(currentGpu, peerGpu);
hipEvent_t start, stop;
hipEventCreate(&start);
hipEventCreate(&stop);
CHECK_HIP_ERROR();
hipStreamCreate(&stream[0]);
hipStreamCreate(&stream[1]);
// Three passes, forward and backward both
for (int pass = 0; pass < p_iterations; pass++)
{
// store the times temporarily to estimate latency
//float times[nSizes];
// Step through sizes forward on even passes and backward on odd
for (int i = 0; i < nSizes; i++)
{
int sizeIndex;
if ((pass % 2) == 0)
sizeIndex = i;
else
sizeIndex = (nSizes - 1) - i;
const int thisSize = p_onesize ? p_onesize : sizes[sizeIndex];
const int nbytes = sizeToBytes(thisSize);
hipDeviceSynchronize();
hipEventRecord(start, 0);
for (int j=0;j<p_beatsperiteration;j++) {
hipMemcpyAsync(peerGpuMem[0], currentGpuMem[0], nbytes, hipMemcpyDeviceToDevice, stream[0]);
hipMemcpyAsync(currentGpuMem[1], peerGpuMem[1], nbytes, hipMemcpyDeviceToDevice, stream[1]);
}
hipEventRecord(stop, 0);
hipEventSynchronize(stop);
float t = 0;
hipEventElapsedTime(&t, start, stop);
//times[sizeIndex] = t;
// Convert to GB/sec
if (p_verbose)
{
std::cerr << "size " << sizeToString(thisSize) << " took " << t << " ms\n";
}
double speed = (double(sizeToBytes(thisSize) * p_beatsperiteration) / (1000*1000)) / t;
char sizeStr[256];
if (p_beatsperiteration>1) {
sprintf(sizeStr, "%9sx%d", sizeToString(thisSize).c_str(), p_beatsperiteration);
} else {
sprintf(sizeStr, "%9s", sizeToString(thisSize).c_str());
}
string cGpu, pGpu;
cGpu = gpuIDToString(currentGpu);
pGpu = gpuIDToString(peerGpu);
resultDB.AddResult(std::string("p2p_bi") + "_gpu" + std::string(cGpu)+ "_gpu" + std::string(pGpu), sizeStr, "GB/sec", speed);
resultDB.AddResult(std::string("P2P_bi") + "_gpu" + std::string(cGpu)+ "_gpu" + std::string(pGpu), sizeStr, "ms", t);
if (p_onesize) {
break;
}
}
}
if (p_onesize) {
numMaxFloats = sizeToBytes(p_onesize) / sizeof(float);
}
disablePeer2Peer(currentGpu, peerGpu);
hipEventDestroy(start);
hipEventDestroy(stop);
for (int i=0; i<2; i++) {
hipStreamDestroy(stream[i]);
hipFree((void*)currentGpuMem[i]);
hipFree((void*)peerGpuMem[i]);
CHECK_HIP_ERROR();
}
hipSetDevice(peerGpu);
hipDeviceReset();
hipSetDevice(currentGpu);
hipDeviceReset();
}
}
}
void printConfig() {
hipDeviceProp_t props;
hipGetDeviceProperties(&props, p_device);
@@ -662,9 +987,9 @@ void help() {
printf (" --d2h : Run only device-to-host test.\n");
printf (" --h2d : Run only host-to-device test.\n");
printf (" --bidir : Run only bidir copy test.\n");
printf (" --p2p : Run only peer2peer unidir and bidir copy tests.\n");
printf (" --verbose : Print verbose status messages as test is run.\n");
printf (" --detailed : Print detailed report (including all trials).\n");
printf (" --async : Use hipMemcpyAsync(with NULL stream) for H2D/D2H. Default uses hipMemcpy.\n");
printf (" --onesize, -o : Only run one measurement, at specified size (in KB, or if negative in bytes)\n");
@@ -712,6 +1037,12 @@ int parseStandardArguments(int argc, char *argv[])
p_d2h = false;
p_bidir = true;
} else if (!strcmp(arg, "--p2p")) {
p_h2d = false;
p_d2h = false;
p_bidir = false;
p_p2p = true;
} else if (!strcmp(arg, "--help") || (!strcmp(arg, "-h"))) {
help();
exit(EXIT_SUCCESS);
@@ -737,39 +1068,57 @@ int main(int argc, char *argv[])
{
parseStandardArguments(argc, argv);
printConfig();
if (p_p2p) {
checkPeer2PeerSupport();
if (p_h2d) {
ResultDatabase resultDB;
RunBenchmark_H2D(resultDB);
ResultDatabase resultDB_Unidir, resultDB_Bidir;
resultDB.DumpSummary(std::cout);
RunBenchmark_P2P_Unidir(resultDB_Unidir);
RunBenchmark_P2P_Bidir(resultDB_Bidir);
resultDB_Unidir.DumpSummary(std::cout);
resultDB_Bidir.DumpSummary(std::cout);
if (p_detailed) {
resultDB.DumpDetailed(std::cout);
resultDB_Unidir.DumpDetailed(std::cout);
resultDB_Bidir.DumpDetailed(std::cout);
}
}
else {
printConfig();
if (p_d2h) {
ResultDatabase resultDB;
RunBenchmark_D2H(resultDB);
if (p_h2d) {
ResultDatabase resultDB;
RunBenchmark_H2D(resultDB);
resultDB.DumpSummary(std::cout);
resultDB.DumpSummary(std::cout);
if (p_detailed) {
resultDB.DumpDetailed(std::cout);
if (p_detailed) {
resultDB.DumpDetailed(std::cout);
}
}
if (p_d2h) {
ResultDatabase resultDB;
RunBenchmark_D2H(resultDB);
resultDB.DumpSummary(std::cout);
if (p_detailed) {
resultDB.DumpDetailed(std::cout);
}
}
}
if (p_bidir) {
ResultDatabase resultDB;
RunBenchmark_Bidir(resultDB);
if (p_bidir) {
ResultDatabase resultDB;
RunBenchmark_Bidir(resultDB);
resultDB.DumpSummary(std::cout);
resultDB.DumpSummary(std::cout);
if (p_detailed) {
resultDB.DumpDetailed(std::cout);
if (p_detailed) {
resultDB.DumpDetailed(std::cout);
}
}
}
}