Add peer2peer bandwidth and latency test

Change-Id: I6d88e4aa9f6e64096af16579eebef4740734203e [ROCm/hip-tests commit: 5c530e7c32]
2017-06-12 17:14:12 +05:30
@@ -16,13 +16,15 @@ int           p_iterations   = 10;
 int           p_beatsperiteration=1;
 int           p_device  = 0;
 int           p_detailed  = 0;
-bool          p_async = 0; 
+bool          p_async = 0;
 int           p_alignedhost = 0;  // align host allocs to this granularity, in bytes. 64 or 4096 are good values to try.
-int           p_onesize = 0;  
+int           p_onesize = 0;

 bool          p_h2d   = true;
 bool          p_d2h   = true;
 bool          p_bidir = true;
+bool          p_p2p   = false;
+

 //#define NO_CHECK

@@ -70,7 +72,7 @@ std::string sizeToString(int size)


 // ****************************************************************************
-hipError_t memcopy(void * dst, const void *src, size_t sizeBytes, enum hipMemcpyKind kind)
+hipError_t memcopy(void * dst, const void *src, size_t sizeBytes, enum hipMemcpyKind kind )
 {
    if (p_async) {
        return hipMemcpyAsync(dst, src, sizeBytes, kind, NULL);
@@ -632,6 +634,9 @@ void RunBenchmark_Bidir(ResultDatabase &resultDB)
 }


+
+
+
 #define failed(...) \
    printf ("error: ");\
    printf (__VA_ARGS__);\
@@ -646,6 +651,326 @@ int parseInt(const char *str, int *output)
 }


+void checkPeer2PeerSupport()
+{
+    int deviceCnt;
+    hipGetDeviceCount(&deviceCnt);
+    std::cout << "Total no. of  available gpu #" << deviceCnt << "\n" << std::endl;
+
+    for(int deviceId=0; deviceId<deviceCnt; deviceId++)
+    {
+        hipDeviceProp_t props;
+        hipGetDeviceProperties(&props, deviceId);
+        std::cout << "for gpu#" << deviceId << " " << props.name << std::endl;
+        std::cout << "    peer2peer supported : ";
+        int PeerCnt=0;
+        for (int i=0; i<deviceCnt; i++) {
+            int isPeer;
+            hipDeviceCanAccessPeer(&isPeer, i, deviceId);
+            if (isPeer) {
+                std::cout << "gpu#" << i << " ";
+                ++PeerCnt;
+            }
+        }
+        if (PeerCnt==0)
+            std::cout << "NONE" << " ";
+
+        std::cout << std::endl;
+        std::cout << "    peer2peer not supported : ";
+        int nonPeerCnt=0;
+        for (int i=0; i<deviceCnt; i++) {
+            int isPeer;
+            hipDeviceCanAccessPeer(&isPeer, i, deviceId);
+            if (!isPeer && (i!=deviceId)) {
+                std::cout << "gpu#" << i << " ";
+                ++nonPeerCnt;
+                }
+            }
+        if (nonPeerCnt==0)
+            std::cout << "NONE" << " ";
+
+        std::cout <<"\n"<<std::endl;
+    }
+
+    std::cout << "\nNote: For non-supported peer2peer devices, memcopy will use/follow the normal behaviour (GPU1-->host then host-->GPU2)\n\n" << std::endl;
+}
+
+void enablePeer2Peer(int currentGpu, int peerGpu)
+{
+    int canAccessPeer;
+
+    hipSetDevice(currentGpu);
+    hipDeviceCanAccessPeer(&canAccessPeer, currentGpu, peerGpu);
+
+    if(canAccessPeer==1){
+        hipDeviceEnablePeerAccess(peerGpu, 0);
+    }
+}
+
+void disablePeer2Peer(int currentGpu, int peerGpu)
+{
+    int canAccessPeer;
+
+    hipSetDevice(currentGpu);
+    hipDeviceCanAccessPeer(&canAccessPeer, currentGpu, peerGpu);
+
+    if(canAccessPeer==1){
+        hipDeviceDisablePeerAccess(peerGpu);
+    }
+}
+
+std::string gpuIDToString(int gpuID)
+{
+    using namespace std;
+    stringstream ss;
+        ss << gpuID;
+    return ss.str();
+}
+
+void RunBenchmark_P2P_Unidir(ResultDatabase &resultDB)
+{
+    int gpuCount;
+    hipGetDeviceCount(&gpuCount);
+
+    int currentGpu, peerGpu;
+
+    long long numMaxFloats = 1024 * (sizes[nSizes-1]) / 4;
+
+    for (currentGpu=0; currentGpu<gpuCount; currentGpu++) {
+
+        for (peerGpu=0; peerGpu<gpuCount; peerGpu++){
+
+            if (currentGpu == peerGpu)
+                continue;
+
+            float *currentGpuMem, *peerGpuMem;
+
+            hipSetDevice(currentGpu);
+            hipMalloc((void**)&currentGpuMem, sizeof(float) * numMaxFloats);
+
+            hipSetDevice(peerGpu);
+            hipMalloc((void**)&peerGpuMem, sizeof(float) * numMaxFloats);
+
+            enablePeer2Peer(currentGpu, peerGpu);
+
+            hipEvent_t start, stop;
+            hipEventCreate(&start);
+            hipEventCreate(&stop);
+            CHECK_HIP_ERROR();
+
+            // Three passes, forward and backward both
+            for (int pass = 0; pass < p_iterations; pass++)
+            {
+                // store the times temporarily to estimate latency
+                //float times[nSizes];
+                // Step through sizes forward on even passes and backward on odd
+                for (int i = 0; i < nSizes; i++)
+                {
+                    int sizeIndex;
+                    if ((pass % 2) == 0)
+                        sizeIndex = i;
+                    else
+                        sizeIndex = (nSizes - 1) - i;
+
+                    const int thisSize = p_onesize ? p_onesize : sizes[sizeIndex];
+                    const int nbytes = sizeToBytes(thisSize);
+
+                    hipDeviceSynchronize();
+
+                    hipEventRecord(start, 0);
+
+                    for (int j=0;j<p_beatsperiteration;j++) {
+                        hipMemcpy(peerGpuMem, currentGpuMem, nbytes, hipMemcpyDeviceToDevice);
+                    }
+
+                    hipEventRecord(stop, 0);
+
+                    hipEventSynchronize(stop);
+
+                    float t = 0;
+                    hipEventElapsedTime(&t, start, stop);
+                    //times[sizeIndex] = t;
+
+                    // Convert to GB/sec
+                    if (p_verbose)
+                    {
+                        std::cerr << "size " << sizeToString(thisSize) << " took " << t << " ms\n";
+                    }
+
+                    double speed = (double(sizeToBytes(thisSize) * p_beatsperiteration) / (1000*1000)) / t;
+                    char sizeStr[256];
+                    if (p_beatsperiteration>1) {
+                        sprintf(sizeStr, "%9sx%d", sizeToString(thisSize).c_str(), p_beatsperiteration);
+                    } else {
+                        sprintf(sizeStr, "%9s", sizeToString(thisSize).c_str());
+                    }
+
+                    string cGpu, pGpu;
+                    cGpu = gpuIDToString(currentGpu);
+                    pGpu = gpuIDToString(peerGpu);
+
+                    resultDB.AddResult(std::string("p2p_uni") + "_gpu" + std::string(cGpu)+ "_gpu" + std::string(pGpu), sizeStr, "GB/sec", speed);
+                    resultDB.AddResult(std::string("P2P_uni") + "_gpu" + std::string(cGpu)+ "_gpu" + std::string(pGpu), sizeStr, "ms", t);
+
+                    if (p_onesize) {
+                        break;
+                    }
+                }
+
+            }
+
+            if (p_onesize) {
+                numMaxFloats = sizeToBytes(p_onesize) / sizeof(float);
+            }
+
+            disablePeer2Peer(currentGpu, peerGpu);
+
+            hipEventDestroy(start);
+            hipEventDestroy(stop);
+
+            // Cleanup
+            hipFree((void*)currentGpuMem);
+            hipFree((void*)peerGpuMem);
+            CHECK_HIP_ERROR();
+
+            hipSetDevice(peerGpu);
+            hipDeviceReset();
+
+            hipSetDevice(currentGpu);
+            hipDeviceReset();
+        }
+
+    }
+
+}
+
+void RunBenchmark_P2P_Bidir(ResultDatabase &resultDB) {
+
+    int gpuCount;
+    hipGetDeviceCount(&gpuCount);
+
+    hipStream_t stream[2];
+
+    int currentGpu, peerGpu;
+
+    long long numMaxFloats = 1024 * (sizes[nSizes-1]) / 4;
+
+    for (currentGpu=0; currentGpu<gpuCount; currentGpu++) {
+
+        for (peerGpu=0; peerGpu<gpuCount; peerGpu++){
+
+            if (currentGpu == peerGpu)
+                continue;
+
+            float *currentGpuMem[2], *peerGpuMem[2];
+
+            hipSetDevice(currentGpu);
+            hipMalloc((void**)&currentGpuMem[0], sizeof(float) * numMaxFloats);
+            hipMalloc((void**)&currentGpuMem[1], sizeof(float) * numMaxFloats);
+
+            hipSetDevice(peerGpu);
+            hipMalloc((void**)&peerGpuMem[0], sizeof(float) * numMaxFloats);
+            hipMalloc((void**)&peerGpuMem[1], sizeof(float) * numMaxFloats);
+
+            enablePeer2Peer(currentGpu, peerGpu);
+
+            hipEvent_t start, stop;
+            hipEventCreate(&start);
+            hipEventCreate(&stop);
+            CHECK_HIP_ERROR();
+
+            hipStreamCreate(&stream[0]);
+            hipStreamCreate(&stream[1]);
+
+            // Three passes, forward and backward both
+            for (int pass = 0; pass < p_iterations; pass++)
+            {
+                // store the times temporarily to estimate latency
+                //float times[nSizes];
+                // Step through sizes forward on even passes and backward on odd
+                for (int i = 0; i < nSizes; i++)
+                {
+                    int sizeIndex;
+                    if ((pass % 2) == 0)
+                        sizeIndex = i;
+                    else
+                        sizeIndex = (nSizes - 1) - i;
+
+                    const int thisSize = p_onesize ? p_onesize : sizes[sizeIndex];
+                    const int nbytes = sizeToBytes(thisSize);
+
+                    hipDeviceSynchronize();
+
+                    hipEventRecord(start, 0);
+
+                    for (int j=0;j<p_beatsperiteration;j++) {
+                        hipMemcpyAsync(peerGpuMem[0], currentGpuMem[0], nbytes, hipMemcpyDeviceToDevice, stream[0]);
+                        hipMemcpyAsync(currentGpuMem[1], peerGpuMem[1], nbytes, hipMemcpyDeviceToDevice, stream[1]);
+                    }
+
+                    hipEventRecord(stop, 0);
+
+                    hipEventSynchronize(stop);
+
+                    float t = 0;
+                    hipEventElapsedTime(&t, start, stop);
+                    //times[sizeIndex] = t;
+
+                    // Convert to GB/sec
+                    if (p_verbose)
+                    {
+                        std::cerr << "size " << sizeToString(thisSize) << " took " << t << " ms\n";
+                    }
+
+                    double speed = (double(sizeToBytes(thisSize) * p_beatsperiteration) / (1000*1000)) / t;
+                    char sizeStr[256];
+                    if (p_beatsperiteration>1) {
+                        sprintf(sizeStr, "%9sx%d", sizeToString(thisSize).c_str(), p_beatsperiteration);
+                    } else {
+                        sprintf(sizeStr, "%9s", sizeToString(thisSize).c_str());
+                    }
+
+                    string cGpu, pGpu;
+                    cGpu = gpuIDToString(currentGpu);
+                    pGpu = gpuIDToString(peerGpu);
+
+                    resultDB.AddResult(std::string("p2p_bi") + "_gpu" + std::string(cGpu)+ "_gpu" + std::string(pGpu), sizeStr, "GB/sec", speed);
+                    resultDB.AddResult(std::string("P2P_bi") + "_gpu" + std::string(cGpu)+ "_gpu" + std::string(pGpu), sizeStr, "ms", t);
+
+                    if (p_onesize) {
+                        break;
+                    }
+                }
+
+            }
+
+            if (p_onesize) {
+                numMaxFloats = sizeToBytes(p_onesize) / sizeof(float);
+            }
+
+            disablePeer2Peer(currentGpu, peerGpu);
+
+            hipEventDestroy(start);
+            hipEventDestroy(stop);
+
+            for (int i=0; i<2; i++) {
+                hipStreamDestroy(stream[i]);
+
+                hipFree((void*)currentGpuMem[i]);
+                hipFree((void*)peerGpuMem[i]);
+                CHECK_HIP_ERROR();
+            }
+
+            hipSetDevice(peerGpu);
+            hipDeviceReset();
+
+            hipSetDevice(currentGpu);
+            hipDeviceReset();
+        }
+    }
+}
+
+
 void printConfig() {
    hipDeviceProp_t props;
    hipGetDeviceProperties(&props, p_device);
@@ -662,9 +987,9 @@ void help() {
    printf ("  --d2h                    : Run only device-to-host test.\n");
    printf ("  --h2d                    : Run only host-to-device test.\n");
    printf ("  --bidir                  : Run only bidir copy test.\n");
+    printf ("  --p2p                    : Run only peer2peer unidir and bidir copy tests.\n");
    printf ("  --verbose                : Print verbose status messages as test is run.\n");
    printf ("  --detailed               : Print detailed report (including all trials).\n");
-
    printf ("  --async                  : Use hipMemcpyAsync(with NULL stream) for H2D/D2H.  Default uses hipMemcpy.\n");
    printf ("  --onesize, -o            : Only run one measurement, at specified size (in KB, or if negative in bytes)\n");

@@ -712,6 +1037,12 @@ int parseStandardArguments(int argc, char *argv[])
            p_d2h   = false;
            p_bidir = true;

+        } else if (!strcmp(arg, "--p2p")) {
+            p_h2d   = false;
+            p_d2h   = false;
+            p_bidir = false;
+            p_p2p   = true;
+
        } else if (!strcmp(arg, "--help")  || (!strcmp(arg, "-h"))) {
            help();
            exit(EXIT_SUCCESS);
@@ -737,39 +1068,57 @@ int main(int argc, char *argv[])
 {
    parseStandardArguments(argc, argv);

-    printConfig();
+    if (p_p2p) {
+        checkPeer2PeerSupport();

-    if (p_h2d) {
-        ResultDatabase resultDB;
-        RunBenchmark_H2D(resultDB);
+        ResultDatabase resultDB_Unidir, resultDB_Bidir;

-        resultDB.DumpSummary(std::cout);
+        RunBenchmark_P2P_Unidir(resultDB_Unidir);
+        RunBenchmark_P2P_Bidir(resultDB_Bidir);
+
+        resultDB_Unidir.DumpSummary(std::cout);
+        resultDB_Bidir.DumpSummary(std::cout);

        if (p_detailed) {
-            resultDB.DumpDetailed(std::cout);
+            resultDB_Unidir.DumpDetailed(std::cout);
+            resultDB_Bidir.DumpDetailed(std::cout);
        }
    }
+    else {
+        printConfig();

-    if (p_d2h) {
-        ResultDatabase resultDB;
-        RunBenchmark_D2H(resultDB);
+        if (p_h2d) {
+            ResultDatabase resultDB;
+            RunBenchmark_H2D(resultDB);

-        resultDB.DumpSummary(std::cout);
+            resultDB.DumpSummary(std::cout);

-        if (p_detailed) {
-            resultDB.DumpDetailed(std::cout);
+            if (p_detailed) {
+                resultDB.DumpDetailed(std::cout);
+            }
+        }
+
+        if (p_d2h) {
+            ResultDatabase resultDB;
+            RunBenchmark_D2H(resultDB);
+
+            resultDB.DumpSummary(std::cout);
+
+            if (p_detailed) {
+                resultDB.DumpDetailed(std::cout);
+            }
        }
-    }


-    if (p_bidir) {
-        ResultDatabase resultDB;
-        RunBenchmark_Bidir(resultDB);
+        if (p_bidir) {
+            ResultDatabase resultDB;
+            RunBenchmark_Bidir(resultDB);

-        resultDB.DumpSummary(std::cout);
+            resultDB.DumpSummary(std::cout);

-        if (p_detailed) {
-            resultDB.DumpDetailed(std::cout);
+            if (p_detailed) {
+                resultDB.DumpDetailed(std::cout);
+            }
        }
    }
 }