From f9ef1553aacb80eef86a8c8e5d7a2aab1e18bd58 Mon Sep 17 00:00:00 2001
From: gilbertlee-amd <44450918+gilbertlee-amd@users.noreply.github.com>
Date: Fri, 1 Nov 2019 10:18:25 -0600
Subject: [PATCH] Single Sync Timing mode (#144)

* Adding single sync timing mode to emulate timing reported by rccl-prim-test / rccl-tests
* Adding duration / overhead info


[ROCm/rccl commit: 2f9edd24327e266fb6cffc3787bcad38746531dd]
---
 .../tools/TransferBench/TransferBench.cpp     | 113 +++++++++++++-----
 1 file changed, 83 insertions(+), 30 deletions(-)

diff --git a/projects/rccl/tools/TransferBench/TransferBench.cpp b/projects/rccl/tools/TransferBench/TransferBench.cpp
index 2ec45376e5..d7bd289fc1 100644
--- a/projects/rccl/tools/TransferBench/TransferBench.cpp
+++ b/projects/rccl/tools/TransferBench/TransferBench.cpp
@@ -49,9 +49,10 @@ int main(int argc, char **argv)
         printf("\n");
         printf("Environment variables:\n");
         printf("======================\n");
-        printf(" USE_HIP_CALL   - Use hip calls (hipMemcpyAsync/hipMemset) instead of kernel\n");
-        printf(" USE_MEMSET     - Write constant value (instead of doing a copy)\n");
-        printf(" USE_COARSE_MEM - Use coarse-grained dst GPU memory (instead of fine-grained)\n");
+        printf(" USE_HIP_CALL    - Use hip calls (hipMemcpyAsync/hipMemset) instead of kernel\n");
+        printf(" USE_MEMSET      - Write constant value (instead of doing a copy)\n");
+        printf(" USE_COARSE_MEM  - Use coarse-grained dst GPU memory (instead of fine-grained)\n");
+        printf(" USE_SINGLE_SYNC - Only synchronize once at end of iterations (disables GPU times)\n");
         exit(0);
     }
 
@@ -68,6 +69,8 @@ int main(int argc, char **argv)
     bool useHipCall = getenv("USE_HIP_CALL");
     bool useMemset  = getenv("USE_MEMSET");
     bool useCoarseMem = getenv("USE_COARSE_MEM");
+    bool useSingleSync = getenv("USE_SINGLE_SYNC");
+
     printf("Running %s%s tests (control using USE_HIP_CALL/USE_MEMSET)\n",
            useHipCall ? "hipMem" : "mem",
            useMemset ? "set" : "cpy");
@@ -80,6 +83,10 @@ int main(int argc, char **argv)
       else
         printf("Using DMA copy engines (disable by setting HSA_ENABLE_SDMA=0)\n");
     }
+    if (useSingleSync)
+      printf("Synchronizing only once, after all iterations (disables GPU timers)\n");
+    else
+      printf("Synchronizing per iteration  (disable via USE_SINGLE_SYNC)\n");
 
     // Currently an environment variable is required in order to enable fine-grained VRAM allocations
     if (!useCoarseMem && !getenv("HSA_FORCE_FINE_GRAIN_PCIE"))
@@ -98,12 +105,24 @@ int main(int argc, char **argv)
     }
 
     // Print header
-    printf("%-*s(GB/s)", MAX_NAME_LEN - 6, "Configuration");
+    printf("%*s", MAX_NAME_LEN, "");
+    printf("%*s | ", 8*(numDevices+1), "Bandwidth (GB/s)");
+    printf("%*s", 8*(numDevices+1), "Duration (msec)");
+    printf(" | Overhead\n");
+    printf("%-*s", MAX_NAME_LEN, "Configuration");
     for (int i = 0; i < numDevices; i++)
-        printf("  GPU %02d", i);
-    printf("   Total\n");
-    for (int i = 0; i < MAX_NAME_LEN + 8 * (numDevices + 1); i++) printf("=");
-    printf("\n");
+      printf("  GPU %02d", i);
+    printf("   Total");
+    printf(" | ");
+    for (int i = 0; i < numDevices; i++)
+      printf("  GPU %02d", i);
+    printf(" CpuTime");
+    printf(" |   (msec)\n");
+
+    for (int i = 0; i < MAX_NAME_LEN + (8 * (numDevices + 1)); i++) printf("=");
+    printf("=|=");
+    for (int i = 0; i < (8 * (numDevices + 1)); i++) printf("=");
+    printf("=|=========\n");
 
     // Read configuration file
     FILE* fp = fopen(argv[1], "r");
@@ -224,7 +243,9 @@ int main(int argc, char **argv)
             for (int i = 0; i < numLinks; i++)
             {
                 HIP_CALL(hipSetDevice(links[i].srcGpu));
+
                 HIP_CALL(hipEventRecord(startEvents[i], streams[i]));
+
                 if (useHipCall)
                 {
                   if (useMemset)
@@ -262,8 +283,13 @@ int main(int argc, char **argv)
                 HIP_CALL(hipEventRecord(stopEvents[i], streams[i]));
             }
 
-            for (int i = 0; i < numLinks; i++)
+            // Synchronize per iteration, unless in single sync mode, in which case
+            // synchronize during last warmup / last actual iteration
+            if (!useSingleSync || iteration == -1  || iteration == numIterations - 1)
+            {
+              for (int i = 0; i < numLinks; i++)
                 hipStreamSynchronize(streams[i]);
+            }
 
             auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart;
             double deltaSec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count();
@@ -274,26 +300,35 @@ int main(int argc, char **argv)
 
                 for (int i = 0; i < numDevices; i++)
                 {
+                  // Collect GPU information only if this is the last iteration for single sync mode
+                  if (useSingleSync && iteration != numIterations - 1)
+                  {
+                    totalGpuTime[i] = 0.00;
+                  }
+                  else
+                  {
                     // Multiple links running on the same device may be running simultaneously
                     // so try to figure out the first/last event across all links
                     float maxTime = 0.0f;
                     for (int j = 0; j < numLinks; j++)
                     {
-                        if (links[j].srcGpu != i) continue;
-                        for (int k = 0; k < numLinks; k++)
-                        {
-                            if (links[k].srcGpu != i) continue;
+                      if (links[j].srcGpu != i) continue;
+                      for (int k = 0; k < numLinks; k++)
+                      {
+                        if (links[k].srcGpu != i) continue;
 
-                            float gpuDeltaMsec;
-                            HIP_CALL(hipEventElapsedTime(&gpuDeltaMsec, startEvents[j], stopEvents[k]));
-                            maxTime = std::max(maxTime, gpuDeltaMsec);
-                        }
+                        float gpuDeltaMsec;
+                        HIP_CALL(hipEventElapsedTime(&gpuDeltaMsec, startEvents[j], stopEvents[k]));
+                        maxTime = std::max(maxTime, gpuDeltaMsec);
+                      }
                     }
                     totalGpuTime[i] += maxTime / 1000.0;
+                  }
                 }
             }
         }
 
+
         // Validate that each link has transferred correctly
         for (int i = 0; i < numLinks; i++)
           CheckOrFill(N, linkDstMem[i], true, useMemset, useHipCall);
@@ -302,20 +337,36 @@ int main(int argc, char **argv)
         printf("%-*s", MAX_NAME_LEN, name);
         for (int i = 0; i < numDevices; i++)
         {
-            if (linkCount[i] == 0)
-            {
-                printf("%8.3f", 0.0);
-            }
-            else
-            {
-                totalGpuTime[i] /= (1.0 * numIterations);
-                printf("%8.3f", (linkCount[i] * numBytesPerLink / 1.0E9) / totalGpuTime[i]);
-            }
+          if (linkCount[i] == 0)
+          {
+            printf("%8.3f", 0.0f);
+          }
+          else
+          {
+            if (!useSingleSync)
+              totalGpuTime[i] /= (1.0 * numIterations);
+            printf("%8.3f", (linkCount[i] * numBytesPerLink / 1.0E9) / totalGpuTime[i]);
+          }
         }
-
         // Print off bandwidth (based on CPU wall-time timer)
         totalCpuTime /= numIterations;
-        printf("%8.3f\n", (numLinks * numBytesPerLink / 1.0E9) / totalCpuTime);
+        printf("%8.3f", (numLinks * numBytesPerLink / 1.0E9) / totalCpuTime);
+        printf(" | ");
+
+        double maxGpuTime = 0;
+        for (int i = 0; i < numDevices; i++)
+        {
+          if (linkCount[i] == 0)
+          {
+            printf("%8.3f", 0.0f);
+          }
+          else
+          {
+            printf("%8.3f", totalGpuTime[i] * 1000.0f);
+            maxGpuTime = std::max(maxGpuTime, totalGpuTime[i]);
+          }
+        }
+        printf("%8.3f | %8.3f\n", totalCpuTime * 1000.0f, (totalCpuTime - maxGpuTime) * 1000.0f);
 
         // Release GPU memory
         for (int i = 0; i < numLinks; i++)
@@ -332,8 +383,10 @@ int main(int argc, char **argv)
     fclose(fp);
 
     // Print link information
-    for (int i = 0; i < MAX_NAME_LEN + 8 * (numDevices + 1); i++) printf("=");
-    printf("\n");
+    for (int i = 0; i < MAX_NAME_LEN + (8 * (numDevices + 1)); i++) printf("=");
+    printf("=|=");
+    for (int i = 0; i < (8 * (numDevices + 1)); i++) printf("=");
+    printf("=|=========\n");
     printf("Link topology:\n");
     uint32_t linkType;
     uint32_t hopCount;