Single Sync Timing mode (#144)

* Adding single sync timing mode to emulate timing reported by rccl-prim-test / rccl-tests * Adding duration / overhead info [ROCm/rccl commit: 2f9edd2432]
2019-11-01 10:18:25 -06:00
@@ -49,9 +49,10 @@ int main(int argc, char **argv)
        printf("\n");
        printf("Environment variables:\n");
        printf("======================\n");
-        printf(" USE_HIP_CALL   - Use hip calls (hipMemcpyAsync/hipMemset) instead of kernel\n");
-        printf(" USE_MEMSET     - Write constant value (instead of doing a copy)\n");
-        printf(" USE_COARSE_MEM - Use coarse-grained dst GPU memory (instead of fine-grained)\n");
+        printf(" USE_HIP_CALL    - Use hip calls (hipMemcpyAsync/hipMemset) instead of kernel\n");
+        printf(" USE_MEMSET      - Write constant value (instead of doing a copy)\n");
+        printf(" USE_COARSE_MEM  - Use coarse-grained dst GPU memory (instead of fine-grained)\n");
+        printf(" USE_SINGLE_SYNC - Only synchronize once at end of iterations (disables GPU times)\n");
        exit(0);
    }

@@ -68,6 +69,8 @@ int main(int argc, char **argv)
    bool useHipCall = getenv("USE_HIP_CALL");
    bool useMemset  = getenv("USE_MEMSET");
    bool useCoarseMem = getenv("USE_COARSE_MEM");
+    bool useSingleSync = getenv("USE_SINGLE_SYNC");
+
    printf("Running %s%s tests (control using USE_HIP_CALL/USE_MEMSET)\n",
           useHipCall ? "hipMem" : "mem",
           useMemset ? "set" : "cpy");
@@ -80,6 +83,10 @@ int main(int argc, char **argv)
      else
        printf("Using DMA copy engines (disable by setting HSA_ENABLE_SDMA=0)\n");
    }
+    if (useSingleSync)
+      printf("Synchronizing only once, after all iterations (disables GPU timers)\n");
+    else
+      printf("Synchronizing per iteration  (disable via USE_SINGLE_SYNC)\n");

    // Currently an environment variable is required in order to enable fine-grained VRAM allocations
    if (!useCoarseMem && !getenv("HSA_FORCE_FINE_GRAIN_PCIE"))
@@ -98,12 +105,24 @@ int main(int argc, char **argv)
    }

    // Print header
-    printf("%-*s(GB/s)", MAX_NAME_LEN - 6, "Configuration");
+    printf("%*s", MAX_NAME_LEN, "");
+    printf("%*s | ", 8*(numDevices+1), "Bandwidth (GB/s)");
+    printf("%*s", 8*(numDevices+1), "Duration (msec)");
+    printf(" | Overhead\n");
+    printf("%-*s", MAX_NAME_LEN, "Configuration");
    for (int i = 0; i < numDevices; i++)
-        printf("  GPU %02d", i);
-    printf("   Total\n");
-    for (int i = 0; i < MAX_NAME_LEN + 8 * (numDevices + 1); i++) printf("=");
-    printf("\n");
+      printf("  GPU %02d", i);
+    printf("   Total");
+    printf(" | ");
+    for (int i = 0; i < numDevices; i++)
+      printf("  GPU %02d", i);
+    printf(" CpuTime");
+    printf(" |   (msec)\n");
+
+    for (int i = 0; i < MAX_NAME_LEN + (8 * (numDevices + 1)); i++) printf("=");
+    printf("=|=");
+    for (int i = 0; i < (8 * (numDevices + 1)); i++) printf("=");
+    printf("=|=========\n");

    // Read configuration file
    FILE* fp = fopen(argv[1], "r");
@@ -224,7 +243,9 @@ int main(int argc, char **argv)
            for (int i = 0; i < numLinks; i++)
            {
                HIP_CALL(hipSetDevice(links[i].srcGpu));
+
                HIP_CALL(hipEventRecord(startEvents[i], streams[i]));
+
                if (useHipCall)
                {
                  if (useMemset)
@@ -262,8 +283,13 @@ int main(int argc, char **argv)
                HIP_CALL(hipEventRecord(stopEvents[i], streams[i]));
            }

-            for (int i = 0; i < numLinks; i++)
+            // Synchronize per iteration, unless in single sync mode, in which case
+            // synchronize during last warmup / last actual iteration
+            if (!useSingleSync || iteration == -1  || iteration == numIterations - 1)
+            {
+              for (int i = 0; i < numLinks; i++)
                hipStreamSynchronize(streams[i]);
+            }

            auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart;
            double deltaSec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count();
@@ -274,26 +300,35 @@ int main(int argc, char **argv)

                for (int i = 0; i < numDevices; i++)
                {
+                  // Collect GPU information only if this is the last iteration for single sync mode
+                  if (useSingleSync && iteration != numIterations - 1)
+                  {
+                    totalGpuTime[i] = 0.00;
+                  }
+                  else
+                  {
                    // Multiple links running on the same device may be running simultaneously
                    // so try to figure out the first/last event across all links
                    float maxTime = 0.0f;
                    for (int j = 0; j < numLinks; j++)
                    {
-                        if (links[j].srcGpu != i) continue;
-                        for (int k = 0; k < numLinks; k++)
-                        {
-                            if (links[k].srcGpu != i) continue;
+                      if (links[j].srcGpu != i) continue;
+                      for (int k = 0; k < numLinks; k++)
+                      {
+                        if (links[k].srcGpu != i) continue;

-                            float gpuDeltaMsec;
-                            HIP_CALL(hipEventElapsedTime(&gpuDeltaMsec, startEvents[j], stopEvents[k]));
-                            maxTime = std::max(maxTime, gpuDeltaMsec);
-                        }
+                        float gpuDeltaMsec;
+                        HIP_CALL(hipEventElapsedTime(&gpuDeltaMsec, startEvents[j], stopEvents[k]));
+                        maxTime = std::max(maxTime, gpuDeltaMsec);
+                      }
                    }
                    totalGpuTime[i] += maxTime / 1000.0;
+                  }
                }
            }
        }

+
        // Validate that each link has transferred correctly
        for (int i = 0; i < numLinks; i++)
          CheckOrFill(N, linkDstMem[i], true, useMemset, useHipCall);
@@ -302,20 +337,36 @@ int main(int argc, char **argv)
        printf("%-*s", MAX_NAME_LEN, name);
        for (int i = 0; i < numDevices; i++)
        {
-            if (linkCount[i] == 0)
-            {
-                printf("%8.3f", 0.0);
-            }
-            else
-            {
-                totalGpuTime[i] /= (1.0 * numIterations);
-                printf("%8.3f", (linkCount[i] * numBytesPerLink / 1.0E9) / totalGpuTime[i]);
-            }
+          if (linkCount[i] == 0)
+          {
+            printf("%8.3f", 0.0f);
+          }
+          else
+          {
+            if (!useSingleSync)
+              totalGpuTime[i] /= (1.0 * numIterations);
+            printf("%8.3f", (linkCount[i] * numBytesPerLink / 1.0E9) / totalGpuTime[i]);
+          }
        }
-
        // Print off bandwidth (based on CPU wall-time timer)
        totalCpuTime /= numIterations;
-        printf("%8.3f\n", (numLinks * numBytesPerLink / 1.0E9) / totalCpuTime);
+        printf("%8.3f", (numLinks * numBytesPerLink / 1.0E9) / totalCpuTime);
+        printf(" | ");
+
+        double maxGpuTime = 0;
+        for (int i = 0; i < numDevices; i++)
+        {
+          if (linkCount[i] == 0)
+          {
+            printf("%8.3f", 0.0f);
+          }
+          else
+          {
+            printf("%8.3f", totalGpuTime[i] * 1000.0f);
+            maxGpuTime = std::max(maxGpuTime, totalGpuTime[i]);
+          }
+        }
+        printf("%8.3f | %8.3f\n", totalCpuTime * 1000.0f, (totalCpuTime - maxGpuTime) * 1000.0f);

        // Release GPU memory
        for (int i = 0; i < numLinks; i++)
@@ -332,8 +383,10 @@ int main(int argc, char **argv)
    fclose(fp);

    // Print link information
-    for (int i = 0; i < MAX_NAME_LEN + 8 * (numDevices + 1); i++) printf("=");
-    printf("\n");
+    for (int i = 0; i < MAX_NAME_LEN + (8 * (numDevices + 1)); i++) printf("=");
+    printf("=|=");
+    for (int i = 0; i < (8 * (numDevices + 1)); i++) printf("=");
+    printf("=|=========\n");
    printf("Link topology:\n");
    uint32_t linkType;
    uint32_t hopCount;