From f9ef1553aacb80eef86a8c8e5d7a2aab1e18bd58 Mon Sep 17 00:00:00 2001 From: gilbertlee-amd <44450918+gilbertlee-amd@users.noreply.github.com> Date: Fri, 1 Nov 2019 10:18:25 -0600 Subject: [PATCH] Single Sync Timing mode (#144) * Adding single sync timing mode to emulate timing reported by rccl-prim-test / rccl-tests * Adding duration / overhead info [ROCm/rccl commit: 2f9edd24327e266fb6cffc3787bcad38746531dd] --- .../tools/TransferBench/TransferBench.cpp | 113 +++++++++++++----- 1 file changed, 83 insertions(+), 30 deletions(-) diff --git a/projects/rccl/tools/TransferBench/TransferBench.cpp b/projects/rccl/tools/TransferBench/TransferBench.cpp index 2ec45376e5..d7bd289fc1 100644 --- a/projects/rccl/tools/TransferBench/TransferBench.cpp +++ b/projects/rccl/tools/TransferBench/TransferBench.cpp @@ -49,9 +49,10 @@ int main(int argc, char **argv) printf("\n"); printf("Environment variables:\n"); printf("======================\n"); - printf(" USE_HIP_CALL - Use hip calls (hipMemcpyAsync/hipMemset) instead of kernel\n"); - printf(" USE_MEMSET - Write constant value (instead of doing a copy)\n"); - printf(" USE_COARSE_MEM - Use coarse-grained dst GPU memory (instead of fine-grained)\n"); + printf(" USE_HIP_CALL - Use hip calls (hipMemcpyAsync/hipMemset) instead of kernel\n"); + printf(" USE_MEMSET - Write constant value (instead of doing a copy)\n"); + printf(" USE_COARSE_MEM - Use coarse-grained dst GPU memory (instead of fine-grained)\n"); + printf(" USE_SINGLE_SYNC - Only synchronize once at end of iterations (disables GPU times)\n"); exit(0); } @@ -68,6 +69,8 @@ int main(int argc, char **argv) bool useHipCall = getenv("USE_HIP_CALL"); bool useMemset = getenv("USE_MEMSET"); bool useCoarseMem = getenv("USE_COARSE_MEM"); + bool useSingleSync = getenv("USE_SINGLE_SYNC"); + printf("Running %s%s tests (control using USE_HIP_CALL/USE_MEMSET)\n", useHipCall ? "hipMem" : "mem", useMemset ? "set" : "cpy"); @@ -80,6 +83,10 @@ int main(int argc, char **argv) else printf("Using DMA copy engines (disable by setting HSA_ENABLE_SDMA=0)\n"); } + if (useSingleSync) + printf("Synchronizing only once, after all iterations (disables GPU timers)\n"); + else + printf("Synchronizing per iteration (disable via USE_SINGLE_SYNC)\n"); // Currently an environment variable is required in order to enable fine-grained VRAM allocations if (!useCoarseMem && !getenv("HSA_FORCE_FINE_GRAIN_PCIE")) @@ -98,12 +105,24 @@ int main(int argc, char **argv) } // Print header - printf("%-*s(GB/s)", MAX_NAME_LEN - 6, "Configuration"); + printf("%*s", MAX_NAME_LEN, ""); + printf("%*s | ", 8*(numDevices+1), "Bandwidth (GB/s)"); + printf("%*s", 8*(numDevices+1), "Duration (msec)"); + printf(" | Overhead\n"); + printf("%-*s", MAX_NAME_LEN, "Configuration"); for (int i = 0; i < numDevices; i++) - printf(" GPU %02d", i); - printf(" Total\n"); - for (int i = 0; i < MAX_NAME_LEN + 8 * (numDevices + 1); i++) printf("="); - printf("\n"); + printf(" GPU %02d", i); + printf(" Total"); + printf(" | "); + for (int i = 0; i < numDevices; i++) + printf(" GPU %02d", i); + printf(" CpuTime"); + printf(" | (msec)\n"); + + for (int i = 0; i < MAX_NAME_LEN + (8 * (numDevices + 1)); i++) printf("="); + printf("=|="); + for (int i = 0; i < (8 * (numDevices + 1)); i++) printf("="); + printf("=|=========\n"); // Read configuration file FILE* fp = fopen(argv[1], "r"); @@ -224,7 +243,9 @@ int main(int argc, char **argv) for (int i = 0; i < numLinks; i++) { HIP_CALL(hipSetDevice(links[i].srcGpu)); + HIP_CALL(hipEventRecord(startEvents[i], streams[i])); + if (useHipCall) { if (useMemset) @@ -262,8 +283,13 @@ int main(int argc, char **argv) HIP_CALL(hipEventRecord(stopEvents[i], streams[i])); } - for (int i = 0; i < numLinks; i++) + // Synchronize per iteration, unless in single sync mode, in which case + // synchronize during last warmup / last actual iteration + if (!useSingleSync || iteration == -1 || iteration == numIterations - 1) + { + for (int i = 0; i < numLinks; i++) hipStreamSynchronize(streams[i]); + } auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart; double deltaSec = std::chrono::duration_cast>(cpuDelta).count(); @@ -274,26 +300,35 @@ int main(int argc, char **argv) for (int i = 0; i < numDevices; i++) { + // Collect GPU information only if this is the last iteration for single sync mode + if (useSingleSync && iteration != numIterations - 1) + { + totalGpuTime[i] = 0.00; + } + else + { // Multiple links running on the same device may be running simultaneously // so try to figure out the first/last event across all links float maxTime = 0.0f; for (int j = 0; j < numLinks; j++) { - if (links[j].srcGpu != i) continue; - for (int k = 0; k < numLinks; k++) - { - if (links[k].srcGpu != i) continue; + if (links[j].srcGpu != i) continue; + for (int k = 0; k < numLinks; k++) + { + if (links[k].srcGpu != i) continue; - float gpuDeltaMsec; - HIP_CALL(hipEventElapsedTime(&gpuDeltaMsec, startEvents[j], stopEvents[k])); - maxTime = std::max(maxTime, gpuDeltaMsec); - } + float gpuDeltaMsec; + HIP_CALL(hipEventElapsedTime(&gpuDeltaMsec, startEvents[j], stopEvents[k])); + maxTime = std::max(maxTime, gpuDeltaMsec); + } } totalGpuTime[i] += maxTime / 1000.0; + } } } } + // Validate that each link has transferred correctly for (int i = 0; i < numLinks; i++) CheckOrFill(N, linkDstMem[i], true, useMemset, useHipCall); @@ -302,20 +337,36 @@ int main(int argc, char **argv) printf("%-*s", MAX_NAME_LEN, name); for (int i = 0; i < numDevices; i++) { - if (linkCount[i] == 0) - { - printf("%8.3f", 0.0); - } - else - { - totalGpuTime[i] /= (1.0 * numIterations); - printf("%8.3f", (linkCount[i] * numBytesPerLink / 1.0E9) / totalGpuTime[i]); - } + if (linkCount[i] == 0) + { + printf("%8.3f", 0.0f); + } + else + { + if (!useSingleSync) + totalGpuTime[i] /= (1.0 * numIterations); + printf("%8.3f", (linkCount[i] * numBytesPerLink / 1.0E9) / totalGpuTime[i]); + } } - // Print off bandwidth (based on CPU wall-time timer) totalCpuTime /= numIterations; - printf("%8.3f\n", (numLinks * numBytesPerLink / 1.0E9) / totalCpuTime); + printf("%8.3f", (numLinks * numBytesPerLink / 1.0E9) / totalCpuTime); + printf(" | "); + + double maxGpuTime = 0; + for (int i = 0; i < numDevices; i++) + { + if (linkCount[i] == 0) + { + printf("%8.3f", 0.0f); + } + else + { + printf("%8.3f", totalGpuTime[i] * 1000.0f); + maxGpuTime = std::max(maxGpuTime, totalGpuTime[i]); + } + } + printf("%8.3f | %8.3f\n", totalCpuTime * 1000.0f, (totalCpuTime - maxGpuTime) * 1000.0f); // Release GPU memory for (int i = 0; i < numLinks; i++) @@ -332,8 +383,10 @@ int main(int argc, char **argv) fclose(fp); // Print link information - for (int i = 0; i < MAX_NAME_LEN + 8 * (numDevices + 1); i++) printf("="); - printf("\n"); + for (int i = 0; i < MAX_NAME_LEN + (8 * (numDevices + 1)); i++) printf("="); + printf("=|="); + for (int i = 0; i < (8 * (numDevices + 1)); i++) printf("="); + printf("=|=========\n"); printf("Link topology:\n"); uint32_t linkType; uint32_t hopCount;