SWDEV-269604 - Revert "Update dispatch latency with wall time measurements"

This reverts commit 416d2371e6. Hold wall time related updates till direct dispatch is ready. Change-Id: I53b232f6f51bc2fc71b6b639fe0081e2907e9707 [ROCm/hip-tests commit: eff5d6cf6c]
2021-02-25 04:48:40 +00:00
@@ -25,97 +25,33 @@ THE SOFTWARE.
 #include <chrono>
 #include <algorithm>

-#include <fstream>
-#include <vector>
-#include <iterator>
-#include <string>
 #define NUM_GROUPS 1
 #define GROUP_SIZE 1
-#define WARMUP_RUN_COUNT 100
-#define TIMING_RUN_COUNT 1000
+#define WARMUP_RUN_COUNT 10
+#define TIMING_RUN_COUNT 100
 #define TOTAL_RUN_COUNT WARMUP_RUN_COUNT + TIMING_RUN_COUNT
 #define BATCH_SIZE 1000

 #define FILE_NAME "test_kernel.code"
 #define KERNEL_NAME "test"

-#define HIPCHECK(error)                                                         \
-{                                                                               \
-  hipError_t localError = error;                                                \
-  if (localError != hipSuccess)  {                                              \
-    printf("error: '%s'(%d) from %s at %s:%d\n",hipGetErrorString(localError),  \
-                   localError, #error, __FILE__, __LINE__);                     \
-    fflush(NULL);                                                               \
-    abort();                                                                    \
-  }                                                                             \
-}
 __global__ void EmptyKernel() { }

-class CSVDump
-{
-    std::string fName;
-    std::string delimeter;
-    int linesCount;
-public:
-    CSVDump(std::string filename, std::string delm = ",") :
-            fName(filename), delimeter(delm), linesCount(1)
-    {}
-    template<typename T>
-    void addRow(std::string test, T first, T last);
-    void addStats(std::string test, float mean, float std, float min, float max);
-};
-template<typename T>
-void CSVDump::addRow(std::string test, T first, T last)
-{
-    std::fstream file;
-    file.open(fName, std::ios::out | (linesCount ? std::ios::app : std::ios::trunc));
-    file << test;
-    file <<delimeter;
-    for (; first != last; )
-    {
-        file << *first;
-        if (++first != last)
-            file << delimeter;
-    }
-    file << "\n";
-    linesCount++;
-    file.close();
-}
-
-void CSVDump::addStats(std::string test, float mean, float std, float min, float max)
-{
-    std::fstream file;
-    file.open(fName, std::ios::out | (linesCount ? std::ios::app : std::ios::trunc));
-    file << test;
-    file <<delimeter;
-    file.precision(3);
-    file << mean <<delimeter <<std<<delimeter<<min<<delimeter<<max;
-    file << "\n";
-    linesCount++;
-    file.close();
-}
-
 void print_timing(std::string test, const std::array<float, TOTAL_RUN_COUNT> &results, int batch = 1) {
-    CSVDump writer("LaunchLatency.csv");
-    float total_us = 0.0f, mean_us = 0.0f, stddev_us = 0.0f, min_us = 0.0f, max_us = 0.0f;
-
+    
+    float total_us = 0.0f, mean_us = 0.0f, stddev_us = 0.0f;
+    
    // skip warm-up runs
    auto start_iter = std::next(results.begin(), WARMUP_RUN_COUNT);
    auto end_iter = results.end();
-    //writer.addRow(test, start_iter, end_iter);
+
    // mean
-    float min = std::numeric_limits<float>::max();
-    float max = std::numeric_limits<float>::min();
    std::for_each(start_iter, end_iter, [&](const float &run_ms) {
        total_us += (run_ms * 1000) / batch;
-        min = std::min(run_ms, min);
-        max = std::max(run_ms, max);
-    });
+    });   
    mean_us = total_us  / TIMING_RUN_COUNT;
-    min_us = (min *1000) / batch;
-    max_us = (max *1000) / batch;

-    // stddev
+   // stddev
    total_us = 0;
    std::for_each(start_iter, end_iter, [&](const float &run_ms) {
        float dev_us = ((run_ms * 1000) / batch) - mean_us;
@@ -123,43 +59,40 @@ void print_timing(std::string test, const std::array<float, TOTAL_RUN_COUNT> &re
    });
    stddev_us = sqrt(total_us / TIMING_RUN_COUNT);

-    writer.addStats(test, mean_us, stddev_us, min_us, max_us);
    // display
-    printf("\n %s: %.1f us, std: %.1f us max: %.1f us min:%.1f us\n", test.c_str(), mean_us, stddev_us, max_us, min_us);
+    printf("\n %s: %.1f us, std: %.1f us\n", test.c_str(), mean_us, stddev_us);
 }

-int main() {
+int main() {   
    hipStream_t stream0 = 0;
    hipDevice_t device;
-    HIPCHECK(hipDeviceGet(&device, 0));
-    hipCtx_t context;
-    HIPCHECK(hipCtxCreate(&context, 0, device));
+    hipDeviceGet(&device, 0);
+    hipCtx_t context;     
+    hipCtxCreate(&context, 0, device); 
    hipModule_t module;
    hipFunction_t function;
-    HIPCHECK(hipModuleLoad(&module, FILE_NAME));
-    HIPCHECK(hipModuleGetFunction(&function, module, KERNEL_NAME));
+    hipModuleLoad(&module, FILE_NAME);
+    hipModuleGetFunction(&function, module, KERNEL_NAME);
    void* params = nullptr;
-
+    
    std::array<float, TOTAL_RUN_COUNT> results;
    hipEvent_t start, stop;
-    HIPCHECK(hipEventCreate(&start));
-    HIPCHECK(hipEventCreate(&stop));
+    hipEventCreate(&start);
+    hipEventCreate(&stop);

    /************************************************************************************/
    /* HIP kernel launch enqueue rate:                                                  */
    /* Measure time taken to enqueue a kernel on the GPU                                */
-    /************************************************************************************/
+    /************************************************************************************/ 

    // Timing hipModuleLaunchKernel
    for (auto i = 0; i < TOTAL_RUN_COUNT; ++i) {
        auto start = std::chrono::high_resolution_clock::now();
-        HIPCHECK(hipModuleLaunchKernel(function, 1, 1, 1, 1, 1, 1, 0, 0, &params, nullptr));
+        hipModuleLaunchKernel(function, 1, 1, 1, 1, 1, 1, 0, 0, &params, nullptr);
        auto stop = std::chrono::high_resolution_clock::now();
        results[i] = std::chrono::duration<float, std::milli>(stop - start).count();
    }
-    print_timing("hipModuleLaunchKernel enqueue time", results);
-
-    HIPCHECK(hipDeviceSynchronize());
+    print_timing("hipModuleLaunchKernel enqueue rate", results);

    // Timing hipLaunchKernelGGL
    for (auto i = 0; i < TOTAL_RUN_COUNT; ++i) {
@@ -168,131 +101,41 @@ int main() {
        auto stop = std::chrono::high_resolution_clock::now();
        results[i] = std::chrono::duration<float, std::milli>(stop - start).count();
    }
-    print_timing("hipLaunchKernelGGL enqueue time", results);
-
-    HIPCHECK(hipDeviceSynchronize());
-
-#ifdef __HIP_PLATFORM_AMD__
-    //Timing hipExtLaunchKernelGGL
-    for (auto i = 0; i < TOTAL_RUN_COUNT; ++i) {
-        auto start_chrono = std::chrono::high_resolution_clock::now();
-        hipExtLaunchKernelGGL((EmptyKernel), dim3(NUM_GROUPS), dim3(GROUP_SIZE), 0, stream0, start, stop, 0);
-        auto stop_chrono = std::chrono::high_resolution_clock::now();
-        results[i] = std::chrono::duration<float, std::milli>(stop_chrono - start_chrono).count();
-    }
-    print_timing("hipExtLaunchKernelGGL enqueue time", results);
-
-    HIPCHECK(hipDeviceSynchronize());
-#endif
-
-    //Timing hipExtLaunchKernelGGL
-    for (auto i = 0; i < TOTAL_RUN_COUNT; ++i) {
-        auto start_chrono = std::chrono::high_resolution_clock::now();
-        hipExtLaunchKernelGGL((EmptyKernel), dim3(NUM_GROUPS), dim3(GROUP_SIZE), 0, stream0, nullptr, nullptr, 0);
-        auto stop_chrono = std::chrono::high_resolution_clock::now();
-        results[i] = std::chrono::duration<float, std::milli>(stop_chrono - start_chrono).count();
-    }
-    print_timing("hipExtLaunchKernelGGL w/o events enqueue time", results);
-
-    HIPCHECK(hipDeviceSynchronize());
+    print_timing("hipLaunchKernelGGL enqueue rate", results);

    /***********************************************************************************/
-    /* Single dispatch execution latency using HIP events:                             */
-    /* Measures latency to start & finish executing a kernel with GPU-scope visibility    */
+    /* Single dispatch execution latency using HIP events:                             */   
+    /* Measures latency to start & finish executing a kernel with GPU-scope visibility    */ 
    /***********************************************************************************/

-    //Timing directly the dispatch
-#ifdef __HIP_PLATFORM_AMD__
-    for (auto i = 0; i < TOTAL_RUN_COUNT; ++i) {
-        hipExtLaunchKernelGGL((EmptyKernel), dim3(NUM_GROUPS), dim3(GROUP_SIZE), 0, stream0, start, stop, 0);
-        HIPCHECK(hipEventSynchronize(stop));
-        HIPCHECK(hipEventElapsedTime(&results[i], start, stop));
-    }
-    print_timing("Timing directly single dispatch latency", results);
-#endif
-
    //Timing around the dispatch
    for (auto i = 0; i < TOTAL_RUN_COUNT; ++i) {
-        HIPCHECK(hipEventRecord(start, 0));
+        hipEventRecord(start, 0);
        hipLaunchKernelGGL((EmptyKernel), dim3(NUM_GROUPS), dim3(GROUP_SIZE), 0, stream0);
-        HIPCHECK(hipEventRecord(stop, 0));
-        HIPCHECK(hipEventSynchronize(stop));
-        HIPCHECK(hipEventElapsedTime(&results[i], start, stop));
+        hipEventRecord(stop, 0);
+        hipEventSynchronize(stop);
+        hipEventElapsedTime(&results[i], start, stop);
    }
    print_timing("Timing around single dispatch latency", results);

-    //Timing around the dispatch
-    for (auto i = 0; i < TOTAL_RUN_COUNT; ++i) {
-        auto start_chrono = std::chrono::high_resolution_clock::now();
-        HIPCHECK(hipEventRecord(start, 0));
-        hipLaunchKernelGGL((EmptyKernel), dim3(NUM_GROUPS), dim3(GROUP_SIZE), 0, stream0);
-        HIPCHECK(hipEventRecord(stop, 0));
-        HIPCHECK(hipEventSynchronize(stop));
-        auto stop_chrono = std::chrono::high_resolution_clock::now();
-        results[i] = std::chrono::duration<float, std::milli>(stop_chrono - start_chrono).count();
-    }
-    print_timing("Wall timing around single dispatch with events", results);
-
-    for (auto i = 0; i < TOTAL_RUN_COUNT; ++i) {
-        auto start_chrono = std::chrono::high_resolution_clock::now();
-        hipLaunchKernelGGL((EmptyKernel), dim3(NUM_GROUPS), dim3(GROUP_SIZE), 0, stream0);
-        HIPCHECK(hipStreamSynchronize(stream0));
-        auto stop_chrono = std::chrono::high_resolution_clock::now();
-        results[i] = std::chrono::duration<float, std::milli>(stop_chrono - start_chrono).count();
-    }
-    print_timing("Wall timing around single dispatch without events", results);
-
-#ifdef __HIP_PLATFORM_AMD__
-    //Timing around the dispatch with hipExtLaunchKernelGGL
-    for (auto i = 0; i < TOTAL_RUN_COUNT; ++i) {
-        auto start_chrono = std::chrono::high_resolution_clock::now();
-        HIPCHECK(hipEventRecord(start, 0));
-        hipExtLaunchKernelGGL((EmptyKernel), dim3(NUM_GROUPS), dim3(GROUP_SIZE), 0, stream0, start, stop, 0);
-        HIPCHECK(hipEventSynchronize(stop));
-        auto stop_chrono = std::chrono::high_resolution_clock::now();
-        results[i] = std::chrono::duration<float, std::milli>(stop_chrono - start_chrono).count();
-    }
-    print_timing("Wall timing around single dispatch ExtLaunch with events", results);
-
-    //Timing around the dispatch with hipExtLaunchKernelGGL without events
-    for (auto i = 0; i < TOTAL_RUN_COUNT; ++i) {
-        auto start_chrono = std::chrono::high_resolution_clock::now();
-        hipExtLaunchKernelGGL((EmptyKernel), dim3(NUM_GROUPS), dim3(GROUP_SIZE), 0, stream0, nullptr, nullptr, 0);
-        HIPCHECK(hipStreamSynchronize(stream0));
-        auto stop_chrono = std::chrono::high_resolution_clock::now();
-        results[i] = std::chrono::duration<float, std::milli>(stop_chrono - start_chrono).count();
-    }
-    print_timing("Wall timing around single dispatch ExtLaunch w/o events", results);
-#endif
-
    /*********************************************************************************/
    /* Batch dispatch execution latency using HIP events:                            */
-    /* Measures latency to start & finish executing each dispatch in a batch    */
+    /* Measures latency to start & finish executing each dispatch in a batch    */ 
    /*********************************************************************************/

    for (auto i = 0; i < TOTAL_RUN_COUNT; ++i) {
-        HIPCHECK(hipEventRecord(start, 0));
-        for (int j = 0; j < BATCH_SIZE; j++) {
-            hipLaunchKernelGGL((EmptyKernel), dim3(NUM_GROUPS), dim3(GROUP_SIZE), 0, stream0);
-        }
-        HIPCHECK(hipEventRecord(stop, 0));
-        HIPCHECK(hipEventSynchronize(stop));
-        HIPCHECK(hipEventElapsedTime(&results[i], start, stop));
+         hipEventRecord(start, 0);
+         for (int j = 0; j < BATCH_SIZE; j++) {
+             hipLaunchKernelGGL((EmptyKernel), dim3(NUM_GROUPS), dim3(GROUP_SIZE), 0, stream0);
+         }
+         hipEventRecord(stop, 0);
+         hipEventSynchronize(stop);
+         hipEventElapsedTime(&results[i], start, stop);
    }
    print_timing("Batch dispatch latency", results, BATCH_SIZE);

-    for (auto i = 0; i < TOTAL_RUN_COUNT; ++i) {
-        auto start_chrono = std::chrono::high_resolution_clock::now();
-        for (int j = 0; j < BATCH_SIZE; j++) {
-            hipLaunchKernelGGL((EmptyKernel), dim3(NUM_GROUPS), dim3(GROUP_SIZE), 0, stream0);
-        }
-        HIPCHECK(hipStreamSynchronize(stream0));
-        auto stop_chrono = std::chrono::high_resolution_clock::now();
-        results[i] = std::chrono::duration<float, std::milli>(stop_chrono - start_chrono).count();
-    }
-    print_timing("Wall timing for batch dispatch latency", results, BATCH_SIZE);
-
-    HIPCHECK(hipEventDestroy(start));
-    HIPCHECK(hipEventDestroy(stop));
-    HIPCHECK(hipCtxDestroy(context));
+    hipEventDestroy(start);
+    hipEventDestroy(stop);
+    hipCtxDestroy(context);
 }
+