diff --git a/hipamd/samples/1_Utils/hipDispatchLatency/hipDispatchLatency.cpp b/hipamd/samples/1_Utils/hipDispatchLatency/hipDispatchLatency.cpp index f34c6852bd..7e82352977 100644 --- a/hipamd/samples/1_Utils/hipDispatchLatency/hipDispatchLatency.cpp +++ b/hipamd/samples/1_Utils/hipDispatchLatency/hipDispatchLatency.cpp @@ -25,33 +25,97 @@ THE SOFTWARE. #include #include +#include +#include +#include +#include #define NUM_GROUPS 1 #define GROUP_SIZE 1 -#define WARMUP_RUN_COUNT 10 -#define TIMING_RUN_COUNT 100 +#define WARMUP_RUN_COUNT 100 +#define TIMING_RUN_COUNT 1000 #define TOTAL_RUN_COUNT WARMUP_RUN_COUNT + TIMING_RUN_COUNT #define BATCH_SIZE 1000 #define FILE_NAME "test_kernel.code" #define KERNEL_NAME "test" +#define HIPCHECK(error) \ +{ \ + hipError_t localError = error; \ + if (localError != hipSuccess) { \ + printf("error: '%s'(%d) from %s at %s:%d\n",hipGetErrorString(localError), \ + localError, #error, __FILE__, __LINE__); \ + fflush(NULL); \ + abort(); \ + } \ +} __global__ void EmptyKernel() { } +class CSVDump +{ + std::string fName; + std::string delimeter; + int linesCount; +public: + CSVDump(std::string filename, std::string delm = ",") : + fName(filename), delimeter(delm), linesCount(1) + {} + template + void addRow(std::string test, T first, T last); + void addStats(std::string test, float mean, float std, float min, float max); +}; +template +void CSVDump::addRow(std::string test, T first, T last) +{ + std::fstream file; + file.open(fName, std::ios::out | (linesCount ? std::ios::app : std::ios::trunc)); + file << test; + file < &results, int batch = 1) { - - float total_us = 0.0f, mean_us = 0.0f, stddev_us = 0.0f; - + CSVDump writer("LaunchLatency.csv"); + float total_us = 0.0f, mean_us = 0.0f, stddev_us = 0.0f, min_us = 0.0f, max_us = 0.0f; + // skip warm-up runs auto start_iter = std::next(results.begin(), WARMUP_RUN_COUNT); auto end_iter = results.end(); - + //writer.addRow(test, start_iter, end_iter); // mean + float min = std::numeric_limits::max(); + float max = std::numeric_limits::min(); std::for_each(start_iter, end_iter, [&](const float &run_ms) { total_us += (run_ms * 1000) / batch; - }); + min = std::min(run_ms, min); + max = std::max(run_ms, max); + }); mean_us = total_us / TIMING_RUN_COUNT; + min_us = (min *1000) / batch; + max_us = (max *1000) / batch; - // stddev + // stddev total_us = 0; std::for_each(start_iter, end_iter, [&](const float &run_ms) { float dev_us = ((run_ms * 1000) / batch) - mean_us; @@ -59,40 +123,43 @@ void print_timing(std::string test, const std::array &re }); stddev_us = sqrt(total_us / TIMING_RUN_COUNT); + writer.addStats(test, mean_us, stddev_us, min_us, max_us); // display - printf("\n %s: %.1f us, std: %.1f us\n", test.c_str(), mean_us, stddev_us); + printf("\n %s: %.1f us, std: %.1f us max: %.1f us min:%.1f us\n", test.c_str(), mean_us, stddev_us, max_us, min_us); } -int main() { +int main() { hipStream_t stream0 = 0; hipDevice_t device; - hipDeviceGet(&device, 0); - hipCtx_t context; - hipCtxCreate(&context, 0, device); + HIPCHECK(hipDeviceGet(&device, 0)); + hipCtx_t context; + HIPCHECK(hipCtxCreate(&context, 0, device)); hipModule_t module; hipFunction_t function; - hipModuleLoad(&module, FILE_NAME); - hipModuleGetFunction(&function, module, KERNEL_NAME); + HIPCHECK(hipModuleLoad(&module, FILE_NAME)); + HIPCHECK(hipModuleGetFunction(&function, module, KERNEL_NAME)); void* params = nullptr; - + std::array results; hipEvent_t start, stop; - hipEventCreate(&start); - hipEventCreate(&stop); + HIPCHECK(hipEventCreate(&start)); + HIPCHECK(hipEventCreate(&stop)); /************************************************************************************/ /* HIP kernel launch enqueue rate: */ /* Measure time taken to enqueue a kernel on the GPU */ - /************************************************************************************/ + /************************************************************************************/ // Timing hipModuleLaunchKernel for (auto i = 0; i < TOTAL_RUN_COUNT; ++i) { auto start = std::chrono::high_resolution_clock::now(); - hipModuleLaunchKernel(function, 1, 1, 1, 1, 1, 1, 0, 0, ¶ms, nullptr); + HIPCHECK(hipModuleLaunchKernel(function, 1, 1, 1, 1, 1, 1, 0, 0, ¶ms, nullptr)); auto stop = std::chrono::high_resolution_clock::now(); results[i] = std::chrono::duration(stop - start).count(); } - print_timing("hipModuleLaunchKernel enqueue rate", results); + print_timing("hipModuleLaunchKernel enqueue time", results); + + HIPCHECK(hipDeviceSynchronize()); // Timing hipLaunchKernelGGL for (auto i = 0; i < TOTAL_RUN_COUNT; ++i) { @@ -101,41 +168,131 @@ int main() { auto stop = std::chrono::high_resolution_clock::now(); results[i] = std::chrono::duration(stop - start).count(); } - print_timing("hipLaunchKernelGGL enqueue rate", results); + print_timing("hipLaunchKernelGGL enqueue time", results); + + HIPCHECK(hipDeviceSynchronize()); + +#ifdef __HIP_PLATFORM_AMD__ + //Timing hipExtLaunchKernelGGL + for (auto i = 0; i < TOTAL_RUN_COUNT; ++i) { + auto start_chrono = std::chrono::high_resolution_clock::now(); + hipExtLaunchKernelGGL((EmptyKernel), dim3(NUM_GROUPS), dim3(GROUP_SIZE), 0, stream0, start, stop, 0); + auto stop_chrono = std::chrono::high_resolution_clock::now(); + results[i] = std::chrono::duration(stop_chrono - start_chrono).count(); + } + print_timing("hipExtLaunchKernelGGL enqueue time", results); + + HIPCHECK(hipDeviceSynchronize()); +#endif + + //Timing hipExtLaunchKernelGGL + for (auto i = 0; i < TOTAL_RUN_COUNT; ++i) { + auto start_chrono = std::chrono::high_resolution_clock::now(); + hipExtLaunchKernelGGL((EmptyKernel), dim3(NUM_GROUPS), dim3(GROUP_SIZE), 0, stream0, nullptr, nullptr, 0); + auto stop_chrono = std::chrono::high_resolution_clock::now(); + results[i] = std::chrono::duration(stop_chrono - start_chrono).count(); + } + print_timing("hipExtLaunchKernelGGL w/o events enqueue time", results); + + HIPCHECK(hipDeviceSynchronize()); /***********************************************************************************/ - /* Single dispatch execution latency using HIP events: */ - /* Measures latency to start & finish executing a kernel with GPU-scope visibility */ + /* Single dispatch execution latency using HIP events: */ + /* Measures latency to start & finish executing a kernel with GPU-scope visibility */ /***********************************************************************************/ + //Timing directly the dispatch +#ifdef __HIP_PLATFORM_AMD__ + for (auto i = 0; i < TOTAL_RUN_COUNT; ++i) { + hipExtLaunchKernelGGL((EmptyKernel), dim3(NUM_GROUPS), dim3(GROUP_SIZE), 0, stream0, start, stop, 0); + HIPCHECK(hipEventSynchronize(stop)); + HIPCHECK(hipEventElapsedTime(&results[i], start, stop)); + } + print_timing("Timing directly single dispatch latency", results); +#endif + //Timing around the dispatch for (auto i = 0; i < TOTAL_RUN_COUNT; ++i) { - hipEventRecord(start, 0); + HIPCHECK(hipEventRecord(start, 0)); hipLaunchKernelGGL((EmptyKernel), dim3(NUM_GROUPS), dim3(GROUP_SIZE), 0, stream0); - hipEventRecord(stop, 0); - hipEventSynchronize(stop); - hipEventElapsedTime(&results[i], start, stop); + HIPCHECK(hipEventRecord(stop, 0)); + HIPCHECK(hipEventSynchronize(stop)); + HIPCHECK(hipEventElapsedTime(&results[i], start, stop)); } print_timing("Timing around single dispatch latency", results); + //Timing around the dispatch + for (auto i = 0; i < TOTAL_RUN_COUNT; ++i) { + auto start_chrono = std::chrono::high_resolution_clock::now(); + HIPCHECK(hipEventRecord(start, 0)); + hipLaunchKernelGGL((EmptyKernel), dim3(NUM_GROUPS), dim3(GROUP_SIZE), 0, stream0); + HIPCHECK(hipEventRecord(stop, 0)); + HIPCHECK(hipEventSynchronize(stop)); + auto stop_chrono = std::chrono::high_resolution_clock::now(); + results[i] = std::chrono::duration(stop_chrono - start_chrono).count(); + } + print_timing("Wall timing around single dispatch with events", results); + + for (auto i = 0; i < TOTAL_RUN_COUNT; ++i) { + auto start_chrono = std::chrono::high_resolution_clock::now(); + hipLaunchKernelGGL((EmptyKernel), dim3(NUM_GROUPS), dim3(GROUP_SIZE), 0, stream0); + HIPCHECK(hipStreamSynchronize(stream0)); + auto stop_chrono = std::chrono::high_resolution_clock::now(); + results[i] = std::chrono::duration(stop_chrono - start_chrono).count(); + } + print_timing("Wall timing around single dispatch without events", results); + +#ifdef __HIP_PLATFORM_AMD__ + //Timing around the dispatch with hipExtLaunchKernelGGL + for (auto i = 0; i < TOTAL_RUN_COUNT; ++i) { + auto start_chrono = std::chrono::high_resolution_clock::now(); + HIPCHECK(hipEventRecord(start, 0)); + hipExtLaunchKernelGGL((EmptyKernel), dim3(NUM_GROUPS), dim3(GROUP_SIZE), 0, stream0, start, stop, 0); + HIPCHECK(hipEventSynchronize(stop)); + auto stop_chrono = std::chrono::high_resolution_clock::now(); + results[i] = std::chrono::duration(stop_chrono - start_chrono).count(); + } + print_timing("Wall timing around single dispatch ExtLaunch with events", results); + + //Timing around the dispatch with hipExtLaunchKernelGGL without events + for (auto i = 0; i < TOTAL_RUN_COUNT; ++i) { + auto start_chrono = std::chrono::high_resolution_clock::now(); + hipExtLaunchKernelGGL((EmptyKernel), dim3(NUM_GROUPS), dim3(GROUP_SIZE), 0, stream0, nullptr, nullptr, 0); + HIPCHECK(hipStreamSynchronize(stream0)); + auto stop_chrono = std::chrono::high_resolution_clock::now(); + results[i] = std::chrono::duration(stop_chrono - start_chrono).count(); + } + print_timing("Wall timing around single dispatch ExtLaunch w/o events", results); +#endif + /*********************************************************************************/ /* Batch dispatch execution latency using HIP events: */ - /* Measures latency to start & finish executing each dispatch in a batch */ + /* Measures latency to start & finish executing each dispatch in a batch */ /*********************************************************************************/ for (auto i = 0; i < TOTAL_RUN_COUNT; ++i) { - hipEventRecord(start, 0); - for (int j = 0; j < BATCH_SIZE; j++) { - hipLaunchKernelGGL((EmptyKernel), dim3(NUM_GROUPS), dim3(GROUP_SIZE), 0, stream0); - } - hipEventRecord(stop, 0); - hipEventSynchronize(stop); - hipEventElapsedTime(&results[i], start, stop); + HIPCHECK(hipEventRecord(start, 0)); + for (int j = 0; j < BATCH_SIZE; j++) { + hipLaunchKernelGGL((EmptyKernel), dim3(NUM_GROUPS), dim3(GROUP_SIZE), 0, stream0); + } + HIPCHECK(hipEventRecord(stop, 0)); + HIPCHECK(hipEventSynchronize(stop)); + HIPCHECK(hipEventElapsedTime(&results[i], start, stop)); } print_timing("Batch dispatch latency", results, BATCH_SIZE); - hipEventDestroy(start); - hipEventDestroy(stop); - hipCtxDestroy(context); -} + for (auto i = 0; i < TOTAL_RUN_COUNT; ++i) { + auto start_chrono = std::chrono::high_resolution_clock::now(); + for (int j = 0; j < BATCH_SIZE; j++) { + hipLaunchKernelGGL((EmptyKernel), dim3(NUM_GROUPS), dim3(GROUP_SIZE), 0, stream0); + } + HIPCHECK(hipStreamSynchronize(stream0)); + auto stop_chrono = std::chrono::high_resolution_clock::now(); + results[i] = std::chrono::duration(stop_chrono - start_chrono).count(); + } + print_timing("Wall timing for batch dispatch latency", results, BATCH_SIZE); + HIPCHECK(hipEventDestroy(start)); + HIPCHECK(hipEventDestroy(stop)); + HIPCHECK(hipCtxDestroy(context)); +}