diff --git a/projects/hip-tests/samples/1_Utils/hipDispatchLatency/Makefile b/projects/hip-tests/samples/1_Utils/hipDispatchLatency/Makefile index 0616f01f0d..74945dc515 100644 --- a/projects/hip-tests/samples/1_Utils/hipDispatchLatency/Makefile +++ b/projects/hip-tests/samples/1_Utils/hipDispatchLatency/Makefile @@ -4,16 +4,17 @@ ifeq (,$(HIP_PATH)) endif HIPCC=$(HIP_PATH)/bin/hipcc -std=c++11 -EXE=hipDispatchLatency - CXXFLAGS = -O3 -all: test_kernel.code ${EXE} +all: test_kernel.code hipDispatchLatency.out hipDispatchEnqueueRateMT.out -$(EXE): hipDispatchLatency.cpp +hipDispatchLatency.out: hipDispatchLatency.cpp $(HIPCC) $(CXXFLAGS) hipDispatchLatency.cpp -o $@ +hipDispatchEnqueueRateMT.out: hipDispatchEnqueueRateMT.cpp + $(HIPCC) $(CXXFLAGS) hipDispatchEnqueueRateMT.cpp -o $@ + test_kernel.code: test_kernel.cpp $(HIP_PATH)/bin/hipcc --genco $(GENCO_FLAGS) $^ -o $@ clean: - rm -f *.o $(EXE) + rm -f *.o *.out diff --git a/projects/hip-tests/samples/1_Utils/hipDispatchLatency/hipDispatchEnqueueRateMT.cpp b/projects/hip-tests/samples/1_Utils/hipDispatchLatency/hipDispatchEnqueueRateMT.cpp new file mode 100644 index 0000000000..d1b5c2f3b5 --- /dev/null +++ b/projects/hip-tests/samples/1_Utils/hipDispatchLatency/hipDispatchEnqueueRateMT.cpp @@ -0,0 +1,167 @@ +/* +Copyright (c) 2020-present Advanced Micro Devices, Inc. All rights reserved. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include +#include "hip/hip_runtime.h" +#ifdef __HIP_PLATFORM_HCC__ +#include "hip/hip_ext.h" +#endif +#include +#include +#include +#include +#include +#include +#include + +#define NUM_GROUPS 1 +#define GROUP_SIZE 1 +#define WARMUP_RUN_COUNT 10 +#define TIMING_RUN_COUNT 100 +#define TOTAL_RUN_COUNT WARMUP_RUN_COUNT + TIMING_RUN_COUNT + +__global__ void EmptyKernel() {} + +// Helper to print various timing metrics +void print_timing(std::string test, std::array &results, int batch = 1) +{ + + float total_us = 0.0f, mean_us = 0.0f, stddev_us = 0.0f; + + // remove top outliers due to nature of variability across large number of multi-threaded runs + std::sort(results.begin(), results.end(), std::greater()); + auto start_iter = std::next(results.begin(), WARMUP_RUN_COUNT); + auto end_iter = results.end(); + + // mean + std::for_each(start_iter, end_iter, [&](const float &run_ms) { + total_us += (run_ms * 1000) / batch; + }); + mean_us = total_us / TIMING_RUN_COUNT; + + // stddev + total_us = 0; + std::for_each(start_iter, end_iter, [&](const float &run_ms) { + float dev_us = ((run_ms * 1000) / batch) - mean_us; + total_us += dev_us * dev_us; + }); + stddev_us = sqrt(total_us / TIMING_RUN_COUNT); + + printf("\n %s: %.1f us, std: %.1f us\n", test.c_str(), mean_us, stddev_us); +} + +// Measure time taken to enqueue a kernel on the GPU using hipModuleLaunchKernel +void hipModuleLaunchKernel_enqueue_rate(std::atomic_int* shared, int max_threads) +{ + //resources necessary for this thread + hipStream_t stream; + hipStreamCreate(&stream); + hipModule_t module; + hipFunction_t function; + hipModuleLoad(&module, "test_kernel.code"); + hipModuleGetFunction(&function, module, "test"); + void* kernel_params = nullptr; + std::array results; + + //synchronize all threads, before running + int tid = shared->fetch_add(1, std::memory_order_release); + while (max_threads != shared->load(std::memory_order_acquire)) {} + + for (auto i = 0; i < TOTAL_RUN_COUNT; ++i) { + auto start = std::chrono::high_resolution_clock::now(); + hipModuleLaunchKernel(function, 1, 1, 1, 1, 1, 1, 0, stream, &kernel_params, nullptr); + auto stop = std::chrono::high_resolution_clock::now(); + results[i] = std::chrono::duration(stop - start).count(); + } + print_timing("Thread ID : " + std::to_string(tid) + " , " + "hipModuleLaunchKernel enqueue rate", results); +} + +// Measure time taken to enqueue a kernel on the GPU using hipLaunchKernelGGL +void hipLaunchKernelGGL_enqueue_rate(std::atomic_int* shared, int max_threads) +{ + //resources necessary for this thread + hipStream_t stream; + hipStreamCreate(&stream); + std::array results; + + //synchronize all threads, before running + int tid = shared->fetch_add(1, std::memory_order_release); + while (max_threads != shared->load(std::memory_order_acquire)) {} + + for (auto i = 0; i < TOTAL_RUN_COUNT; ++i) { + auto start = std::chrono::high_resolution_clock::now(); + hipLaunchKernelGGL((EmptyKernel), dim3(NUM_GROUPS), dim3(GROUP_SIZE), 0, stream); + auto stop = std::chrono::high_resolution_clock::now(); + results[i] = std::chrono::duration(stop - start).count(); + } + print_timing("Thread ID : " + std::to_string(tid) + " , " + "hipLaunchKernelGGL enqueue rate", results); +} + +// Simple thread pool +struct thread_pool { + thread_pool(int total_threads) : max_threads(total_threads) {} + void start(std::function f) { + for (int i = 0; i < max_threads; ++i) { + threads.push_back(std::async(std::launch::async, f, &shared, max_threads)); + } + } + void finish() { + for (auto&&thread : threads) { + thread.get(); + } + threads.clear(); + shared = {0}; + } + ~thread_pool() { + finish(); + } +private: + std::atomic_int shared {0}; + std::vector> threads; + int max_threads = 1; +}; + + +int main(int argc, char* argv[]) +{ + if (argc != 3) { + std::cerr << "Run test as 'hipDispatchEnqueueRateMT <0-hipModuleLaunchKernel /1-hipLaunchKernelGGL>'\n"; + return -1; + } + + int max_threads = atoi(argv[1]); + int run_module_test = atoi(argv[2]); + if(max_threads < 1 || run_module_test < 0 || run_module_test > 1) { + std::cerr << "Invalid Input.\n"; + std::cerr << "Run test as 'hipDispatchEnqueueRateMT <0-hipModuleLaunchKernel /1-hipLaunchKernelGGL>'\n"; + return -1; + } + thread_pool task(max_threads); + + if(run_module_test == 0) { + task.start(hipModuleLaunchKernel_enqueue_rate); + task.finish(); + } else { + task.start(hipLaunchKernelGGL_enqueue_rate); + task.finish(); + } + + return 0; +} +