SWDEV-269604 - Revert "Update dispatch latency with wall time measurements"

This reverts commit 416d2371e6.

Hold wall time related updates till direct dispatch is ready.

Change-Id: I53b232f6f51bc2fc71b6b639fe0081e2907e9707


[ROCm/hip-tests commit: eff5d6cf6c]
Этот коммит содержится в:
Rahul Garg
2021-02-25 04:48:40 +00:00
родитель 4ca662b406
Коммит 034cb582d8
+40 -197
Просмотреть файл
@@ -25,97 +25,33 @@ THE SOFTWARE.
#include <chrono>
#include <algorithm>
#include <fstream>
#include <vector>
#include <iterator>
#include <string>
#define NUM_GROUPS 1
#define GROUP_SIZE 1
#define WARMUP_RUN_COUNT 100
#define TIMING_RUN_COUNT 1000
#define WARMUP_RUN_COUNT 10
#define TIMING_RUN_COUNT 100
#define TOTAL_RUN_COUNT WARMUP_RUN_COUNT + TIMING_RUN_COUNT
#define BATCH_SIZE 1000
#define FILE_NAME "test_kernel.code"
#define KERNEL_NAME "test"
#define HIPCHECK(error) \
{ \
hipError_t localError = error; \
if (localError != hipSuccess) { \
printf("error: '%s'(%d) from %s at %s:%d\n",hipGetErrorString(localError), \
localError, #error, __FILE__, __LINE__); \
fflush(NULL); \
abort(); \
} \
}
__global__ void EmptyKernel() { }
class CSVDump
{
std::string fName;
std::string delimeter;
int linesCount;
public:
CSVDump(std::string filename, std::string delm = ",") :
fName(filename), delimeter(delm), linesCount(1)
{}
template<typename T>
void addRow(std::string test, T first, T last);
void addStats(std::string test, float mean, float std, float min, float max);
};
template<typename T>
void CSVDump::addRow(std::string test, T first, T last)
{
std::fstream file;
file.open(fName, std::ios::out | (linesCount ? std::ios::app : std::ios::trunc));
file << test;
file <<delimeter;
for (; first != last; )
{
file << *first;
if (++first != last)
file << delimeter;
}
file << "\n";
linesCount++;
file.close();
}
void CSVDump::addStats(std::string test, float mean, float std, float min, float max)
{
std::fstream file;
file.open(fName, std::ios::out | (linesCount ? std::ios::app : std::ios::trunc));
file << test;
file <<delimeter;
file.precision(3);
file << mean <<delimeter <<std<<delimeter<<min<<delimeter<<max;
file << "\n";
linesCount++;
file.close();
}
void print_timing(std::string test, const std::array<float, TOTAL_RUN_COUNT> &results, int batch = 1) {
CSVDump writer("LaunchLatency.csv");
float total_us = 0.0f, mean_us = 0.0f, stddev_us = 0.0f, min_us = 0.0f, max_us = 0.0f;
float total_us = 0.0f, mean_us = 0.0f, stddev_us = 0.0f;
// skip warm-up runs
auto start_iter = std::next(results.begin(), WARMUP_RUN_COUNT);
auto end_iter = results.end();
//writer.addRow(test, start_iter, end_iter);
// mean
float min = std::numeric_limits<float>::max();
float max = std::numeric_limits<float>::min();
std::for_each(start_iter, end_iter, [&](const float &run_ms) {
total_us += (run_ms * 1000) / batch;
min = std::min(run_ms, min);
max = std::max(run_ms, max);
});
});
mean_us = total_us / TIMING_RUN_COUNT;
min_us = (min *1000) / batch;
max_us = (max *1000) / batch;
// stddev
// stddev
total_us = 0;
std::for_each(start_iter, end_iter, [&](const float &run_ms) {
float dev_us = ((run_ms * 1000) / batch) - mean_us;
@@ -123,43 +59,40 @@ void print_timing(std::string test, const std::array<float, TOTAL_RUN_COUNT> &re
});
stddev_us = sqrt(total_us / TIMING_RUN_COUNT);
writer.addStats(test, mean_us, stddev_us, min_us, max_us);
// display
printf("\n %s: %.1f us, std: %.1f us max: %.1f us min:%.1f us\n", test.c_str(), mean_us, stddev_us, max_us, min_us);
printf("\n %s: %.1f us, std: %.1f us\n", test.c_str(), mean_us, stddev_us);
}
int main() {
int main() {
hipStream_t stream0 = 0;
hipDevice_t device;
HIPCHECK(hipDeviceGet(&device, 0));
hipCtx_t context;
HIPCHECK(hipCtxCreate(&context, 0, device));
hipDeviceGet(&device, 0);
hipCtx_t context;
hipCtxCreate(&context, 0, device);
hipModule_t module;
hipFunction_t function;
HIPCHECK(hipModuleLoad(&module, FILE_NAME));
HIPCHECK(hipModuleGetFunction(&function, module, KERNEL_NAME));
hipModuleLoad(&module, FILE_NAME);
hipModuleGetFunction(&function, module, KERNEL_NAME);
void* params = nullptr;
std::array<float, TOTAL_RUN_COUNT> results;
hipEvent_t start, stop;
HIPCHECK(hipEventCreate(&start));
HIPCHECK(hipEventCreate(&stop));
hipEventCreate(&start);
hipEventCreate(&stop);
/************************************************************************************/
/* HIP kernel launch enqueue rate: */
/* Measure time taken to enqueue a kernel on the GPU */
/************************************************************************************/
/************************************************************************************/
// Timing hipModuleLaunchKernel
for (auto i = 0; i < TOTAL_RUN_COUNT; ++i) {
auto start = std::chrono::high_resolution_clock::now();
HIPCHECK(hipModuleLaunchKernel(function, 1, 1, 1, 1, 1, 1, 0, 0, &params, nullptr));
hipModuleLaunchKernel(function, 1, 1, 1, 1, 1, 1, 0, 0, &params, nullptr);
auto stop = std::chrono::high_resolution_clock::now();
results[i] = std::chrono::duration<float, std::milli>(stop - start).count();
}
print_timing("hipModuleLaunchKernel enqueue time", results);
HIPCHECK(hipDeviceSynchronize());
print_timing("hipModuleLaunchKernel enqueue rate", results);
// Timing hipLaunchKernelGGL
for (auto i = 0; i < TOTAL_RUN_COUNT; ++i) {
@@ -168,131 +101,41 @@ int main() {
auto stop = std::chrono::high_resolution_clock::now();
results[i] = std::chrono::duration<float, std::milli>(stop - start).count();
}
print_timing("hipLaunchKernelGGL enqueue time", results);
HIPCHECK(hipDeviceSynchronize());
#ifdef __HIP_PLATFORM_AMD__
//Timing hipExtLaunchKernelGGL
for (auto i = 0; i < TOTAL_RUN_COUNT; ++i) {
auto start_chrono = std::chrono::high_resolution_clock::now();
hipExtLaunchKernelGGL((EmptyKernel), dim3(NUM_GROUPS), dim3(GROUP_SIZE), 0, stream0, start, stop, 0);
auto stop_chrono = std::chrono::high_resolution_clock::now();
results[i] = std::chrono::duration<float, std::milli>(stop_chrono - start_chrono).count();
}
print_timing("hipExtLaunchKernelGGL enqueue time", results);
HIPCHECK(hipDeviceSynchronize());
#endif
//Timing hipExtLaunchKernelGGL
for (auto i = 0; i < TOTAL_RUN_COUNT; ++i) {
auto start_chrono = std::chrono::high_resolution_clock::now();
hipExtLaunchKernelGGL((EmptyKernel), dim3(NUM_GROUPS), dim3(GROUP_SIZE), 0, stream0, nullptr, nullptr, 0);
auto stop_chrono = std::chrono::high_resolution_clock::now();
results[i] = std::chrono::duration<float, std::milli>(stop_chrono - start_chrono).count();
}
print_timing("hipExtLaunchKernelGGL w/o events enqueue time", results);
HIPCHECK(hipDeviceSynchronize());
print_timing("hipLaunchKernelGGL enqueue rate", results);
/***********************************************************************************/
/* Single dispatch execution latency using HIP events: */
/* Measures latency to start & finish executing a kernel with GPU-scope visibility */
/* Single dispatch execution latency using HIP events: */
/* Measures latency to start & finish executing a kernel with GPU-scope visibility */
/***********************************************************************************/
//Timing directly the dispatch
#ifdef __HIP_PLATFORM_AMD__
for (auto i = 0; i < TOTAL_RUN_COUNT; ++i) {
hipExtLaunchKernelGGL((EmptyKernel), dim3(NUM_GROUPS), dim3(GROUP_SIZE), 0, stream0, start, stop, 0);
HIPCHECK(hipEventSynchronize(stop));
HIPCHECK(hipEventElapsedTime(&results[i], start, stop));
}
print_timing("Timing directly single dispatch latency", results);
#endif
//Timing around the dispatch
for (auto i = 0; i < TOTAL_RUN_COUNT; ++i) {
HIPCHECK(hipEventRecord(start, 0));
hipEventRecord(start, 0);
hipLaunchKernelGGL((EmptyKernel), dim3(NUM_GROUPS), dim3(GROUP_SIZE), 0, stream0);
HIPCHECK(hipEventRecord(stop, 0));
HIPCHECK(hipEventSynchronize(stop));
HIPCHECK(hipEventElapsedTime(&results[i], start, stop));
hipEventRecord(stop, 0);
hipEventSynchronize(stop);
hipEventElapsedTime(&results[i], start, stop);
}
print_timing("Timing around single dispatch latency", results);
//Timing around the dispatch
for (auto i = 0; i < TOTAL_RUN_COUNT; ++i) {
auto start_chrono = std::chrono::high_resolution_clock::now();
HIPCHECK(hipEventRecord(start, 0));
hipLaunchKernelGGL((EmptyKernel), dim3(NUM_GROUPS), dim3(GROUP_SIZE), 0, stream0);
HIPCHECK(hipEventRecord(stop, 0));
HIPCHECK(hipEventSynchronize(stop));
auto stop_chrono = std::chrono::high_resolution_clock::now();
results[i] = std::chrono::duration<float, std::milli>(stop_chrono - start_chrono).count();
}
print_timing("Wall timing around single dispatch with events", results);
for (auto i = 0; i < TOTAL_RUN_COUNT; ++i) {
auto start_chrono = std::chrono::high_resolution_clock::now();
hipLaunchKernelGGL((EmptyKernel), dim3(NUM_GROUPS), dim3(GROUP_SIZE), 0, stream0);
HIPCHECK(hipStreamSynchronize(stream0));
auto stop_chrono = std::chrono::high_resolution_clock::now();
results[i] = std::chrono::duration<float, std::milli>(stop_chrono - start_chrono).count();
}
print_timing("Wall timing around single dispatch without events", results);
#ifdef __HIP_PLATFORM_AMD__
//Timing around the dispatch with hipExtLaunchKernelGGL
for (auto i = 0; i < TOTAL_RUN_COUNT; ++i) {
auto start_chrono = std::chrono::high_resolution_clock::now();
HIPCHECK(hipEventRecord(start, 0));
hipExtLaunchKernelGGL((EmptyKernel), dim3(NUM_GROUPS), dim3(GROUP_SIZE), 0, stream0, start, stop, 0);
HIPCHECK(hipEventSynchronize(stop));
auto stop_chrono = std::chrono::high_resolution_clock::now();
results[i] = std::chrono::duration<float, std::milli>(stop_chrono - start_chrono).count();
}
print_timing("Wall timing around single dispatch ExtLaunch with events", results);
//Timing around the dispatch with hipExtLaunchKernelGGL without events
for (auto i = 0; i < TOTAL_RUN_COUNT; ++i) {
auto start_chrono = std::chrono::high_resolution_clock::now();
hipExtLaunchKernelGGL((EmptyKernel), dim3(NUM_GROUPS), dim3(GROUP_SIZE), 0, stream0, nullptr, nullptr, 0);
HIPCHECK(hipStreamSynchronize(stream0));
auto stop_chrono = std::chrono::high_resolution_clock::now();
results[i] = std::chrono::duration<float, std::milli>(stop_chrono - start_chrono).count();
}
print_timing("Wall timing around single dispatch ExtLaunch w/o events", results);
#endif
/*********************************************************************************/
/* Batch dispatch execution latency using HIP events: */
/* Measures latency to start & finish executing each dispatch in a batch */
/* Measures latency to start & finish executing each dispatch in a batch */
/*********************************************************************************/
for (auto i = 0; i < TOTAL_RUN_COUNT; ++i) {
HIPCHECK(hipEventRecord(start, 0));
for (int j = 0; j < BATCH_SIZE; j++) {
hipLaunchKernelGGL((EmptyKernel), dim3(NUM_GROUPS), dim3(GROUP_SIZE), 0, stream0);
}
HIPCHECK(hipEventRecord(stop, 0));
HIPCHECK(hipEventSynchronize(stop));
HIPCHECK(hipEventElapsedTime(&results[i], start, stop));
hipEventRecord(start, 0);
for (int j = 0; j < BATCH_SIZE; j++) {
hipLaunchKernelGGL((EmptyKernel), dim3(NUM_GROUPS), dim3(GROUP_SIZE), 0, stream0);
}
hipEventRecord(stop, 0);
hipEventSynchronize(stop);
hipEventElapsedTime(&results[i], start, stop);
}
print_timing("Batch dispatch latency", results, BATCH_SIZE);
for (auto i = 0; i < TOTAL_RUN_COUNT; ++i) {
auto start_chrono = std::chrono::high_resolution_clock::now();
for (int j = 0; j < BATCH_SIZE; j++) {
hipLaunchKernelGGL((EmptyKernel), dim3(NUM_GROUPS), dim3(GROUP_SIZE), 0, stream0);
}
HIPCHECK(hipStreamSynchronize(stream0));
auto stop_chrono = std::chrono::high_resolution_clock::now();
results[i] = std::chrono::duration<float, std::milli>(stop_chrono - start_chrono).count();
}
print_timing("Wall timing for batch dispatch latency", results, BATCH_SIZE);
HIPCHECK(hipEventDestroy(start));
HIPCHECK(hipEventDestroy(stop));
HIPCHECK(hipCtxDestroy(context));
hipEventDestroy(start);
hipEventDestroy(stop);
hipCtxDestroy(context);
}