diff --git a/tools/EmptyKernelTest/EmptyKernelTest.cpp b/tools/EmptyKernelTest/EmptyKernelTest.cpp index fe159437c0..1ea33da4f6 100644 --- a/tools/EmptyKernelTest/EmptyKernelTest.cpp +++ b/tools/EmptyKernelTest/EmptyKernelTest.cpp @@ -20,8 +20,41 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#if defined(__NVCC__) + +#include + +// Datatypes +#define hipError_t cudaError_t +#define hipEvent_t cudaEvent_t +#define hipStream_t cudaStream_t + +// Enumerations +#define hipSuccess cudaSuccess + +// Functions +#define hipEventCreate cudaEventCreate +#define hipEventDestroy cudaEventDestroy +#define hipEventElapsedTime cudaEventElapsedTime +#define hipGetErrorString cudaGetErrorString +#define hipEventRecord cudaEventRecord +#define hipStreamCreate cudaStreamCreate +#define hipStreamDestroy cudaStreamDestroy +#define hipStreamSynchronize cudaStreamSynchronize + +#else + +#include #include +#include + +#endif + #include +#include +#include +#include +#include // Helper macro for catching HIP errors #define HIP_CALL(cmd) \ @@ -38,6 +71,15 @@ THE SOFTWARE. __global__ void EmptyKernel(){}; +float calStdDev(const std::vector& allDeltaMs, float mean) +{ + std::vector diff(allDeltaMs.size()); + std::transform(allDeltaMs.begin(), allDeltaMs.end(), diff.begin(), [mean](double x) { return x - mean; }); + double sq_sum = std::inner_product(diff.begin(), diff.end(), diff.begin(), 0.0); + double stdev = std::sqrt(sq_sum / allDeltaMs.size()); + return stdev; +} + int main(int argc, char **argv) { int numIterations = (argc > 1 ? atoi(argv[1]) : 10); @@ -59,6 +101,8 @@ int main(int argc, char **argv) EmptyKernel<<>>(); } HIP_CALL(hipStreamSynchronize(stream)); + std::vector allGpuDeltaMsec(numIterations); + std::vector allCpuDeltaMsec(numIterations); // Launch empty kernel // NOTE: Timing is done per-iteration, instead of batching multiple iterations @@ -83,6 +127,8 @@ int main(int argc, char **argv) // Report timing printf("Iteration %03d Kernel Launch Time (usec) %10.5f (CPU) %10.5f (GPU)\n", iteration, cpuDeltaMsec *1000.0, gpuDeltaMsec * 1000.0); + allGpuDeltaMsec[iteration] = gpuDeltaMsec * 1000.0; + allCpuDeltaMsec[iteration] = cpuDeltaMsec * 1000.0; cpuSum += cpuDeltaMsec * 1000.0; gpuSum += gpuDeltaMsec * 1000.0; } @@ -91,8 +137,17 @@ int main(int argc, char **argv) // Report averages double avgCpuUsec = cpuSum / numIterations; double avgGpuUsec = gpuSum / numIterations; - printf("Average Kernel Launch time (usec) %10.5f (CPU) %10.5f (GPU)\n", avgCpuUsec, avgGpuUsec); + auto minCpuUsec = std::min_element(std::begin(allCpuDeltaMsec), std::end(allCpuDeltaMsec)); + auto minGpuUsec = std::min_element(std::begin(allGpuDeltaMsec), std::end(allGpuDeltaMsec)); + auto maxCpuUsec = std::max_element(std::begin(allCpuDeltaMsec), std::end(allCpuDeltaMsec)); + auto maxGpuUsec = std::max_element(std::begin(allGpuDeltaMsec), std::end(allGpuDeltaMsec)); + auto varCpuUsec = calStdDev(allCpuDeltaMsec, avgCpuUsec); + auto varGpuUsec = calStdDev(allGpuDeltaMsec, avgGpuUsec); + printf("Average Kernel Launch time (usec) %10.5f (CPU) %10.5f (GPU)\n", avgCpuUsec, avgGpuUsec); + printf("Minimum Kernel Launch time (usec) %10.5f (CPU) %10.5f (GPU)\n", *minCpuUsec, *minGpuUsec); + printf("Maximum Kernel Launch time (usec) %10.5f (CPU) %10.5f (GPU)\n", *maxCpuUsec, *maxGpuUsec); + printf("Stddev Kernel Launch time (usec) %10.5f (CPU) %10.5f (GPU)\n", varCpuUsec, varGpuUsec); // Cleanup events and stream HIP_CALL(hipStreamDestroy(stream)); HIP_CALL(hipEventDestroy(startEvent)); diff --git a/tools/EmptyKernelTest/Makefile b/tools/EmptyKernelTest/Makefile index 8066587ca3..f6238d43e7 100644 --- a/tools/EmptyKernelTest/Makefile +++ b/tools/EmptyKernelTest/Makefile @@ -1,8 +1,26 @@ -CXX=/opt/rocm/bin/hipcc +ROCM_PATH ?= /opt/rocm +CUDA_PATH ?= /usr/local/cuda + +HIPCC=$(ROCM_PATH)/bin/hipcc +NVCC=$(CUDA_PATH)/bin/nvcc + +# Compile TransferBenchCuda if nvcc detected +ifeq ("$(shell test -e $(NVCC) && echo found)", "found") + EXE=./EmptyKernelTestCuda +else + EXE=./EmptyKernelTest +endif + +all: $(EXE) + +./EmptyKernelTest: EmptyKernelTest.cpp + $(HIPCC) EmptyKernelTest.cpp -o EmptyKernelTest + +./EmptyKernelTestCuda: EmptyKernelTest.cpp + $(NVCC) EmptyKernelTest.cpp -x cu -o EmptyKernelTestCuda -EmptyKernelTest: EmptyKernelTest.cpp - $(CXX) EmptyKernelTest.cpp -o EmptyKernelTest clean: - rm -f ./EmptyKernelTest + rm -f ./EmptyKernelTest ./EmptyKernelTestCuda +