From 6ef3a48e27769959cdc5365c5c88dcd0110f18b6 Mon Sep 17 00:00:00 2001 From: Sourabh Betigeri Date: Sat, 12 Sep 2020 01:00:17 -0700 Subject: [PATCH] Added performance test for Device concurrency Change-Id: I30f4e65467613152ce86b80b13e1c9f2928da801 --- perftests/stream/hipPerfDeviceConcurrency.cpp | 289 ++++++++++++++++++ 1 file changed, 289 insertions(+) create mode 100644 perftests/stream/hipPerfDeviceConcurrency.cpp diff --git a/perftests/stream/hipPerfDeviceConcurrency.cpp b/perftests/stream/hipPerfDeviceConcurrency.cpp new file mode 100644 index 0000000000..7d6699a9a2 --- /dev/null +++ b/perftests/stream/hipPerfDeviceConcurrency.cpp @@ -0,0 +1,289 @@ +/* + Copyright (c) 2015-2020 Advanced Micro Devices, Inc. All rights reserved. + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. + */ + +/* HIT_START + * BUILD: %t %s ../../src/test_common.cpp EXCLUDE_HIP_PLATFORM nvcc + * TEST: %t + * HIT_END + */ + +#include +#include +#include "test_common.h" + +typedef struct { + double x; + double y; + double width; +} coordRec; + +static coordRec coords[] = { + {0.0, 0.0, 0.00001}, // All black +}; + +static unsigned int numCoords = sizeof(coords) / sizeof(coordRec); + +__global__ void mandelbrot(uint *out, uint width, float xPos, float yPos, float xStep, + float yStep, uint maxIter) { + + int tid = (blockIdx.x * blockDim.x + threadIdx.x); + int i = tid % width; + int j = tid / width; + float x0 = (float)(xPos + xStep*i); + float y0 = (float)(yPos + yStep*j); + + float x = x0; + float y = y0; + + uint iter = 0; + float tmp; + for (iter = 0; (x*x + y*y <= 4.0f) && (iter < maxIter); iter++) { + tmp = x; + x = fma(-y,y,fma(x,x,x0)); + y = fma(2.0f*tmp,y,y0); + } + + out[tid] = iter; +}; + +class hipPerfDeviceConcurrency { + public: + hipPerfDeviceConcurrency(); + ~hipPerfDeviceConcurrency(); + + void setNumGpus(unsigned int num) { + numDevices = num; + } + unsigned int getNumGpus() { + return numDevices; + } + + void open(void); + void close(void); + void run(unsigned int testCase, int numGpus); + + private: + void setData(void *ptr, unsigned int value); + void checkData(uint *ptr); + + unsigned int numDevices; + unsigned int width_; + unsigned int bufSize; + unsigned int coordIdx; + unsigned long long totalIters = 0; +}; + + +hipPerfDeviceConcurrency::hipPerfDeviceConcurrency() {} + +hipPerfDeviceConcurrency::~hipPerfDeviceConcurrency() {} + +void hipPerfDeviceConcurrency::open(void) { + + + int nGpu = 0; + HIPCHECK(hipGetDeviceCount(&nGpu)); + setNumGpus(nGpu); + if (nGpu < 1) { + std::cout << "info: didn't find any GPU! skipping the test!\n"; + passed(); + } + + +} + + +void hipPerfDeviceConcurrency::close() { +} + +void hipPerfDeviceConcurrency::run(unsigned int testCase, int numGpus) { + + + static int deviceId; + uint * hPtr[numGpus]; + uint * dPtr[numGpus]; + hipStream_t streams[numGpus]; + int numCUs[numGpus]; + unsigned int maxIter[numGpus]; + unsigned long long expectedIters[numGpus]; + + int threads, threads_per_block, blocks; + float xStep, yStep, xPos, yPos; + + for(int i = 0; i < numGpus; i++) { + + if(testCase != 0) { + deviceId = i; + } + + HIPCHECK(hipSetDevice(deviceId)); + + hipDeviceProp_t props = {0}; + HIPCHECK(hipGetDeviceProperties(&props, i)); + + if (testCase != 0) { + std::cout << "info: running on bus " << "0x" << props.pciBusID << " " << props.name + << " with " << props.multiProcessorCount << " CUs" << " and device ID: " + << i << std::endl; + } + + numCUs[i] = props.multiProcessorCount; + int clkFrequency = 0; + HIPCHECK(hipDeviceGetAttribute(&clkFrequency, hipDeviceAttributeClockRate, i)); + + clkFrequency =(unsigned int)clkFrequency/1000; + + // Maximum iteration count + // maxIter = 8388608 * (engine_clock / 1000).serial execution + maxIter[i] = (unsigned int)(((8388608 * ((float)clkFrequency / 1000)) * numCUs[i]) / 128); + maxIter[i] = (maxIter[i] + 15) & ~15; + + // Width is divisible by 4 because the mandelbrot kernel processes 4 pixels at once. + width_ = 256; + + bufSize = width_ * width_ * sizeof(uint); + + // Create streams for concurrency + HIPCHECK(hipStreamCreate(&streams[i])); + + // Allocate memory on the host and device + HIPCHECK(hipHostMalloc((void **)&hPtr[i], bufSize, hipHostMallocDefault)); + setData(hPtr[i], 0xdeadbeef); + HIPCHECK(hipMalloc((uint **)&dPtr[i], bufSize)) + + // Prepare kernel launch parameters + threads = (bufSize/sizeof(uint)); + threads_per_block = 64; + blocks = (threads/threads_per_block) + (threads % threads_per_block); + + coordIdx = testCase % numCoords; + xStep = (float)(coords[coordIdx].width / (double)width_); + yStep = (float)(-coords[coordIdx].width / (double)width_); + xPos = (float)(coords[coordIdx].x - 0.5 * coords[coordIdx].width); + yPos = (float)(coords[coordIdx].y + 0.5 * coords[coordIdx].width); + + // Copy memory from host to device + HIPCHECK(hipMemcpy(dPtr[i], hPtr[i], bufSize, hipMemcpyHostToDevice)); + + } + + // Time the kernel execution + auto all_start = std::chrono::steady_clock::now(); + + for(int i = 0; i < numGpus; i++) { + + if(testCase != 0) { + deviceId = i; + } + + HIPCHECK(hipSetDevice(deviceId)); + + hipLaunchKernelGGL(mandelbrot, dim3(blocks), dim3(threads_per_block), 0, streams[i], + dPtr[i], width_, xPos, yPos, xStep, yStep, maxIter[i]); + + } + + for(int i = 0; i < numGpus; i++) { + HIPCHECK(hipStreamSynchronize(0)); + } + + + auto all_end = std::chrono::steady_clock::now(); + std::chrono::duration all_kernel_time = all_end - all_start; + + for(int i = 0; i < numGpus; i++) { + + if(testCase != 0) { + deviceId = i; + } + HIPCHECK(hipSetDevice(deviceId)); + + // Copy data back from device to the host + HIPCHECK(hipMemcpy(hPtr[i], dPtr[i], bufSize, hipMemcpyDeviceToHost)); + + checkData(hPtr[i]); + expectedIters[i] = width_ * width_ * (unsigned long long) maxIter[i]; + + if (testCase != 0) { + checkData(hPtr[i]); + if(totalIters != expectedIters[i]) { + std::cout << "Incorrect iteration count detected" << std::endl; + } + } + + + HIPCHECK(hipStreamDestroy(streams[i])); + + // Free host and device memory + HIPCHECK(hipFree(hPtr[i])); + HIPCHECK(hipFree(dPtr[i])); + } + + if (testCase != 0) { + std::cout << '\n' << "Measured time for kernel computation on " << numGpus << " device (s): " + << all_kernel_time.count() << " (s) " << '\n' << std::endl; + } + + if(testCase == 0) { + deviceId++; + } + + +} + + +void hipPerfDeviceConcurrency::setData(void *ptr, unsigned int value) { + unsigned int *ptr2 = (unsigned int *)ptr; + for (unsigned int i = 0; i < width_ * width_ ; i++) { + ptr2[i] = value; + } +} + + +void hipPerfDeviceConcurrency::checkData(uint *ptr) { + totalIters = 0; + for (unsigned int i = 0; i < width_ * width_; i++) { + totalIters += ptr[i]; + } +} + + +int main(int argc, char* argv[]) { + hipPerfDeviceConcurrency deviceConcurrency; + + deviceConcurrency.open(); + + int nGpu = deviceConcurrency.getNumGpus(); + + // testCase = 0 refers to warmup kernel run + int testCase = 0; + + for (int i = 0; i < nGpu; i++) { + // Warm-up kernel on all devices + deviceConcurrency.run(testCase, 1); + } + + // Time for kernel on 1 device + deviceConcurrency.run(++testCase, 1); + + // Time for kernel on all available devices + deviceConcurrency.run(++testCase, nGpu); + + passed(); +}