diff --git a/perftests/compute/hipPerfMandelbrot.cpp b/perftests/compute/hipPerfMandelbrot.cpp new file mode 100644 index 0000000000..c4234d8c37 --- /dev/null +++ b/perftests/compute/hipPerfMandelbrot.cpp @@ -0,0 +1,747 @@ +/* + Copyright (c) 2015-2020 Advanced Micro Devices, Inc. All rights reserved. + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. + */ + +/* HIT_START + * BUILD: %t %s ../../src/test_common.cpp EXCLUDE_HIP_PLATFORM nvcc + * TEST: %t + * HIT_END + */ + +#include +#include +#include "test_common.h" +#include +#include +#include +#include +#include + +typedef struct { + double x; + double y; + double width; +} coordRec; + +coordRec coords[] = { + {0.0, 0.0, 4.0}, // Whole set + {0.0, 0.0, 0.00001}, // All black + {-0.0180789661868, 0.6424294066162, 0.00003824140}, // Hit detail +}; + +static unsigned int numCoords = sizeof(coords) / sizeof(coordRec); + +template +__global__ void float_mad_kernel(uint *out, uint width, T xPos, T yPos, T xStep, T yStep, + uint maxIter) { + +#pragma FP_CONTRACT ON + int tid = (blockIdx.x * blockDim.x + threadIdx.x); + int i = tid % width; + int j = tid / width; + float x0 = (float)(xPos + xStep*i); + float y0 = (float)(yPos + yStep*j); + + float x = x0; + float y = y0; + + uint iter = 0; + float tmp; + for (iter = 0; (x*x + y*y <= 4.0f) && (iter < maxIter); iter++) { + tmp = x; + x = fma(-y,y,fma(x,x,x0)); + y = fma(2.0f*tmp,y,y0); + } + + out[tid] = iter; +}; + +template +__global__ void float_mandel_unroll_kernel(uint *out, uint width, T xPos, + T yPos, T xStep, T yStep, uint maxIter) { + +#pragma FP_CONTRACT ON + int tid = (blockIdx.x * blockDim.x + threadIdx.x); + int i = tid % width; + int j = tid / width; + float x0 = (float)(xPos + xStep*(float)i); + float y0 = (float)(yPos + yStep*(float)j); + + float x = x0; + float y = y0; + +#define FAST + uint iter = 0; + float tmp; + int stay; + int ccount = 0; + stay = (x*x+y*y) <= 4.0; + float savx = x; + float savy = y; +#ifdef FAST + for (iter = 0; (iter < maxIter); iter+=16) { +#else + for (iter = 0; stay && (iter < maxIter); iter+=16) { +#endif + x = savx; + y = savy; + + // Two iterations + tmp = fma(-y,y, fma(x,x,x0)); + y = fma(2.0f*x,y,y0); + x = fma(-y,y, fma(tmp,tmp,x0)); + y = fma(2.0f*tmp,y,y0); + + // Two iterations + tmp = fma(-y,y, fma(x,x,x0)); + y = fma(2.0f*x,y,y0); + x = fma(-y,y, fma(tmp,tmp,x0)); + y = fma(2.0f*tmp,y,y0); + + // Two iterations + tmp = fma(-y,y, fma(x,x,x0)); + y = fma(2.0f*x,y,y0); + x = fma(-y,y, fma(tmp,tmp,x0)); + y = fma(2.0f*tmp,y,y0); + + // Two iterations + tmp = fma(-y,y, fma(x,x,x0)); + y = fma(2.0f*x,y,y0); + x = fma(-y,y, fma(tmp,tmp,x0)); + y = fma(2.0f*tmp,y,y0); + + // Two iterations + tmp = fma(-y,y, fma(x,x,x0)); + y = fma(2.0f*x,y,y0); + x = fma(-y,y, fma(tmp,tmp,x0)); + y = fma(2.0f*tmp,y,y0); + + // Two iterations + tmp = fma(-y,y, fma(x,x,x0)); + y = fma(2.0f*x,y,y0); + x = fma(-y,y, fma(tmp,tmp,x0)); + y = fma(2.0f*tmp,y,y0); + + // Two iterations + tmp = fma(-y,y, fma(x,x,x0)); + y = fma(2.0f*x,y,y0); + x = fma(-y,y, fma(tmp,tmp,x0)); + y = fma(2.0f*tmp,y,y0); + + // Two iterations + tmp = fma(-y,y, fma(x,x,x0)); + y = fma(2.0f*x,y,y0); + x = fma(-y,y, fma(tmp,tmp,x0)); + y = fma(2.0f*tmp,y,y0); + + stay = (x*x+y*y) <= 4.0; + savx = (stay ? x : savx); + savy = (stay ? y : savy); + ccount += stay*16; +#ifdef FAST + if (!stay) + break; +#endif + } + // Handle remainder + if (!stay) { + iter = 16; + do { + x = savx; + y = savy; + stay = ((x*x+y*y) <= 4.0) && (ccount < maxIter); + tmp = x; + x = fma(-y,y, fma(x,x,x0)); + y = fma(2.0f*tmp,y,y0); + ccount += stay; + iter--; + savx = (stay ? x : savx); + savy = (stay ? y : savy); + } while (stay && iter); + } + + + out[tid] = (uint)ccount; + +}; + + +template +__global__ void double_mad_kernel(uint *out, uint width, T xPos, T yPos, T xStep, T yStep, + uint maxIter) { + +#pragma FP_CONTRACT ON + int tid = (blockIdx.x * blockDim.x + threadIdx.x); + int i = tid % width; + int j = tid / width; + double x0 = (double)(xPos + xStep*i); + double y0 = (double)(yPos + yStep*j); + + double x = x0; + double y = y0; + + uint iter = 0; + double tmp; + for (iter = 0; (x*x + y*y <= 4.0f) && (iter < maxIter); iter++) { + tmp = x; + x = fma(-y,y,fma(x,x,x0)); + y = fma(2.0f*tmp,y,y0); + } + out[tid] = iter; +}; + + +template +__global__ void double_mandel_unroll_kernel(uint *out, uint width, T xPos, + T yPos, T xStep, T yStep, uint maxIter) { + +#pragma FP_CONTRACT ON + int tid = (blockIdx.x * blockDim.x + threadIdx.x); + + int i = tid % width; + int j = tid / width; + double x0 = (double)(xPos + xStep*(double)i); + double y0 = (double)(yPos + yStep*(double)j); + + double x = x0; + double y = y0; + +#define FAST + uint iter = 0; + double tmp; + int stay; + int ccount = 0; + stay = (x*x+y*y) <= 4.0; + double savx = x; + double savy = y; +#ifdef FAST + for (iter = 0; (iter < maxIter); iter+=16) +#else + for (iter = 0; stay && (iter < maxIter); iter+=16) +#endif + { + x = savx; + y = savy; + + // Two iterations + tmp = fma(-y,y, fma(x,x,x0)); + y = fma(2.0f*x,y,y0); + x = fma(-y,y, fma(tmp,tmp,x0)); + y = fma(2.0f*tmp,y,y0); + + // Two iterations + tmp = fma(-y,y, fma(x,x,x0)); + y = fma(2.0f*x,y,y0); + x = fma(-y,y, fma(tmp,tmp,x0)); + y = fma(2.0f*tmp,y,y0); + + // Two iterations + tmp = fma(-y,y, fma(x,x,x0)); + y = fma(2.0f*x,y,y0); + x = fma(-y,y, fma(tmp,tmp,x0)); + y = fma(2.0f*tmp,y,y0); + + // Two iterations + tmp = fma(-y,y, fma(x,x,x0)); + y = fma(2.0f*x,y,y0); + x = fma(-y,y, fma(tmp,tmp,x0)); + y = fma(2.0f*tmp,y,y0); + + // Two iterations + tmp = fma(-y,y, fma(x,x,x0)); + y = fma(2.0f*x,y,y0); + x = fma(-y,y, fma(tmp,tmp,x0)); + y = fma(2.0f*tmp,y,y0); + + // Two iterations + tmp = fma(-y,y, fma(x,x,x0)); + y = fma(2.0f*x,y,y0); + x = fma(-y,y, fma(tmp,tmp,x0)); + y = fma(2.0f*tmp,y,y0); + + // Two iterations + tmp = fma(-y,y, fma(x,x,x0)); + y = fma(2.0f*x,y,y0); + x = fma(-y,y, fma(tmp,tmp,x0)); + y = fma(2.0f*tmp,y,y0); + + // Two iterations + tmp = fma(-y,y, fma(x,x,x0)); + y = fma(2.0f*x,y,y0); + x = fma(-y,y, fma(tmp,tmp,x0)); + y = fma(2.0f*tmp,y,y0); + + stay = (x*x+y*y) <= 4.0; + savx = (stay ? x : savx); + savy = (stay ? y : savy); + ccount += stay*16; +#ifdef FAST + if (!stay) + break; +#endif + } + // Handle remainder + if (!stay) { + iter = 16; + do { + x = savx; + y = savy; + stay = ((x*x+y*y) <= 4.0) && (ccount < maxIter); + tmp = x; + x = fma(-y,y, fma(x,x,x0)); + y = fma(2.0f*tmp,y,y0); + ccount += stay; + iter--; + savx = (stay ? x : savx); + savy = (stay ? y : savy); + } + while (stay && iter); + + } + out[tid] = (uint)ccount; +}; + +static const unsigned int FMA_EXPECTEDVALUES_INDEX = 15; + +// Expected results for each kernel run at each coord +unsigned long long expectedIters[] = { + 203277748ull, 2147483648ull, 120254651ull, 203277748ull, 2147483648ull, + 120254651ull, 203277748ull, 2147483648ull, 120254651ull, 203315114ull, + 2147483648ull, 120042599ull, 203315114ull, 2147483648ull, 120042599ull, + 203280620ull, 2147483648ull, 120485704ull, 203280620ull, 2147483648ull, + 120485704ull, 203280620ull, 2147483648ull, 120485704ull, 203315114ull, + 2147483648ull, 120042599ull, 203315114ull, 2147483648ull, 120042599ull}; + +class hipPerfMandelBrot { + public: + hipPerfMandelBrot(); + ~hipPerfMandelBrot(); + + void setNumKernels(unsigned int num) { + numKernels = num; + } + + unsigned int getNumKernels() { + return numKernels; + } + + void setNumStreams(unsigned int num) { + numStreams = num; + } + unsigned int getNumStreams() { + return numStreams; + } + + void open(int deviceID); + void run(unsigned int testCase, unsigned int deviceId); + void printResults(void); + + // array of funtion pointers + typedef void (hipPerfMandelBrot::*funPtr)(uint *out, uint width, float xPos, float yPos, + float xStep, float yStep, uint maxIter, hipStream_t* streams, int blocks, + int threads_per_block, int kernelCnt); + + // Wrappers + void float_mad(uint *out, uint width, float xPos, float yPos, + float xStep, float yStep, uint maxIter, hipStream_t* streams, + int blocks, int threads_per_block, int kernelCnt); + + void float_mandel_unroll(uint *out, uint width, float xPos, float yPos, + float xStep, float yStep, uint maxIter, hipStream_t* streams, + int blocks, int threads_per_block, int kernelCnt); + + void double_mad(uint *out, uint width, float xPos, float yPos, float xStep, + float yStep, uint maxIter, hipStream_t* streams, int blocks, + int threads_per_block, int kernelCnt); + + void double_mandel_unroll(uint *out, uint width, float xPos, float yPos, float xStep, + float yStep, uint maxIter, hipStream_t* streams, int blocks, + int threads_per_block, int kernelCnt); + + hipStream_t streams[2]; + + private: + void setData(void *ptr, unsigned int value); + void checkData(uint *ptr); + + unsigned int numKernels; + unsigned int numStreams; + + std::map> results; + unsigned int width_; + unsigned int bufSize; + unsigned int maxIter; + unsigned int coordIdx; + volatile unsigned long long totalIters = 0; + int numCUs; + static const unsigned int numLoops = 10; +}; + + +hipPerfMandelBrot::hipPerfMandelBrot() {} + +hipPerfMandelBrot::~hipPerfMandelBrot() {} + +void hipPerfMandelBrot::open(int deviceId) { + + + int nGpu = 0; + HIPCHECK(hipGetDeviceCount(&nGpu)); + if (nGpu < 1) { + std::cout << "info: didn't find any GPU! skipping the test!\n"; + passed(); + return; + } + + + HIPCHECK(hipSetDevice(deviceId)); + hipDeviceProp_t props = {0}; + HIPCHECK(hipGetDeviceProperties(&props, deviceId)); + std::cout << "info: running on bus " << "0x" << props.pciBusID << " " << props.name + << " with " << props.multiProcessorCount << " CUs" << " and device id: " << deviceId + << std::endl; + + numCUs = props.multiProcessorCount; +} + + +void hipPerfMandelBrot::printResults() { + + int numkernels = getNumKernels(); + int numStreams = getNumStreams(); + + std::cout << "\n" <<"Measured perf for kernels in GFLOPS on " + << numStreams << " streams (s)" << std::endl; + + std::map>:: iterator itr; + for (itr = results.begin(); itr != results.end(); itr++) { + std::cout << "\n" << std::setw(20) << itr->first << " "; + for(auto i : results[itr->first]) { + std::cout << std::setw(10) << i << " "; + } + } + results.clear(); + + std::cout << std::endl; +} + + +// Wrappers for the kernel launches +void hipPerfMandelBrot::float_mad(uint *out, uint width, float xPos, float yPos, float xStep, + float yStep, uint maxIter, hipStream_t* streams, + int blocks, int threads_per_block, int kernelCnt) { + + int streamCnt = getNumStreams(); + hipLaunchKernelGGL(float_mad_kernel, dim3(blocks), dim3(threads_per_block), 0, + streams[kernelCnt % streamCnt], out, width_, xPos, yPos, xStep, yStep, + maxIter); + + +} + + +void hipPerfMandelBrot::float_mandel_unroll(uint *out, uint width, float xPos, float yPos, + float xStep, float yStep, uint maxIter, hipStream_t * streams, + int blocks, int threads_per_block, int kernelCnt) { + + int streamCnt = getNumStreams(); + hipLaunchKernelGGL(float_mandel_unroll_kernel, dim3(blocks), dim3(threads_per_block), 0, + streams[kernelCnt % streamCnt], out, width_, xPos, yPos, xStep, yStep, maxIter); + +} + + +void hipPerfMandelBrot::double_mad(uint *out, uint width, float xPos, float yPos, + float xStep, float yStep, uint maxIter, hipStream_t * streams, + int blocks, int threads_per_block, int kernelCnt) { + + int streamCnt = getNumStreams(); + hipLaunchKernelGGL(double_mad_kernel, dim3(blocks), dim3(threads_per_block), 0, + streams[kernelCnt % streamCnt], out, width_, xPos, yPos, xStep, yStep, maxIter); + +} + + +void hipPerfMandelBrot::double_mandel_unroll(uint *out, uint width, float xPos, float yPos, + float xStep, float yStep, uint maxIter, hipStream_t * streams, + int blocks, int threads_per_block, int kernelCnt) { + + int streamCnt = getNumStreams(); + hipLaunchKernelGGL(float_mandel_unroll_kernel, dim3(blocks), dim3(threads_per_block), 0, + streams[kernelCnt % streamCnt], out, width_, xPos, yPos, xStep, yStep, maxIter); + +} + + +void hipPerfMandelBrot::run(unsigned int testCase,unsigned int deviceId) { + + unsigned int numStreams = getNumStreams(); + + funPtr p[] = {&hipPerfMandelBrot::float_mad, &hipPerfMandelBrot::float_mandel_unroll, + &hipPerfMandelBrot::double_mad, &hipPerfMandelBrot::double_mandel_unroll}; + + // Maximum iteration count + maxIter = 32768; + + uint * hPtr[numKernels]; + uint * dPtr[numKernels]; + + // Width is divisible by 4 because the mandelbrot kernel processes 4 pixels at once. + width_ = 256; + + bufSize = width_ * width_ * sizeof(uint); + + // Create streams for concurrency + for (uint i = 0; i < numStreams; i++) { + HIPCHECK(hipStreamCreate(&streams[i])); + } + + + // Allocate memory on the host and device + for (uint i = 0; i < numKernels; i++) { + HIPCHECK(hipHostMalloc((void **)&hPtr[i], bufSize, hipHostMallocDefault)); + setData(hPtr[i], 0xdeadbeef); + HIPCHECK(hipMalloc((uint **)&dPtr[i], bufSize)) + } + + + // Prepare kernel launch parameters + int threads = (bufSize/sizeof(uint)); + int threads_per_block = 64; + int blocks = (threads/threads_per_block) + (threads % threads_per_block); + + float xStep = (float)(coords[coordIdx].width / (double)width_); + float yStep = (float)(-coords[coordIdx].width / (double)width_); + float xPos = (float)(coords[coordIdx].x - 0.5 * coords[coordIdx].width); + float yPos = (float)(coords[coordIdx].y + 0.5 * coords[coordIdx].width); + + // Copy memory asynchronously and concurrently from host to device + for (uint i = 0; i < numKernels; i++) { + HIPCHECK(hipMemcpy(dPtr[i], hPtr[i], bufSize, hipMemcpyHostToDevice)); + } + + // Synchronize to make sure all the copies are completed + HIPCHECK(hipStreamSynchronize(0)); + + int kernelIdx; + if(testCase == 0 || testCase == 5 || testCase == 10) { + kernelIdx = 0; + } + + else if(testCase == 1 || testCase == 6 || testCase == 11) { + kernelIdx = 1; + } + else if(testCase == 2 || testCase == 7 || testCase == 12) { + kernelIdx = 2; + } + else if(testCase == 3 || testCase == 8 || testCase == 13){ + kernelIdx = 3; + } + + + double totalTime = 0.0; + + for (unsigned int k = 0; k < numLoops; k++) { + + coordIdx = testCase % numCoords; + + if ((testCase == 0 || testCase == 1 || testCase == 2 || + testCase == 5 || testCase == 6 || testCase == 7 || + testCase == 10 || testCase == 11 || testCase == 12)) { + float xStep = (float)(coords[coordIdx].width / (double)width_); + float yStep = (float)(-coords[coordIdx].width / (double)width_); + float xPos = (float)(coords[coordIdx].x - 0.5 * coords[coordIdx].width); + float yPos = (float)(coords[coordIdx].y + 0.5 * coords[coordIdx].width); + + // Time the kernel execution + auto all_start = std::chrono::steady_clock::now(); + + for (uint i = 0; i < numKernels; i++) { + (this->*p[kernelIdx])(dPtr[i], width_, xPos, yPos, xStep, yStep, maxIter, streams, blocks, + threads_per_block, i); + } + + + // Synchronize all the concurrent streams to have completed execution + HIPCHECK(hipStreamSynchronize(0)); + + auto all_end = std::chrono::steady_clock::now(); + std::chrono::duration all_kernel_time = all_end - all_start; + totalTime += all_kernel_time.count(); + + } + + + else { + double xStep = coords[coordIdx].width / (double)width_; + double yStep = -coords[coordIdx].width / (double)width_; + double xPos = coords[coordIdx].x - 0.5 * coords[coordIdx].width; + double yPos = coords[coordIdx].y + 0.5 * coords[coordIdx].width; + + // Time the kernel execution + auto all_start = std::chrono::steady_clock::now(); + + for (uint i = 0; i < numKernels; i++) { + (this->*p[kernelIdx])(dPtr[i], width_, xPos, yPos, xStep, yStep, maxIter, streams, blocks, + threads_per_block, i); + } + + + // Synchronize all the concurrent streams to have completed execution + HIPCHECK(hipStreamSynchronize(0)); + + auto all_end = std::chrono::steady_clock::now(); + std::chrono::duration all_kernel_time = all_end - all_start; + totalTime += all_kernel_time.count(); + } + + + } + + // Copy data back from device to the host + for(uint i = 0; i < numKernels; i++) { + HIPCHECK(hipMemcpy(hPtr[i] ,dPtr[i], bufSize, hipMemcpyDeviceToHost)); + } + + + for(uint i = 0; i < numKernels; i++) { + checkData(hPtr[i]); + + int j =0; + while((totalIters != expectedIters[j] && totalIters > expectedIters[j]) && j < 30) { + j++; + } + + if(j==30) { + std::cout << "Incorrect iteration count detected. "; + } + + } + + + // Compute GFLOPS. There are 7 FLOPs per iteration + double perf = ((double)(totalIters*numKernels) * 7 * (double)(1e-09)) / + (totalTime / (double)numLoops); + + + std::vector kernelName = {"float", "float_unroll", + "double", "double_unroll"}; + + // Print results except for Warm-up kernel + if(testCase!=100) { + results[kernelName[testCase % 4]].push_back(perf); + } + + + for(uint i = 0 ; i < numStreams; i++) { + HIPCHECK(hipStreamDestroy(streams[i])); + } + + + // Free host and device memory + for (uint i = 0; i < numKernels; i++) { + HIPCHECK(hipFree(hPtr[i])); + HIPCHECK(hipFree(dPtr[i])); + } + + +} + + +void hipPerfMandelBrot::setData(void *ptr, unsigned int value) { + unsigned int *ptr2 = (unsigned int *)ptr; + for (unsigned int i = 0; i < width_ * width_; i++) { + ptr2[i] = value; + } +} + + +void hipPerfMandelBrot::checkData(uint *ptr) { + totalIters = 0; + for (unsigned int i = 0; i < width_ * width_; i++) { + totalIters += ptr[i]; + } +} + + +int main(int argc, char* argv[]) { + hipPerfMandelBrot mandelbrotCompute; + int deviceId = 0; + + mandelbrotCompute.open(deviceId); + + for (unsigned int testCase = 0; testCase < 3; testCase++) { + + + switch (testCase) { + + + case 0: { + // Warmup-kernel - default stream executes serially + mandelbrotCompute.setNumStreams(1); + mandelbrotCompute.setNumKernels(1); + mandelbrotCompute.run(100/*Random number*/, deviceId); + break; + } + + + case 1: { + // run all - sync + int i = 0; + do { + mandelbrotCompute.setNumStreams(1); + mandelbrotCompute.setNumKernels(1); + mandelbrotCompute.run(i, deviceId); + i++; + }while(i < 12); + mandelbrotCompute.printResults(); + + break; + } + + + case 2: { + // run all - async + int i = 0; + do { + mandelbrotCompute.setNumStreams(2); + mandelbrotCompute.setNumKernels(2); + mandelbrotCompute.run(i, deviceId); + i++; + }while(i < 12); + mandelbrotCompute.printResults(); + + break; + + } + + + default: { + break; + } + + + } + + + + } + + + passed(); +} diff --git a/perftests/stream/hipPerfDeviceConcurrency.cpp b/perftests/stream/hipPerfDeviceConcurrency.cpp new file mode 100644 index 0000000000..7d6699a9a2 --- /dev/null +++ b/perftests/stream/hipPerfDeviceConcurrency.cpp @@ -0,0 +1,289 @@ +/* + Copyright (c) 2015-2020 Advanced Micro Devices, Inc. All rights reserved. + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. + */ + +/* HIT_START + * BUILD: %t %s ../../src/test_common.cpp EXCLUDE_HIP_PLATFORM nvcc + * TEST: %t + * HIT_END + */ + +#include +#include +#include "test_common.h" + +typedef struct { + double x; + double y; + double width; +} coordRec; + +static coordRec coords[] = { + {0.0, 0.0, 0.00001}, // All black +}; + +static unsigned int numCoords = sizeof(coords) / sizeof(coordRec); + +__global__ void mandelbrot(uint *out, uint width, float xPos, float yPos, float xStep, + float yStep, uint maxIter) { + + int tid = (blockIdx.x * blockDim.x + threadIdx.x); + int i = tid % width; + int j = tid / width; + float x0 = (float)(xPos + xStep*i); + float y0 = (float)(yPos + yStep*j); + + float x = x0; + float y = y0; + + uint iter = 0; + float tmp; + for (iter = 0; (x*x + y*y <= 4.0f) && (iter < maxIter); iter++) { + tmp = x; + x = fma(-y,y,fma(x,x,x0)); + y = fma(2.0f*tmp,y,y0); + } + + out[tid] = iter; +}; + +class hipPerfDeviceConcurrency { + public: + hipPerfDeviceConcurrency(); + ~hipPerfDeviceConcurrency(); + + void setNumGpus(unsigned int num) { + numDevices = num; + } + unsigned int getNumGpus() { + return numDevices; + } + + void open(void); + void close(void); + void run(unsigned int testCase, int numGpus); + + private: + void setData(void *ptr, unsigned int value); + void checkData(uint *ptr); + + unsigned int numDevices; + unsigned int width_; + unsigned int bufSize; + unsigned int coordIdx; + unsigned long long totalIters = 0; +}; + + +hipPerfDeviceConcurrency::hipPerfDeviceConcurrency() {} + +hipPerfDeviceConcurrency::~hipPerfDeviceConcurrency() {} + +void hipPerfDeviceConcurrency::open(void) { + + + int nGpu = 0; + HIPCHECK(hipGetDeviceCount(&nGpu)); + setNumGpus(nGpu); + if (nGpu < 1) { + std::cout << "info: didn't find any GPU! skipping the test!\n"; + passed(); + } + + +} + + +void hipPerfDeviceConcurrency::close() { +} + +void hipPerfDeviceConcurrency::run(unsigned int testCase, int numGpus) { + + + static int deviceId; + uint * hPtr[numGpus]; + uint * dPtr[numGpus]; + hipStream_t streams[numGpus]; + int numCUs[numGpus]; + unsigned int maxIter[numGpus]; + unsigned long long expectedIters[numGpus]; + + int threads, threads_per_block, blocks; + float xStep, yStep, xPos, yPos; + + for(int i = 0; i < numGpus; i++) { + + if(testCase != 0) { + deviceId = i; + } + + HIPCHECK(hipSetDevice(deviceId)); + + hipDeviceProp_t props = {0}; + HIPCHECK(hipGetDeviceProperties(&props, i)); + + if (testCase != 0) { + std::cout << "info: running on bus " << "0x" << props.pciBusID << " " << props.name + << " with " << props.multiProcessorCount << " CUs" << " and device ID: " + << i << std::endl; + } + + numCUs[i] = props.multiProcessorCount; + int clkFrequency = 0; + HIPCHECK(hipDeviceGetAttribute(&clkFrequency, hipDeviceAttributeClockRate, i)); + + clkFrequency =(unsigned int)clkFrequency/1000; + + // Maximum iteration count + // maxIter = 8388608 * (engine_clock / 1000).serial execution + maxIter[i] = (unsigned int)(((8388608 * ((float)clkFrequency / 1000)) * numCUs[i]) / 128); + maxIter[i] = (maxIter[i] + 15) & ~15; + + // Width is divisible by 4 because the mandelbrot kernel processes 4 pixels at once. + width_ = 256; + + bufSize = width_ * width_ * sizeof(uint); + + // Create streams for concurrency + HIPCHECK(hipStreamCreate(&streams[i])); + + // Allocate memory on the host and device + HIPCHECK(hipHostMalloc((void **)&hPtr[i], bufSize, hipHostMallocDefault)); + setData(hPtr[i], 0xdeadbeef); + HIPCHECK(hipMalloc((uint **)&dPtr[i], bufSize)) + + // Prepare kernel launch parameters + threads = (bufSize/sizeof(uint)); + threads_per_block = 64; + blocks = (threads/threads_per_block) + (threads % threads_per_block); + + coordIdx = testCase % numCoords; + xStep = (float)(coords[coordIdx].width / (double)width_); + yStep = (float)(-coords[coordIdx].width / (double)width_); + xPos = (float)(coords[coordIdx].x - 0.5 * coords[coordIdx].width); + yPos = (float)(coords[coordIdx].y + 0.5 * coords[coordIdx].width); + + // Copy memory from host to device + HIPCHECK(hipMemcpy(dPtr[i], hPtr[i], bufSize, hipMemcpyHostToDevice)); + + } + + // Time the kernel execution + auto all_start = std::chrono::steady_clock::now(); + + for(int i = 0; i < numGpus; i++) { + + if(testCase != 0) { + deviceId = i; + } + + HIPCHECK(hipSetDevice(deviceId)); + + hipLaunchKernelGGL(mandelbrot, dim3(blocks), dim3(threads_per_block), 0, streams[i], + dPtr[i], width_, xPos, yPos, xStep, yStep, maxIter[i]); + + } + + for(int i = 0; i < numGpus; i++) { + HIPCHECK(hipStreamSynchronize(0)); + } + + + auto all_end = std::chrono::steady_clock::now(); + std::chrono::duration all_kernel_time = all_end - all_start; + + for(int i = 0; i < numGpus; i++) { + + if(testCase != 0) { + deviceId = i; + } + HIPCHECK(hipSetDevice(deviceId)); + + // Copy data back from device to the host + HIPCHECK(hipMemcpy(hPtr[i], dPtr[i], bufSize, hipMemcpyDeviceToHost)); + + checkData(hPtr[i]); + expectedIters[i] = width_ * width_ * (unsigned long long) maxIter[i]; + + if (testCase != 0) { + checkData(hPtr[i]); + if(totalIters != expectedIters[i]) { + std::cout << "Incorrect iteration count detected" << std::endl; + } + } + + + HIPCHECK(hipStreamDestroy(streams[i])); + + // Free host and device memory + HIPCHECK(hipFree(hPtr[i])); + HIPCHECK(hipFree(dPtr[i])); + } + + if (testCase != 0) { + std::cout << '\n' << "Measured time for kernel computation on " << numGpus << " device (s): " + << all_kernel_time.count() << " (s) " << '\n' << std::endl; + } + + if(testCase == 0) { + deviceId++; + } + + +} + + +void hipPerfDeviceConcurrency::setData(void *ptr, unsigned int value) { + unsigned int *ptr2 = (unsigned int *)ptr; + for (unsigned int i = 0; i < width_ * width_ ; i++) { + ptr2[i] = value; + } +} + + +void hipPerfDeviceConcurrency::checkData(uint *ptr) { + totalIters = 0; + for (unsigned int i = 0; i < width_ * width_; i++) { + totalIters += ptr[i]; + } +} + + +int main(int argc, char* argv[]) { + hipPerfDeviceConcurrency deviceConcurrency; + + deviceConcurrency.open(); + + int nGpu = deviceConcurrency.getNumGpus(); + + // testCase = 0 refers to warmup kernel run + int testCase = 0; + + for (int i = 0; i < nGpu; i++) { + // Warm-up kernel on all devices + deviceConcurrency.run(testCase, 1); + } + + // Time for kernel on 1 device + deviceConcurrency.run(++testCase, 1); + + // Time for kernel on all available devices + deviceConcurrency.run(++testCase, nGpu); + + passed(); +} diff --git a/samples/0_Intro/bit_extract/CMakeLists.txt b/samples/0_Intro/bit_extract/CMakeLists.txt new file mode 100644 index 0000000000..c9b13be812 --- /dev/null +++ b/samples/0_Intro/bit_extract/CMakeLists.txt @@ -0,0 +1,20 @@ +project(bit_extract) + +cmake_minimum_required(VERSION 3.10) + +# Search for rocm in common locations +list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm) + +# Find hip +find_package(hip) + +# Set compiler and linker +set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE}) + +set(CMAKE_CXX_LINKER ${HIP_HIPCC_EXECUTABLE}) + +# Create the excutable +add_executable(bit_extract bit_extract.cpp) + +# Link with HIP +target_link_libraries(bit_extract hip::host) \ No newline at end of file diff --git a/samples/0_Intro/bit_extract/Makefile b/samples/0_Intro/bit_extract/Makefile index 4a3a0bb4fe..3427815ffc 100644 --- a/samples/0_Intro/bit_extract/Makefile +++ b/samples/0_Intro/bit_extract/Makefile @@ -9,19 +9,15 @@ HIPCC=$(HIP_PATH)/bin/hipcc # Show how to use PLATFORM to specify different options for each compiler: ifeq (${HIP_PLATFORM}, nvcc) - HIPCC_FLAGS = -gencode=arch=compute_20,code=sm_20 + HIPCC_FLAGS = -gencode=arch=compute_20,code=sm_20 endif EXE=bit_extract -EXE_STATIC=bit_extract_static $(EXE): bit_extract.cpp $(HIPCC) $(HIPCC_FLAGS) $< -o $@ -$(EXE_STATIC): bit_extract.cpp - $(HIPCC) -use-staticlib $(HIPCC_FLAGS) $< -o $@ - -all: $(EXE) $(EXE_STATIC) +all: $(EXE) clean: - rm -f *.o $(EXE) $(EXE_STATIC) + rm -f *.o $(EXE) diff --git a/samples/0_Intro/module_api/CMakeLists.txt b/samples/0_Intro/module_api/CMakeLists.txt new file mode 100644 index 0000000000..0f5cc32f91 --- /dev/null +++ b/samples/0_Intro/module_api/CMakeLists.txt @@ -0,0 +1,36 @@ +project(module_api) + +cmake_minimum_required(VERSION 3.10) + +# Search for rocm in common locations +list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm) + +# Find hip +find_package(hip) + +# Set compiler and linker +set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE}) + +set(CMAKE_CXX_LINKER ${HIP_HIPCC_EXECUTABLE}) + +# Create the excutable +add_executable(runKernel.hip.out runKernel.cpp) +add_executable(launchKernelHcc.hip.out launchKernelHcc.cpp) +add_executable(defaultDriver.hip.out defaultDriver.cpp) + +# Generate code object +add_custom_target( + codeobj + ALL + COMMAND ${HIP_HIPCC_EXECUTABLE} --genco ../vcpy_kernel.cpp -o vcpy_kernel.code + COMMENT "codeobj generated" +) + +add_dependencies(runKernel.hip.out codeobj) +add_dependencies(launchKernelHcc.hip.out codeobj) +add_dependencies(defaultDriver.hip.out codeobj) + +# Link with HIP +target_link_libraries(runKernel.hip.out hip::host) +target_link_libraries(launchKernelHcc.hip.out hip::host) +target_link_libraries(defaultDriver.hip.out hip::host) diff --git a/samples/0_Intro/module_api_global/CMakeLists.txt b/samples/0_Intro/module_api_global/CMakeLists.txt new file mode 100644 index 0000000000..00caa79cfa --- /dev/null +++ b/samples/0_Intro/module_api_global/CMakeLists.txt @@ -0,0 +1,30 @@ +project(modile_api_global) + +cmake_minimum_required(VERSION 3.10) + +# Search for rocm in common locations +list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm) + +# Find hip +find_package(hip) + +# Set compiler and linker +set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE}) + +set(CMAKE_CXX_LINKER ${HIP_HIPCC_EXECUTABLE}) + +# Create the excutable +add_executable(runKernel.hip.out runKernel.cpp) + +# Generate code object +add_custom_target( + codeobj + ALL + COMMAND ${HIP_HIPCC_EXECUTABLE} --genco ../vcpy_kernel.cpp -o vcpy_kernel.code + COMMENT "codeobj generated" +) + +add_dependencies(runKernel.hip.out codeobj) + +# Link with HIP +target_link_libraries(runKernel.hip.out hip::host) \ No newline at end of file diff --git a/samples/0_Intro/square/CMakeLists.txt b/samples/0_Intro/square/CMakeLists.txt new file mode 100644 index 0000000000..845c43fd1f --- /dev/null +++ b/samples/0_Intro/square/CMakeLists.txt @@ -0,0 +1,21 @@ +#Follow "README.md" to generate square.cpp if it's missing + +project(square) + +cmake_minimum_required(VERSION 3.10) + +# Search for rocm in common locations +list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm) + +# Find hip +find_package(hip) + +# Set compiler and linker +set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE}) +set(CMAKE_CXX_LINKER ${HIP_HIPCC_EXECUTABLE}) + +# Create the excutable +add_executable(square square.cpp) + +# Link with HIP +target_link_libraries(square hip::host) \ No newline at end of file diff --git a/samples/0_Intro/square/Makefile b/samples/0_Intro/square/Makefile index aa046eeaaa..9bb0dd8205 100644 --- a/samples/0_Intro/square/Makefile +++ b/samples/0_Intro/square/Makefile @@ -11,7 +11,7 @@ else SOURCES=square.cpp endif -all: square.out square.out.static +all: square.out # Step square.cpp: square.cu @@ -20,8 +20,5 @@ square.cpp: square.cu square.out: $(SOURCES) $(HIPCC) $(CXXFLAGS) $(SOURCES) -o $@ -square.out.static: $(SOURCES) - $(HIPCC) -use-staticlib $(CXXFLAGS) $(SOURCES) -o $@ - clean: - rm -f *.o *.out *.out.static square.cpp + rm -f *.o *.out square.cpp diff --git a/samples/0_Intro/square/README.md b/samples/0_Intro/square/README.md index c185903993..0bbb2f7e39 100644 --- a/samples/0_Intro/square/README.md +++ b/samples/0_Intro/square/README.md @@ -1,13 +1,39 @@ # Square.md -Simple test which shows how to use hipify-perl to port CUDA code to HIP. -See related [blog](http://gpuopen.com/hip-to-be-squared-an-introductory-hip-tutorial) that explains the example. +Simple test which shows how to use hipify-perl to port CUDA code to HIP. +See related [blog](http://gpuopen.com/hip-to-be-squared-an-introductory-hip-tutorial) that explains the example. Now it is even simpler and requires no manual modification to the hipified source code - just hipify and compile: -1. Add hip/bin path to the PATH : - export PATH=$PATH:[MYHIP]/bin +- Add hip/bin path to the PATH -2. $ make - Make runs these steps. This can be performed on either CUDA or AMD platform: - hipify-perl square.cu > square.cpp # convert cuda code to hip code - hipcc square.cpp # compile into executable +``` +$ export PATH=$PATH:[MYHIP]/bin +``` + +- Define environment variable + +``` +$ export HIP_PATH=[MYHIP] +``` + +- Build executible file + +``` +$ cd ~/hip/samples/0_Intro/square +$ make +/home/user/hip/bin/hipify-perl square.cu > square.cpp +/home/user/hip/bin/hipcc square.cpp -o square.out +/home/user/hip/bin/hipcc -use-staticlib square.cpp -o square.out.static +``` +- Execute file +``` +$ ./square.out +info: running on device Navi 14 [Radeon Pro W5500] +info: allocate host mem ( 7.63 MB) +info: allocate device mem ( 7.63 MB) +info: copy Host2Device +info: launch 'vector_square' kernel +info: copy Device2Host +info: check result +PASSED! +``` diff --git a/samples/1_Utils/hipBusBandwidth/CMakeLists.txt b/samples/1_Utils/hipBusBandwidth/CMakeLists.txt new file mode 100644 index 0000000000..df01c31d97 --- /dev/null +++ b/samples/1_Utils/hipBusBandwidth/CMakeLists.txt @@ -0,0 +1,20 @@ +project(hipBusBandwidth) + +cmake_minimum_required(VERSION 3.10) + +# Search for rocm in common locations +list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm) + +# Find hip +find_package(hip) + +# Set compiler and linker +set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE}) +set(CMAKE_CXX_LINKER ${HIP_HIPCC_EXECUTABLE}) +set(CMAKE_BUILD_TYPE Release) + +# Create the excutable +add_executable(hipBusBandwidth hipBusBandwidth.cpp ResultDatabase.cpp) + +# Link with HIP +target_link_libraries(hipBusBandwidth hip::host) \ No newline at end of file diff --git a/samples/1_Utils/hipBusBandwidth/hipBusBandwidth.cpp b/samples/1_Utils/hipBusBandwidth/hipBusBandwidth.cpp index 6181c49afe..8032bd0a20 100644 --- a/samples/1_Utils/hipBusBandwidth/hipBusBandwidth.cpp +++ b/samples/1_Utils/hipBusBandwidth/hipBusBandwidth.cpp @@ -12,7 +12,7 @@ enum MallocMode { MallocPinned, MallocUnpinned, MallocRegistered }; bool p_verbose = false; MallocMode p_malloc_mode = MallocPinned; int p_numa_ctl = -1; -int p_iterations = 10; +int p_iterations = 0; int p_beatsperiteration = 1; int p_device = 0; int p_detailed = 0; @@ -89,7 +89,9 @@ hipError_t memcopy(void* dst, const void* src, size_t sizeBytes, enum hipMemcpyK int sizes[] = {-64, -256, -512, 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072, 262144, 524288}; int nSizes = sizeof(sizes) / sizeof(int); - +// iterations to be run for the corresponding sizes, less number as the size increases +int iterations[] = {1000, 1000, 1000, 1000, 500, 500, 500, 500, 500, 200, 200, 200, + 200, 200, 100, 100, 100, 100, 50, 50, 50, 20, 20}; // **************************************************************************** // Function: RunBenchmark_H2D @@ -174,53 +176,48 @@ void RunBenchmark_H2D(ResultDatabase& resultDB) { hipEventCreate(&stop); CHECK_HIP_ERROR(); - // Three passes, forward and backward both - for (int pass = 0; pass < p_iterations; pass++) { - // store the times temporarily to estimate latency - // float times[nSizes]; - // Step through sizes forward on even passes and backward on odd - for (int i = 0; i < nSizes; i++) { - int sizeIndex; - if ((pass % 2) == 0) - sizeIndex = i; - else - sizeIndex = (nSizes - 1) - i; + // store the times temporarily to estimate latency + // float times[nSizes]; + for (int i = 0; i < nSizes; i++) { + int sizeIndex, iterIndex; + sizeIndex = i; + iterIndex = i; - const int thisSize = p_onesize ? p_onesize : sizes[sizeIndex]; - const int nbytes = sizeToBytes(thisSize); + const int thisSize = p_onesize ? p_onesize : sizes[sizeIndex]; + const int nbytes = sizeToBytes(thisSize); + const int niter = p_iterations ? p_iterations : iterations[iterIndex]; + for (int pass = 0; pass < niter; pass++) { - hipEventRecord(start, 0); - for (int j = 0; j < p_beatsperiteration; j++) { - memcopy(device, hostMem, nbytes, hipMemcpyHostToDevice); - } - hipEventRecord(stop, 0); - hipEventSynchronize(stop); - float t = 0; - hipEventElapsedTime(&t, start, stop); - // times[sizeIndex] = t; - - // Convert to GB/sec - if (p_verbose) { - std::cerr << "size " << sizeToString(thisSize) << " took " << t << " ms\n"; - } - - double speed = - (double(double(sizeToBytes(thisSize)/1000) * p_beatsperiteration) / 1000) / t; - char sizeStr[256]; - if (p_beatsperiteration > 1) { - sprintf(sizeStr, "%9sx%d", sizeToString(thisSize).c_str(), p_beatsperiteration); - } else { - sprintf(sizeStr, "%9s", sizeToString(thisSize).c_str()); - } - resultDB.AddResult(std::string("H2D_Bandwidth") + "_" + mallocModeString(p_malloc_mode), - sizeStr, "GB/sec", speed); - resultDB.AddResult(std::string("H2D_Time") + mallocModeString(p_malloc_mode), sizeStr, - "ms", t); - - if (p_onesize) { - break; - } + hipEventRecord(start, 0); + for (int j = 0; j < p_beatsperiteration; j++) { + memcopy(device, hostMem, nbytes, hipMemcpyHostToDevice); } + hipEventRecord(stop, 0); + hipEventSynchronize(stop); + float t = 0; + hipEventElapsedTime(&t, start, stop); + // times[sizeIndex] = t; + // Convert to GB/sec + if (p_verbose) { + std::cerr << "size " << sizeToString(thisSize) << " took " << t << " ms\n"; + } + + double speed = + (double(double(sizeToBytes(thisSize)/1000) * p_beatsperiteration) / 1000) / t; + char sizeStr[256]; + if (p_beatsperiteration > 1) { + sprintf(sizeStr, "%9sx%d", sizeToString(thisSize).c_str(), p_beatsperiteration); + } else { + sprintf(sizeStr, "%9s", sizeToString(thisSize).c_str()); + } + resultDB.AddResult(std::string("H2D_Bandwidth") + "_" + mallocModeString(p_malloc_mode), + sizeStr, "GB/sec", speed); + resultDB.AddResult(std::string("H2D_Time") + mallocModeString(p_malloc_mode), sizeStr, "ms", t); + + } + if (p_onesize) { + break; + } } if (p_onesize) { @@ -347,53 +344,50 @@ void RunBenchmark_D2H(ResultDatabase& resultDB) { hipEventCreate(&stop); CHECK_HIP_ERROR(); - // Three passes, forward and backward both - for (int pass = 0; pass < p_iterations; pass++) { - // store the times temporarily to estimate latency - // float times[nSizes]; - // Step through sizes forward on even passes and backward on odd - for (int i = 0; i < nSizes; i++) { - int sizeIndex; - if ((pass % 2) == 0) - sizeIndex = i; - else - sizeIndex = (nSizes - 1) - i; + // store the times temporarily to estimate latency + // float times[nSizes]; + for (int i = 0; i < nSizes; i++) { + int sizeIndex, iterIndex; + sizeIndex = i; + iterIndex = i; - const int thisSize = p_onesize ? p_onesize : sizes[sizeIndex]; - const int nbytes = sizeToBytes(thisSize); + const int thisSize = p_onesize ? p_onesize : sizes[sizeIndex]; + const int nbytes = sizeToBytes(thisSize); + const int niter = p_iterations ? p_iterations : iterations[iterIndex]; + for (int pass = 0; pass < niter; pass++) { - hipEventRecord(start, 0); - for (int j = 0; j < p_beatsperiteration; j++) { - memcopy(hostMem2, device, nbytes, hipMemcpyDeviceToHost); - } - hipEventRecord(stop, 0); - hipEventSynchronize(stop); - float t = 0; - hipEventElapsedTime(&t, start, stop); - // times[sizeIndex] = t; - - // Convert to GB/sec - if (p_verbose) { - std::cerr << "size " << sizeToString(thisSize) << " took " << t << " ms\n"; - } - - double speed = - (double(double(sizeToBytes(thisSize)/1000) * p_beatsperiteration) / 1000) / t; - char sizeStr[256]; - sprintf(sizeStr, "%9s", sizeToString(thisSize).c_str()); - if (p_beatsperiteration > 1) { - sprintf(sizeStr, "%9sx%d", sizeToString(thisSize).c_str(), p_beatsperiteration); - } else { - sprintf(sizeStr, "%9s", sizeToString(thisSize).c_str()); - } - resultDB.AddResult(std::string("D2H_Bandwidth") + "_" + mallocModeString(p_malloc_mode), - sizeStr, "GB/sec", speed); - resultDB.AddResult(std::string("D2H_Time") + "_" + mallocModeString(p_malloc_mode), - sizeStr, "ms", t); - if (p_onesize) { - break; - } + hipEventRecord(start, 0); + for (int j = 0; j < p_beatsperiteration; j++) { + memcopy(hostMem2, device, nbytes, hipMemcpyDeviceToHost); } + hipEventRecord(stop, 0); + hipEventSynchronize(stop); + float t = 0; + hipEventElapsedTime(&t, start, stop); + // times[sizeIndex] = t; + // Convert to GB/sec + if (p_verbose) { + std::cerr << "size " << sizeToString(thisSize) << " took " << t << " ms\n"; + } + + double speed = + (double(double(sizeToBytes(thisSize)/1000) * p_beatsperiteration) / 1000) / t; + char sizeStr[256]; + sprintf(sizeStr, "%9s", sizeToString(thisSize).c_str()); + if (p_beatsperiteration > 1) { + sprintf(sizeStr, "%9sx%d", sizeToString(thisSize).c_str(), p_beatsperiteration); + } else { + sprintf(sizeStr, "%9s", sizeToString(thisSize).c_str()); + } + resultDB.AddResult(std::string("D2H_Bandwidth") + "_" + mallocModeString(p_malloc_mode), + sizeStr, "GB/sec", speed); + resultDB.AddResult(std::string("D2H_Time") + "_" + mallocModeString(p_malloc_mode), + sizeStr, "ms", t); + + } + if (p_onesize) { + break; + } } if (p_onesize) { @@ -522,43 +516,43 @@ void RunBenchmark_Bidir(ResultDatabase& resultDB) { hipStreamCreate(&stream[0]); hipStreamCreate(&stream[1]); - // Three passes, forward and backward both - for (int pass = 0; pass < p_iterations; pass++) { - // store the times temporarily to estimate latency - // float times[nSizes]; - // Step through sizes forward on even passes and backward on odd - for (int i = 0; i < nSizes; i++) { - int sizeIndex; - if ((pass % 2) == 0) - sizeIndex = i; - else - sizeIndex = (nSizes - 1) - i; + // store the times temporarily to estimate latency + // float times[nSizes]; + for (int i = 0; i < nSizes; i++) { + int sizeIndex, iterIndex; + sizeIndex = i; + iterIndex = i; - const int thisSize = p_onesize ? p_onesize : sizes[sizeIndex]; - const int nbytes = sizeToBytes(thisSize); + const int thisSize = p_onesize ? p_onesize : sizes[sizeIndex]; + const int nbytes = sizeToBytes(thisSize); + const int niter = p_iterations ? p_iterations : iterations[iterIndex]; + for (int pass = 0; pass < niter; pass++) { - hipEventRecord(start, 0); - hipMemcpyAsync(deviceMem[0], hostMem[0], nbytes, hipMemcpyHostToDevice, stream[0]); - hipMemcpyAsync(hostMem[1], deviceMem[1], nbytes, hipMemcpyDeviceToHost, stream[1]); - hipEventRecord(stop, 0); - hipEventSynchronize(stop); - float t = 0; - hipEventElapsedTime(&t, start, stop); + hipEventRecord(start, 0); + hipMemcpyAsync(deviceMem[0], hostMem[0], nbytes, hipMemcpyHostToDevice, stream[0]); + hipMemcpyAsync(hostMem[1], deviceMem[1], nbytes, hipMemcpyDeviceToHost, stream[1]); + hipEventRecord(stop, 0); + hipEventSynchronize(stop); + float t = 0; + hipEventElapsedTime(&t, start, stop); - // Convert to GB/sec - if (p_verbose) { - std::cerr << "size " << sizeToString(thisSize) << " took " << t << " ms\n"; - } - - double speed = (double(sizeToBytes(2 * thisSize)) / (1000 * 1000)) / t; - char sizeStr[256]; - sprintf(sizeStr, "%9s", sizeToString(thisSize).c_str()); - resultDB.AddResult( - std::string("Bidir_Bandwidth") + "_" + mallocModeString(p_malloc_mode), sizeStr, - "GB/sec", speed); - resultDB.AddResult(std::string("Bidir_Time") + "_" + mallocModeString(p_malloc_mode), - sizeStr, "ms", t); + // Convert to GB/sec + if (p_verbose) { + std::cerr << "size " << sizeToString(thisSize) << " took " << t << " ms\n"; } + + double speed = (double(sizeToBytes(2 * thisSize)) / (1000 * 1000)) / t; + char sizeStr[256]; + sprintf(sizeStr, "%9s", sizeToString(thisSize).c_str()); + resultDB.AddResult( + std::string("Bidir_Bandwidth") + "_" + mallocModeString(p_malloc_mode), sizeStr, + "GB/sec", speed); + resultDB.AddResult(std::string("Bidir_Time") + "_" + mallocModeString(p_malloc_mode), + sizeStr, "ms", t); + } + if (p_onesize) { + break; + } } // Cleanup @@ -708,66 +702,63 @@ void RunBenchmark_P2P_Unidir(ResultDatabase& resultDB) { hipEventCreate(&stop); CHECK_HIP_ERROR(); - // Three passes, forward and backward both - for (int pass = 0; pass < p_iterations; pass++) { - // store the times temporarily to estimate latency - // float times[nSizes]; - // Step through sizes forward on even passes and backward on odd - for (int i = 0; i < nSizes; i++) { - int sizeIndex; - if ((pass % 2) == 0) - sizeIndex = i; - else - sizeIndex = (nSizes - 1) - i; + // store the times temporarily to estimate latency + // float times[nSizes]; + for (int i = 0; i < nSizes; i++) { + int sizeIndex, iterIndex; + sizeIndex = i; + iterIndex = i; - const int thisSize = p_onesize ? p_onesize : sizes[sizeIndex]; - const int nbytes = sizeToBytes(thisSize); + const int thisSize = p_onesize ? p_onesize : sizes[sizeIndex]; + const int nbytes = sizeToBytes(thisSize); + const int niter = p_iterations ? p_iterations : iterations[iterIndex]; + for (int pass = 0; pass < niter; pass++) { - hipDeviceSynchronize(); + hipDeviceSynchronize(); - hipEventRecord(start, 0); + hipEventRecord(start, 0); - for (int j = 0; j < p_beatsperiteration; j++) { - hipMemcpy(peerGpuMem, currentGpuMem, nbytes, hipMemcpyDeviceToDevice); - } + for (int j = 0; j < p_beatsperiteration; j++) { + hipMemcpy(peerGpuMem, currentGpuMem, nbytes, hipMemcpyDeviceToDevice); + } - hipEventRecord(stop, 0); + hipEventRecord(stop, 0); - hipEventSynchronize(stop); + hipEventSynchronize(stop); - float t = 0; - hipEventElapsedTime(&t, start, stop); - // times[sizeIndex] = t; + float t = 0; + hipEventElapsedTime(&t, start, stop); + // times[sizeIndex] = t; - // Convert to GB/sec - if (p_verbose) { - std::cerr << "size " << sizeToString(thisSize) << " took " << t << " ms\n"; - } + // Convert to GB/sec + if (p_verbose) { + std::cerr << "size " << sizeToString(thisSize) << " took " << t << " ms\n"; + } - double speed = - (double(double(sizeToBytes(thisSize)/1000) * p_beatsperiteration) / 1000) / t; - char sizeStr[256]; - if (p_beatsperiteration > 1) { - sprintf(sizeStr, "%9sx%d", sizeToString(thisSize).c_str(), - p_beatsperiteration); - } else { - sprintf(sizeStr, "%9s", sizeToString(thisSize).c_str()); - } + double speed = + (double(double(sizeToBytes(thisSize)/1000) * p_beatsperiteration) / 1000) / t; + char sizeStr[256]; + if (p_beatsperiteration > 1) { + sprintf(sizeStr, "%9sx%d", sizeToString(thisSize).c_str(), + p_beatsperiteration); + } else { + sprintf(sizeStr, "%9s", sizeToString(thisSize).c_str()); + } - string cGpu, pGpu; - cGpu = gpuIDToString(currentGpu); - pGpu = gpuIDToString(peerGpu); + string cGpu, pGpu; + cGpu = gpuIDToString(currentGpu); + pGpu = gpuIDToString(peerGpu); - resultDB.AddResult(std::string("p2p_uni") + "_gpu" + std::string(cGpu) + - "_gpu" + std::string(pGpu), + resultDB.AddResult(std::string("p2p_uni") + "_gpu" + std::string(cGpu) + + "_gpu" + std::string(pGpu), sizeStr, "GB/sec", speed); - resultDB.AddResult(std::string("P2P_uni") + "_gpu" + std::string(cGpu) + - "_gpu" + std::string(pGpu), + resultDB.AddResult(std::string("P2P_uni") + "_gpu" + std::string(cGpu) + + "_gpu" + std::string(pGpu), sizeStr, "ms", t); - if (p_onesize) { - break; - } + } + if (p_onesize) { + break; } } @@ -829,71 +820,68 @@ void RunBenchmark_P2P_Bidir(ResultDatabase& resultDB) { hipStreamCreate(&stream[0]); hipStreamCreate(&stream[1]); - // Three passes, forward and backward both - for (int pass = 0; pass < p_iterations; pass++) { - // store the times temporarily to estimate latency - // float times[nSizes]; - // Step through sizes forward on even passes and backward on odd - for (int i = 0; i < nSizes; i++) { - int sizeIndex; - if ((pass % 2) == 0) - sizeIndex = i; - else - sizeIndex = (nSizes - 1) - i; + // store the times temporarily to estimate latency + // float times[nSizes]; + for (int i = 0; i < nSizes; i++) { + int sizeIndex, iterIndex; + sizeIndex = i; + iterIndex = i; - const int thisSize = p_onesize ? p_onesize : sizes[sizeIndex]; - const int nbytes = sizeToBytes(thisSize); + const int thisSize = p_onesize ? p_onesize : sizes[sizeIndex]; + const int nbytes = sizeToBytes(thisSize); + const int niter = p_iterations ? p_iterations : iterations[iterIndex]; + for (int pass = 0; pass < niter; pass++) { - hipDeviceSynchronize(); + hipDeviceSynchronize(); - hipEventRecord(start, 0); + hipEventRecord(start, 0); - for (int j = 0; j < p_beatsperiteration; j++) { - hipMemcpyAsync(peerGpuMem[0], currentGpuMem[0], nbytes, - hipMemcpyDeviceToDevice, stream[0]); - hipMemcpyAsync(currentGpuMem[1], peerGpuMem[1], nbytes, - hipMemcpyDeviceToDevice, stream[1]); - } - - hipEventRecord(stop, 0); - - hipEventSynchronize(stop); - - float t = 0; - hipEventElapsedTime(&t, start, stop); - // times[sizeIndex] = t; - - // Convert to GB/sec - if (p_verbose) { - std::cerr << "size " << sizeToString(thisSize) << " took " << t << " ms\n"; - } - - double speed = - (double(double(sizeToBytes(2 * thisSize)/1000) * p_beatsperiteration) / 1000) / - t; - char sizeStr[256]; - if (p_beatsperiteration > 1) { - sprintf(sizeStr, "%9sx%d", sizeToString(thisSize).c_str(), - p_beatsperiteration); - } else { - sprintf(sizeStr, "%9s", sizeToString(thisSize).c_str()); - } - - string cGpu, pGpu; - cGpu = gpuIDToString(currentGpu); - pGpu = gpuIDToString(peerGpu); - - resultDB.AddResult(std::string("p2p_bi") + "_gpu" + std::string(cGpu) + "_gpu" + - std::string(pGpu), - sizeStr, "GB/sec", speed); - resultDB.AddResult(std::string("P2P_bi") + "_gpu" + std::string(cGpu) + "_gpu" + - std::string(pGpu), - sizeStr, "ms", t); - - if (p_onesize) { - break; - } + for (int j = 0; j < p_beatsperiteration; j++) { + hipMemcpyAsync(peerGpuMem[0], currentGpuMem[0], nbytes, + hipMemcpyDeviceToDevice, stream[0]); + hipMemcpyAsync(currentGpuMem[1], peerGpuMem[1], nbytes, + hipMemcpyDeviceToDevice, stream[1]); } + + hipEventRecord(stop, 0); + + hipEventSynchronize(stop); + + float t = 0; + hipEventElapsedTime(&t, start, stop); + // times[sizeIndex] = t; + + // Convert to GB/sec + if (p_verbose) { + std::cerr << "size " << sizeToString(thisSize) << " took " << t << " ms\n"; + } + + double speed = + (double(double(sizeToBytes(2 * thisSize)/1000) * p_beatsperiteration) / 1000) / + t; + char sizeStr[256]; + if (p_beatsperiteration > 1) { + sprintf(sizeStr, "%9sx%d", sizeToString(thisSize).c_str(), + p_beatsperiteration); + } else { + sprintf(sizeStr, "%9s", sizeToString(thisSize).c_str()); + } + + string cGpu, pGpu; + cGpu = gpuIDToString(currentGpu); + pGpu = gpuIDToString(peerGpu); + + resultDB.AddResult(std::string("p2p_bi") + "_gpu" + std::string(cGpu) + "_gpu" + + std::string(pGpu), + sizeStr, "GB/sec", speed); + resultDB.AddResult(std::string("P2P_bi") + "_gpu" + std::string(cGpu) + "_gpu" + + std::string(pGpu), + sizeStr, "ms", t); + + } + if (p_onesize) { + break; + } } if (p_onesize) { diff --git a/samples/1_Utils/hipCommander/CMakeLists.txt b/samples/1_Utils/hipCommander/CMakeLists.txt new file mode 100644 index 0000000000..2592020c66 --- /dev/null +++ b/samples/1_Utils/hipCommander/CMakeLists.txt @@ -0,0 +1,31 @@ +project(hipCommander) + +cmake_minimum_required(VERSION 3.10) + +# Search for rocm in common locations +list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm) + +# Find hip +find_package(hip) + +# Set compiler and linker +set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE}) +set(CMAKE_CXX_LINKER ${HIP_HIPCC_EXECUTABLE}) +set(CMAKE_BUILD_TYPE Release) + +# Create the excutable +add_executable(hipCommander hipCommander.cpp) + +# Generate code object +add_custom_target( + codeobj + ALL + COMMAND ${HIP_HIPCC_EXECUTABLE} --genco ../nullkernel.hip.cpp -o nullkernel.hsaco + COMMENT "codeobj generated" +) + +add_dependencies(hipCommander codeobj) + +# Link with HIP +target_link_libraries(hipCommander hip::host) +set_property(TARGET hipCommander PROPERTY CXX_STANDARD 11) diff --git a/samples/1_Utils/hipDispatchLatency/CMakeLists.txt b/samples/1_Utils/hipDispatchLatency/CMakeLists.txt new file mode 100644 index 0000000000..b267f91905 --- /dev/null +++ b/samples/1_Utils/hipDispatchLatency/CMakeLists.txt @@ -0,0 +1,35 @@ +project(hipDispatchLatency) + +cmake_minimum_required(VERSION 3.10) + +# Search for rocm in common locations +list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm) + +# Find hip +find_package(hip) + +# Set compiler and linker +set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE}) +set(CMAKE_CXX_LINKER ${HIP_HIPCC_EXECUTABLE}) +set(CMAKE_BUILD_TYPE Release) + +# Create the excutable +add_executable(hipDispatchLatency hipDispatchLatency.cpp) +add_executable(hipDispatchEnqueueRateMT hipDispatchEnqueueRateMT.cpp) + +# Generate code object +add_custom_target( + codeobj + ALL + COMMAND ${HIP_HIPCC_EXECUTABLE} --genco ../test_kernel.cpp -o test_kernel.code + COMMENT "codeobj generated" +) + +add_dependencies(hipDispatchLatency codeobj) +add_dependencies(hipDispatchEnqueueRateMT codeobj) + +# Link with HIP +target_link_libraries(hipDispatchLatency hip::host) +target_link_libraries(hipDispatchEnqueueRateMT hip::host) +set_property(TARGET hipDispatchLatency PROPERTY CXX_STANDARD 11) +set_property(TARGET hipDispatchEnqueueRateMT PROPERTY CXX_STANDARD 11) diff --git a/samples/1_Utils/hipInfo/CMakeLists.txt b/samples/1_Utils/hipInfo/CMakeLists.txt new file mode 100644 index 0000000000..f3678d3160 --- /dev/null +++ b/samples/1_Utils/hipInfo/CMakeLists.txt @@ -0,0 +1,20 @@ +project(hipInfo) + +cmake_minimum_required(VERSION 3.10) + +# Search for rocm in common locations +list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm) + +# Find hip +find_package(hip) + +# Set compiler and linker +set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE}) +set(CMAKE_CXX_LINKER ${HIP_HIPCC_EXECUTABLE}) +set(CMAKE_BUILD_TYPE Release) + +# Create the excutable +add_executable(hipInfo hipInfo.cpp) + +# Link with HIP +target_link_libraries(hipInfo hip::host) diff --git a/samples/2_Cookbook/0_MatrixTranspose/CMakeLists.txt b/samples/2_Cookbook/0_MatrixTranspose/CMakeLists.txt new file mode 100644 index 0000000000..de5bb0b5ea --- /dev/null +++ b/samples/2_Cookbook/0_MatrixTranspose/CMakeLists.txt @@ -0,0 +1,20 @@ +project(MatrixTranspose) + +cmake_minimum_required(VERSION 3.10) + +# Search for rocm in common locations +list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm) + +# Find hip +find_package(hip) + +# Set compiler and linker +set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE}) +set(CMAKE_CXX_LINKER ${HIP_HIPCC_EXECUTABLE}) +set(CMAKE_BUILD_TYPE Release) + +# Create the excutable +add_executable(MatrixTranspose MatrixTranspose.cpp) + +# Link with HIP +target_link_libraries(MatrixTranspose hip::host) diff --git a/samples/2_Cookbook/10_inline_asm/CMakeLists.txt b/samples/2_Cookbook/10_inline_asm/CMakeLists.txt new file mode 100644 index 0000000000..7adb51f5de --- /dev/null +++ b/samples/2_Cookbook/10_inline_asm/CMakeLists.txt @@ -0,0 +1,20 @@ +project(inline_asm) + +cmake_minimum_required(VERSION 3.10) + +# Search for rocm in common locations +list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm) + +# Find hip +find_package(hip) + +# Set compiler and linker +set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE}) +set(CMAKE_CXX_LINKER ${HIP_HIPCC_EXECUTABLE}) +set(CMAKE_BUILD_TYPE Release) + +# Create the excutable +add_executable(inline_asm inline_asm.cpp) + +# Link with HIP +target_link_libraries(inline_asm hip::host) diff --git a/samples/2_Cookbook/11_texture_driver/CMakeLists.txt b/samples/2_Cookbook/11_texture_driver/CMakeLists.txt new file mode 100644 index 0000000000..8ff242c993 --- /dev/null +++ b/samples/2_Cookbook/11_texture_driver/CMakeLists.txt @@ -0,0 +1,30 @@ +project(texture2dDrv) + +cmake_minimum_required(VERSION 3.10) + +# Search for rocm in common locations +list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm) + +# Find hip +find_package(hip) + +# Set compiler and linker +set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE}) +set(CMAKE_CXX_LINKER ${HIP_HIPCC_EXECUTABLE}) +set(CMAKE_BUILD_TYPE Release) + +# Create the excutable +add_executable(texture2dDrv texture2dDrv.cpp) + +# Generate code object +add_custom_target( + codeobj + ALL + COMMAND ${HIP_HIPCC_EXECUTABLE} --genco ../tex2dKernel.cpp -o tex2dKernel.code + COMMENT "codeobj generated" +) + +add_dependencies(texture2dDrv codeobj) + +# Link with HIP +target_link_libraries(texture2dDrv hip::host) diff --git a/samples/2_Cookbook/13_occupancy/CMakeLists.txt b/samples/2_Cookbook/13_occupancy/CMakeLists.txt new file mode 100644 index 0000000000..6cad76a395 --- /dev/null +++ b/samples/2_Cookbook/13_occupancy/CMakeLists.txt @@ -0,0 +1,20 @@ +project(occupancy) + +cmake_minimum_required(VERSION 3.10) + +# Search for rocm in common locations +list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm) + +# Find hip +find_package(hip) + +# Set compiler and linker +set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE}) +set(CMAKE_CXX_LINKER ${HIP_HIPCC_EXECUTABLE}) +set(CMAKE_BUILD_TYPE Release) + +# Create the excutable +add_executable(occupancy occupancy.cpp) + +# Link with HIP +target_link_libraries(occupancy hip::host) diff --git a/samples/2_Cookbook/1_hipEvent/CMakeLists.txt b/samples/2_Cookbook/1_hipEvent/CMakeLists.txt new file mode 100644 index 0000000000..6f6ee4e050 --- /dev/null +++ b/samples/2_Cookbook/1_hipEvent/CMakeLists.txt @@ -0,0 +1,20 @@ +project(hipEvent) + +cmake_minimum_required(VERSION 3.10) + +# Search for rocm in common locations +list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm) + +# Find hip +find_package(hip) + +# Set compiler and linker +set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE}) +set(CMAKE_CXX_LINKER ${HIP_HIPCC_EXECUTABLE}) +set(CMAKE_BUILD_TYPE Release) + +# Create the excutable +add_executable(hipEvent hipEvent.cpp) + +# Link with HIP +target_link_libraries(hipEvent hip::host) diff --git a/samples/2_Cookbook/3_shared_memory/CMakeLists.txt b/samples/2_Cookbook/3_shared_memory/CMakeLists.txt new file mode 100644 index 0000000000..6401488628 --- /dev/null +++ b/samples/2_Cookbook/3_shared_memory/CMakeLists.txt @@ -0,0 +1,20 @@ +project(sharedMemory) + +cmake_minimum_required(VERSION 3.10) + +# Search for rocm in common locations +list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm) + +# Find hip +find_package(hip) + +# Set compiler and linker +set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE}) +set(CMAKE_CXX_LINKER ${HIP_HIPCC_EXECUTABLE}) +set(CMAKE_BUILD_TYPE Release) + +# Create the excutable +add_executable(sharedMemory sharedMemory.cpp) + +# Link with HIP +target_link_libraries(sharedMemory hip::host) diff --git a/samples/2_Cookbook/4_shfl/CMakeLists.txt b/samples/2_Cookbook/4_shfl/CMakeLists.txt new file mode 100644 index 0000000000..9d142eeb02 --- /dev/null +++ b/samples/2_Cookbook/4_shfl/CMakeLists.txt @@ -0,0 +1,20 @@ +project(shfl) + +cmake_minimum_required(VERSION 3.10) + +# Search for rocm in common locations +list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm) + +# Find hip +find_package(hip) + +# Set compiler and linker +set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE}) +set(CMAKE_CXX_LINKER ${HIP_HIPCC_EXECUTABLE}) +set(CMAKE_BUILD_TYPE Release) + +# Create the excutable +add_executable(shfl shfl.cpp) + +# Link with HIP +target_link_libraries(shfl hip::host) diff --git a/samples/2_Cookbook/5_2dshfl/CMakeLists.txt b/samples/2_Cookbook/5_2dshfl/CMakeLists.txt new file mode 100644 index 0000000000..adc0e3595d --- /dev/null +++ b/samples/2_Cookbook/5_2dshfl/CMakeLists.txt @@ -0,0 +1,19 @@ +project(2dshfl) + +cmake_minimum_required(VERSION 3.10) + +# Search for rocm in common locations +list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm) + +# Find hip +find_package(hip) + +# Set compiler and linker +set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE}) +set(CMAKE_CXX_LINKER ${HIP_HIPCC_EXECUTABLE}) + +# Create the excutable +add_executable(2dshfl 2dshfl.cpp) + +# Link with HIP +target_link_libraries(2dshfl hip::host) diff --git a/samples/2_Cookbook/6_dynamic_shared/CMakeLists.txt b/samples/2_Cookbook/6_dynamic_shared/CMakeLists.txt new file mode 100644 index 0000000000..f177952d5a --- /dev/null +++ b/samples/2_Cookbook/6_dynamic_shared/CMakeLists.txt @@ -0,0 +1,19 @@ +project(dynamic_shared) + +cmake_minimum_required(VERSION 3.10) + +# Search for rocm in common locations +list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm) + +# Find hip +find_package(hip) + +# Set compiler and linker +set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE}) +set(CMAKE_CXX_LINKER ${HIP_HIPCC_EXECUTABLE}) + +# Create the excutable +add_executable(dynamic_shared dynamic_shared.cpp) + +# Link with HIP +target_link_libraries(dynamic_shared hip::host) diff --git a/samples/2_Cookbook/7_streams/CMakeLists.txt b/samples/2_Cookbook/7_streams/CMakeLists.txt new file mode 100644 index 0000000000..fac4187b47 --- /dev/null +++ b/samples/2_Cookbook/7_streams/CMakeLists.txt @@ -0,0 +1,19 @@ +project(stream) + +cmake_minimum_required(VERSION 3.10) + +# Search for rocm in common locations +list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm) + +# Find hip +find_package(hip) + +# Set compiler and linker +set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE}) +set(CMAKE_CXX_LINKER ${HIP_HIPCC_EXECUTABLE}) + +# Create the excutable +add_executable(stream stream.cpp) + +# Link with HIP +target_link_libraries(stream hip::host) diff --git a/samples/2_Cookbook/8_peer2peer/CMakeLists.txt b/samples/2_Cookbook/8_peer2peer/CMakeLists.txt new file mode 100644 index 0000000000..7c38373911 --- /dev/null +++ b/samples/2_Cookbook/8_peer2peer/CMakeLists.txt @@ -0,0 +1,19 @@ +project(peer2peer) + +cmake_minimum_required(VERSION 3.10) + +# Search for rocm in common locations +list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm) + +# Find hip +find_package(hip) + +# Set compiler and linker +set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE}) +set(CMAKE_CXX_LINKER ${HIP_HIPCC_EXECUTABLE}) + +# Create the excutable +add_executable(peer2peer peer2peer.cpp) + +# Link with HIP +target_link_libraries(peer2peer hip::host) diff --git a/samples/2_Cookbook/9_unroll/CMakeLists.txt b/samples/2_Cookbook/9_unroll/CMakeLists.txt new file mode 100644 index 0000000000..fc1b740e33 --- /dev/null +++ b/samples/2_Cookbook/9_unroll/CMakeLists.txt @@ -0,0 +1,19 @@ +project(unroll) + +cmake_minimum_required(VERSION 3.10) + +# Search for rocm in common locations +list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm) + +# Find hip +find_package(hip) + +# Set compiler and linker +set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE}) +set(CMAKE_CXX_LINKER ${HIP_HIPCC_EXECUTABLE}) + +# Create the excutable +add_executable(unroll unroll.cpp) + +# Link with HIP +target_link_libraries(unroll hip::host) diff --git a/samples/README.md b/samples/README.md new file mode 100644 index 0000000000..739045382e --- /dev/null +++ b/samples/README.md @@ -0,0 +1,27 @@ +Build procedure + +We provide Makefile and CMakeLists.txt to build the samples seperately. + +1.Makefile supports shared lib of hip-rocclr runtime and nvcc. + +To build a sample, just type in sample folder, + +make + + + +2.CMakeLists.txt can support shared and static libs of hip-rocclr runtime. + +To build a sample, type in sample folder, + +mkdir build (if build folder is missing) + +cd build + +cmake .. + +make + +If you want debug version, follow, + +cmake -DCMAKE_BUILD_TYPE=Debug .. \ No newline at end of file