Revert "Merge branch 'amd-master-next' into amd-npi-next"
This reverts commit873a2e33d3. Reason for revert: <INSERT REASONING HERE> Change-Id: I53322718dadde2c98f96140b8e260ec7ee9ef721 [ROCm/hip-tests commit:051c84bba3]
Этот коммит содержится в:
@@ -1,747 +0,0 @@
|
||||
/*
|
||||
Copyright (c) 2015-2020 Advanced Micro Devices, Inc. All rights reserved.
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
/* HIT_START
|
||||
* BUILD: %t %s ../../src/test_common.cpp EXCLUDE_HIP_PLATFORM nvcc
|
||||
* TEST: %t
|
||||
* HIT_END
|
||||
*/
|
||||
|
||||
#include <iostream>
|
||||
#include <chrono>
|
||||
#include "test_common.h"
|
||||
#include <hip/hip_vector_types.h>
|
||||
#include <hip/math_functions.h>
|
||||
#include <vector>
|
||||
#include <string>
|
||||
#include <map>
|
||||
|
||||
typedef struct {
|
||||
double x;
|
||||
double y;
|
||||
double width;
|
||||
} coordRec;
|
||||
|
||||
coordRec coords[] = {
|
||||
{0.0, 0.0, 4.0}, // Whole set
|
||||
{0.0, 0.0, 0.00001}, // All black
|
||||
{-0.0180789661868, 0.6424294066162, 0.00003824140}, // Hit detail
|
||||
};
|
||||
|
||||
static unsigned int numCoords = sizeof(coords) / sizeof(coordRec);
|
||||
|
||||
template <typename T>
|
||||
__global__ void float_mad_kernel(uint *out, uint width, T xPos, T yPos, T xStep, T yStep,
|
||||
uint maxIter) {
|
||||
|
||||
#pragma FP_CONTRACT ON
|
||||
int tid = (blockIdx.x * blockDim.x + threadIdx.x);
|
||||
int i = tid % width;
|
||||
int j = tid / width;
|
||||
float x0 = (float)(xPos + xStep*i);
|
||||
float y0 = (float)(yPos + yStep*j);
|
||||
|
||||
float x = x0;
|
||||
float y = y0;
|
||||
|
||||
uint iter = 0;
|
||||
float tmp;
|
||||
for (iter = 0; (x*x + y*y <= 4.0f) && (iter < maxIter); iter++) {
|
||||
tmp = x;
|
||||
x = fma(-y,y,fma(x,x,x0));
|
||||
y = fma(2.0f*tmp,y,y0);
|
||||
}
|
||||
|
||||
out[tid] = iter;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
__global__ void float_mandel_unroll_kernel(uint *out, uint width, T xPos,
|
||||
T yPos, T xStep, T yStep, uint maxIter) {
|
||||
|
||||
#pragma FP_CONTRACT ON
|
||||
int tid = (blockIdx.x * blockDim.x + threadIdx.x);
|
||||
int i = tid % width;
|
||||
int j = tid / width;
|
||||
float x0 = (float)(xPos + xStep*(float)i);
|
||||
float y0 = (float)(yPos + yStep*(float)j);
|
||||
|
||||
float x = x0;
|
||||
float y = y0;
|
||||
|
||||
#define FAST
|
||||
uint iter = 0;
|
||||
float tmp;
|
||||
int stay;
|
||||
int ccount = 0;
|
||||
stay = (x*x+y*y) <= 4.0;
|
||||
float savx = x;
|
||||
float savy = y;
|
||||
#ifdef FAST
|
||||
for (iter = 0; (iter < maxIter); iter+=16) {
|
||||
#else
|
||||
for (iter = 0; stay && (iter < maxIter); iter+=16) {
|
||||
#endif
|
||||
x = savx;
|
||||
y = savy;
|
||||
|
||||
// Two iterations
|
||||
tmp = fma(-y,y, fma(x,x,x0));
|
||||
y = fma(2.0f*x,y,y0);
|
||||
x = fma(-y,y, fma(tmp,tmp,x0));
|
||||
y = fma(2.0f*tmp,y,y0);
|
||||
|
||||
// Two iterations
|
||||
tmp = fma(-y,y, fma(x,x,x0));
|
||||
y = fma(2.0f*x,y,y0);
|
||||
x = fma(-y,y, fma(tmp,tmp,x0));
|
||||
y = fma(2.0f*tmp,y,y0);
|
||||
|
||||
// Two iterations
|
||||
tmp = fma(-y,y, fma(x,x,x0));
|
||||
y = fma(2.0f*x,y,y0);
|
||||
x = fma(-y,y, fma(tmp,tmp,x0));
|
||||
y = fma(2.0f*tmp,y,y0);
|
||||
|
||||
// Two iterations
|
||||
tmp = fma(-y,y, fma(x,x,x0));
|
||||
y = fma(2.0f*x,y,y0);
|
||||
x = fma(-y,y, fma(tmp,tmp,x0));
|
||||
y = fma(2.0f*tmp,y,y0);
|
||||
|
||||
// Two iterations
|
||||
tmp = fma(-y,y, fma(x,x,x0));
|
||||
y = fma(2.0f*x,y,y0);
|
||||
x = fma(-y,y, fma(tmp,tmp,x0));
|
||||
y = fma(2.0f*tmp,y,y0);
|
||||
|
||||
// Two iterations
|
||||
tmp = fma(-y,y, fma(x,x,x0));
|
||||
y = fma(2.0f*x,y,y0);
|
||||
x = fma(-y,y, fma(tmp,tmp,x0));
|
||||
y = fma(2.0f*tmp,y,y0);
|
||||
|
||||
// Two iterations
|
||||
tmp = fma(-y,y, fma(x,x,x0));
|
||||
y = fma(2.0f*x,y,y0);
|
||||
x = fma(-y,y, fma(tmp,tmp,x0));
|
||||
y = fma(2.0f*tmp,y,y0);
|
||||
|
||||
// Two iterations
|
||||
tmp = fma(-y,y, fma(x,x,x0));
|
||||
y = fma(2.0f*x,y,y0);
|
||||
x = fma(-y,y, fma(tmp,tmp,x0));
|
||||
y = fma(2.0f*tmp,y,y0);
|
||||
|
||||
stay = (x*x+y*y) <= 4.0;
|
||||
savx = (stay ? x : savx);
|
||||
savy = (stay ? y : savy);
|
||||
ccount += stay*16;
|
||||
#ifdef FAST
|
||||
if (!stay)
|
||||
break;
|
||||
#endif
|
||||
}
|
||||
// Handle remainder
|
||||
if (!stay) {
|
||||
iter = 16;
|
||||
do {
|
||||
x = savx;
|
||||
y = savy;
|
||||
stay = ((x*x+y*y) <= 4.0) && (ccount < maxIter);
|
||||
tmp = x;
|
||||
x = fma(-y,y, fma(x,x,x0));
|
||||
y = fma(2.0f*tmp,y,y0);
|
||||
ccount += stay;
|
||||
iter--;
|
||||
savx = (stay ? x : savx);
|
||||
savy = (stay ? y : savy);
|
||||
} while (stay && iter);
|
||||
}
|
||||
|
||||
|
||||
out[tid] = (uint)ccount;
|
||||
|
||||
};
|
||||
|
||||
|
||||
template <typename T>
|
||||
__global__ void double_mad_kernel(uint *out, uint width, T xPos, T yPos, T xStep, T yStep,
|
||||
uint maxIter) {
|
||||
|
||||
#pragma FP_CONTRACT ON
|
||||
int tid = (blockIdx.x * blockDim.x + threadIdx.x);
|
||||
int i = tid % width;
|
||||
int j = tid / width;
|
||||
double x0 = (double)(xPos + xStep*i);
|
||||
double y0 = (double)(yPos + yStep*j);
|
||||
|
||||
double x = x0;
|
||||
double y = y0;
|
||||
|
||||
uint iter = 0;
|
||||
double tmp;
|
||||
for (iter = 0; (x*x + y*y <= 4.0f) && (iter < maxIter); iter++) {
|
||||
tmp = x;
|
||||
x = fma(-y,y,fma(x,x,x0));
|
||||
y = fma(2.0f*tmp,y,y0);
|
||||
}
|
||||
out[tid] = iter;
|
||||
};
|
||||
|
||||
|
||||
template <typename T>
|
||||
__global__ void double_mandel_unroll_kernel(uint *out, uint width, T xPos,
|
||||
T yPos, T xStep, T yStep, uint maxIter) {
|
||||
|
||||
#pragma FP_CONTRACT ON
|
||||
int tid = (blockIdx.x * blockDim.x + threadIdx.x);
|
||||
|
||||
int i = tid % width;
|
||||
int j = tid / width;
|
||||
double x0 = (double)(xPos + xStep*(double)i);
|
||||
double y0 = (double)(yPos + yStep*(double)j);
|
||||
|
||||
double x = x0;
|
||||
double y = y0;
|
||||
|
||||
#define FAST
|
||||
uint iter = 0;
|
||||
double tmp;
|
||||
int stay;
|
||||
int ccount = 0;
|
||||
stay = (x*x+y*y) <= 4.0;
|
||||
double savx = x;
|
||||
double savy = y;
|
||||
#ifdef FAST
|
||||
for (iter = 0; (iter < maxIter); iter+=16)
|
||||
#else
|
||||
for (iter = 0; stay && (iter < maxIter); iter+=16)
|
||||
#endif
|
||||
{
|
||||
x = savx;
|
||||
y = savy;
|
||||
|
||||
// Two iterations
|
||||
tmp = fma(-y,y, fma(x,x,x0));
|
||||
y = fma(2.0f*x,y,y0);
|
||||
x = fma(-y,y, fma(tmp,tmp,x0));
|
||||
y = fma(2.0f*tmp,y,y0);
|
||||
|
||||
// Two iterations
|
||||
tmp = fma(-y,y, fma(x,x,x0));
|
||||
y = fma(2.0f*x,y,y0);
|
||||
x = fma(-y,y, fma(tmp,tmp,x0));
|
||||
y = fma(2.0f*tmp,y,y0);
|
||||
|
||||
// Two iterations
|
||||
tmp = fma(-y,y, fma(x,x,x0));
|
||||
y = fma(2.0f*x,y,y0);
|
||||
x = fma(-y,y, fma(tmp,tmp,x0));
|
||||
y = fma(2.0f*tmp,y,y0);
|
||||
|
||||
// Two iterations
|
||||
tmp = fma(-y,y, fma(x,x,x0));
|
||||
y = fma(2.0f*x,y,y0);
|
||||
x = fma(-y,y, fma(tmp,tmp,x0));
|
||||
y = fma(2.0f*tmp,y,y0);
|
||||
|
||||
// Two iterations
|
||||
tmp = fma(-y,y, fma(x,x,x0));
|
||||
y = fma(2.0f*x,y,y0);
|
||||
x = fma(-y,y, fma(tmp,tmp,x0));
|
||||
y = fma(2.0f*tmp,y,y0);
|
||||
|
||||
// Two iterations
|
||||
tmp = fma(-y,y, fma(x,x,x0));
|
||||
y = fma(2.0f*x,y,y0);
|
||||
x = fma(-y,y, fma(tmp,tmp,x0));
|
||||
y = fma(2.0f*tmp,y,y0);
|
||||
|
||||
// Two iterations
|
||||
tmp = fma(-y,y, fma(x,x,x0));
|
||||
y = fma(2.0f*x,y,y0);
|
||||
x = fma(-y,y, fma(tmp,tmp,x0));
|
||||
y = fma(2.0f*tmp,y,y0);
|
||||
|
||||
// Two iterations
|
||||
tmp = fma(-y,y, fma(x,x,x0));
|
||||
y = fma(2.0f*x,y,y0);
|
||||
x = fma(-y,y, fma(tmp,tmp,x0));
|
||||
y = fma(2.0f*tmp,y,y0);
|
||||
|
||||
stay = (x*x+y*y) <= 4.0;
|
||||
savx = (stay ? x : savx);
|
||||
savy = (stay ? y : savy);
|
||||
ccount += stay*16;
|
||||
#ifdef FAST
|
||||
if (!stay)
|
||||
break;
|
||||
#endif
|
||||
}
|
||||
// Handle remainder
|
||||
if (!stay) {
|
||||
iter = 16;
|
||||
do {
|
||||
x = savx;
|
||||
y = savy;
|
||||
stay = ((x*x+y*y) <= 4.0) && (ccount < maxIter);
|
||||
tmp = x;
|
||||
x = fma(-y,y, fma(x,x,x0));
|
||||
y = fma(2.0f*tmp,y,y0);
|
||||
ccount += stay;
|
||||
iter--;
|
||||
savx = (stay ? x : savx);
|
||||
savy = (stay ? y : savy);
|
||||
}
|
||||
while (stay && iter);
|
||||
|
||||
}
|
||||
out[tid] = (uint)ccount;
|
||||
};
|
||||
|
||||
static const unsigned int FMA_EXPECTEDVALUES_INDEX = 15;
|
||||
|
||||
// Expected results for each kernel run at each coord
|
||||
unsigned long long expectedIters[] = {
|
||||
203277748ull, 2147483648ull, 120254651ull, 203277748ull, 2147483648ull,
|
||||
120254651ull, 203277748ull, 2147483648ull, 120254651ull, 203315114ull,
|
||||
2147483648ull, 120042599ull, 203315114ull, 2147483648ull, 120042599ull,
|
||||
203280620ull, 2147483648ull, 120485704ull, 203280620ull, 2147483648ull,
|
||||
120485704ull, 203280620ull, 2147483648ull, 120485704ull, 203315114ull,
|
||||
2147483648ull, 120042599ull, 203315114ull, 2147483648ull, 120042599ull};
|
||||
|
||||
class hipPerfMandelBrot {
|
||||
public:
|
||||
hipPerfMandelBrot();
|
||||
~hipPerfMandelBrot();
|
||||
|
||||
void setNumKernels(unsigned int num) {
|
||||
numKernels = num;
|
||||
}
|
||||
|
||||
unsigned int getNumKernels() {
|
||||
return numKernels;
|
||||
}
|
||||
|
||||
void setNumStreams(unsigned int num) {
|
||||
numStreams = num;
|
||||
}
|
||||
unsigned int getNumStreams() {
|
||||
return numStreams;
|
||||
}
|
||||
|
||||
void open(int deviceID);
|
||||
void run(unsigned int testCase, unsigned int deviceId);
|
||||
void printResults(void);
|
||||
|
||||
// array of funtion pointers
|
||||
typedef void (hipPerfMandelBrot::*funPtr)(uint *out, uint width, float xPos, float yPos,
|
||||
float xStep, float yStep, uint maxIter, hipStream_t* streams, int blocks,
|
||||
int threads_per_block, int kernelCnt);
|
||||
|
||||
// Wrappers
|
||||
void float_mad(uint *out, uint width, float xPos, float yPos,
|
||||
float xStep, float yStep, uint maxIter, hipStream_t* streams,
|
||||
int blocks, int threads_per_block, int kernelCnt);
|
||||
|
||||
void float_mandel_unroll(uint *out, uint width, float xPos, float yPos,
|
||||
float xStep, float yStep, uint maxIter, hipStream_t* streams,
|
||||
int blocks, int threads_per_block, int kernelCnt);
|
||||
|
||||
void double_mad(uint *out, uint width, float xPos, float yPos, float xStep,
|
||||
float yStep, uint maxIter, hipStream_t* streams, int blocks,
|
||||
int threads_per_block, int kernelCnt);
|
||||
|
||||
void double_mandel_unroll(uint *out, uint width, float xPos, float yPos, float xStep,
|
||||
float yStep, uint maxIter, hipStream_t* streams, int blocks,
|
||||
int threads_per_block, int kernelCnt);
|
||||
|
||||
hipStream_t streams[2];
|
||||
|
||||
private:
|
||||
void setData(void *ptr, unsigned int value);
|
||||
void checkData(uint *ptr);
|
||||
|
||||
unsigned int numKernels;
|
||||
unsigned int numStreams;
|
||||
|
||||
std::map<std::string, std::vector<double>> results;
|
||||
unsigned int width_;
|
||||
unsigned int bufSize;
|
||||
unsigned int maxIter;
|
||||
unsigned int coordIdx;
|
||||
volatile unsigned long long totalIters = 0;
|
||||
int numCUs;
|
||||
static const unsigned int numLoops = 10;
|
||||
};
|
||||
|
||||
|
||||
hipPerfMandelBrot::hipPerfMandelBrot() {}
|
||||
|
||||
hipPerfMandelBrot::~hipPerfMandelBrot() {}
|
||||
|
||||
void hipPerfMandelBrot::open(int deviceId) {
|
||||
|
||||
|
||||
int nGpu = 0;
|
||||
HIPCHECK(hipGetDeviceCount(&nGpu));
|
||||
if (nGpu < 1) {
|
||||
std::cout << "info: didn't find any GPU! skipping the test!\n";
|
||||
passed();
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
HIPCHECK(hipSetDevice(deviceId));
|
||||
hipDeviceProp_t props = {0};
|
||||
HIPCHECK(hipGetDeviceProperties(&props, deviceId));
|
||||
std::cout << "info: running on bus " << "0x" << props.pciBusID << " " << props.name
|
||||
<< " with " << props.multiProcessorCount << " CUs" << " and device id: " << deviceId
|
||||
<< std::endl;
|
||||
|
||||
numCUs = props.multiProcessorCount;
|
||||
}
|
||||
|
||||
|
||||
void hipPerfMandelBrot::printResults() {
|
||||
|
||||
int numkernels = getNumKernels();
|
||||
int numStreams = getNumStreams();
|
||||
|
||||
std::cout << "\n" <<"Measured perf for kernels in GFLOPS on "
|
||||
<< numStreams << " streams (s)" << std::endl;
|
||||
|
||||
std::map<std::string, std::vector<double>>:: iterator itr;
|
||||
for (itr = results.begin(); itr != results.end(); itr++) {
|
||||
std::cout << "\n" << std::setw(20) << itr->first << " ";
|
||||
for(auto i : results[itr->first]) {
|
||||
std::cout << std::setw(10) << i << " ";
|
||||
}
|
||||
}
|
||||
results.clear();
|
||||
|
||||
std::cout << std::endl;
|
||||
}
|
||||
|
||||
|
||||
// Wrappers for the kernel launches
|
||||
void hipPerfMandelBrot::float_mad(uint *out, uint width, float xPos, float yPos, float xStep,
|
||||
float yStep, uint maxIter, hipStream_t* streams,
|
||||
int blocks, int threads_per_block, int kernelCnt) {
|
||||
|
||||
int streamCnt = getNumStreams();
|
||||
hipLaunchKernelGGL(float_mad_kernel<float>, dim3(blocks), dim3(threads_per_block), 0,
|
||||
streams[kernelCnt % streamCnt], out, width_, xPos, yPos, xStep, yStep,
|
||||
maxIter);
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
void hipPerfMandelBrot::float_mandel_unroll(uint *out, uint width, float xPos, float yPos,
|
||||
float xStep, float yStep, uint maxIter, hipStream_t * streams,
|
||||
int blocks, int threads_per_block, int kernelCnt) {
|
||||
|
||||
int streamCnt = getNumStreams();
|
||||
hipLaunchKernelGGL(float_mandel_unroll_kernel<float>, dim3(blocks), dim3(threads_per_block), 0,
|
||||
streams[kernelCnt % streamCnt], out, width_, xPos, yPos, xStep, yStep, maxIter);
|
||||
|
||||
}
|
||||
|
||||
|
||||
void hipPerfMandelBrot::double_mad(uint *out, uint width, float xPos, float yPos,
|
||||
float xStep, float yStep, uint maxIter, hipStream_t * streams,
|
||||
int blocks, int threads_per_block, int kernelCnt) {
|
||||
|
||||
int streamCnt = getNumStreams();
|
||||
hipLaunchKernelGGL(double_mad_kernel<double>, dim3(blocks), dim3(threads_per_block), 0,
|
||||
streams[kernelCnt % streamCnt], out, width_, xPos, yPos, xStep, yStep, maxIter);
|
||||
|
||||
}
|
||||
|
||||
|
||||
void hipPerfMandelBrot::double_mandel_unroll(uint *out, uint width, float xPos, float yPos,
|
||||
float xStep, float yStep, uint maxIter, hipStream_t * streams,
|
||||
int blocks, int threads_per_block, int kernelCnt) {
|
||||
|
||||
int streamCnt = getNumStreams();
|
||||
hipLaunchKernelGGL(float_mandel_unroll_kernel<double>, dim3(blocks), dim3(threads_per_block), 0,
|
||||
streams[kernelCnt % streamCnt], out, width_, xPos, yPos, xStep, yStep, maxIter);
|
||||
|
||||
}
|
||||
|
||||
|
||||
void hipPerfMandelBrot::run(unsigned int testCase,unsigned int deviceId) {
|
||||
|
||||
unsigned int numStreams = getNumStreams();
|
||||
|
||||
funPtr p[] = {&hipPerfMandelBrot::float_mad, &hipPerfMandelBrot::float_mandel_unroll,
|
||||
&hipPerfMandelBrot::double_mad, &hipPerfMandelBrot::double_mandel_unroll};
|
||||
|
||||
// Maximum iteration count
|
||||
maxIter = 32768;
|
||||
|
||||
uint * hPtr[numKernels];
|
||||
uint * dPtr[numKernels];
|
||||
|
||||
// Width is divisible by 4 because the mandelbrot kernel processes 4 pixels at once.
|
||||
width_ = 256;
|
||||
|
||||
bufSize = width_ * width_ * sizeof(uint);
|
||||
|
||||
// Create streams for concurrency
|
||||
for (uint i = 0; i < numStreams; i++) {
|
||||
HIPCHECK(hipStreamCreate(&streams[i]));
|
||||
}
|
||||
|
||||
|
||||
// Allocate memory on the host and device
|
||||
for (uint i = 0; i < numKernels; i++) {
|
||||
HIPCHECK(hipHostMalloc((void **)&hPtr[i], bufSize, hipHostMallocDefault));
|
||||
setData(hPtr[i], 0xdeadbeef);
|
||||
HIPCHECK(hipMalloc((uint **)&dPtr[i], bufSize))
|
||||
}
|
||||
|
||||
|
||||
// Prepare kernel launch parameters
|
||||
int threads = (bufSize/sizeof(uint));
|
||||
int threads_per_block = 64;
|
||||
int blocks = (threads/threads_per_block) + (threads % threads_per_block);
|
||||
|
||||
float xStep = (float)(coords[coordIdx].width / (double)width_);
|
||||
float yStep = (float)(-coords[coordIdx].width / (double)width_);
|
||||
float xPos = (float)(coords[coordIdx].x - 0.5 * coords[coordIdx].width);
|
||||
float yPos = (float)(coords[coordIdx].y + 0.5 * coords[coordIdx].width);
|
||||
|
||||
// Copy memory asynchronously and concurrently from host to device
|
||||
for (uint i = 0; i < numKernels; i++) {
|
||||
HIPCHECK(hipMemcpy(dPtr[i], hPtr[i], bufSize, hipMemcpyHostToDevice));
|
||||
}
|
||||
|
||||
// Synchronize to make sure all the copies are completed
|
||||
HIPCHECK(hipStreamSynchronize(0));
|
||||
|
||||
int kernelIdx;
|
||||
if(testCase == 0 || testCase == 5 || testCase == 10) {
|
||||
kernelIdx = 0;
|
||||
}
|
||||
|
||||
else if(testCase == 1 || testCase == 6 || testCase == 11) {
|
||||
kernelIdx = 1;
|
||||
}
|
||||
else if(testCase == 2 || testCase == 7 || testCase == 12) {
|
||||
kernelIdx = 2;
|
||||
}
|
||||
else if(testCase == 3 || testCase == 8 || testCase == 13){
|
||||
kernelIdx = 3;
|
||||
}
|
||||
|
||||
|
||||
double totalTime = 0.0;
|
||||
|
||||
for (unsigned int k = 0; k < numLoops; k++) {
|
||||
|
||||
coordIdx = testCase % numCoords;
|
||||
|
||||
if ((testCase == 0 || testCase == 1 || testCase == 2 ||
|
||||
testCase == 5 || testCase == 6 || testCase == 7 ||
|
||||
testCase == 10 || testCase == 11 || testCase == 12)) {
|
||||
float xStep = (float)(coords[coordIdx].width / (double)width_);
|
||||
float yStep = (float)(-coords[coordIdx].width / (double)width_);
|
||||
float xPos = (float)(coords[coordIdx].x - 0.5 * coords[coordIdx].width);
|
||||
float yPos = (float)(coords[coordIdx].y + 0.5 * coords[coordIdx].width);
|
||||
|
||||
// Time the kernel execution
|
||||
auto all_start = std::chrono::steady_clock::now();
|
||||
|
||||
for (uint i = 0; i < numKernels; i++) {
|
||||
(this->*p[kernelIdx])(dPtr[i], width_, xPos, yPos, xStep, yStep, maxIter, streams, blocks,
|
||||
threads_per_block, i);
|
||||
}
|
||||
|
||||
|
||||
// Synchronize all the concurrent streams to have completed execution
|
||||
HIPCHECK(hipStreamSynchronize(0));
|
||||
|
||||
auto all_end = std::chrono::steady_clock::now();
|
||||
std::chrono::duration<double> all_kernel_time = all_end - all_start;
|
||||
totalTime += all_kernel_time.count();
|
||||
|
||||
}
|
||||
|
||||
|
||||
else {
|
||||
double xStep = coords[coordIdx].width / (double)width_;
|
||||
double yStep = -coords[coordIdx].width / (double)width_;
|
||||
double xPos = coords[coordIdx].x - 0.5 * coords[coordIdx].width;
|
||||
double yPos = coords[coordIdx].y + 0.5 * coords[coordIdx].width;
|
||||
|
||||
// Time the kernel execution
|
||||
auto all_start = std::chrono::steady_clock::now();
|
||||
|
||||
for (uint i = 0; i < numKernels; i++) {
|
||||
(this->*p[kernelIdx])(dPtr[i], width_, xPos, yPos, xStep, yStep, maxIter, streams, blocks,
|
||||
threads_per_block, i);
|
||||
}
|
||||
|
||||
|
||||
// Synchronize all the concurrent streams to have completed execution
|
||||
HIPCHECK(hipStreamSynchronize(0));
|
||||
|
||||
auto all_end = std::chrono::steady_clock::now();
|
||||
std::chrono::duration<double> all_kernel_time = all_end - all_start;
|
||||
totalTime += all_kernel_time.count();
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
// Copy data back from device to the host
|
||||
for(uint i = 0; i < numKernels; i++) {
|
||||
HIPCHECK(hipMemcpy(hPtr[i] ,dPtr[i], bufSize, hipMemcpyDeviceToHost));
|
||||
}
|
||||
|
||||
|
||||
for(uint i = 0; i < numKernels; i++) {
|
||||
checkData(hPtr[i]);
|
||||
|
||||
int j =0;
|
||||
while((totalIters != expectedIters[j] && totalIters > expectedIters[j]) && j < 30) {
|
||||
j++;
|
||||
}
|
||||
|
||||
if(j==30) {
|
||||
std::cout << "Incorrect iteration count detected. ";
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
// Compute GFLOPS. There are 7 FLOPs per iteration
|
||||
double perf = ((double)(totalIters*numKernels) * 7 * (double)(1e-09)) /
|
||||
(totalTime / (double)numLoops);
|
||||
|
||||
|
||||
std::vector<std::string> kernelName = {"float", "float_unroll",
|
||||
"double", "double_unroll"};
|
||||
|
||||
// Print results except for Warm-up kernel
|
||||
if(testCase!=100) {
|
||||
results[kernelName[testCase % 4]].push_back(perf);
|
||||
}
|
||||
|
||||
|
||||
for(uint i = 0 ; i < numStreams; i++) {
|
||||
HIPCHECK(hipStreamDestroy(streams[i]));
|
||||
}
|
||||
|
||||
|
||||
// Free host and device memory
|
||||
for (uint i = 0; i < numKernels; i++) {
|
||||
HIPCHECK(hipFree(hPtr[i]));
|
||||
HIPCHECK(hipFree(dPtr[i]));
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
void hipPerfMandelBrot::setData(void *ptr, unsigned int value) {
|
||||
unsigned int *ptr2 = (unsigned int *)ptr;
|
||||
for (unsigned int i = 0; i < width_ * width_; i++) {
|
||||
ptr2[i] = value;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void hipPerfMandelBrot::checkData(uint *ptr) {
|
||||
totalIters = 0;
|
||||
for (unsigned int i = 0; i < width_ * width_; i++) {
|
||||
totalIters += ptr[i];
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
int main(int argc, char* argv[]) {
|
||||
hipPerfMandelBrot mandelbrotCompute;
|
||||
int deviceId = 0;
|
||||
|
||||
mandelbrotCompute.open(deviceId);
|
||||
|
||||
for (unsigned int testCase = 0; testCase < 3; testCase++) {
|
||||
|
||||
|
||||
switch (testCase) {
|
||||
|
||||
|
||||
case 0: {
|
||||
// Warmup-kernel - default stream executes serially
|
||||
mandelbrotCompute.setNumStreams(1);
|
||||
mandelbrotCompute.setNumKernels(1);
|
||||
mandelbrotCompute.run(100/*Random number*/, deviceId);
|
||||
break;
|
||||
}
|
||||
|
||||
|
||||
case 1: {
|
||||
// run all - sync
|
||||
int i = 0;
|
||||
do {
|
||||
mandelbrotCompute.setNumStreams(1);
|
||||
mandelbrotCompute.setNumKernels(1);
|
||||
mandelbrotCompute.run(i, deviceId);
|
||||
i++;
|
||||
}while(i < 12);
|
||||
mandelbrotCompute.printResults();
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
|
||||
case 2: {
|
||||
// run all - async
|
||||
int i = 0;
|
||||
do {
|
||||
mandelbrotCompute.setNumStreams(2);
|
||||
mandelbrotCompute.setNumKernels(2);
|
||||
mandelbrotCompute.run(i, deviceId);
|
||||
i++;
|
||||
}while(i < 12);
|
||||
mandelbrotCompute.printResults();
|
||||
|
||||
break;
|
||||
|
||||
}
|
||||
|
||||
|
||||
default: {
|
||||
break;
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
passed();
|
||||
}
|
||||
@@ -1,289 +0,0 @@
|
||||
/*
|
||||
Copyright (c) 2015-2020 Advanced Micro Devices, Inc. All rights reserved.
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
/* HIT_START
|
||||
* BUILD: %t %s ../../src/test_common.cpp EXCLUDE_HIP_PLATFORM nvcc
|
||||
* TEST: %t
|
||||
* HIT_END
|
||||
*/
|
||||
|
||||
#include <iostream>
|
||||
#include <chrono>
|
||||
#include "test_common.h"
|
||||
|
||||
typedef struct {
|
||||
double x;
|
||||
double y;
|
||||
double width;
|
||||
} coordRec;
|
||||
|
||||
static coordRec coords[] = {
|
||||
{0.0, 0.0, 0.00001}, // All black
|
||||
};
|
||||
|
||||
static unsigned int numCoords = sizeof(coords) / sizeof(coordRec);
|
||||
|
||||
__global__ void mandelbrot(uint *out, uint width, float xPos, float yPos, float xStep,
|
||||
float yStep, uint maxIter) {
|
||||
|
||||
int tid = (blockIdx.x * blockDim.x + threadIdx.x);
|
||||
int i = tid % width;
|
||||
int j = tid / width;
|
||||
float x0 = (float)(xPos + xStep*i);
|
||||
float y0 = (float)(yPos + yStep*j);
|
||||
|
||||
float x = x0;
|
||||
float y = y0;
|
||||
|
||||
uint iter = 0;
|
||||
float tmp;
|
||||
for (iter = 0; (x*x + y*y <= 4.0f) && (iter < maxIter); iter++) {
|
||||
tmp = x;
|
||||
x = fma(-y,y,fma(x,x,x0));
|
||||
y = fma(2.0f*tmp,y,y0);
|
||||
}
|
||||
|
||||
out[tid] = iter;
|
||||
};
|
||||
|
||||
class hipPerfDeviceConcurrency {
|
||||
public:
|
||||
hipPerfDeviceConcurrency();
|
||||
~hipPerfDeviceConcurrency();
|
||||
|
||||
void setNumGpus(unsigned int num) {
|
||||
numDevices = num;
|
||||
}
|
||||
unsigned int getNumGpus() {
|
||||
return numDevices;
|
||||
}
|
||||
|
||||
void open(void);
|
||||
void close(void);
|
||||
void run(unsigned int testCase, int numGpus);
|
||||
|
||||
private:
|
||||
void setData(void *ptr, unsigned int value);
|
||||
void checkData(uint *ptr);
|
||||
|
||||
unsigned int numDevices;
|
||||
unsigned int width_;
|
||||
unsigned int bufSize;
|
||||
unsigned int coordIdx;
|
||||
unsigned long long totalIters = 0;
|
||||
};
|
||||
|
||||
|
||||
hipPerfDeviceConcurrency::hipPerfDeviceConcurrency() {}
|
||||
|
||||
hipPerfDeviceConcurrency::~hipPerfDeviceConcurrency() {}
|
||||
|
||||
void hipPerfDeviceConcurrency::open(void) {
|
||||
|
||||
|
||||
int nGpu = 0;
|
||||
HIPCHECK(hipGetDeviceCount(&nGpu));
|
||||
setNumGpus(nGpu);
|
||||
if (nGpu < 1) {
|
||||
std::cout << "info: didn't find any GPU! skipping the test!\n";
|
||||
passed();
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
void hipPerfDeviceConcurrency::close() {
|
||||
}
|
||||
|
||||
void hipPerfDeviceConcurrency::run(unsigned int testCase, int numGpus) {
|
||||
|
||||
|
||||
static int deviceId;
|
||||
uint * hPtr[numGpus];
|
||||
uint * dPtr[numGpus];
|
||||
hipStream_t streams[numGpus];
|
||||
int numCUs[numGpus];
|
||||
unsigned int maxIter[numGpus];
|
||||
unsigned long long expectedIters[numGpus];
|
||||
|
||||
int threads, threads_per_block, blocks;
|
||||
float xStep, yStep, xPos, yPos;
|
||||
|
||||
for(int i = 0; i < numGpus; i++) {
|
||||
|
||||
if(testCase != 0) {
|
||||
deviceId = i;
|
||||
}
|
||||
|
||||
HIPCHECK(hipSetDevice(deviceId));
|
||||
|
||||
hipDeviceProp_t props = {0};
|
||||
HIPCHECK(hipGetDeviceProperties(&props, i));
|
||||
|
||||
if (testCase != 0) {
|
||||
std::cout << "info: running on bus " << "0x" << props.pciBusID << " " << props.name
|
||||
<< " with " << props.multiProcessorCount << " CUs" << " and device ID: "
|
||||
<< i << std::endl;
|
||||
}
|
||||
|
||||
numCUs[i] = props.multiProcessorCount;
|
||||
int clkFrequency = 0;
|
||||
HIPCHECK(hipDeviceGetAttribute(&clkFrequency, hipDeviceAttributeClockRate, i));
|
||||
|
||||
clkFrequency =(unsigned int)clkFrequency/1000;
|
||||
|
||||
// Maximum iteration count
|
||||
// maxIter = 8388608 * (engine_clock / 1000).serial execution
|
||||
maxIter[i] = (unsigned int)(((8388608 * ((float)clkFrequency / 1000)) * numCUs[i]) / 128);
|
||||
maxIter[i] = (maxIter[i] + 15) & ~15;
|
||||
|
||||
// Width is divisible by 4 because the mandelbrot kernel processes 4 pixels at once.
|
||||
width_ = 256;
|
||||
|
||||
bufSize = width_ * width_ * sizeof(uint);
|
||||
|
||||
// Create streams for concurrency
|
||||
HIPCHECK(hipStreamCreate(&streams[i]));
|
||||
|
||||
// Allocate memory on the host and device
|
||||
HIPCHECK(hipHostMalloc((void **)&hPtr[i], bufSize, hipHostMallocDefault));
|
||||
setData(hPtr[i], 0xdeadbeef);
|
||||
HIPCHECK(hipMalloc((uint **)&dPtr[i], bufSize))
|
||||
|
||||
// Prepare kernel launch parameters
|
||||
threads = (bufSize/sizeof(uint));
|
||||
threads_per_block = 64;
|
||||
blocks = (threads/threads_per_block) + (threads % threads_per_block);
|
||||
|
||||
coordIdx = testCase % numCoords;
|
||||
xStep = (float)(coords[coordIdx].width / (double)width_);
|
||||
yStep = (float)(-coords[coordIdx].width / (double)width_);
|
||||
xPos = (float)(coords[coordIdx].x - 0.5 * coords[coordIdx].width);
|
||||
yPos = (float)(coords[coordIdx].y + 0.5 * coords[coordIdx].width);
|
||||
|
||||
// Copy memory from host to device
|
||||
HIPCHECK(hipMemcpy(dPtr[i], hPtr[i], bufSize, hipMemcpyHostToDevice));
|
||||
|
||||
}
|
||||
|
||||
// Time the kernel execution
|
||||
auto all_start = std::chrono::steady_clock::now();
|
||||
|
||||
for(int i = 0; i < numGpus; i++) {
|
||||
|
||||
if(testCase != 0) {
|
||||
deviceId = i;
|
||||
}
|
||||
|
||||
HIPCHECK(hipSetDevice(deviceId));
|
||||
|
||||
hipLaunchKernelGGL(mandelbrot, dim3(blocks), dim3(threads_per_block), 0, streams[i],
|
||||
dPtr[i], width_, xPos, yPos, xStep, yStep, maxIter[i]);
|
||||
|
||||
}
|
||||
|
||||
for(int i = 0; i < numGpus; i++) {
|
||||
HIPCHECK(hipStreamSynchronize(0));
|
||||
}
|
||||
|
||||
|
||||
auto all_end = std::chrono::steady_clock::now();
|
||||
std::chrono::duration<double> all_kernel_time = all_end - all_start;
|
||||
|
||||
for(int i = 0; i < numGpus; i++) {
|
||||
|
||||
if(testCase != 0) {
|
||||
deviceId = i;
|
||||
}
|
||||
HIPCHECK(hipSetDevice(deviceId));
|
||||
|
||||
// Copy data back from device to the host
|
||||
HIPCHECK(hipMemcpy(hPtr[i], dPtr[i], bufSize, hipMemcpyDeviceToHost));
|
||||
|
||||
checkData(hPtr[i]);
|
||||
expectedIters[i] = width_ * width_ * (unsigned long long) maxIter[i];
|
||||
|
||||
if (testCase != 0) {
|
||||
checkData(hPtr[i]);
|
||||
if(totalIters != expectedIters[i]) {
|
||||
std::cout << "Incorrect iteration count detected" << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
HIPCHECK(hipStreamDestroy(streams[i]));
|
||||
|
||||
// Free host and device memory
|
||||
HIPCHECK(hipFree(hPtr[i]));
|
||||
HIPCHECK(hipFree(dPtr[i]));
|
||||
}
|
||||
|
||||
if (testCase != 0) {
|
||||
std::cout << '\n' << "Measured time for kernel computation on " << numGpus << " device (s): "
|
||||
<< all_kernel_time.count() << " (s) " << '\n' << std::endl;
|
||||
}
|
||||
|
||||
if(testCase == 0) {
|
||||
deviceId++;
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
void hipPerfDeviceConcurrency::setData(void *ptr, unsigned int value) {
|
||||
unsigned int *ptr2 = (unsigned int *)ptr;
|
||||
for (unsigned int i = 0; i < width_ * width_ ; i++) {
|
||||
ptr2[i] = value;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void hipPerfDeviceConcurrency::checkData(uint *ptr) {
|
||||
totalIters = 0;
|
||||
for (unsigned int i = 0; i < width_ * width_; i++) {
|
||||
totalIters += ptr[i];
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
int main(int argc, char* argv[]) {
|
||||
hipPerfDeviceConcurrency deviceConcurrency;
|
||||
|
||||
deviceConcurrency.open();
|
||||
|
||||
int nGpu = deviceConcurrency.getNumGpus();
|
||||
|
||||
// testCase = 0 refers to warmup kernel run
|
||||
int testCase = 0;
|
||||
|
||||
for (int i = 0; i < nGpu; i++) {
|
||||
// Warm-up kernel on all devices
|
||||
deviceConcurrency.run(testCase, 1);
|
||||
}
|
||||
|
||||
// Time for kernel on 1 device
|
||||
deviceConcurrency.run(++testCase, 1);
|
||||
|
||||
// Time for kernel on all available devices
|
||||
deviceConcurrency.run(++testCase, nGpu);
|
||||
|
||||
passed();
|
||||
}
|
||||
@@ -1,20 +0,0 @@
|
||||
project(bit_extract)
|
||||
|
||||
cmake_minimum_required(VERSION 3.10)
|
||||
|
||||
# Search for rocm in common locations
|
||||
list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
|
||||
|
||||
# Find hip
|
||||
find_package(hip)
|
||||
|
||||
# Set compiler and linker
|
||||
set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
|
||||
|
||||
set(CMAKE_CXX_LINKER ${HIP_HIPCC_EXECUTABLE})
|
||||
|
||||
# Create the excutable
|
||||
add_executable(bit_extract bit_extract.cpp)
|
||||
|
||||
# Link with HIP
|
||||
target_link_libraries(bit_extract hip::host)
|
||||
@@ -9,15 +9,19 @@ HIPCC=$(HIP_PATH)/bin/hipcc
|
||||
|
||||
# Show how to use PLATFORM to specify different options for each compiler:
|
||||
ifeq (${HIP_PLATFORM}, nvcc)
|
||||
HIPCC_FLAGS = -gencode=arch=compute_20,code=sm_20
|
||||
HIPCC_FLAGS = -gencode=arch=compute_20,code=sm_20
|
||||
endif
|
||||
|
||||
EXE=bit_extract
|
||||
EXE_STATIC=bit_extract_static
|
||||
|
||||
$(EXE): bit_extract.cpp
|
||||
$(HIPCC) $(HIPCC_FLAGS) $< -o $@
|
||||
|
||||
all: $(EXE)
|
||||
$(EXE_STATIC): bit_extract.cpp
|
||||
$(HIPCC) -use-staticlib $(HIPCC_FLAGS) $< -o $@
|
||||
|
||||
all: $(EXE) $(EXE_STATIC)
|
||||
|
||||
clean:
|
||||
rm -f *.o $(EXE)
|
||||
rm -f *.o $(EXE) $(EXE_STATIC)
|
||||
|
||||
@@ -1,36 +0,0 @@
|
||||
project(module_api)
|
||||
|
||||
cmake_minimum_required(VERSION 3.10)
|
||||
|
||||
# Search for rocm in common locations
|
||||
list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
|
||||
|
||||
# Find hip
|
||||
find_package(hip)
|
||||
|
||||
# Set compiler and linker
|
||||
set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
|
||||
|
||||
set(CMAKE_CXX_LINKER ${HIP_HIPCC_EXECUTABLE})
|
||||
|
||||
# Create the excutable
|
||||
add_executable(runKernel.hip.out runKernel.cpp)
|
||||
add_executable(launchKernelHcc.hip.out launchKernelHcc.cpp)
|
||||
add_executable(defaultDriver.hip.out defaultDriver.cpp)
|
||||
|
||||
# Generate code object
|
||||
add_custom_target(
|
||||
codeobj
|
||||
ALL
|
||||
COMMAND ${HIP_HIPCC_EXECUTABLE} --genco ../vcpy_kernel.cpp -o vcpy_kernel.code
|
||||
COMMENT "codeobj generated"
|
||||
)
|
||||
|
||||
add_dependencies(runKernel.hip.out codeobj)
|
||||
add_dependencies(launchKernelHcc.hip.out codeobj)
|
||||
add_dependencies(defaultDriver.hip.out codeobj)
|
||||
|
||||
# Link with HIP
|
||||
target_link_libraries(runKernel.hip.out hip::host)
|
||||
target_link_libraries(launchKernelHcc.hip.out hip::host)
|
||||
target_link_libraries(defaultDriver.hip.out hip::host)
|
||||
@@ -1,30 +0,0 @@
|
||||
project(modile_api_global)
|
||||
|
||||
cmake_minimum_required(VERSION 3.10)
|
||||
|
||||
# Search for rocm in common locations
|
||||
list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
|
||||
|
||||
# Find hip
|
||||
find_package(hip)
|
||||
|
||||
# Set compiler and linker
|
||||
set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
|
||||
|
||||
set(CMAKE_CXX_LINKER ${HIP_HIPCC_EXECUTABLE})
|
||||
|
||||
# Create the excutable
|
||||
add_executable(runKernel.hip.out runKernel.cpp)
|
||||
|
||||
# Generate code object
|
||||
add_custom_target(
|
||||
codeobj
|
||||
ALL
|
||||
COMMAND ${HIP_HIPCC_EXECUTABLE} --genco ../vcpy_kernel.cpp -o vcpy_kernel.code
|
||||
COMMENT "codeobj generated"
|
||||
)
|
||||
|
||||
add_dependencies(runKernel.hip.out codeobj)
|
||||
|
||||
# Link with HIP
|
||||
target_link_libraries(runKernel.hip.out hip::host)
|
||||
@@ -1,21 +0,0 @@
|
||||
#Follow "README.md" to generate square.cpp if it's missing
|
||||
|
||||
project(square)
|
||||
|
||||
cmake_minimum_required(VERSION 3.10)
|
||||
|
||||
# Search for rocm in common locations
|
||||
list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
|
||||
|
||||
# Find hip
|
||||
find_package(hip)
|
||||
|
||||
# Set compiler and linker
|
||||
set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
|
||||
set(CMAKE_CXX_LINKER ${HIP_HIPCC_EXECUTABLE})
|
||||
|
||||
# Create the excutable
|
||||
add_executable(square square.cpp)
|
||||
|
||||
# Link with HIP
|
||||
target_link_libraries(square hip::host)
|
||||
@@ -11,7 +11,7 @@ else
|
||||
SOURCES=square.cpp
|
||||
endif
|
||||
|
||||
all: square.out
|
||||
all: square.out square.out.static
|
||||
|
||||
# Step
|
||||
square.cpp: square.cu
|
||||
@@ -20,5 +20,8 @@ square.cpp: square.cu
|
||||
square.out: $(SOURCES)
|
||||
$(HIPCC) $(CXXFLAGS) $(SOURCES) -o $@
|
||||
|
||||
square.out.static: $(SOURCES)
|
||||
$(HIPCC) -use-staticlib $(CXXFLAGS) $(SOURCES) -o $@
|
||||
|
||||
clean:
|
||||
rm -f *.o *.out square.cpp
|
||||
rm -f *.o *.out *.out.static square.cpp
|
||||
|
||||
@@ -1,39 +1,13 @@
|
||||
# Square.md
|
||||
|
||||
Simple test which shows how to use hipify-perl to port CUDA code to HIP.
|
||||
See related [blog](http://gpuopen.com/hip-to-be-squared-an-introductory-hip-tutorial) that explains the example.
|
||||
Simple test which shows how to use hipify-perl to port CUDA code to HIP.
|
||||
See related [blog](http://gpuopen.com/hip-to-be-squared-an-introductory-hip-tutorial) that explains the example.
|
||||
Now it is even simpler and requires no manual modification to the hipified source code - just hipify and compile:
|
||||
|
||||
- Add hip/bin path to the PATH
|
||||
1. Add hip/bin path to the PATH :
|
||||
<code>export PATH=$PATH:[MYHIP]/bin</code>
|
||||
|
||||
```
|
||||
$ export PATH=$PATH:[MYHIP]/bin
|
||||
```
|
||||
|
||||
- Define environment variable
|
||||
|
||||
```
|
||||
$ export HIP_PATH=[MYHIP]
|
||||
```
|
||||
|
||||
- Build executible file
|
||||
|
||||
```
|
||||
$ cd ~/hip/samples/0_Intro/square
|
||||
$ make
|
||||
/home/user/hip/bin/hipify-perl square.cu > square.cpp
|
||||
/home/user/hip/bin/hipcc square.cpp -o square.out
|
||||
/home/user/hip/bin/hipcc -use-staticlib square.cpp -o square.out.static
|
||||
```
|
||||
- Execute file
|
||||
```
|
||||
$ ./square.out
|
||||
info: running on device Navi 14 [Radeon Pro W5500]
|
||||
info: allocate host mem ( 7.63 MB)
|
||||
info: allocate device mem ( 7.63 MB)
|
||||
info: copy Host2Device
|
||||
info: launch 'vector_square' kernel
|
||||
info: copy Device2Host
|
||||
info: check result
|
||||
PASSED!
|
||||
```
|
||||
2. <code>$ make </code>
|
||||
Make runs these steps. This can be performed on either CUDA or AMD platform:
|
||||
<code>hipify-perl square.cu > square.cpp </code> # convert cuda code to hip code
|
||||
<code>hipcc square.cpp</code> # compile into executable
|
||||
|
||||
@@ -1,20 +0,0 @@
|
||||
project(hipBusBandwidth)
|
||||
|
||||
cmake_minimum_required(VERSION 3.10)
|
||||
|
||||
# Search for rocm in common locations
|
||||
list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
|
||||
|
||||
# Find hip
|
||||
find_package(hip)
|
||||
|
||||
# Set compiler and linker
|
||||
set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
|
||||
set(CMAKE_CXX_LINKER ${HIP_HIPCC_EXECUTABLE})
|
||||
set(CMAKE_BUILD_TYPE Release)
|
||||
|
||||
# Create the excutable
|
||||
add_executable(hipBusBandwidth hipBusBandwidth.cpp ResultDatabase.cpp)
|
||||
|
||||
# Link with HIP
|
||||
target_link_libraries(hipBusBandwidth hip::host)
|
||||
@@ -12,7 +12,7 @@ enum MallocMode { MallocPinned, MallocUnpinned, MallocRegistered };
|
||||
bool p_verbose = false;
|
||||
MallocMode p_malloc_mode = MallocPinned;
|
||||
int p_numa_ctl = -1;
|
||||
int p_iterations = 0;
|
||||
int p_iterations = 10;
|
||||
int p_beatsperiteration = 1;
|
||||
int p_device = 0;
|
||||
int p_detailed = 0;
|
||||
@@ -89,9 +89,7 @@ hipError_t memcopy(void* dst, const void* src, size_t sizeBytes, enum hipMemcpyK
|
||||
int sizes[] = {-64, -256, -512, 1, 2, 4, 8, 16, 32, 64, 128, 256,
|
||||
512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072, 262144, 524288};
|
||||
int nSizes = sizeof(sizes) / sizeof(int);
|
||||
// iterations to be run for the corresponding sizes, less number as the size increases
|
||||
int iterations[] = {1000, 1000, 1000, 1000, 500, 500, 500, 500, 500, 200, 200, 200,
|
||||
200, 200, 100, 100, 100, 100, 50, 50, 50, 20, 20};
|
||||
|
||||
|
||||
// ****************************************************************************
|
||||
// Function: RunBenchmark_H2D
|
||||
@@ -176,48 +174,53 @@ void RunBenchmark_H2D(ResultDatabase& resultDB) {
|
||||
hipEventCreate(&stop);
|
||||
CHECK_HIP_ERROR();
|
||||
|
||||
// store the times temporarily to estimate latency
|
||||
// float times[nSizes];
|
||||
for (int i = 0; i < nSizes; i++) {
|
||||
int sizeIndex, iterIndex;
|
||||
sizeIndex = i;
|
||||
iterIndex = i;
|
||||
// Three passes, forward and backward both
|
||||
for (int pass = 0; pass < p_iterations; pass++) {
|
||||
// store the times temporarily to estimate latency
|
||||
// float times[nSizes];
|
||||
// Step through sizes forward on even passes and backward on odd
|
||||
for (int i = 0; i < nSizes; i++) {
|
||||
int sizeIndex;
|
||||
if ((pass % 2) == 0)
|
||||
sizeIndex = i;
|
||||
else
|
||||
sizeIndex = (nSizes - 1) - i;
|
||||
|
||||
const int thisSize = p_onesize ? p_onesize : sizes[sizeIndex];
|
||||
const int nbytes = sizeToBytes(thisSize);
|
||||
const int niter = p_iterations ? p_iterations : iterations[iterIndex];
|
||||
for (int pass = 0; pass < niter; pass++) {
|
||||
const int thisSize = p_onesize ? p_onesize : sizes[sizeIndex];
|
||||
const int nbytes = sizeToBytes(thisSize);
|
||||
|
||||
hipEventRecord(start, 0);
|
||||
for (int j = 0; j < p_beatsperiteration; j++) {
|
||||
memcopy(device, hostMem, nbytes, hipMemcpyHostToDevice);
|
||||
hipEventRecord(start, 0);
|
||||
for (int j = 0; j < p_beatsperiteration; j++) {
|
||||
memcopy(device, hostMem, nbytes, hipMemcpyHostToDevice);
|
||||
}
|
||||
hipEventRecord(stop, 0);
|
||||
hipEventSynchronize(stop);
|
||||
float t = 0;
|
||||
hipEventElapsedTime(&t, start, stop);
|
||||
// times[sizeIndex] = t;
|
||||
|
||||
// Convert to GB/sec
|
||||
if (p_verbose) {
|
||||
std::cerr << "size " << sizeToString(thisSize) << " took " << t << " ms\n";
|
||||
}
|
||||
|
||||
double speed =
|
||||
(double(double(sizeToBytes(thisSize)/1000) * p_beatsperiteration) / 1000) / t;
|
||||
char sizeStr[256];
|
||||
if (p_beatsperiteration > 1) {
|
||||
sprintf(sizeStr, "%9sx%d", sizeToString(thisSize).c_str(), p_beatsperiteration);
|
||||
} else {
|
||||
sprintf(sizeStr, "%9s", sizeToString(thisSize).c_str());
|
||||
}
|
||||
resultDB.AddResult(std::string("H2D_Bandwidth") + "_" + mallocModeString(p_malloc_mode),
|
||||
sizeStr, "GB/sec", speed);
|
||||
resultDB.AddResult(std::string("H2D_Time") + mallocModeString(p_malloc_mode), sizeStr,
|
||||
"ms", t);
|
||||
|
||||
if (p_onesize) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
hipEventRecord(stop, 0);
|
||||
hipEventSynchronize(stop);
|
||||
float t = 0;
|
||||
hipEventElapsedTime(&t, start, stop);
|
||||
// times[sizeIndex] = t;
|
||||
// Convert to GB/sec
|
||||
if (p_verbose) {
|
||||
std::cerr << "size " << sizeToString(thisSize) << " took " << t << " ms\n";
|
||||
}
|
||||
|
||||
double speed =
|
||||
(double(double(sizeToBytes(thisSize)/1000) * p_beatsperiteration) / 1000) / t;
|
||||
char sizeStr[256];
|
||||
if (p_beatsperiteration > 1) {
|
||||
sprintf(sizeStr, "%9sx%d", sizeToString(thisSize).c_str(), p_beatsperiteration);
|
||||
} else {
|
||||
sprintf(sizeStr, "%9s", sizeToString(thisSize).c_str());
|
||||
}
|
||||
resultDB.AddResult(std::string("H2D_Bandwidth") + "_" + mallocModeString(p_malloc_mode),
|
||||
sizeStr, "GB/sec", speed);
|
||||
resultDB.AddResult(std::string("H2D_Time") + mallocModeString(p_malloc_mode), sizeStr, "ms", t);
|
||||
|
||||
}
|
||||
if (p_onesize) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (p_onesize) {
|
||||
@@ -344,50 +347,53 @@ void RunBenchmark_D2H(ResultDatabase& resultDB) {
|
||||
hipEventCreate(&stop);
|
||||
CHECK_HIP_ERROR();
|
||||
|
||||
// store the times temporarily to estimate latency
|
||||
// float times[nSizes];
|
||||
for (int i = 0; i < nSizes; i++) {
|
||||
int sizeIndex, iterIndex;
|
||||
sizeIndex = i;
|
||||
iterIndex = i;
|
||||
// Three passes, forward and backward both
|
||||
for (int pass = 0; pass < p_iterations; pass++) {
|
||||
// store the times temporarily to estimate latency
|
||||
// float times[nSizes];
|
||||
// Step through sizes forward on even passes and backward on odd
|
||||
for (int i = 0; i < nSizes; i++) {
|
||||
int sizeIndex;
|
||||
if ((pass % 2) == 0)
|
||||
sizeIndex = i;
|
||||
else
|
||||
sizeIndex = (nSizes - 1) - i;
|
||||
|
||||
const int thisSize = p_onesize ? p_onesize : sizes[sizeIndex];
|
||||
const int nbytes = sizeToBytes(thisSize);
|
||||
const int niter = p_iterations ? p_iterations : iterations[iterIndex];
|
||||
for (int pass = 0; pass < niter; pass++) {
|
||||
const int thisSize = p_onesize ? p_onesize : sizes[sizeIndex];
|
||||
const int nbytes = sizeToBytes(thisSize);
|
||||
|
||||
hipEventRecord(start, 0);
|
||||
for (int j = 0; j < p_beatsperiteration; j++) {
|
||||
memcopy(hostMem2, device, nbytes, hipMemcpyDeviceToHost);
|
||||
hipEventRecord(start, 0);
|
||||
for (int j = 0; j < p_beatsperiteration; j++) {
|
||||
memcopy(hostMem2, device, nbytes, hipMemcpyDeviceToHost);
|
||||
}
|
||||
hipEventRecord(stop, 0);
|
||||
hipEventSynchronize(stop);
|
||||
float t = 0;
|
||||
hipEventElapsedTime(&t, start, stop);
|
||||
// times[sizeIndex] = t;
|
||||
|
||||
// Convert to GB/sec
|
||||
if (p_verbose) {
|
||||
std::cerr << "size " << sizeToString(thisSize) << " took " << t << " ms\n";
|
||||
}
|
||||
|
||||
double speed =
|
||||
(double(double(sizeToBytes(thisSize)/1000) * p_beatsperiteration) / 1000) / t;
|
||||
char sizeStr[256];
|
||||
sprintf(sizeStr, "%9s", sizeToString(thisSize).c_str());
|
||||
if (p_beatsperiteration > 1) {
|
||||
sprintf(sizeStr, "%9sx%d", sizeToString(thisSize).c_str(), p_beatsperiteration);
|
||||
} else {
|
||||
sprintf(sizeStr, "%9s", sizeToString(thisSize).c_str());
|
||||
}
|
||||
resultDB.AddResult(std::string("D2H_Bandwidth") + "_" + mallocModeString(p_malloc_mode),
|
||||
sizeStr, "GB/sec", speed);
|
||||
resultDB.AddResult(std::string("D2H_Time") + "_" + mallocModeString(p_malloc_mode),
|
||||
sizeStr, "ms", t);
|
||||
if (p_onesize) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
hipEventRecord(stop, 0);
|
||||
hipEventSynchronize(stop);
|
||||
float t = 0;
|
||||
hipEventElapsedTime(&t, start, stop);
|
||||
// times[sizeIndex] = t;
|
||||
// Convert to GB/sec
|
||||
if (p_verbose) {
|
||||
std::cerr << "size " << sizeToString(thisSize) << " took " << t << " ms\n";
|
||||
}
|
||||
|
||||
double speed =
|
||||
(double(double(sizeToBytes(thisSize)/1000) * p_beatsperiteration) / 1000) / t;
|
||||
char sizeStr[256];
|
||||
sprintf(sizeStr, "%9s", sizeToString(thisSize).c_str());
|
||||
if (p_beatsperiteration > 1) {
|
||||
sprintf(sizeStr, "%9sx%d", sizeToString(thisSize).c_str(), p_beatsperiteration);
|
||||
} else {
|
||||
sprintf(sizeStr, "%9s", sizeToString(thisSize).c_str());
|
||||
}
|
||||
resultDB.AddResult(std::string("D2H_Bandwidth") + "_" + mallocModeString(p_malloc_mode),
|
||||
sizeStr, "GB/sec", speed);
|
||||
resultDB.AddResult(std::string("D2H_Time") + "_" + mallocModeString(p_malloc_mode),
|
||||
sizeStr, "ms", t);
|
||||
|
||||
}
|
||||
if (p_onesize) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (p_onesize) {
|
||||
@@ -516,43 +522,43 @@ void RunBenchmark_Bidir(ResultDatabase& resultDB) {
|
||||
hipStreamCreate(&stream[0]);
|
||||
hipStreamCreate(&stream[1]);
|
||||
|
||||
// store the times temporarily to estimate latency
|
||||
// float times[nSizes];
|
||||
for (int i = 0; i < nSizes; i++) {
|
||||
int sizeIndex, iterIndex;
|
||||
sizeIndex = i;
|
||||
iterIndex = i;
|
||||
// Three passes, forward and backward both
|
||||
for (int pass = 0; pass < p_iterations; pass++) {
|
||||
// store the times temporarily to estimate latency
|
||||
// float times[nSizes];
|
||||
// Step through sizes forward on even passes and backward on odd
|
||||
for (int i = 0; i < nSizes; i++) {
|
||||
int sizeIndex;
|
||||
if ((pass % 2) == 0)
|
||||
sizeIndex = i;
|
||||
else
|
||||
sizeIndex = (nSizes - 1) - i;
|
||||
|
||||
const int thisSize = p_onesize ? p_onesize : sizes[sizeIndex];
|
||||
const int nbytes = sizeToBytes(thisSize);
|
||||
const int niter = p_iterations ? p_iterations : iterations[iterIndex];
|
||||
for (int pass = 0; pass < niter; pass++) {
|
||||
const int thisSize = p_onesize ? p_onesize : sizes[sizeIndex];
|
||||
const int nbytes = sizeToBytes(thisSize);
|
||||
|
||||
hipEventRecord(start, 0);
|
||||
hipMemcpyAsync(deviceMem[0], hostMem[0], nbytes, hipMemcpyHostToDevice, stream[0]);
|
||||
hipMemcpyAsync(hostMem[1], deviceMem[1], nbytes, hipMemcpyDeviceToHost, stream[1]);
|
||||
hipEventRecord(stop, 0);
|
||||
hipEventSynchronize(stop);
|
||||
float t = 0;
|
||||
hipEventElapsedTime(&t, start, stop);
|
||||
hipEventRecord(start, 0);
|
||||
hipMemcpyAsync(deviceMem[0], hostMem[0], nbytes, hipMemcpyHostToDevice, stream[0]);
|
||||
hipMemcpyAsync(hostMem[1], deviceMem[1], nbytes, hipMemcpyDeviceToHost, stream[1]);
|
||||
hipEventRecord(stop, 0);
|
||||
hipEventSynchronize(stop);
|
||||
float t = 0;
|
||||
hipEventElapsedTime(&t, start, stop);
|
||||
|
||||
// Convert to GB/sec
|
||||
if (p_verbose) {
|
||||
std::cerr << "size " << sizeToString(thisSize) << " took " << t << " ms\n";
|
||||
// Convert to GB/sec
|
||||
if (p_verbose) {
|
||||
std::cerr << "size " << sizeToString(thisSize) << " took " << t << " ms\n";
|
||||
}
|
||||
|
||||
double speed = (double(sizeToBytes(2 * thisSize)) / (1000 * 1000)) / t;
|
||||
char sizeStr[256];
|
||||
sprintf(sizeStr, "%9s", sizeToString(thisSize).c_str());
|
||||
resultDB.AddResult(
|
||||
std::string("Bidir_Bandwidth") + "_" + mallocModeString(p_malloc_mode), sizeStr,
|
||||
"GB/sec", speed);
|
||||
resultDB.AddResult(std::string("Bidir_Time") + "_" + mallocModeString(p_malloc_mode),
|
||||
sizeStr, "ms", t);
|
||||
}
|
||||
|
||||
double speed = (double(sizeToBytes(2 * thisSize)) / (1000 * 1000)) / t;
|
||||
char sizeStr[256];
|
||||
sprintf(sizeStr, "%9s", sizeToString(thisSize).c_str());
|
||||
resultDB.AddResult(
|
||||
std::string("Bidir_Bandwidth") + "_" + mallocModeString(p_malloc_mode), sizeStr,
|
||||
"GB/sec", speed);
|
||||
resultDB.AddResult(std::string("Bidir_Time") + "_" + mallocModeString(p_malloc_mode),
|
||||
sizeStr, "ms", t);
|
||||
}
|
||||
if (p_onesize) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Cleanup
|
||||
@@ -702,63 +708,66 @@ void RunBenchmark_P2P_Unidir(ResultDatabase& resultDB) {
|
||||
hipEventCreate(&stop);
|
||||
CHECK_HIP_ERROR();
|
||||
|
||||
// store the times temporarily to estimate latency
|
||||
// float times[nSizes];
|
||||
for (int i = 0; i < nSizes; i++) {
|
||||
int sizeIndex, iterIndex;
|
||||
sizeIndex = i;
|
||||
iterIndex = i;
|
||||
// Three passes, forward and backward both
|
||||
for (int pass = 0; pass < p_iterations; pass++) {
|
||||
// store the times temporarily to estimate latency
|
||||
// float times[nSizes];
|
||||
// Step through sizes forward on even passes and backward on odd
|
||||
for (int i = 0; i < nSizes; i++) {
|
||||
int sizeIndex;
|
||||
if ((pass % 2) == 0)
|
||||
sizeIndex = i;
|
||||
else
|
||||
sizeIndex = (nSizes - 1) - i;
|
||||
|
||||
const int thisSize = p_onesize ? p_onesize : sizes[sizeIndex];
|
||||
const int nbytes = sizeToBytes(thisSize);
|
||||
const int niter = p_iterations ? p_iterations : iterations[iterIndex];
|
||||
for (int pass = 0; pass < niter; pass++) {
|
||||
const int thisSize = p_onesize ? p_onesize : sizes[sizeIndex];
|
||||
const int nbytes = sizeToBytes(thisSize);
|
||||
|
||||
hipDeviceSynchronize();
|
||||
hipDeviceSynchronize();
|
||||
|
||||
hipEventRecord(start, 0);
|
||||
hipEventRecord(start, 0);
|
||||
|
||||
for (int j = 0; j < p_beatsperiteration; j++) {
|
||||
hipMemcpy(peerGpuMem, currentGpuMem, nbytes, hipMemcpyDeviceToDevice);
|
||||
}
|
||||
for (int j = 0; j < p_beatsperiteration; j++) {
|
||||
hipMemcpy(peerGpuMem, currentGpuMem, nbytes, hipMemcpyDeviceToDevice);
|
||||
}
|
||||
|
||||
hipEventRecord(stop, 0);
|
||||
hipEventRecord(stop, 0);
|
||||
|
||||
hipEventSynchronize(stop);
|
||||
hipEventSynchronize(stop);
|
||||
|
||||
float t = 0;
|
||||
hipEventElapsedTime(&t, start, stop);
|
||||
// times[sizeIndex] = t;
|
||||
float t = 0;
|
||||
hipEventElapsedTime(&t, start, stop);
|
||||
// times[sizeIndex] = t;
|
||||
|
||||
// Convert to GB/sec
|
||||
if (p_verbose) {
|
||||
std::cerr << "size " << sizeToString(thisSize) << " took " << t << " ms\n";
|
||||
}
|
||||
// Convert to GB/sec
|
||||
if (p_verbose) {
|
||||
std::cerr << "size " << sizeToString(thisSize) << " took " << t << " ms\n";
|
||||
}
|
||||
|
||||
double speed =
|
||||
(double(double(sizeToBytes(thisSize)/1000) * p_beatsperiteration) / 1000) / t;
|
||||
char sizeStr[256];
|
||||
if (p_beatsperiteration > 1) {
|
||||
sprintf(sizeStr, "%9sx%d", sizeToString(thisSize).c_str(),
|
||||
p_beatsperiteration);
|
||||
} else {
|
||||
sprintf(sizeStr, "%9s", sizeToString(thisSize).c_str());
|
||||
}
|
||||
double speed =
|
||||
(double(double(sizeToBytes(thisSize)/1000) * p_beatsperiteration) / 1000) / t;
|
||||
char sizeStr[256];
|
||||
if (p_beatsperiteration > 1) {
|
||||
sprintf(sizeStr, "%9sx%d", sizeToString(thisSize).c_str(),
|
||||
p_beatsperiteration);
|
||||
} else {
|
||||
sprintf(sizeStr, "%9s", sizeToString(thisSize).c_str());
|
||||
}
|
||||
|
||||
string cGpu, pGpu;
|
||||
cGpu = gpuIDToString(currentGpu);
|
||||
pGpu = gpuIDToString(peerGpu);
|
||||
string cGpu, pGpu;
|
||||
cGpu = gpuIDToString(currentGpu);
|
||||
pGpu = gpuIDToString(peerGpu);
|
||||
|
||||
resultDB.AddResult(std::string("p2p_uni") + "_gpu" + std::string(cGpu) +
|
||||
"_gpu" + std::string(pGpu),
|
||||
resultDB.AddResult(std::string("p2p_uni") + "_gpu" + std::string(cGpu) +
|
||||
"_gpu" + std::string(pGpu),
|
||||
sizeStr, "GB/sec", speed);
|
||||
resultDB.AddResult(std::string("P2P_uni") + "_gpu" + std::string(cGpu) +
|
||||
"_gpu" + std::string(pGpu),
|
||||
resultDB.AddResult(std::string("P2P_uni") + "_gpu" + std::string(cGpu) +
|
||||
"_gpu" + std::string(pGpu),
|
||||
sizeStr, "ms", t);
|
||||
|
||||
}
|
||||
if (p_onesize) {
|
||||
break;
|
||||
if (p_onesize) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -820,68 +829,71 @@ void RunBenchmark_P2P_Bidir(ResultDatabase& resultDB) {
|
||||
hipStreamCreate(&stream[0]);
|
||||
hipStreamCreate(&stream[1]);
|
||||
|
||||
// store the times temporarily to estimate latency
|
||||
// float times[nSizes];
|
||||
for (int i = 0; i < nSizes; i++) {
|
||||
int sizeIndex, iterIndex;
|
||||
sizeIndex = i;
|
||||
iterIndex = i;
|
||||
// Three passes, forward and backward both
|
||||
for (int pass = 0; pass < p_iterations; pass++) {
|
||||
// store the times temporarily to estimate latency
|
||||
// float times[nSizes];
|
||||
// Step through sizes forward on even passes and backward on odd
|
||||
for (int i = 0; i < nSizes; i++) {
|
||||
int sizeIndex;
|
||||
if ((pass % 2) == 0)
|
||||
sizeIndex = i;
|
||||
else
|
||||
sizeIndex = (nSizes - 1) - i;
|
||||
|
||||
const int thisSize = p_onesize ? p_onesize : sizes[sizeIndex];
|
||||
const int nbytes = sizeToBytes(thisSize);
|
||||
const int niter = p_iterations ? p_iterations : iterations[iterIndex];
|
||||
for (int pass = 0; pass < niter; pass++) {
|
||||
const int thisSize = p_onesize ? p_onesize : sizes[sizeIndex];
|
||||
const int nbytes = sizeToBytes(thisSize);
|
||||
|
||||
hipDeviceSynchronize();
|
||||
hipDeviceSynchronize();
|
||||
|
||||
hipEventRecord(start, 0);
|
||||
hipEventRecord(start, 0);
|
||||
|
||||
for (int j = 0; j < p_beatsperiteration; j++) {
|
||||
hipMemcpyAsync(peerGpuMem[0], currentGpuMem[0], nbytes,
|
||||
hipMemcpyDeviceToDevice, stream[0]);
|
||||
hipMemcpyAsync(currentGpuMem[1], peerGpuMem[1], nbytes,
|
||||
hipMemcpyDeviceToDevice, stream[1]);
|
||||
for (int j = 0; j < p_beatsperiteration; j++) {
|
||||
hipMemcpyAsync(peerGpuMem[0], currentGpuMem[0], nbytes,
|
||||
hipMemcpyDeviceToDevice, stream[0]);
|
||||
hipMemcpyAsync(currentGpuMem[1], peerGpuMem[1], nbytes,
|
||||
hipMemcpyDeviceToDevice, stream[1]);
|
||||
}
|
||||
|
||||
hipEventRecord(stop, 0);
|
||||
|
||||
hipEventSynchronize(stop);
|
||||
|
||||
float t = 0;
|
||||
hipEventElapsedTime(&t, start, stop);
|
||||
// times[sizeIndex] = t;
|
||||
|
||||
// Convert to GB/sec
|
||||
if (p_verbose) {
|
||||
std::cerr << "size " << sizeToString(thisSize) << " took " << t << " ms\n";
|
||||
}
|
||||
|
||||
double speed =
|
||||
(double(double(sizeToBytes(2 * thisSize)/1000) * p_beatsperiteration) / 1000) /
|
||||
t;
|
||||
char sizeStr[256];
|
||||
if (p_beatsperiteration > 1) {
|
||||
sprintf(sizeStr, "%9sx%d", sizeToString(thisSize).c_str(),
|
||||
p_beatsperiteration);
|
||||
} else {
|
||||
sprintf(sizeStr, "%9s", sizeToString(thisSize).c_str());
|
||||
}
|
||||
|
||||
string cGpu, pGpu;
|
||||
cGpu = gpuIDToString(currentGpu);
|
||||
pGpu = gpuIDToString(peerGpu);
|
||||
|
||||
resultDB.AddResult(std::string("p2p_bi") + "_gpu" + std::string(cGpu) + "_gpu" +
|
||||
std::string(pGpu),
|
||||
sizeStr, "GB/sec", speed);
|
||||
resultDB.AddResult(std::string("P2P_bi") + "_gpu" + std::string(cGpu) + "_gpu" +
|
||||
std::string(pGpu),
|
||||
sizeStr, "ms", t);
|
||||
|
||||
if (p_onesize) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
hipEventRecord(stop, 0);
|
||||
|
||||
hipEventSynchronize(stop);
|
||||
|
||||
float t = 0;
|
||||
hipEventElapsedTime(&t, start, stop);
|
||||
// times[sizeIndex] = t;
|
||||
|
||||
// Convert to GB/sec
|
||||
if (p_verbose) {
|
||||
std::cerr << "size " << sizeToString(thisSize) << " took " << t << " ms\n";
|
||||
}
|
||||
|
||||
double speed =
|
||||
(double(double(sizeToBytes(2 * thisSize)/1000) * p_beatsperiteration) / 1000) /
|
||||
t;
|
||||
char sizeStr[256];
|
||||
if (p_beatsperiteration > 1) {
|
||||
sprintf(sizeStr, "%9sx%d", sizeToString(thisSize).c_str(),
|
||||
p_beatsperiteration);
|
||||
} else {
|
||||
sprintf(sizeStr, "%9s", sizeToString(thisSize).c_str());
|
||||
}
|
||||
|
||||
string cGpu, pGpu;
|
||||
cGpu = gpuIDToString(currentGpu);
|
||||
pGpu = gpuIDToString(peerGpu);
|
||||
|
||||
resultDB.AddResult(std::string("p2p_bi") + "_gpu" + std::string(cGpu) + "_gpu" +
|
||||
std::string(pGpu),
|
||||
sizeStr, "GB/sec", speed);
|
||||
resultDB.AddResult(std::string("P2P_bi") + "_gpu" + std::string(cGpu) + "_gpu" +
|
||||
std::string(pGpu),
|
||||
sizeStr, "ms", t);
|
||||
|
||||
}
|
||||
if (p_onesize) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (p_onesize) {
|
||||
|
||||
@@ -1,31 +0,0 @@
|
||||
project(hipCommander)
|
||||
|
||||
cmake_minimum_required(VERSION 3.10)
|
||||
|
||||
# Search for rocm in common locations
|
||||
list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
|
||||
|
||||
# Find hip
|
||||
find_package(hip)
|
||||
|
||||
# Set compiler and linker
|
||||
set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
|
||||
set(CMAKE_CXX_LINKER ${HIP_HIPCC_EXECUTABLE})
|
||||
set(CMAKE_BUILD_TYPE Release)
|
||||
|
||||
# Create the excutable
|
||||
add_executable(hipCommander hipCommander.cpp)
|
||||
|
||||
# Generate code object
|
||||
add_custom_target(
|
||||
codeobj
|
||||
ALL
|
||||
COMMAND ${HIP_HIPCC_EXECUTABLE} --genco ../nullkernel.hip.cpp -o nullkernel.hsaco
|
||||
COMMENT "codeobj generated"
|
||||
)
|
||||
|
||||
add_dependencies(hipCommander codeobj)
|
||||
|
||||
# Link with HIP
|
||||
target_link_libraries(hipCommander hip::host)
|
||||
set_property(TARGET hipCommander PROPERTY CXX_STANDARD 11)
|
||||
@@ -1,35 +0,0 @@
|
||||
project(hipDispatchLatency)
|
||||
|
||||
cmake_minimum_required(VERSION 3.10)
|
||||
|
||||
# Search for rocm in common locations
|
||||
list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
|
||||
|
||||
# Find hip
|
||||
find_package(hip)
|
||||
|
||||
# Set compiler and linker
|
||||
set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
|
||||
set(CMAKE_CXX_LINKER ${HIP_HIPCC_EXECUTABLE})
|
||||
set(CMAKE_BUILD_TYPE Release)
|
||||
|
||||
# Create the excutable
|
||||
add_executable(hipDispatchLatency hipDispatchLatency.cpp)
|
||||
add_executable(hipDispatchEnqueueRateMT hipDispatchEnqueueRateMT.cpp)
|
||||
|
||||
# Generate code object
|
||||
add_custom_target(
|
||||
codeobj
|
||||
ALL
|
||||
COMMAND ${HIP_HIPCC_EXECUTABLE} --genco ../test_kernel.cpp -o test_kernel.code
|
||||
COMMENT "codeobj generated"
|
||||
)
|
||||
|
||||
add_dependencies(hipDispatchLatency codeobj)
|
||||
add_dependencies(hipDispatchEnqueueRateMT codeobj)
|
||||
|
||||
# Link with HIP
|
||||
target_link_libraries(hipDispatchLatency hip::host)
|
||||
target_link_libraries(hipDispatchEnqueueRateMT hip::host)
|
||||
set_property(TARGET hipDispatchLatency PROPERTY CXX_STANDARD 11)
|
||||
set_property(TARGET hipDispatchEnqueueRateMT PROPERTY CXX_STANDARD 11)
|
||||
@@ -1,20 +0,0 @@
|
||||
project(hipInfo)
|
||||
|
||||
cmake_minimum_required(VERSION 3.10)
|
||||
|
||||
# Search for rocm in common locations
|
||||
list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
|
||||
|
||||
# Find hip
|
||||
find_package(hip)
|
||||
|
||||
# Set compiler and linker
|
||||
set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
|
||||
set(CMAKE_CXX_LINKER ${HIP_HIPCC_EXECUTABLE})
|
||||
set(CMAKE_BUILD_TYPE Release)
|
||||
|
||||
# Create the excutable
|
||||
add_executable(hipInfo hipInfo.cpp)
|
||||
|
||||
# Link with HIP
|
||||
target_link_libraries(hipInfo hip::host)
|
||||
@@ -1,20 +0,0 @@
|
||||
project(MatrixTranspose)
|
||||
|
||||
cmake_minimum_required(VERSION 3.10)
|
||||
|
||||
# Search for rocm in common locations
|
||||
list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
|
||||
|
||||
# Find hip
|
||||
find_package(hip)
|
||||
|
||||
# Set compiler and linker
|
||||
set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
|
||||
set(CMAKE_CXX_LINKER ${HIP_HIPCC_EXECUTABLE})
|
||||
set(CMAKE_BUILD_TYPE Release)
|
||||
|
||||
# Create the excutable
|
||||
add_executable(MatrixTranspose MatrixTranspose.cpp)
|
||||
|
||||
# Link with HIP
|
||||
target_link_libraries(MatrixTranspose hip::host)
|
||||
@@ -1,20 +0,0 @@
|
||||
project(inline_asm)
|
||||
|
||||
cmake_minimum_required(VERSION 3.10)
|
||||
|
||||
# Search for rocm in common locations
|
||||
list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
|
||||
|
||||
# Find hip
|
||||
find_package(hip)
|
||||
|
||||
# Set compiler and linker
|
||||
set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
|
||||
set(CMAKE_CXX_LINKER ${HIP_HIPCC_EXECUTABLE})
|
||||
set(CMAKE_BUILD_TYPE Release)
|
||||
|
||||
# Create the excutable
|
||||
add_executable(inline_asm inline_asm.cpp)
|
||||
|
||||
# Link with HIP
|
||||
target_link_libraries(inline_asm hip::host)
|
||||
@@ -1,30 +0,0 @@
|
||||
project(texture2dDrv)
|
||||
|
||||
cmake_minimum_required(VERSION 3.10)
|
||||
|
||||
# Search for rocm in common locations
|
||||
list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
|
||||
|
||||
# Find hip
|
||||
find_package(hip)
|
||||
|
||||
# Set compiler and linker
|
||||
set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
|
||||
set(CMAKE_CXX_LINKER ${HIP_HIPCC_EXECUTABLE})
|
||||
set(CMAKE_BUILD_TYPE Release)
|
||||
|
||||
# Create the excutable
|
||||
add_executable(texture2dDrv texture2dDrv.cpp)
|
||||
|
||||
# Generate code object
|
||||
add_custom_target(
|
||||
codeobj
|
||||
ALL
|
||||
COMMAND ${HIP_HIPCC_EXECUTABLE} --genco ../tex2dKernel.cpp -o tex2dKernel.code
|
||||
COMMENT "codeobj generated"
|
||||
)
|
||||
|
||||
add_dependencies(texture2dDrv codeobj)
|
||||
|
||||
# Link with HIP
|
||||
target_link_libraries(texture2dDrv hip::host)
|
||||
@@ -1,20 +0,0 @@
|
||||
project(occupancy)
|
||||
|
||||
cmake_minimum_required(VERSION 3.10)
|
||||
|
||||
# Search for rocm in common locations
|
||||
list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
|
||||
|
||||
# Find hip
|
||||
find_package(hip)
|
||||
|
||||
# Set compiler and linker
|
||||
set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
|
||||
set(CMAKE_CXX_LINKER ${HIP_HIPCC_EXECUTABLE})
|
||||
set(CMAKE_BUILD_TYPE Release)
|
||||
|
||||
# Create the excutable
|
||||
add_executable(occupancy occupancy.cpp)
|
||||
|
||||
# Link with HIP
|
||||
target_link_libraries(occupancy hip::host)
|
||||
@@ -1,20 +0,0 @@
|
||||
project(hipEvent)
|
||||
|
||||
cmake_minimum_required(VERSION 3.10)
|
||||
|
||||
# Search for rocm in common locations
|
||||
list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
|
||||
|
||||
# Find hip
|
||||
find_package(hip)
|
||||
|
||||
# Set compiler and linker
|
||||
set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
|
||||
set(CMAKE_CXX_LINKER ${HIP_HIPCC_EXECUTABLE})
|
||||
set(CMAKE_BUILD_TYPE Release)
|
||||
|
||||
# Create the excutable
|
||||
add_executable(hipEvent hipEvent.cpp)
|
||||
|
||||
# Link with HIP
|
||||
target_link_libraries(hipEvent hip::host)
|
||||
@@ -1,20 +0,0 @@
|
||||
project(sharedMemory)
|
||||
|
||||
cmake_minimum_required(VERSION 3.10)
|
||||
|
||||
# Search for rocm in common locations
|
||||
list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
|
||||
|
||||
# Find hip
|
||||
find_package(hip)
|
||||
|
||||
# Set compiler and linker
|
||||
set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
|
||||
set(CMAKE_CXX_LINKER ${HIP_HIPCC_EXECUTABLE})
|
||||
set(CMAKE_BUILD_TYPE Release)
|
||||
|
||||
# Create the excutable
|
||||
add_executable(sharedMemory sharedMemory.cpp)
|
||||
|
||||
# Link with HIP
|
||||
target_link_libraries(sharedMemory hip::host)
|
||||
@@ -1,20 +0,0 @@
|
||||
project(shfl)
|
||||
|
||||
cmake_minimum_required(VERSION 3.10)
|
||||
|
||||
# Search for rocm in common locations
|
||||
list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
|
||||
|
||||
# Find hip
|
||||
find_package(hip)
|
||||
|
||||
# Set compiler and linker
|
||||
set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
|
||||
set(CMAKE_CXX_LINKER ${HIP_HIPCC_EXECUTABLE})
|
||||
set(CMAKE_BUILD_TYPE Release)
|
||||
|
||||
# Create the excutable
|
||||
add_executable(shfl shfl.cpp)
|
||||
|
||||
# Link with HIP
|
||||
target_link_libraries(shfl hip::host)
|
||||
@@ -1,19 +0,0 @@
|
||||
project(2dshfl)
|
||||
|
||||
cmake_minimum_required(VERSION 3.10)
|
||||
|
||||
# Search for rocm in common locations
|
||||
list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
|
||||
|
||||
# Find hip
|
||||
find_package(hip)
|
||||
|
||||
# Set compiler and linker
|
||||
set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
|
||||
set(CMAKE_CXX_LINKER ${HIP_HIPCC_EXECUTABLE})
|
||||
|
||||
# Create the excutable
|
||||
add_executable(2dshfl 2dshfl.cpp)
|
||||
|
||||
# Link with HIP
|
||||
target_link_libraries(2dshfl hip::host)
|
||||
@@ -1,19 +0,0 @@
|
||||
project(dynamic_shared)
|
||||
|
||||
cmake_minimum_required(VERSION 3.10)
|
||||
|
||||
# Search for rocm in common locations
|
||||
list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
|
||||
|
||||
# Find hip
|
||||
find_package(hip)
|
||||
|
||||
# Set compiler and linker
|
||||
set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
|
||||
set(CMAKE_CXX_LINKER ${HIP_HIPCC_EXECUTABLE})
|
||||
|
||||
# Create the excutable
|
||||
add_executable(dynamic_shared dynamic_shared.cpp)
|
||||
|
||||
# Link with HIP
|
||||
target_link_libraries(dynamic_shared hip::host)
|
||||
@@ -1,19 +0,0 @@
|
||||
project(stream)
|
||||
|
||||
cmake_minimum_required(VERSION 3.10)
|
||||
|
||||
# Search for rocm in common locations
|
||||
list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
|
||||
|
||||
# Find hip
|
||||
find_package(hip)
|
||||
|
||||
# Set compiler and linker
|
||||
set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
|
||||
set(CMAKE_CXX_LINKER ${HIP_HIPCC_EXECUTABLE})
|
||||
|
||||
# Create the excutable
|
||||
add_executable(stream stream.cpp)
|
||||
|
||||
# Link with HIP
|
||||
target_link_libraries(stream hip::host)
|
||||
@@ -1,19 +0,0 @@
|
||||
project(peer2peer)
|
||||
|
||||
cmake_minimum_required(VERSION 3.10)
|
||||
|
||||
# Search for rocm in common locations
|
||||
list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
|
||||
|
||||
# Find hip
|
||||
find_package(hip)
|
||||
|
||||
# Set compiler and linker
|
||||
set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
|
||||
set(CMAKE_CXX_LINKER ${HIP_HIPCC_EXECUTABLE})
|
||||
|
||||
# Create the excutable
|
||||
add_executable(peer2peer peer2peer.cpp)
|
||||
|
||||
# Link with HIP
|
||||
target_link_libraries(peer2peer hip::host)
|
||||
@@ -1,19 +0,0 @@
|
||||
project(unroll)
|
||||
|
||||
cmake_minimum_required(VERSION 3.10)
|
||||
|
||||
# Search for rocm in common locations
|
||||
list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
|
||||
|
||||
# Find hip
|
||||
find_package(hip)
|
||||
|
||||
# Set compiler and linker
|
||||
set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
|
||||
set(CMAKE_CXX_LINKER ${HIP_HIPCC_EXECUTABLE})
|
||||
|
||||
# Create the excutable
|
||||
add_executable(unroll unroll.cpp)
|
||||
|
||||
# Link with HIP
|
||||
target_link_libraries(unroll hip::host)
|
||||
@@ -1,27 +0,0 @@
|
||||
Build procedure
|
||||
|
||||
We provide Makefile and CMakeLists.txt to build the samples seperately.
|
||||
|
||||
1.Makefile supports shared lib of hip-rocclr runtime and nvcc.
|
||||
|
||||
To build a sample, just type in sample folder,
|
||||
|
||||
make
|
||||
|
||||
|
||||
|
||||
2.CMakeLists.txt can support shared and static libs of hip-rocclr runtime.
|
||||
|
||||
To build a sample, type in sample folder,
|
||||
|
||||
mkdir build (if build folder is missing)
|
||||
|
||||
cd build
|
||||
|
||||
cmake ..
|
||||
|
||||
make
|
||||
|
||||
If you want debug version, follow,
|
||||
|
||||
cmake -DCMAKE_BUILD_TYPE=Debug ..
|
||||
Ссылка в новой задаче
Block a user