Revert "Revert "Merge branch 'amd-master-next' into amd-npi-next""

This reverts commit 5bc903a0cd.

Reason for revert: <INSERT REASONING HERE>

Change-Id: I92ceb171e31026ed1864704cef2fc1497b883ef9


[ROCm/hip-tests commit: 08c35854c0]
Этот коммит содержится в:
Vladislav Sytchenko
2020-10-05 13:20:58 -04:00
родитель 5bc903a0cd
Коммит 64bc0bc57f
27 изменённых файлов: 1777 добавлений и 249 удалений
+747
Просмотреть файл
@@ -0,0 +1,747 @@
/*
Copyright (c) 2015-2020 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
/* HIT_START
* BUILD: %t %s ../../src/test_common.cpp EXCLUDE_HIP_PLATFORM nvcc
* TEST: %t
* HIT_END
*/
#include <iostream>
#include <chrono>
#include "test_common.h"
#include <hip/hip_vector_types.h>
#include <hip/math_functions.h>
#include <vector>
#include <string>
#include <map>
typedef struct {
double x;
double y;
double width;
} coordRec;
coordRec coords[] = {
{0.0, 0.0, 4.0}, // Whole set
{0.0, 0.0, 0.00001}, // All black
{-0.0180789661868, 0.6424294066162, 0.00003824140}, // Hit detail
};
static unsigned int numCoords = sizeof(coords) / sizeof(coordRec);
template <typename T>
__global__ void float_mad_kernel(uint *out, uint width, T xPos, T yPos, T xStep, T yStep,
uint maxIter) {
#pragma FP_CONTRACT ON
int tid = (blockIdx.x * blockDim.x + threadIdx.x);
int i = tid % width;
int j = tid / width;
float x0 = (float)(xPos + xStep*i);
float y0 = (float)(yPos + yStep*j);
float x = x0;
float y = y0;
uint iter = 0;
float tmp;
for (iter = 0; (x*x + y*y <= 4.0f) && (iter < maxIter); iter++) {
tmp = x;
x = fma(-y,y,fma(x,x,x0));
y = fma(2.0f*tmp,y,y0);
}
out[tid] = iter;
};
template <typename T>
__global__ void float_mandel_unroll_kernel(uint *out, uint width, T xPos,
T yPos, T xStep, T yStep, uint maxIter) {
#pragma FP_CONTRACT ON
int tid = (blockIdx.x * blockDim.x + threadIdx.x);
int i = tid % width;
int j = tid / width;
float x0 = (float)(xPos + xStep*(float)i);
float y0 = (float)(yPos + yStep*(float)j);
float x = x0;
float y = y0;
#define FAST
uint iter = 0;
float tmp;
int stay;
int ccount = 0;
stay = (x*x+y*y) <= 4.0;
float savx = x;
float savy = y;
#ifdef FAST
for (iter = 0; (iter < maxIter); iter+=16) {
#else
for (iter = 0; stay && (iter < maxIter); iter+=16) {
#endif
x = savx;
y = savy;
// Two iterations
tmp = fma(-y,y, fma(x,x,x0));
y = fma(2.0f*x,y,y0);
x = fma(-y,y, fma(tmp,tmp,x0));
y = fma(2.0f*tmp,y,y0);
// Two iterations
tmp = fma(-y,y, fma(x,x,x0));
y = fma(2.0f*x,y,y0);
x = fma(-y,y, fma(tmp,tmp,x0));
y = fma(2.0f*tmp,y,y0);
// Two iterations
tmp = fma(-y,y, fma(x,x,x0));
y = fma(2.0f*x,y,y0);
x = fma(-y,y, fma(tmp,tmp,x0));
y = fma(2.0f*tmp,y,y0);
// Two iterations
tmp = fma(-y,y, fma(x,x,x0));
y = fma(2.0f*x,y,y0);
x = fma(-y,y, fma(tmp,tmp,x0));
y = fma(2.0f*tmp,y,y0);
// Two iterations
tmp = fma(-y,y, fma(x,x,x0));
y = fma(2.0f*x,y,y0);
x = fma(-y,y, fma(tmp,tmp,x0));
y = fma(2.0f*tmp,y,y0);
// Two iterations
tmp = fma(-y,y, fma(x,x,x0));
y = fma(2.0f*x,y,y0);
x = fma(-y,y, fma(tmp,tmp,x0));
y = fma(2.0f*tmp,y,y0);
// Two iterations
tmp = fma(-y,y, fma(x,x,x0));
y = fma(2.0f*x,y,y0);
x = fma(-y,y, fma(tmp,tmp,x0));
y = fma(2.0f*tmp,y,y0);
// Two iterations
tmp = fma(-y,y, fma(x,x,x0));
y = fma(2.0f*x,y,y0);
x = fma(-y,y, fma(tmp,tmp,x0));
y = fma(2.0f*tmp,y,y0);
stay = (x*x+y*y) <= 4.0;
savx = (stay ? x : savx);
savy = (stay ? y : savy);
ccount += stay*16;
#ifdef FAST
if (!stay)
break;
#endif
}
// Handle remainder
if (!stay) {
iter = 16;
do {
x = savx;
y = savy;
stay = ((x*x+y*y) <= 4.0) && (ccount < maxIter);
tmp = x;
x = fma(-y,y, fma(x,x,x0));
y = fma(2.0f*tmp,y,y0);
ccount += stay;
iter--;
savx = (stay ? x : savx);
savy = (stay ? y : savy);
} while (stay && iter);
}
out[tid] = (uint)ccount;
};
template <typename T>
__global__ void double_mad_kernel(uint *out, uint width, T xPos, T yPos, T xStep, T yStep,
uint maxIter) {
#pragma FP_CONTRACT ON
int tid = (blockIdx.x * blockDim.x + threadIdx.x);
int i = tid % width;
int j = tid / width;
double x0 = (double)(xPos + xStep*i);
double y0 = (double)(yPos + yStep*j);
double x = x0;
double y = y0;
uint iter = 0;
double tmp;
for (iter = 0; (x*x + y*y <= 4.0f) && (iter < maxIter); iter++) {
tmp = x;
x = fma(-y,y,fma(x,x,x0));
y = fma(2.0f*tmp,y,y0);
}
out[tid] = iter;
};
template <typename T>
__global__ void double_mandel_unroll_kernel(uint *out, uint width, T xPos,
T yPos, T xStep, T yStep, uint maxIter) {
#pragma FP_CONTRACT ON
int tid = (blockIdx.x * blockDim.x + threadIdx.x);
int i = tid % width;
int j = tid / width;
double x0 = (double)(xPos + xStep*(double)i);
double y0 = (double)(yPos + yStep*(double)j);
double x = x0;
double y = y0;
#define FAST
uint iter = 0;
double tmp;
int stay;
int ccount = 0;
stay = (x*x+y*y) <= 4.0;
double savx = x;
double savy = y;
#ifdef FAST
for (iter = 0; (iter < maxIter); iter+=16)
#else
for (iter = 0; stay && (iter < maxIter); iter+=16)
#endif
{
x = savx;
y = savy;
// Two iterations
tmp = fma(-y,y, fma(x,x,x0));
y = fma(2.0f*x,y,y0);
x = fma(-y,y, fma(tmp,tmp,x0));
y = fma(2.0f*tmp,y,y0);
// Two iterations
tmp = fma(-y,y, fma(x,x,x0));
y = fma(2.0f*x,y,y0);
x = fma(-y,y, fma(tmp,tmp,x0));
y = fma(2.0f*tmp,y,y0);
// Two iterations
tmp = fma(-y,y, fma(x,x,x0));
y = fma(2.0f*x,y,y0);
x = fma(-y,y, fma(tmp,tmp,x0));
y = fma(2.0f*tmp,y,y0);
// Two iterations
tmp = fma(-y,y, fma(x,x,x0));
y = fma(2.0f*x,y,y0);
x = fma(-y,y, fma(tmp,tmp,x0));
y = fma(2.0f*tmp,y,y0);
// Two iterations
tmp = fma(-y,y, fma(x,x,x0));
y = fma(2.0f*x,y,y0);
x = fma(-y,y, fma(tmp,tmp,x0));
y = fma(2.0f*tmp,y,y0);
// Two iterations
tmp = fma(-y,y, fma(x,x,x0));
y = fma(2.0f*x,y,y0);
x = fma(-y,y, fma(tmp,tmp,x0));
y = fma(2.0f*tmp,y,y0);
// Two iterations
tmp = fma(-y,y, fma(x,x,x0));
y = fma(2.0f*x,y,y0);
x = fma(-y,y, fma(tmp,tmp,x0));
y = fma(2.0f*tmp,y,y0);
// Two iterations
tmp = fma(-y,y, fma(x,x,x0));
y = fma(2.0f*x,y,y0);
x = fma(-y,y, fma(tmp,tmp,x0));
y = fma(2.0f*tmp,y,y0);
stay = (x*x+y*y) <= 4.0;
savx = (stay ? x : savx);
savy = (stay ? y : savy);
ccount += stay*16;
#ifdef FAST
if (!stay)
break;
#endif
}
// Handle remainder
if (!stay) {
iter = 16;
do {
x = savx;
y = savy;
stay = ((x*x+y*y) <= 4.0) && (ccount < maxIter);
tmp = x;
x = fma(-y,y, fma(x,x,x0));
y = fma(2.0f*tmp,y,y0);
ccount += stay;
iter--;
savx = (stay ? x : savx);
savy = (stay ? y : savy);
}
while (stay && iter);
}
out[tid] = (uint)ccount;
};
static const unsigned int FMA_EXPECTEDVALUES_INDEX = 15;
// Expected results for each kernel run at each coord
unsigned long long expectedIters[] = {
203277748ull, 2147483648ull, 120254651ull, 203277748ull, 2147483648ull,
120254651ull, 203277748ull, 2147483648ull, 120254651ull, 203315114ull,
2147483648ull, 120042599ull, 203315114ull, 2147483648ull, 120042599ull,
203280620ull, 2147483648ull, 120485704ull, 203280620ull, 2147483648ull,
120485704ull, 203280620ull, 2147483648ull, 120485704ull, 203315114ull,
2147483648ull, 120042599ull, 203315114ull, 2147483648ull, 120042599ull};
class hipPerfMandelBrot {
public:
hipPerfMandelBrot();
~hipPerfMandelBrot();
void setNumKernels(unsigned int num) {
numKernels = num;
}
unsigned int getNumKernels() {
return numKernels;
}
void setNumStreams(unsigned int num) {
numStreams = num;
}
unsigned int getNumStreams() {
return numStreams;
}
void open(int deviceID);
void run(unsigned int testCase, unsigned int deviceId);
void printResults(void);
// array of funtion pointers
typedef void (hipPerfMandelBrot::*funPtr)(uint *out, uint width, float xPos, float yPos,
float xStep, float yStep, uint maxIter, hipStream_t* streams, int blocks,
int threads_per_block, int kernelCnt);
// Wrappers
void float_mad(uint *out, uint width, float xPos, float yPos,
float xStep, float yStep, uint maxIter, hipStream_t* streams,
int blocks, int threads_per_block, int kernelCnt);
void float_mandel_unroll(uint *out, uint width, float xPos, float yPos,
float xStep, float yStep, uint maxIter, hipStream_t* streams,
int blocks, int threads_per_block, int kernelCnt);
void double_mad(uint *out, uint width, float xPos, float yPos, float xStep,
float yStep, uint maxIter, hipStream_t* streams, int blocks,
int threads_per_block, int kernelCnt);
void double_mandel_unroll(uint *out, uint width, float xPos, float yPos, float xStep,
float yStep, uint maxIter, hipStream_t* streams, int blocks,
int threads_per_block, int kernelCnt);
hipStream_t streams[2];
private:
void setData(void *ptr, unsigned int value);
void checkData(uint *ptr);
unsigned int numKernels;
unsigned int numStreams;
std::map<std::string, std::vector<double>> results;
unsigned int width_;
unsigned int bufSize;
unsigned int maxIter;
unsigned int coordIdx;
volatile unsigned long long totalIters = 0;
int numCUs;
static const unsigned int numLoops = 10;
};
hipPerfMandelBrot::hipPerfMandelBrot() {}
hipPerfMandelBrot::~hipPerfMandelBrot() {}
void hipPerfMandelBrot::open(int deviceId) {
int nGpu = 0;
HIPCHECK(hipGetDeviceCount(&nGpu));
if (nGpu < 1) {
std::cout << "info: didn't find any GPU! skipping the test!\n";
passed();
return;
}
HIPCHECK(hipSetDevice(deviceId));
hipDeviceProp_t props = {0};
HIPCHECK(hipGetDeviceProperties(&props, deviceId));
std::cout << "info: running on bus " << "0x" << props.pciBusID << " " << props.name
<< " with " << props.multiProcessorCount << " CUs" << " and device id: " << deviceId
<< std::endl;
numCUs = props.multiProcessorCount;
}
void hipPerfMandelBrot::printResults() {
int numkernels = getNumKernels();
int numStreams = getNumStreams();
std::cout << "\n" <<"Measured perf for kernels in GFLOPS on "
<< numStreams << " streams (s)" << std::endl;
std::map<std::string, std::vector<double>>:: iterator itr;
for (itr = results.begin(); itr != results.end(); itr++) {
std::cout << "\n" << std::setw(20) << itr->first << " ";
for(auto i : results[itr->first]) {
std::cout << std::setw(10) << i << " ";
}
}
results.clear();
std::cout << std::endl;
}
// Wrappers for the kernel launches
void hipPerfMandelBrot::float_mad(uint *out, uint width, float xPos, float yPos, float xStep,
float yStep, uint maxIter, hipStream_t* streams,
int blocks, int threads_per_block, int kernelCnt) {
int streamCnt = getNumStreams();
hipLaunchKernelGGL(float_mad_kernel<float>, dim3(blocks), dim3(threads_per_block), 0,
streams[kernelCnt % streamCnt], out, width_, xPos, yPos, xStep, yStep,
maxIter);
}
void hipPerfMandelBrot::float_mandel_unroll(uint *out, uint width, float xPos, float yPos,
float xStep, float yStep, uint maxIter, hipStream_t * streams,
int blocks, int threads_per_block, int kernelCnt) {
int streamCnt = getNumStreams();
hipLaunchKernelGGL(float_mandel_unroll_kernel<float>, dim3(blocks), dim3(threads_per_block), 0,
streams[kernelCnt % streamCnt], out, width_, xPos, yPos, xStep, yStep, maxIter);
}
void hipPerfMandelBrot::double_mad(uint *out, uint width, float xPos, float yPos,
float xStep, float yStep, uint maxIter, hipStream_t * streams,
int blocks, int threads_per_block, int kernelCnt) {
int streamCnt = getNumStreams();
hipLaunchKernelGGL(double_mad_kernel<double>, dim3(blocks), dim3(threads_per_block), 0,
streams[kernelCnt % streamCnt], out, width_, xPos, yPos, xStep, yStep, maxIter);
}
void hipPerfMandelBrot::double_mandel_unroll(uint *out, uint width, float xPos, float yPos,
float xStep, float yStep, uint maxIter, hipStream_t * streams,
int blocks, int threads_per_block, int kernelCnt) {
int streamCnt = getNumStreams();
hipLaunchKernelGGL(float_mandel_unroll_kernel<double>, dim3(blocks), dim3(threads_per_block), 0,
streams[kernelCnt % streamCnt], out, width_, xPos, yPos, xStep, yStep, maxIter);
}
void hipPerfMandelBrot::run(unsigned int testCase,unsigned int deviceId) {
unsigned int numStreams = getNumStreams();
funPtr p[] = {&hipPerfMandelBrot::float_mad, &hipPerfMandelBrot::float_mandel_unroll,
&hipPerfMandelBrot::double_mad, &hipPerfMandelBrot::double_mandel_unroll};
// Maximum iteration count
maxIter = 32768;
uint * hPtr[numKernels];
uint * dPtr[numKernels];
// Width is divisible by 4 because the mandelbrot kernel processes 4 pixels at once.
width_ = 256;
bufSize = width_ * width_ * sizeof(uint);
// Create streams for concurrency
for (uint i = 0; i < numStreams; i++) {
HIPCHECK(hipStreamCreate(&streams[i]));
}
// Allocate memory on the host and device
for (uint i = 0; i < numKernels; i++) {
HIPCHECK(hipHostMalloc((void **)&hPtr[i], bufSize, hipHostMallocDefault));
setData(hPtr[i], 0xdeadbeef);
HIPCHECK(hipMalloc((uint **)&dPtr[i], bufSize))
}
// Prepare kernel launch parameters
int threads = (bufSize/sizeof(uint));
int threads_per_block = 64;
int blocks = (threads/threads_per_block) + (threads % threads_per_block);
float xStep = (float)(coords[coordIdx].width / (double)width_);
float yStep = (float)(-coords[coordIdx].width / (double)width_);
float xPos = (float)(coords[coordIdx].x - 0.5 * coords[coordIdx].width);
float yPos = (float)(coords[coordIdx].y + 0.5 * coords[coordIdx].width);
// Copy memory asynchronously and concurrently from host to device
for (uint i = 0; i < numKernels; i++) {
HIPCHECK(hipMemcpy(dPtr[i], hPtr[i], bufSize, hipMemcpyHostToDevice));
}
// Synchronize to make sure all the copies are completed
HIPCHECK(hipStreamSynchronize(0));
int kernelIdx;
if(testCase == 0 || testCase == 5 || testCase == 10) {
kernelIdx = 0;
}
else if(testCase == 1 || testCase == 6 || testCase == 11) {
kernelIdx = 1;
}
else if(testCase == 2 || testCase == 7 || testCase == 12) {
kernelIdx = 2;
}
else if(testCase == 3 || testCase == 8 || testCase == 13){
kernelIdx = 3;
}
double totalTime = 0.0;
for (unsigned int k = 0; k < numLoops; k++) {
coordIdx = testCase % numCoords;
if ((testCase == 0 || testCase == 1 || testCase == 2 ||
testCase == 5 || testCase == 6 || testCase == 7 ||
testCase == 10 || testCase == 11 || testCase == 12)) {
float xStep = (float)(coords[coordIdx].width / (double)width_);
float yStep = (float)(-coords[coordIdx].width / (double)width_);
float xPos = (float)(coords[coordIdx].x - 0.5 * coords[coordIdx].width);
float yPos = (float)(coords[coordIdx].y + 0.5 * coords[coordIdx].width);
// Time the kernel execution
auto all_start = std::chrono::steady_clock::now();
for (uint i = 0; i < numKernels; i++) {
(this->*p[kernelIdx])(dPtr[i], width_, xPos, yPos, xStep, yStep, maxIter, streams, blocks,
threads_per_block, i);
}
// Synchronize all the concurrent streams to have completed execution
HIPCHECK(hipStreamSynchronize(0));
auto all_end = std::chrono::steady_clock::now();
std::chrono::duration<double> all_kernel_time = all_end - all_start;
totalTime += all_kernel_time.count();
}
else {
double xStep = coords[coordIdx].width / (double)width_;
double yStep = -coords[coordIdx].width / (double)width_;
double xPos = coords[coordIdx].x - 0.5 * coords[coordIdx].width;
double yPos = coords[coordIdx].y + 0.5 * coords[coordIdx].width;
// Time the kernel execution
auto all_start = std::chrono::steady_clock::now();
for (uint i = 0; i < numKernels; i++) {
(this->*p[kernelIdx])(dPtr[i], width_, xPos, yPos, xStep, yStep, maxIter, streams, blocks,
threads_per_block, i);
}
// Synchronize all the concurrent streams to have completed execution
HIPCHECK(hipStreamSynchronize(0));
auto all_end = std::chrono::steady_clock::now();
std::chrono::duration<double> all_kernel_time = all_end - all_start;
totalTime += all_kernel_time.count();
}
}
// Copy data back from device to the host
for(uint i = 0; i < numKernels; i++) {
HIPCHECK(hipMemcpy(hPtr[i] ,dPtr[i], bufSize, hipMemcpyDeviceToHost));
}
for(uint i = 0; i < numKernels; i++) {
checkData(hPtr[i]);
int j =0;
while((totalIters != expectedIters[j] && totalIters > expectedIters[j]) && j < 30) {
j++;
}
if(j==30) {
std::cout << "Incorrect iteration count detected. ";
}
}
// Compute GFLOPS. There are 7 FLOPs per iteration
double perf = ((double)(totalIters*numKernels) * 7 * (double)(1e-09)) /
(totalTime / (double)numLoops);
std::vector<std::string> kernelName = {"float", "float_unroll",
"double", "double_unroll"};
// Print results except for Warm-up kernel
if(testCase!=100) {
results[kernelName[testCase % 4]].push_back(perf);
}
for(uint i = 0 ; i < numStreams; i++) {
HIPCHECK(hipStreamDestroy(streams[i]));
}
// Free host and device memory
for (uint i = 0; i < numKernels; i++) {
HIPCHECK(hipFree(hPtr[i]));
HIPCHECK(hipFree(dPtr[i]));
}
}
void hipPerfMandelBrot::setData(void *ptr, unsigned int value) {
unsigned int *ptr2 = (unsigned int *)ptr;
for (unsigned int i = 0; i < width_ * width_; i++) {
ptr2[i] = value;
}
}
void hipPerfMandelBrot::checkData(uint *ptr) {
totalIters = 0;
for (unsigned int i = 0; i < width_ * width_; i++) {
totalIters += ptr[i];
}
}
int main(int argc, char* argv[]) {
hipPerfMandelBrot mandelbrotCompute;
int deviceId = 0;
mandelbrotCompute.open(deviceId);
for (unsigned int testCase = 0; testCase < 3; testCase++) {
switch (testCase) {
case 0: {
// Warmup-kernel - default stream executes serially
mandelbrotCompute.setNumStreams(1);
mandelbrotCompute.setNumKernels(1);
mandelbrotCompute.run(100/*Random number*/, deviceId);
break;
}
case 1: {
// run all - sync
int i = 0;
do {
mandelbrotCompute.setNumStreams(1);
mandelbrotCompute.setNumKernels(1);
mandelbrotCompute.run(i, deviceId);
i++;
}while(i < 12);
mandelbrotCompute.printResults();
break;
}
case 2: {
// run all - async
int i = 0;
do {
mandelbrotCompute.setNumStreams(2);
mandelbrotCompute.setNumKernels(2);
mandelbrotCompute.run(i, deviceId);
i++;
}while(i < 12);
mandelbrotCompute.printResults();
break;
}
default: {
break;
}
}
}
passed();
}
+289
Просмотреть файл
@@ -0,0 +1,289 @@
/*
Copyright (c) 2015-2020 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
/* HIT_START
* BUILD: %t %s ../../src/test_common.cpp EXCLUDE_HIP_PLATFORM nvcc
* TEST: %t
* HIT_END
*/
#include <iostream>
#include <chrono>
#include "test_common.h"
typedef struct {
double x;
double y;
double width;
} coordRec;
static coordRec coords[] = {
{0.0, 0.0, 0.00001}, // All black
};
static unsigned int numCoords = sizeof(coords) / sizeof(coordRec);
__global__ void mandelbrot(uint *out, uint width, float xPos, float yPos, float xStep,
float yStep, uint maxIter) {
int tid = (blockIdx.x * blockDim.x + threadIdx.x);
int i = tid % width;
int j = tid / width;
float x0 = (float)(xPos + xStep*i);
float y0 = (float)(yPos + yStep*j);
float x = x0;
float y = y0;
uint iter = 0;
float tmp;
for (iter = 0; (x*x + y*y <= 4.0f) && (iter < maxIter); iter++) {
tmp = x;
x = fma(-y,y,fma(x,x,x0));
y = fma(2.0f*tmp,y,y0);
}
out[tid] = iter;
};
class hipPerfDeviceConcurrency {
public:
hipPerfDeviceConcurrency();
~hipPerfDeviceConcurrency();
void setNumGpus(unsigned int num) {
numDevices = num;
}
unsigned int getNumGpus() {
return numDevices;
}
void open(void);
void close(void);
void run(unsigned int testCase, int numGpus);
private:
void setData(void *ptr, unsigned int value);
void checkData(uint *ptr);
unsigned int numDevices;
unsigned int width_;
unsigned int bufSize;
unsigned int coordIdx;
unsigned long long totalIters = 0;
};
hipPerfDeviceConcurrency::hipPerfDeviceConcurrency() {}
hipPerfDeviceConcurrency::~hipPerfDeviceConcurrency() {}
void hipPerfDeviceConcurrency::open(void) {
int nGpu = 0;
HIPCHECK(hipGetDeviceCount(&nGpu));
setNumGpus(nGpu);
if (nGpu < 1) {
std::cout << "info: didn't find any GPU! skipping the test!\n";
passed();
}
}
void hipPerfDeviceConcurrency::close() {
}
void hipPerfDeviceConcurrency::run(unsigned int testCase, int numGpus) {
static int deviceId;
uint * hPtr[numGpus];
uint * dPtr[numGpus];
hipStream_t streams[numGpus];
int numCUs[numGpus];
unsigned int maxIter[numGpus];
unsigned long long expectedIters[numGpus];
int threads, threads_per_block, blocks;
float xStep, yStep, xPos, yPos;
for(int i = 0; i < numGpus; i++) {
if(testCase != 0) {
deviceId = i;
}
HIPCHECK(hipSetDevice(deviceId));
hipDeviceProp_t props = {0};
HIPCHECK(hipGetDeviceProperties(&props, i));
if (testCase != 0) {
std::cout << "info: running on bus " << "0x" << props.pciBusID << " " << props.name
<< " with " << props.multiProcessorCount << " CUs" << " and device ID: "
<< i << std::endl;
}
numCUs[i] = props.multiProcessorCount;
int clkFrequency = 0;
HIPCHECK(hipDeviceGetAttribute(&clkFrequency, hipDeviceAttributeClockRate, i));
clkFrequency =(unsigned int)clkFrequency/1000;
// Maximum iteration count
// maxIter = 8388608 * (engine_clock / 1000).serial execution
maxIter[i] = (unsigned int)(((8388608 * ((float)clkFrequency / 1000)) * numCUs[i]) / 128);
maxIter[i] = (maxIter[i] + 15) & ~15;
// Width is divisible by 4 because the mandelbrot kernel processes 4 pixels at once.
width_ = 256;
bufSize = width_ * width_ * sizeof(uint);
// Create streams for concurrency
HIPCHECK(hipStreamCreate(&streams[i]));
// Allocate memory on the host and device
HIPCHECK(hipHostMalloc((void **)&hPtr[i], bufSize, hipHostMallocDefault));
setData(hPtr[i], 0xdeadbeef);
HIPCHECK(hipMalloc((uint **)&dPtr[i], bufSize))
// Prepare kernel launch parameters
threads = (bufSize/sizeof(uint));
threads_per_block = 64;
blocks = (threads/threads_per_block) + (threads % threads_per_block);
coordIdx = testCase % numCoords;
xStep = (float)(coords[coordIdx].width / (double)width_);
yStep = (float)(-coords[coordIdx].width / (double)width_);
xPos = (float)(coords[coordIdx].x - 0.5 * coords[coordIdx].width);
yPos = (float)(coords[coordIdx].y + 0.5 * coords[coordIdx].width);
// Copy memory from host to device
HIPCHECK(hipMemcpy(dPtr[i], hPtr[i], bufSize, hipMemcpyHostToDevice));
}
// Time the kernel execution
auto all_start = std::chrono::steady_clock::now();
for(int i = 0; i < numGpus; i++) {
if(testCase != 0) {
deviceId = i;
}
HIPCHECK(hipSetDevice(deviceId));
hipLaunchKernelGGL(mandelbrot, dim3(blocks), dim3(threads_per_block), 0, streams[i],
dPtr[i], width_, xPos, yPos, xStep, yStep, maxIter[i]);
}
for(int i = 0; i < numGpus; i++) {
HIPCHECK(hipStreamSynchronize(0));
}
auto all_end = std::chrono::steady_clock::now();
std::chrono::duration<double> all_kernel_time = all_end - all_start;
for(int i = 0; i < numGpus; i++) {
if(testCase != 0) {
deviceId = i;
}
HIPCHECK(hipSetDevice(deviceId));
// Copy data back from device to the host
HIPCHECK(hipMemcpy(hPtr[i], dPtr[i], bufSize, hipMemcpyDeviceToHost));
checkData(hPtr[i]);
expectedIters[i] = width_ * width_ * (unsigned long long) maxIter[i];
if (testCase != 0) {
checkData(hPtr[i]);
if(totalIters != expectedIters[i]) {
std::cout << "Incorrect iteration count detected" << std::endl;
}
}
HIPCHECK(hipStreamDestroy(streams[i]));
// Free host and device memory
HIPCHECK(hipFree(hPtr[i]));
HIPCHECK(hipFree(dPtr[i]));
}
if (testCase != 0) {
std::cout << '\n' << "Measured time for kernel computation on " << numGpus << " device (s): "
<< all_kernel_time.count() << " (s) " << '\n' << std::endl;
}
if(testCase == 0) {
deviceId++;
}
}
void hipPerfDeviceConcurrency::setData(void *ptr, unsigned int value) {
unsigned int *ptr2 = (unsigned int *)ptr;
for (unsigned int i = 0; i < width_ * width_ ; i++) {
ptr2[i] = value;
}
}
void hipPerfDeviceConcurrency::checkData(uint *ptr) {
totalIters = 0;
for (unsigned int i = 0; i < width_ * width_; i++) {
totalIters += ptr[i];
}
}
int main(int argc, char* argv[]) {
hipPerfDeviceConcurrency deviceConcurrency;
deviceConcurrency.open();
int nGpu = deviceConcurrency.getNumGpus();
// testCase = 0 refers to warmup kernel run
int testCase = 0;
for (int i = 0; i < nGpu; i++) {
// Warm-up kernel on all devices
deviceConcurrency.run(testCase, 1);
}
// Time for kernel on 1 device
deviceConcurrency.run(++testCase, 1);
// Time for kernel on all available devices
deviceConcurrency.run(++testCase, nGpu);
passed();
}
+20
Просмотреть файл
@@ -0,0 +1,20 @@
project(bit_extract)
cmake_minimum_required(VERSION 3.10)
# Search for rocm in common locations
list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
# Find hip
find_package(hip)
# Set compiler and linker
set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
set(CMAKE_CXX_LINKER ${HIP_HIPCC_EXECUTABLE})
# Create the excutable
add_executable(bit_extract bit_extract.cpp)
# Link with HIP
target_link_libraries(bit_extract hip::host)
+3 -7
Просмотреть файл
@@ -9,19 +9,15 @@ HIPCC=$(HIP_PATH)/bin/hipcc
# Show how to use PLATFORM to specify different options for each compiler:
ifeq (${HIP_PLATFORM}, nvcc)
HIPCC_FLAGS = -gencode=arch=compute_20,code=sm_20
HIPCC_FLAGS = -gencode=arch=compute_20,code=sm_20
endif
EXE=bit_extract
EXE_STATIC=bit_extract_static
$(EXE): bit_extract.cpp
$(HIPCC) $(HIPCC_FLAGS) $< -o $@
$(EXE_STATIC): bit_extract.cpp
$(HIPCC) -use-staticlib $(HIPCC_FLAGS) $< -o $@
all: $(EXE) $(EXE_STATIC)
all: $(EXE)
clean:
rm -f *.o $(EXE) $(EXE_STATIC)
rm -f *.o $(EXE)
+36
Просмотреть файл
@@ -0,0 +1,36 @@
project(module_api)
cmake_minimum_required(VERSION 3.10)
# Search for rocm in common locations
list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
# Find hip
find_package(hip)
# Set compiler and linker
set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
set(CMAKE_CXX_LINKER ${HIP_HIPCC_EXECUTABLE})
# Create the excutable
add_executable(runKernel.hip.out runKernel.cpp)
add_executable(launchKernelHcc.hip.out launchKernelHcc.cpp)
add_executable(defaultDriver.hip.out defaultDriver.cpp)
# Generate code object
add_custom_target(
codeobj
ALL
COMMAND ${HIP_HIPCC_EXECUTABLE} --genco ../vcpy_kernel.cpp -o vcpy_kernel.code
COMMENT "codeobj generated"
)
add_dependencies(runKernel.hip.out codeobj)
add_dependencies(launchKernelHcc.hip.out codeobj)
add_dependencies(defaultDriver.hip.out codeobj)
# Link with HIP
target_link_libraries(runKernel.hip.out hip::host)
target_link_libraries(launchKernelHcc.hip.out hip::host)
target_link_libraries(defaultDriver.hip.out hip::host)
+30
Просмотреть файл
@@ -0,0 +1,30 @@
project(modile_api_global)
cmake_minimum_required(VERSION 3.10)
# Search for rocm in common locations
list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
# Find hip
find_package(hip)
# Set compiler and linker
set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
set(CMAKE_CXX_LINKER ${HIP_HIPCC_EXECUTABLE})
# Create the excutable
add_executable(runKernel.hip.out runKernel.cpp)
# Generate code object
add_custom_target(
codeobj
ALL
COMMAND ${HIP_HIPCC_EXECUTABLE} --genco ../vcpy_kernel.cpp -o vcpy_kernel.code
COMMENT "codeobj generated"
)
add_dependencies(runKernel.hip.out codeobj)
# Link with HIP
target_link_libraries(runKernel.hip.out hip::host)
+21
Просмотреть файл
@@ -0,0 +1,21 @@
#Follow "README.md" to generate square.cpp if it's missing
project(square)
cmake_minimum_required(VERSION 3.10)
# Search for rocm in common locations
list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
# Find hip
find_package(hip)
# Set compiler and linker
set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
set(CMAKE_CXX_LINKER ${HIP_HIPCC_EXECUTABLE})
# Create the excutable
add_executable(square square.cpp)
# Link with HIP
target_link_libraries(square hip::host)
+2 -5
Просмотреть файл
@@ -11,7 +11,7 @@ else
SOURCES=square.cpp
endif
all: square.out square.out.static
all: square.out
# Step
square.cpp: square.cu
@@ -20,8 +20,5 @@ square.cpp: square.cu
square.out: $(SOURCES)
$(HIPCC) $(CXXFLAGS) $(SOURCES) -o $@
square.out.static: $(SOURCES)
$(HIPCC) -use-staticlib $(CXXFLAGS) $(SOURCES) -o $@
clean:
rm -f *.o *.out *.out.static square.cpp
rm -f *.o *.out square.cpp
+34 -8
Просмотреть файл
@@ -1,13 +1,39 @@
# Square.md
Simple test which shows how to use hipify-perl to port CUDA code to HIP.
See related [blog](http://gpuopen.com/hip-to-be-squared-an-introductory-hip-tutorial) that explains the example.
Simple test which shows how to use hipify-perl to port CUDA code to HIP.
See related [blog](http://gpuopen.com/hip-to-be-squared-an-introductory-hip-tutorial) that explains the example.
Now it is even simpler and requires no manual modification to the hipified source code - just hipify and compile:
1. Add hip/bin path to the PATH :
<code>export PATH=$PATH:[MYHIP]/bin</code>
- Add hip/bin path to the PATH
2. <code>$ make </code>
Make runs these steps. This can be performed on either CUDA or AMD platform:
<code>hipify-perl square.cu > square.cpp </code> # convert cuda code to hip code
<code>hipcc square.cpp</code> # compile into executable
```
$ export PATH=$PATH:[MYHIP]/bin
```
- Define environment variable
```
$ export HIP_PATH=[MYHIP]
```
- Build executible file
```
$ cd ~/hip/samples/0_Intro/square
$ make
/home/user/hip/bin/hipify-perl square.cu > square.cpp
/home/user/hip/bin/hipcc square.cpp -o square.out
/home/user/hip/bin/hipcc -use-staticlib square.cpp -o square.out.static
```
- Execute file
```
$ ./square.out
info: running on device Navi 14 [Radeon Pro W5500]
info: allocate host mem ( 7.63 MB)
info: allocate device mem ( 7.63 MB)
info: copy Host2Device
info: launch 'vector_square' kernel
info: copy Device2Host
info: check result
PASSED!
```
+20
Просмотреть файл
@@ -0,0 +1,20 @@
project(hipBusBandwidth)
cmake_minimum_required(VERSION 3.10)
# Search for rocm in common locations
list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
# Find hip
find_package(hip)
# Set compiler and linker
set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
set(CMAKE_CXX_LINKER ${HIP_HIPCC_EXECUTABLE})
set(CMAKE_BUILD_TYPE Release)
# Create the excutable
add_executable(hipBusBandwidth hipBusBandwidth.cpp ResultDatabase.cpp)
# Link with HIP
target_link_libraries(hipBusBandwidth hip::host)
+217 -229
Просмотреть файл
@@ -12,7 +12,7 @@ enum MallocMode { MallocPinned, MallocUnpinned, MallocRegistered };
bool p_verbose = false;
MallocMode p_malloc_mode = MallocPinned;
int p_numa_ctl = -1;
int p_iterations = 10;
int p_iterations = 0;
int p_beatsperiteration = 1;
int p_device = 0;
int p_detailed = 0;
@@ -89,7 +89,9 @@ hipError_t memcopy(void* dst, const void* src, size_t sizeBytes, enum hipMemcpyK
int sizes[] = {-64, -256, -512, 1, 2, 4, 8, 16, 32, 64, 128, 256,
512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072, 262144, 524288};
int nSizes = sizeof(sizes) / sizeof(int);
// iterations to be run for the corresponding sizes, less number as the size increases
int iterations[] = {1000, 1000, 1000, 1000, 500, 500, 500, 500, 500, 200, 200, 200,
200, 200, 100, 100, 100, 100, 50, 50, 50, 20, 20};
// ****************************************************************************
// Function: RunBenchmark_H2D
@@ -174,53 +176,48 @@ void RunBenchmark_H2D(ResultDatabase& resultDB) {
hipEventCreate(&stop);
CHECK_HIP_ERROR();
// Three passes, forward and backward both
for (int pass = 0; pass < p_iterations; pass++) {
// store the times temporarily to estimate latency
// float times[nSizes];
// Step through sizes forward on even passes and backward on odd
for (int i = 0; i < nSizes; i++) {
int sizeIndex;
if ((pass % 2) == 0)
sizeIndex = i;
else
sizeIndex = (nSizes - 1) - i;
// store the times temporarily to estimate latency
// float times[nSizes];
for (int i = 0; i < nSizes; i++) {
int sizeIndex, iterIndex;
sizeIndex = i;
iterIndex = i;
const int thisSize = p_onesize ? p_onesize : sizes[sizeIndex];
const int nbytes = sizeToBytes(thisSize);
const int thisSize = p_onesize ? p_onesize : sizes[sizeIndex];
const int nbytes = sizeToBytes(thisSize);
const int niter = p_iterations ? p_iterations : iterations[iterIndex];
for (int pass = 0; pass < niter; pass++) {
hipEventRecord(start, 0);
for (int j = 0; j < p_beatsperiteration; j++) {
memcopy(device, hostMem, nbytes, hipMemcpyHostToDevice);
}
hipEventRecord(stop, 0);
hipEventSynchronize(stop);
float t = 0;
hipEventElapsedTime(&t, start, stop);
// times[sizeIndex] = t;
// Convert to GB/sec
if (p_verbose) {
std::cerr << "size " << sizeToString(thisSize) << " took " << t << " ms\n";
}
double speed =
(double(double(sizeToBytes(thisSize)/1000) * p_beatsperiteration) / 1000) / t;
char sizeStr[256];
if (p_beatsperiteration > 1) {
sprintf(sizeStr, "%9sx%d", sizeToString(thisSize).c_str(), p_beatsperiteration);
} else {
sprintf(sizeStr, "%9s", sizeToString(thisSize).c_str());
}
resultDB.AddResult(std::string("H2D_Bandwidth") + "_" + mallocModeString(p_malloc_mode),
sizeStr, "GB/sec", speed);
resultDB.AddResult(std::string("H2D_Time") + mallocModeString(p_malloc_mode), sizeStr,
"ms", t);
if (p_onesize) {
break;
}
hipEventRecord(start, 0);
for (int j = 0; j < p_beatsperiteration; j++) {
memcopy(device, hostMem, nbytes, hipMemcpyHostToDevice);
}
hipEventRecord(stop, 0);
hipEventSynchronize(stop);
float t = 0;
hipEventElapsedTime(&t, start, stop);
// times[sizeIndex] = t;
// Convert to GB/sec
if (p_verbose) {
std::cerr << "size " << sizeToString(thisSize) << " took " << t << " ms\n";
}
double speed =
(double(double(sizeToBytes(thisSize)/1000) * p_beatsperiteration) / 1000) / t;
char sizeStr[256];
if (p_beatsperiteration > 1) {
sprintf(sizeStr, "%9sx%d", sizeToString(thisSize).c_str(), p_beatsperiteration);
} else {
sprintf(sizeStr, "%9s", sizeToString(thisSize).c_str());
}
resultDB.AddResult(std::string("H2D_Bandwidth") + "_" + mallocModeString(p_malloc_mode),
sizeStr, "GB/sec", speed);
resultDB.AddResult(std::string("H2D_Time") + mallocModeString(p_malloc_mode), sizeStr, "ms", t);
}
if (p_onesize) {
break;
}
}
if (p_onesize) {
@@ -347,53 +344,50 @@ void RunBenchmark_D2H(ResultDatabase& resultDB) {
hipEventCreate(&stop);
CHECK_HIP_ERROR();
// Three passes, forward and backward both
for (int pass = 0; pass < p_iterations; pass++) {
// store the times temporarily to estimate latency
// float times[nSizes];
// Step through sizes forward on even passes and backward on odd
for (int i = 0; i < nSizes; i++) {
int sizeIndex;
if ((pass % 2) == 0)
sizeIndex = i;
else
sizeIndex = (nSizes - 1) - i;
// store the times temporarily to estimate latency
// float times[nSizes];
for (int i = 0; i < nSizes; i++) {
int sizeIndex, iterIndex;
sizeIndex = i;
iterIndex = i;
const int thisSize = p_onesize ? p_onesize : sizes[sizeIndex];
const int nbytes = sizeToBytes(thisSize);
const int thisSize = p_onesize ? p_onesize : sizes[sizeIndex];
const int nbytes = sizeToBytes(thisSize);
const int niter = p_iterations ? p_iterations : iterations[iterIndex];
for (int pass = 0; pass < niter; pass++) {
hipEventRecord(start, 0);
for (int j = 0; j < p_beatsperiteration; j++) {
memcopy(hostMem2, device, nbytes, hipMemcpyDeviceToHost);
}
hipEventRecord(stop, 0);
hipEventSynchronize(stop);
float t = 0;
hipEventElapsedTime(&t, start, stop);
// times[sizeIndex] = t;
// Convert to GB/sec
if (p_verbose) {
std::cerr << "size " << sizeToString(thisSize) << " took " << t << " ms\n";
}
double speed =
(double(double(sizeToBytes(thisSize)/1000) * p_beatsperiteration) / 1000) / t;
char sizeStr[256];
sprintf(sizeStr, "%9s", sizeToString(thisSize).c_str());
if (p_beatsperiteration > 1) {
sprintf(sizeStr, "%9sx%d", sizeToString(thisSize).c_str(), p_beatsperiteration);
} else {
sprintf(sizeStr, "%9s", sizeToString(thisSize).c_str());
}
resultDB.AddResult(std::string("D2H_Bandwidth") + "_" + mallocModeString(p_malloc_mode),
sizeStr, "GB/sec", speed);
resultDB.AddResult(std::string("D2H_Time") + "_" + mallocModeString(p_malloc_mode),
sizeStr, "ms", t);
if (p_onesize) {
break;
}
hipEventRecord(start, 0);
for (int j = 0; j < p_beatsperiteration; j++) {
memcopy(hostMem2, device, nbytes, hipMemcpyDeviceToHost);
}
hipEventRecord(stop, 0);
hipEventSynchronize(stop);
float t = 0;
hipEventElapsedTime(&t, start, stop);
// times[sizeIndex] = t;
// Convert to GB/sec
if (p_verbose) {
std::cerr << "size " << sizeToString(thisSize) << " took " << t << " ms\n";
}
double speed =
(double(double(sizeToBytes(thisSize)/1000) * p_beatsperiteration) / 1000) / t;
char sizeStr[256];
sprintf(sizeStr, "%9s", sizeToString(thisSize).c_str());
if (p_beatsperiteration > 1) {
sprintf(sizeStr, "%9sx%d", sizeToString(thisSize).c_str(), p_beatsperiteration);
} else {
sprintf(sizeStr, "%9s", sizeToString(thisSize).c_str());
}
resultDB.AddResult(std::string("D2H_Bandwidth") + "_" + mallocModeString(p_malloc_mode),
sizeStr, "GB/sec", speed);
resultDB.AddResult(std::string("D2H_Time") + "_" + mallocModeString(p_malloc_mode),
sizeStr, "ms", t);
}
if (p_onesize) {
break;
}
}
if (p_onesize) {
@@ -522,43 +516,43 @@ void RunBenchmark_Bidir(ResultDatabase& resultDB) {
hipStreamCreate(&stream[0]);
hipStreamCreate(&stream[1]);
// Three passes, forward and backward both
for (int pass = 0; pass < p_iterations; pass++) {
// store the times temporarily to estimate latency
// float times[nSizes];
// Step through sizes forward on even passes and backward on odd
for (int i = 0; i < nSizes; i++) {
int sizeIndex;
if ((pass % 2) == 0)
sizeIndex = i;
else
sizeIndex = (nSizes - 1) - i;
// store the times temporarily to estimate latency
// float times[nSizes];
for (int i = 0; i < nSizes; i++) {
int sizeIndex, iterIndex;
sizeIndex = i;
iterIndex = i;
const int thisSize = p_onesize ? p_onesize : sizes[sizeIndex];
const int nbytes = sizeToBytes(thisSize);
const int thisSize = p_onesize ? p_onesize : sizes[sizeIndex];
const int nbytes = sizeToBytes(thisSize);
const int niter = p_iterations ? p_iterations : iterations[iterIndex];
for (int pass = 0; pass < niter; pass++) {
hipEventRecord(start, 0);
hipMemcpyAsync(deviceMem[0], hostMem[0], nbytes, hipMemcpyHostToDevice, stream[0]);
hipMemcpyAsync(hostMem[1], deviceMem[1], nbytes, hipMemcpyDeviceToHost, stream[1]);
hipEventRecord(stop, 0);
hipEventSynchronize(stop);
float t = 0;
hipEventElapsedTime(&t, start, stop);
hipEventRecord(start, 0);
hipMemcpyAsync(deviceMem[0], hostMem[0], nbytes, hipMemcpyHostToDevice, stream[0]);
hipMemcpyAsync(hostMem[1], deviceMem[1], nbytes, hipMemcpyDeviceToHost, stream[1]);
hipEventRecord(stop, 0);
hipEventSynchronize(stop);
float t = 0;
hipEventElapsedTime(&t, start, stop);
// Convert to GB/sec
if (p_verbose) {
std::cerr << "size " << sizeToString(thisSize) << " took " << t << " ms\n";
}
double speed = (double(sizeToBytes(2 * thisSize)) / (1000 * 1000)) / t;
char sizeStr[256];
sprintf(sizeStr, "%9s", sizeToString(thisSize).c_str());
resultDB.AddResult(
std::string("Bidir_Bandwidth") + "_" + mallocModeString(p_malloc_mode), sizeStr,
"GB/sec", speed);
resultDB.AddResult(std::string("Bidir_Time") + "_" + mallocModeString(p_malloc_mode),
sizeStr, "ms", t);
// Convert to GB/sec
if (p_verbose) {
std::cerr << "size " << sizeToString(thisSize) << " took " << t << " ms\n";
}
double speed = (double(sizeToBytes(2 * thisSize)) / (1000 * 1000)) / t;
char sizeStr[256];
sprintf(sizeStr, "%9s", sizeToString(thisSize).c_str());
resultDB.AddResult(
std::string("Bidir_Bandwidth") + "_" + mallocModeString(p_malloc_mode), sizeStr,
"GB/sec", speed);
resultDB.AddResult(std::string("Bidir_Time") + "_" + mallocModeString(p_malloc_mode),
sizeStr, "ms", t);
}
if (p_onesize) {
break;
}
}
// Cleanup
@@ -708,66 +702,63 @@ void RunBenchmark_P2P_Unidir(ResultDatabase& resultDB) {
hipEventCreate(&stop);
CHECK_HIP_ERROR();
// Three passes, forward and backward both
for (int pass = 0; pass < p_iterations; pass++) {
// store the times temporarily to estimate latency
// float times[nSizes];
// Step through sizes forward on even passes and backward on odd
for (int i = 0; i < nSizes; i++) {
int sizeIndex;
if ((pass % 2) == 0)
sizeIndex = i;
else
sizeIndex = (nSizes - 1) - i;
// store the times temporarily to estimate latency
// float times[nSizes];
for (int i = 0; i < nSizes; i++) {
int sizeIndex, iterIndex;
sizeIndex = i;
iterIndex = i;
const int thisSize = p_onesize ? p_onesize : sizes[sizeIndex];
const int nbytes = sizeToBytes(thisSize);
const int thisSize = p_onesize ? p_onesize : sizes[sizeIndex];
const int nbytes = sizeToBytes(thisSize);
const int niter = p_iterations ? p_iterations : iterations[iterIndex];
for (int pass = 0; pass < niter; pass++) {
hipDeviceSynchronize();
hipDeviceSynchronize();
hipEventRecord(start, 0);
hipEventRecord(start, 0);
for (int j = 0; j < p_beatsperiteration; j++) {
hipMemcpy(peerGpuMem, currentGpuMem, nbytes, hipMemcpyDeviceToDevice);
}
for (int j = 0; j < p_beatsperiteration; j++) {
hipMemcpy(peerGpuMem, currentGpuMem, nbytes, hipMemcpyDeviceToDevice);
}
hipEventRecord(stop, 0);
hipEventRecord(stop, 0);
hipEventSynchronize(stop);
hipEventSynchronize(stop);
float t = 0;
hipEventElapsedTime(&t, start, stop);
// times[sizeIndex] = t;
float t = 0;
hipEventElapsedTime(&t, start, stop);
// times[sizeIndex] = t;
// Convert to GB/sec
if (p_verbose) {
std::cerr << "size " << sizeToString(thisSize) << " took " << t << " ms\n";
}
// Convert to GB/sec
if (p_verbose) {
std::cerr << "size " << sizeToString(thisSize) << " took " << t << " ms\n";
}
double speed =
(double(double(sizeToBytes(thisSize)/1000) * p_beatsperiteration) / 1000) / t;
char sizeStr[256];
if (p_beatsperiteration > 1) {
sprintf(sizeStr, "%9sx%d", sizeToString(thisSize).c_str(),
p_beatsperiteration);
} else {
sprintf(sizeStr, "%9s", sizeToString(thisSize).c_str());
}
double speed =
(double(double(sizeToBytes(thisSize)/1000) * p_beatsperiteration) / 1000) / t;
char sizeStr[256];
if (p_beatsperiteration > 1) {
sprintf(sizeStr, "%9sx%d", sizeToString(thisSize).c_str(),
p_beatsperiteration);
} else {
sprintf(sizeStr, "%9s", sizeToString(thisSize).c_str());
}
string cGpu, pGpu;
cGpu = gpuIDToString(currentGpu);
pGpu = gpuIDToString(peerGpu);
string cGpu, pGpu;
cGpu = gpuIDToString(currentGpu);
pGpu = gpuIDToString(peerGpu);
resultDB.AddResult(std::string("p2p_uni") + "_gpu" + std::string(cGpu) +
"_gpu" + std::string(pGpu),
resultDB.AddResult(std::string("p2p_uni") + "_gpu" + std::string(cGpu) +
"_gpu" + std::string(pGpu),
sizeStr, "GB/sec", speed);
resultDB.AddResult(std::string("P2P_uni") + "_gpu" + std::string(cGpu) +
"_gpu" + std::string(pGpu),
resultDB.AddResult(std::string("P2P_uni") + "_gpu" + std::string(cGpu) +
"_gpu" + std::string(pGpu),
sizeStr, "ms", t);
if (p_onesize) {
break;
}
}
if (p_onesize) {
break;
}
}
@@ -829,71 +820,68 @@ void RunBenchmark_P2P_Bidir(ResultDatabase& resultDB) {
hipStreamCreate(&stream[0]);
hipStreamCreate(&stream[1]);
// Three passes, forward and backward both
for (int pass = 0; pass < p_iterations; pass++) {
// store the times temporarily to estimate latency
// float times[nSizes];
// Step through sizes forward on even passes and backward on odd
for (int i = 0; i < nSizes; i++) {
int sizeIndex;
if ((pass % 2) == 0)
sizeIndex = i;
else
sizeIndex = (nSizes - 1) - i;
// store the times temporarily to estimate latency
// float times[nSizes];
for (int i = 0; i < nSizes; i++) {
int sizeIndex, iterIndex;
sizeIndex = i;
iterIndex = i;
const int thisSize = p_onesize ? p_onesize : sizes[sizeIndex];
const int nbytes = sizeToBytes(thisSize);
const int thisSize = p_onesize ? p_onesize : sizes[sizeIndex];
const int nbytes = sizeToBytes(thisSize);
const int niter = p_iterations ? p_iterations : iterations[iterIndex];
for (int pass = 0; pass < niter; pass++) {
hipDeviceSynchronize();
hipDeviceSynchronize();
hipEventRecord(start, 0);
hipEventRecord(start, 0);
for (int j = 0; j < p_beatsperiteration; j++) {
hipMemcpyAsync(peerGpuMem[0], currentGpuMem[0], nbytes,
hipMemcpyDeviceToDevice, stream[0]);
hipMemcpyAsync(currentGpuMem[1], peerGpuMem[1], nbytes,
hipMemcpyDeviceToDevice, stream[1]);
}
hipEventRecord(stop, 0);
hipEventSynchronize(stop);
float t = 0;
hipEventElapsedTime(&t, start, stop);
// times[sizeIndex] = t;
// Convert to GB/sec
if (p_verbose) {
std::cerr << "size " << sizeToString(thisSize) << " took " << t << " ms\n";
}
double speed =
(double(double(sizeToBytes(2 * thisSize)/1000) * p_beatsperiteration) / 1000) /
t;
char sizeStr[256];
if (p_beatsperiteration > 1) {
sprintf(sizeStr, "%9sx%d", sizeToString(thisSize).c_str(),
p_beatsperiteration);
} else {
sprintf(sizeStr, "%9s", sizeToString(thisSize).c_str());
}
string cGpu, pGpu;
cGpu = gpuIDToString(currentGpu);
pGpu = gpuIDToString(peerGpu);
resultDB.AddResult(std::string("p2p_bi") + "_gpu" + std::string(cGpu) + "_gpu" +
std::string(pGpu),
sizeStr, "GB/sec", speed);
resultDB.AddResult(std::string("P2P_bi") + "_gpu" + std::string(cGpu) + "_gpu" +
std::string(pGpu),
sizeStr, "ms", t);
if (p_onesize) {
break;
}
for (int j = 0; j < p_beatsperiteration; j++) {
hipMemcpyAsync(peerGpuMem[0], currentGpuMem[0], nbytes,
hipMemcpyDeviceToDevice, stream[0]);
hipMemcpyAsync(currentGpuMem[1], peerGpuMem[1], nbytes,
hipMemcpyDeviceToDevice, stream[1]);
}
hipEventRecord(stop, 0);
hipEventSynchronize(stop);
float t = 0;
hipEventElapsedTime(&t, start, stop);
// times[sizeIndex] = t;
// Convert to GB/sec
if (p_verbose) {
std::cerr << "size " << sizeToString(thisSize) << " took " << t << " ms\n";
}
double speed =
(double(double(sizeToBytes(2 * thisSize)/1000) * p_beatsperiteration) / 1000) /
t;
char sizeStr[256];
if (p_beatsperiteration > 1) {
sprintf(sizeStr, "%9sx%d", sizeToString(thisSize).c_str(),
p_beatsperiteration);
} else {
sprintf(sizeStr, "%9s", sizeToString(thisSize).c_str());
}
string cGpu, pGpu;
cGpu = gpuIDToString(currentGpu);
pGpu = gpuIDToString(peerGpu);
resultDB.AddResult(std::string("p2p_bi") + "_gpu" + std::string(cGpu) + "_gpu" +
std::string(pGpu),
sizeStr, "GB/sec", speed);
resultDB.AddResult(std::string("P2P_bi") + "_gpu" + std::string(cGpu) + "_gpu" +
std::string(pGpu),
sizeStr, "ms", t);
}
if (p_onesize) {
break;
}
}
if (p_onesize) {
+31
Просмотреть файл
@@ -0,0 +1,31 @@
project(hipCommander)
cmake_minimum_required(VERSION 3.10)
# Search for rocm in common locations
list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
# Find hip
find_package(hip)
# Set compiler and linker
set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
set(CMAKE_CXX_LINKER ${HIP_HIPCC_EXECUTABLE})
set(CMAKE_BUILD_TYPE Release)
# Create the excutable
add_executable(hipCommander hipCommander.cpp)
# Generate code object
add_custom_target(
codeobj
ALL
COMMAND ${HIP_HIPCC_EXECUTABLE} --genco ../nullkernel.hip.cpp -o nullkernel.hsaco
COMMENT "codeobj generated"
)
add_dependencies(hipCommander codeobj)
# Link with HIP
target_link_libraries(hipCommander hip::host)
set_property(TARGET hipCommander PROPERTY CXX_STANDARD 11)
+35
Просмотреть файл
@@ -0,0 +1,35 @@
project(hipDispatchLatency)
cmake_minimum_required(VERSION 3.10)
# Search for rocm in common locations
list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
# Find hip
find_package(hip)
# Set compiler and linker
set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
set(CMAKE_CXX_LINKER ${HIP_HIPCC_EXECUTABLE})
set(CMAKE_BUILD_TYPE Release)
# Create the excutable
add_executable(hipDispatchLatency hipDispatchLatency.cpp)
add_executable(hipDispatchEnqueueRateMT hipDispatchEnqueueRateMT.cpp)
# Generate code object
add_custom_target(
codeobj
ALL
COMMAND ${HIP_HIPCC_EXECUTABLE} --genco ../test_kernel.cpp -o test_kernel.code
COMMENT "codeobj generated"
)
add_dependencies(hipDispatchLatency codeobj)
add_dependencies(hipDispatchEnqueueRateMT codeobj)
# Link with HIP
target_link_libraries(hipDispatchLatency hip::host)
target_link_libraries(hipDispatchEnqueueRateMT hip::host)
set_property(TARGET hipDispatchLatency PROPERTY CXX_STANDARD 11)
set_property(TARGET hipDispatchEnqueueRateMT PROPERTY CXX_STANDARD 11)
+20
Просмотреть файл
@@ -0,0 +1,20 @@
project(hipInfo)
cmake_minimum_required(VERSION 3.10)
# Search for rocm in common locations
list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
# Find hip
find_package(hip)
# Set compiler and linker
set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
set(CMAKE_CXX_LINKER ${HIP_HIPCC_EXECUTABLE})
set(CMAKE_BUILD_TYPE Release)
# Create the excutable
add_executable(hipInfo hipInfo.cpp)
# Link with HIP
target_link_libraries(hipInfo hip::host)
+20
Просмотреть файл
@@ -0,0 +1,20 @@
project(MatrixTranspose)
cmake_minimum_required(VERSION 3.10)
# Search for rocm in common locations
list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
# Find hip
find_package(hip)
# Set compiler and linker
set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
set(CMAKE_CXX_LINKER ${HIP_HIPCC_EXECUTABLE})
set(CMAKE_BUILD_TYPE Release)
# Create the excutable
add_executable(MatrixTranspose MatrixTranspose.cpp)
# Link with HIP
target_link_libraries(MatrixTranspose hip::host)
+20
Просмотреть файл
@@ -0,0 +1,20 @@
project(inline_asm)
cmake_minimum_required(VERSION 3.10)
# Search for rocm in common locations
list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
# Find hip
find_package(hip)
# Set compiler and linker
set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
set(CMAKE_CXX_LINKER ${HIP_HIPCC_EXECUTABLE})
set(CMAKE_BUILD_TYPE Release)
# Create the excutable
add_executable(inline_asm inline_asm.cpp)
# Link with HIP
target_link_libraries(inline_asm hip::host)
+30
Просмотреть файл
@@ -0,0 +1,30 @@
project(texture2dDrv)
cmake_minimum_required(VERSION 3.10)
# Search for rocm in common locations
list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
# Find hip
find_package(hip)
# Set compiler and linker
set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
set(CMAKE_CXX_LINKER ${HIP_HIPCC_EXECUTABLE})
set(CMAKE_BUILD_TYPE Release)
# Create the excutable
add_executable(texture2dDrv texture2dDrv.cpp)
# Generate code object
add_custom_target(
codeobj
ALL
COMMAND ${HIP_HIPCC_EXECUTABLE} --genco ../tex2dKernel.cpp -o tex2dKernel.code
COMMENT "codeobj generated"
)
add_dependencies(texture2dDrv codeobj)
# Link with HIP
target_link_libraries(texture2dDrv hip::host)
+20
Просмотреть файл
@@ -0,0 +1,20 @@
project(occupancy)
cmake_minimum_required(VERSION 3.10)
# Search for rocm in common locations
list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
# Find hip
find_package(hip)
# Set compiler and linker
set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
set(CMAKE_CXX_LINKER ${HIP_HIPCC_EXECUTABLE})
set(CMAKE_BUILD_TYPE Release)
# Create the excutable
add_executable(occupancy occupancy.cpp)
# Link with HIP
target_link_libraries(occupancy hip::host)
+20
Просмотреть файл
@@ -0,0 +1,20 @@
project(hipEvent)
cmake_minimum_required(VERSION 3.10)
# Search for rocm in common locations
list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
# Find hip
find_package(hip)
# Set compiler and linker
set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
set(CMAKE_CXX_LINKER ${HIP_HIPCC_EXECUTABLE})
set(CMAKE_BUILD_TYPE Release)
# Create the excutable
add_executable(hipEvent hipEvent.cpp)
# Link with HIP
target_link_libraries(hipEvent hip::host)
+20
Просмотреть файл
@@ -0,0 +1,20 @@
project(sharedMemory)
cmake_minimum_required(VERSION 3.10)
# Search for rocm in common locations
list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
# Find hip
find_package(hip)
# Set compiler and linker
set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
set(CMAKE_CXX_LINKER ${HIP_HIPCC_EXECUTABLE})
set(CMAKE_BUILD_TYPE Release)
# Create the excutable
add_executable(sharedMemory sharedMemory.cpp)
# Link with HIP
target_link_libraries(sharedMemory hip::host)
+20
Просмотреть файл
@@ -0,0 +1,20 @@
project(shfl)
cmake_minimum_required(VERSION 3.10)
# Search for rocm in common locations
list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
# Find hip
find_package(hip)
# Set compiler and linker
set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
set(CMAKE_CXX_LINKER ${HIP_HIPCC_EXECUTABLE})
set(CMAKE_BUILD_TYPE Release)
# Create the excutable
add_executable(shfl shfl.cpp)
# Link with HIP
target_link_libraries(shfl hip::host)
+19
Просмотреть файл
@@ -0,0 +1,19 @@
project(2dshfl)
cmake_minimum_required(VERSION 3.10)
# Search for rocm in common locations
list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
# Find hip
find_package(hip)
# Set compiler and linker
set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
set(CMAKE_CXX_LINKER ${HIP_HIPCC_EXECUTABLE})
# Create the excutable
add_executable(2dshfl 2dshfl.cpp)
# Link with HIP
target_link_libraries(2dshfl hip::host)
+19
Просмотреть файл
@@ -0,0 +1,19 @@
project(dynamic_shared)
cmake_minimum_required(VERSION 3.10)
# Search for rocm in common locations
list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
# Find hip
find_package(hip)
# Set compiler and linker
set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
set(CMAKE_CXX_LINKER ${HIP_HIPCC_EXECUTABLE})
# Create the excutable
add_executable(dynamic_shared dynamic_shared.cpp)
# Link with HIP
target_link_libraries(dynamic_shared hip::host)
+19
Просмотреть файл
@@ -0,0 +1,19 @@
project(stream)
cmake_minimum_required(VERSION 3.10)
# Search for rocm in common locations
list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
# Find hip
find_package(hip)
# Set compiler and linker
set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
set(CMAKE_CXX_LINKER ${HIP_HIPCC_EXECUTABLE})
# Create the excutable
add_executable(stream stream.cpp)
# Link with HIP
target_link_libraries(stream hip::host)
+19
Просмотреть файл
@@ -0,0 +1,19 @@
project(peer2peer)
cmake_minimum_required(VERSION 3.10)
# Search for rocm in common locations
list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
# Find hip
find_package(hip)
# Set compiler and linker
set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
set(CMAKE_CXX_LINKER ${HIP_HIPCC_EXECUTABLE})
# Create the excutable
add_executable(peer2peer peer2peer.cpp)
# Link with HIP
target_link_libraries(peer2peer hip::host)
+19
Просмотреть файл
@@ -0,0 +1,19 @@
project(unroll)
cmake_minimum_required(VERSION 3.10)
# Search for rocm in common locations
list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
# Find hip
find_package(hip)
# Set compiler and linker
set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
set(CMAKE_CXX_LINKER ${HIP_HIPCC_EXECUTABLE})
# Create the excutable
add_executable(unroll unroll.cpp)
# Link with HIP
target_link_libraries(unroll hip::host)
+27
Просмотреть файл
@@ -0,0 +1,27 @@
Build procedure
We provide Makefile and CMakeLists.txt to build the samples seperately.
1.Makefile supports shared lib of hip-rocclr runtime and nvcc.
To build a sample, just type in sample folder,
make
2.CMakeLists.txt can support shared and static libs of hip-rocclr runtime.
To build a sample, type in sample folder,
mkdir build (if build folder is missing)
cd build
cmake ..
make
If you want debug version, follow,
cmake -DCMAKE_BUILD_TYPE=Debug ..