Modify vcopy to enable multi iteration and multi kernel for CI
Signed-off-by: colramos-amd <colramos@amd.com>
[ROCm/rocprofiler-compute commit: 7cfaf27ece]
Šī revīzija ir iekļauta:
@@ -5,157 +5,187 @@
|
||||
#include <iostream>
|
||||
using namespace std;
|
||||
|
||||
|
||||
#define HIP_ASSERT(x) (assert((x)==hipSuccess))
|
||||
|
||||
// HIP kernel. Each thread takes care of one element of c
|
||||
__global__ void vecCopy(double *a, double *b, double *c, int n,int stride)
|
||||
{
|
||||
__global__ void vecCopy(double *a, double *b, double *c, int n, int stride) {
|
||||
// Get our global thread ID
|
||||
int id = blockIdx.x*blockDim.x+threadIdx.x;
|
||||
|
||||
if (id < n) {
|
||||
c[id] = a[id];
|
||||
}
|
||||
if (id < n)
|
||||
c[id] = a[id];
|
||||
}
|
||||
|
||||
void usage()
|
||||
{
|
||||
printf("\nUsage: vcopy [n] [blocksize] {dev}\n\n");
|
||||
// Duplicate of vecCopy kernel. Included for testing purposes
|
||||
__global__ void vecCopy_2(double *a, double *b, double *c, int n, int stride) {
|
||||
// Get our global thread ID
|
||||
int id = blockIdx.x*blockDim.x+threadIdx.x;
|
||||
if (id < n)
|
||||
c[id] = a[id];
|
||||
}
|
||||
|
||||
void usage() {
|
||||
std::cout << "Usage: vcopy [OPTIONS]\n";
|
||||
std::cout << "Required:\n";
|
||||
std::cout << " -n/--numThreads <value> Set the num of threads\n";
|
||||
std::cout << " -b/--blockSize <value> Set the block size\n";
|
||||
std::cout << "Optional:\n";
|
||||
std::cout << " -d/--dev <value> Set the device ID [Default: 0]\n";
|
||||
std::cout << " -i/--iter <value> Set the num of iterations [Default: 1]\n";
|
||||
std::cout << " -h/--help Display this help message\n";
|
||||
exit(1);
|
||||
return;
|
||||
}
|
||||
|
||||
int main( int argc, char* argv[] )
|
||||
{
|
||||
// Size of vectors
|
||||
int n; //64 MB
|
||||
int blockSize, gridSize;
|
||||
|
||||
// Host input vectors
|
||||
double *h_a;
|
||||
double *h_b;
|
||||
//Host output vector
|
||||
double *h_c;
|
||||
//Host output vector for verification
|
||||
double *h_verify_c;
|
||||
|
||||
// Device input vectors
|
||||
double *d_a;
|
||||
double *d_b;
|
||||
//Device output vector
|
||||
double *d_c;
|
||||
int main(int argc, char* argv[]) {
|
||||
// Size of vectors
|
||||
int n; //64 MB
|
||||
int blockSize, gridSize;
|
||||
|
||||
int stride = 1;
|
||||
int devId = 0;
|
||||
// Launch multiple kernels
|
||||
bool multiKernel = false;
|
||||
|
||||
if(argc < 3)
|
||||
usage();
|
||||
if(argc > 3)
|
||||
devId = atoi(argv[3]);
|
||||
// Host input vectors
|
||||
double *h_a;
|
||||
double *h_b;
|
||||
//Host output vector
|
||||
double *h_c;
|
||||
//Host output vector for verification
|
||||
double *h_verify_c;
|
||||
|
||||
n = atoi(argv[1]);
|
||||
blockSize = atoi(argv[2]);
|
||||
// Device input vectors
|
||||
double *d_a;
|
||||
double *d_b;
|
||||
// Device output vector
|
||||
double *d_c;
|
||||
|
||||
int numGpuDevices;
|
||||
HIP_ASSERT(hipGetDeviceCount(&numGpuDevices));
|
||||
if(devId >= numGpuDevices)
|
||||
devId = 0;
|
||||
HIP_ASSERT(hipSetDevice(devId));
|
||||
int stride = 1;
|
||||
int devId = 0;
|
||||
int numIter = 1;
|
||||
|
||||
printf("vcopy testing on GCD %d\n", devId);
|
||||
|
||||
assert(n > 0);
|
||||
assert(blockSize > 0);
|
||||
|
||||
// Size, in bytes, of each vector
|
||||
size_t bytes = n*sizeof(double)*stride;
|
||||
|
||||
// Allocate memory for each vector on host
|
||||
h_a = (double*)malloc(bytes);
|
||||
h_b = (double*)malloc(bytes);
|
||||
h_c = (double*)malloc(bytes);
|
||||
h_verify_c = (double*)malloc(bytes);
|
||||
|
||||
printf("Finished allocating vectors on the CPU\n");
|
||||
// Allocate memory for each vector on GPU
|
||||
HIP_ASSERT(hipMalloc(&d_a, bytes));
|
||||
HIP_ASSERT(hipMalloc(&d_b, bytes));
|
||||
HIP_ASSERT(hipMalloc(&d_c, bytes));
|
||||
|
||||
printf("Finished allocating vectors on the GPU\n");
|
||||
|
||||
int i;
|
||||
// Initialize vectors on host
|
||||
for( i = 0; i < n; i++ ) {
|
||||
h_a[i] = i;
|
||||
h_b[i] = i;
|
||||
}
|
||||
|
||||
|
||||
// Copy host vectors to device
|
||||
HIP_ASSERT(hipMemcpy( d_a, h_a, bytes, hipMemcpyHostToDevice));
|
||||
HIP_ASSERT(hipMemcpy(d_b, h_b, bytes, hipMemcpyHostToDevice));
|
||||
|
||||
printf("Finished copying vectors to the GPU\n");
|
||||
|
||||
|
||||
// Number of thread blocks in grid
|
||||
gridSize = (int)ceil((float)n/blockSize);
|
||||
//gridSize = 1;
|
||||
for (int i = 0; i < argc; i++){
|
||||
std::string arg = argv[i];
|
||||
if ((arg == "--blockSize" || arg == "-b") && i+1 < argc)
|
||||
blockSize = std::atoi(argv[i+1]);
|
||||
|
||||
int tot_waves = (blockSize*gridSize)/64;
|
||||
float num_bytes_kb = ((sizeof(double))*n)/(1024);
|
||||
float num_bytes_wave = (1.0*num_bytes_kb)/(1.0*tot_waves);
|
||||
else if ((arg == "--vec" || arg == "-n") && i+1 < argc)
|
||||
n = std::atoi(argv[i+1]);
|
||||
|
||||
else if ((arg == "--device" || arg == "-d") && i+1 < argc)
|
||||
devId = std::atoi(argv[i+1]);
|
||||
|
||||
else if ((arg == "--iter" || arg == "-i") && i+1 < argc)
|
||||
numIter = std::atoi(argv[i+1]);
|
||||
|
||||
else if (arg == "--multikernel")
|
||||
multiKernel = true;
|
||||
|
||||
printf("sw thinks it moved %f KB per wave \n", (2.0*num_bytes_wave));
|
||||
|
||||
printf("Total threads: %d, Grid Size: %d block Size:%d, Wavefronts:%d:\n", n, gridSize, blockSize, tot_waves);
|
||||
printf("Launching the kernel on the GPU\n");
|
||||
// Execute the kernel
|
||||
hipLaunchKernelGGL(vecCopy, dim3(gridSize), dim3(blockSize), 0, 0, d_a, d_b, d_c, n,stride);
|
||||
hipDeviceSynchronize( );
|
||||
else if (arg == "--help" || arg == "-h")
|
||||
usage();
|
||||
}
|
||||
|
||||
if (blockSize == 0)
|
||||
usage();
|
||||
|
||||
if (n == 0)
|
||||
usage();
|
||||
|
||||
|
||||
int numGpuDevices;
|
||||
HIP_ASSERT(hipGetDeviceCount(&numGpuDevices));
|
||||
if(devId >= numGpuDevices)
|
||||
devId = 0;
|
||||
HIP_ASSERT(hipSetDevice(devId));
|
||||
|
||||
printf("vcopy testing on GCD %d\n", devId);
|
||||
|
||||
assert(n > 0);
|
||||
assert(blockSize > 0);
|
||||
|
||||
// Size, in bytes, of each vector
|
||||
size_t bytes = n*sizeof(double)*stride;
|
||||
|
||||
// Allocate memory for each vector on host
|
||||
h_a = (double*)malloc(bytes);
|
||||
h_b = (double*)malloc(bytes);
|
||||
h_c = (double*)malloc(bytes);
|
||||
h_verify_c = (double*)malloc(bytes);
|
||||
|
||||
printf("Finished allocating vectors on the CPU\n");
|
||||
|
||||
// Allocate memory for each vector on GPU
|
||||
HIP_ASSERT(hipMalloc(&d_a, bytes));
|
||||
HIP_ASSERT(hipMalloc(&d_b, bytes));
|
||||
HIP_ASSERT(hipMalloc(&d_c, bytes));
|
||||
|
||||
printf("Finished allocating vectors on the GPU\n");
|
||||
|
||||
// Initialize vectors on host
|
||||
for(int i = 0; i < n; i++) {
|
||||
h_a[i] = i;
|
||||
h_b[i] = i;
|
||||
}
|
||||
|
||||
// Copy host vectors to device
|
||||
HIP_ASSERT(hipMemcpy(d_a, h_a, bytes, hipMemcpyHostToDevice));
|
||||
HIP_ASSERT(hipMemcpy(d_b, h_b, bytes, hipMemcpyHostToDevice));
|
||||
|
||||
printf("Finished copying vectors to the GPU\n");
|
||||
|
||||
// Number of thread blocks in grid
|
||||
gridSize = (int)ceil((float)n/blockSize);
|
||||
int tot_waves = (blockSize*gridSize)/64;
|
||||
float num_bytes_kb = ((sizeof(double))*n)/(1024);
|
||||
float num_bytes_wave = (1.0*num_bytes_kb)/(1.0*tot_waves);
|
||||
|
||||
printf("sw thinks it moved %f KB per wave \n", (2.0*num_bytes_wave));
|
||||
printf("Total threads: %d, Grid Size: %d block Size:%d, Wavefronts:%d:\n", n, gridSize, blockSize, tot_waves);
|
||||
printf("Launching the kernel on the GPU\n");
|
||||
|
||||
// Execute the kernel
|
||||
for(int i = 0; i < numIter; i++){
|
||||
hipLaunchKernelGGL(vecCopy, dim3(gridSize), dim3(blockSize), 0, 0, d_a, d_b, d_c, n, stride);
|
||||
hipDeviceSynchronize();
|
||||
printf("Finished executing kernel\n");
|
||||
// Copy array back to host
|
||||
HIP_ASSERT(hipMemcpy( h_c, d_c, bytes, hipMemcpyDeviceToHost));
|
||||
printf("Finished copying the output vector from the GPU to the CPU\n");
|
||||
|
||||
//Compute for CPU
|
||||
for(i=0; i <n; i++)
|
||||
{
|
||||
// h_verify_c[i*stride] = h_a[i*stride] + h_b[i*stride];
|
||||
h_verify_c[i*stride] = h_a[i*stride] ;
|
||||
}
|
||||
|
||||
|
||||
//Verfiy results
|
||||
for(i=0; i <n; i++)
|
||||
{
|
||||
if (abs(h_verify_c[i*stride] - h_c[i*stride]) > 1e-5)
|
||||
{
|
||||
printf("Error at position i %d, Expected: %f, Found: %f \n", i, h_c[i], d_c[i]);
|
||||
}
|
||||
}
|
||||
|
||||
// printf("Printing few elements from the output vector\n");
|
||||
|
||||
for(i=0; i < 20; i++)
|
||||
{
|
||||
// printf("Output[%d]:%f\n",i, h_c[i]);
|
||||
// Optionally, launch a second kernel. Only here for testing purposes
|
||||
if (multiKernel){
|
||||
hipLaunchKernelGGL(vecCopy_2, dim3(gridSize), dim3(blockSize), 0, 0, d_a, d_b, d_c, n, stride);
|
||||
hipDeviceSynchronize();
|
||||
printf("Finished executing kernel\n");
|
||||
}
|
||||
}
|
||||
|
||||
// Copy array back to host
|
||||
HIP_ASSERT(hipMemcpy( h_c, d_c, bytes, hipMemcpyDeviceToHost));
|
||||
printf("Finished copying the output vector from the GPU to the CPU\n");
|
||||
|
||||
printf("Releasing GPU memory\n");
|
||||
|
||||
// Release device memory
|
||||
HIP_ASSERT(hipFree(d_a));
|
||||
HIP_ASSERT(hipFree(d_b));
|
||||
HIP_ASSERT(hipFree(d_c));
|
||||
|
||||
// Release host memory
|
||||
printf("Releasing CPU memory\n");
|
||||
free(h_a);
|
||||
free(h_b);
|
||||
free(h_c);
|
||||
|
||||
return 0;
|
||||
// Compute for CPU
|
||||
for(int i=0; i<n; i++) {
|
||||
// h_verify_c[i*stride] = h_a[i*stride] + h_b[i*stride];
|
||||
h_verify_c[i*stride] = h_a[i*stride] ;
|
||||
}
|
||||
|
||||
// Verfiy results
|
||||
for(int i = 0; i < n; i++) {
|
||||
if (abs(h_verify_c[i*stride] - h_c[i*stride]) > 1e-5)
|
||||
printf("Error at position i %d, Expected: %f, Found: %f \n", i, h_c[i], d_c[i]);
|
||||
}
|
||||
//printf("Printing few elements from the output vector\n");
|
||||
for(int i = 0; i < 20; i++) {
|
||||
//printf("Output[%d]:%f\n",i, h_c[i]);
|
||||
}
|
||||
|
||||
printf("Releasing GPU memory\n");
|
||||
|
||||
// Release device memory
|
||||
HIP_ASSERT(hipFree(d_a));
|
||||
HIP_ASSERT(hipFree(d_b));
|
||||
HIP_ASSERT(hipFree(d_c));
|
||||
|
||||
// Release host memory
|
||||
printf("Releasing CPU memory\n");
|
||||
free(h_a);
|
||||
free(h_b);
|
||||
free(h_c);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
Atsaukties uz šo jaunā problēmā
Block a user