diff --git a/sample/vcopy.cpp b/sample/vcopy.cpp index 0eed48711a..7f4620eacf 100644 --- a/sample/vcopy.cpp +++ b/sample/vcopy.cpp @@ -5,157 +5,187 @@ #include using namespace std; - #define HIP_ASSERT(x) (assert((x)==hipSuccess)) // HIP kernel. Each thread takes care of one element of c -__global__ void vecCopy(double *a, double *b, double *c, int n,int stride) -{ +__global__ void vecCopy(double *a, double *b, double *c, int n, int stride) { // Get our global thread ID int id = blockIdx.x*blockDim.x+threadIdx.x; - - if (id < n) { - c[id] = a[id]; - } + if (id < n) + c[id] = a[id]; } -void usage() -{ - printf("\nUsage: vcopy [n] [blocksize] {dev}\n\n"); +// Duplicate of vecCopy kernel. Included for testing purposes +__global__ void vecCopy_2(double *a, double *b, double *c, int n, int stride) { + // Get our global thread ID + int id = blockIdx.x*blockDim.x+threadIdx.x; + if (id < n) + c[id] = a[id]; +} + +void usage() { + std::cout << "Usage: vcopy [OPTIONS]\n"; + std::cout << "Required:\n"; + std::cout << " -n/--numThreads Set the num of threads\n"; + std::cout << " -b/--blockSize Set the block size\n"; + std::cout << "Optional:\n"; + std::cout << " -d/--dev Set the device ID [Default: 0]\n"; + std::cout << " -i/--iter Set the num of iterations [Default: 1]\n"; + std::cout << " -h/--help Display this help message\n"; exit(1); return; } -int main( int argc, char* argv[] ) -{ - // Size of vectors - int n; //64 MB - int blockSize, gridSize; - - // Host input vectors - double *h_a; - double *h_b; - //Host output vector - double *h_c; - //Host output vector for verification - double *h_verify_c; - - // Device input vectors - double *d_a; - double *d_b; - //Device output vector - double *d_c; +int main(int argc, char* argv[]) { + // Size of vectors + int n; //64 MB + int blockSize, gridSize; - int stride = 1; - int devId = 0; + // Launch multiple kernels + bool multiKernel = false; - if(argc < 3) - usage(); - if(argc > 3) - devId = atoi(argv[3]); + // Host input vectors + double *h_a; + double *h_b; + //Host output vector + double *h_c; + //Host output vector for verification + double *h_verify_c; - n = atoi(argv[1]); - blockSize = atoi(argv[2]); + // Device input vectors + double *d_a; + double *d_b; + // Device output vector + double *d_c; - int numGpuDevices; - HIP_ASSERT(hipGetDeviceCount(&numGpuDevices)); - if(devId >= numGpuDevices) - devId = 0; - HIP_ASSERT(hipSetDevice(devId)); + int stride = 1; + int devId = 0; + int numIter = 1; - printf("vcopy testing on GCD %d\n", devId); - - assert(n > 0); - assert(blockSize > 0); - - // Size, in bytes, of each vector - size_t bytes = n*sizeof(double)*stride; - - // Allocate memory for each vector on host - h_a = (double*)malloc(bytes); - h_b = (double*)malloc(bytes); - h_c = (double*)malloc(bytes); - h_verify_c = (double*)malloc(bytes); - - printf("Finished allocating vectors on the CPU\n"); - // Allocate memory for each vector on GPU - HIP_ASSERT(hipMalloc(&d_a, bytes)); - HIP_ASSERT(hipMalloc(&d_b, bytes)); - HIP_ASSERT(hipMalloc(&d_c, bytes)); - - printf("Finished allocating vectors on the GPU\n"); - - int i; - // Initialize vectors on host - for( i = 0; i < n; i++ ) { - h_a[i] = i; - h_b[i] = i; - } - - - // Copy host vectors to device - HIP_ASSERT(hipMemcpy( d_a, h_a, bytes, hipMemcpyHostToDevice)); - HIP_ASSERT(hipMemcpy(d_b, h_b, bytes, hipMemcpyHostToDevice)); - - printf("Finished copying vectors to the GPU\n"); - - - // Number of thread blocks in grid - gridSize = (int)ceil((float)n/blockSize); - //gridSize = 1; + for (int i = 0; i < argc; i++){ + std::string arg = argv[i]; + if ((arg == "--blockSize" || arg == "-b") && i+1 < argc) + blockSize = std::atoi(argv[i+1]); - int tot_waves = (blockSize*gridSize)/64; - float num_bytes_kb = ((sizeof(double))*n)/(1024); - float num_bytes_wave = (1.0*num_bytes_kb)/(1.0*tot_waves); + else if ((arg == "--vec" || arg == "-n") && i+1 < argc) + n = std::atoi(argv[i+1]); + + else if ((arg == "--device" || arg == "-d") && i+1 < argc) + devId = std::atoi(argv[i+1]); + + else if ((arg == "--iter" || arg == "-i") && i+1 < argc) + numIter = std::atoi(argv[i+1]); + + else if (arg == "--multikernel") + multiKernel = true; - printf("sw thinks it moved %f KB per wave \n", (2.0*num_bytes_wave)); - - printf("Total threads: %d, Grid Size: %d block Size:%d, Wavefronts:%d:\n", n, gridSize, blockSize, tot_waves); - printf("Launching the kernel on the GPU\n"); - // Execute the kernel - hipLaunchKernelGGL(vecCopy, dim3(gridSize), dim3(blockSize), 0, 0, d_a, d_b, d_c, n,stride); - hipDeviceSynchronize( ); + else if (arg == "--help" || arg == "-h") + usage(); + } + + if (blockSize == 0) + usage(); + + if (n == 0) + usage(); + + + int numGpuDevices; + HIP_ASSERT(hipGetDeviceCount(&numGpuDevices)); + if(devId >= numGpuDevices) + devId = 0; + HIP_ASSERT(hipSetDevice(devId)); + + printf("vcopy testing on GCD %d\n", devId); + + assert(n > 0); + assert(blockSize > 0); + + // Size, in bytes, of each vector + size_t bytes = n*sizeof(double)*stride; + + // Allocate memory for each vector on host + h_a = (double*)malloc(bytes); + h_b = (double*)malloc(bytes); + h_c = (double*)malloc(bytes); + h_verify_c = (double*)malloc(bytes); + + printf("Finished allocating vectors on the CPU\n"); + + // Allocate memory for each vector on GPU + HIP_ASSERT(hipMalloc(&d_a, bytes)); + HIP_ASSERT(hipMalloc(&d_b, bytes)); + HIP_ASSERT(hipMalloc(&d_c, bytes)); + + printf("Finished allocating vectors on the GPU\n"); + + // Initialize vectors on host + for(int i = 0; i < n; i++) { + h_a[i] = i; + h_b[i] = i; + } + + // Copy host vectors to device + HIP_ASSERT(hipMemcpy(d_a, h_a, bytes, hipMemcpyHostToDevice)); + HIP_ASSERT(hipMemcpy(d_b, h_b, bytes, hipMemcpyHostToDevice)); + + printf("Finished copying vectors to the GPU\n"); + + // Number of thread blocks in grid + gridSize = (int)ceil((float)n/blockSize); + int tot_waves = (blockSize*gridSize)/64; + float num_bytes_kb = ((sizeof(double))*n)/(1024); + float num_bytes_wave = (1.0*num_bytes_kb)/(1.0*tot_waves); + + printf("sw thinks it moved %f KB per wave \n", (2.0*num_bytes_wave)); + printf("Total threads: %d, Grid Size: %d block Size:%d, Wavefronts:%d:\n", n, gridSize, blockSize, tot_waves); + printf("Launching the kernel on the GPU\n"); + + // Execute the kernel + for(int i = 0; i < numIter; i++){ + hipLaunchKernelGGL(vecCopy, dim3(gridSize), dim3(blockSize), 0, 0, d_a, d_b, d_c, n, stride); + hipDeviceSynchronize(); printf("Finished executing kernel\n"); - // Copy array back to host - HIP_ASSERT(hipMemcpy( h_c, d_c, bytes, hipMemcpyDeviceToHost)); - printf("Finished copying the output vector from the GPU to the CPU\n"); - - //Compute for CPU - for(i=0; i 1e-5) - { - printf("Error at position i %d, Expected: %f, Found: %f \n", i, h_c[i], d_c[i]); - } - } - - // printf("Printing few elements from the output vector\n"); - - for(i=0; i < 20; i++) - { - // printf("Output[%d]:%f\n",i, h_c[i]); + // Optionally, launch a second kernel. Only here for testing purposes + if (multiKernel){ + hipLaunchKernelGGL(vecCopy_2, dim3(gridSize), dim3(blockSize), 0, 0, d_a, d_b, d_c, n, stride); + hipDeviceSynchronize(); + printf("Finished executing kernel\n"); } + } + + // Copy array back to host + HIP_ASSERT(hipMemcpy( h_c, d_c, bytes, hipMemcpyDeviceToHost)); + printf("Finished copying the output vector from the GPU to the CPU\n"); - printf("Releasing GPU memory\n"); - - // Release device memory - HIP_ASSERT(hipFree(d_a)); - HIP_ASSERT(hipFree(d_b)); - HIP_ASSERT(hipFree(d_c)); - - // Release host memory - printf("Releasing CPU memory\n"); - free(h_a); - free(h_b); - free(h_c); - - return 0; + // Compute for CPU + for(int i=0; i 1e-5) + printf("Error at position i %d, Expected: %f, Found: %f \n", i, h_c[i], d_c[i]); + } + //printf("Printing few elements from the output vector\n"); + for(int i = 0; i < 20; i++) { + //printf("Output[%d]:%f\n",i, h_c[i]); + } + + printf("Releasing GPU memory\n"); + + // Release device memory + HIP_ASSERT(hipFree(d_a)); + HIP_ASSERT(hipFree(d_b)); + HIP_ASSERT(hipFree(d_c)); + + // Release host memory + printf("Releasing CPU memory\n"); + free(h_a); + free(h_b); + free(h_c); + + return 0; }